From b2745b88c8b83c3a58fffa3c0389a08a7484610e Mon Sep 17 00:00:00 2001 From: Andres Morales Esquivel Date: Tue, 21 Apr 2026 15:26:20 -0600 Subject: [PATCH 1/8] Add agent skills --- .apm/skills/benchmark-qed-autoe/SKILL.md | 180 +++++++++ .apm/skills/benchmark-qed-autoq/SKILL.md | 164 ++++++++ .apm/skills/benchmark-qed-setup/SKILL.md | 170 +++++++++ .../references/config-reference.md | 349 ++++++++++++++++++ apm.yml | 8 + benchmark_qed/autoq/cli.py | 12 +- 6 files changed, 877 insertions(+), 6 deletions(-) create mode 100644 .apm/skills/benchmark-qed-autoe/SKILL.md create mode 100644 .apm/skills/benchmark-qed-autoq/SKILL.md create mode 100644 .apm/skills/benchmark-qed-setup/SKILL.md create mode 100644 .apm/skills/benchmark-qed-setup/references/config-reference.md create mode 100644 apm.yml diff --git a/.apm/skills/benchmark-qed-autoe/SKILL.md b/.apm/skills/benchmark-qed-autoe/SKILL.md new file mode 100644 index 0000000..1480976 --- /dev/null +++ b/.apm/skills/benchmark-qed-autoe/SKILL.md @@ -0,0 +1,180 @@ +--- +name: benchmark-qed-autoe +description: > + Evaluate RAG system outputs using benchmark-qed scoring methods. Use when: + running pairwise comparisons, reference-based scoring, assertion-based + evaluation (flat or hierarchical), retrieval metrics, or statistical + significance tests on RAG outputs. Also use when the user wants to score, + compare, or evaluate RAG methods, measure retrieval quality, or run + significance tests on benchmark results — even if they don't say "autoe" + explicitly. +--- + +# Benchmark-QED Evaluation (autoe) + +Evaluate and compare RAG system outputs using LLM-judged scoring, assertion-based evaluation, and retrieval metrics — all with built-in statistical significance testing. + +## Prerequisites + +- Generated questions/assertions from the autoq pipeline (or your own) +- RAG method answer files (JSON, one per method per question set) +- A valid `settings.yaml` for the evaluation type +- LLM API key configured + +Run all commands with: +```bash +uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed +``` + +## Evaluation Methods Overview + +| Method | Command | Best for | +|--------|---------|----------| +| Pairwise comparison | `autoe pairwise-scores` | Comparing two RAG methods head-to-head | +| Reference scoring | `autoe reference-scores` | Scoring against gold-standard answers | +| Assertion scoring | `autoe assertion-scores` | Evaluating with ground-truth assertions (single or multi-RAG) | +| Hierarchical assertions | `autoe hierarchical-assertion-scores` | Global + local assertion hierarchies | +| Retrieval metrics | `autoe retrieval-scores` | Precision, recall, fidelity of retrieval | +| Significance tests | `autoe assertion-significance` | Post-hoc significance on existing scores | + +## Commands + +### 1. Pairwise Scores + +Compare RAG methods using LLM-judged pairwise comparisons. + +```bash +uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoe pairwise-scores [OPTIONS] +``` + +**Options:** +| Option | Default | Description | +|--------|---------|-------------| +| `--alpha` | `0.05` | P-value threshold for significance | +| `--exclude-criteria` | `[]` | Criteria to exclude (repeatable) | +| `--print-model-usage` | `false` | Print LLM token usage | + +**Config requires**: `base` (reference method), `others` (methods to compare), `question_sets`, `criteria`, `trials` (must be even), `llm_config`, `prompt_config` + +Default criteria: `comprehensiveness`, `diversity`, `empowerment`, `relevance` + +**Output**: `{question_set}_{base}--{other}.csv`, `win_rates.csv`, `winrates_sig_tests.csv` + +### 2. Reference Scores + +Score generated answers against reference (gold-standard) answers. + +```bash +uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoe reference-scores [OPTIONS] +``` + +**Config requires**: `reference`, `generated` (list), `criteria`, `score_min`/`score_max`, `trials`, `llm_config` + +Default criteria: `correctness`, `completeness`. Default score range: 1–10. + +**Output**: `reference_scores-{name}.csv`, `model_usage.json` + +### 3. Assertion Scores + +Evaluate RAG methods using assertion-based scoring. Auto-detects single-RAG vs multi-RAG config. + +```bash +uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoe assertion-scores [OPTIONS] +``` + +**Options:** +| Option | Default | Description | +|--------|---------|-------------| +| `--alpha` | `0.05` | Significance threshold (multi-RAG) | +| `--print-model-usage` | `false` | Print LLM token usage | + +**Auto-detection**: If the YAML contains a `rag_methods` key, it runs in multi-RAG mode with automated significance testing. Otherwise, single-RAG mode. + +**Single-RAG output**: `assertion_scores.csv`, `assertion_summary_by_question.csv`, `eval_summary.json` + +**Multi-RAG output**: Per-method scores + significance tests in structured `output_dir/` + +### 4. Hierarchical Assertion Scores + +Score hierarchical assertions (global assertions with supporting local assertions). + +```bash +uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoe hierarchical-assertion-scores [OPTIONS] +``` + +**Modes**: `staged` (default — evaluate local first, then global) or `joint` (evaluate together) + +**Extra field**: `detect_discovery: true` enables detection of novel findings not covered by assertions. + +Also auto-detects single vs multi-RAG config (same as assertion-scores). + +### 5. Assertion Significance + +Run statistical significance tests on existing assertion scores (no LLM calls). + +```bash +uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoe assertion-significance +``` + +**Config requires**: `output_dir`, `rag_methods`, `question_sets`, `alpha`, `correction_method` + +**Correction methods**: `holm` (default, recommended), `bonferroni`, `fdr_bh` + +### 6. Hierarchical Assertion Significance + +Significance tests on hierarchical assertion scores. + +```bash +uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoe hierarchical-assertion-significance +``` + +**Config requires**: `scores_dir`, `rag_methods`, `scores_filename_template`, `alpha`, `correction_method`, `output_dir` + +### 7. Generate Retrieval Reference + +Generate cluster relevance reference data for retrieval evaluation (one-off prep step). + +```bash +uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoe generate-retrieval-reference +``` + +**Config requires**: `llm_config`, `embedding_config`, question source (`questions_path` or `question_sets`), `text_units_path` + +**Key settings**: `num_clusters`, `assessor_type` (`rationale` or `bing`), `semantic_neighbors`, `centroid_neighbors` + +### 8. Retrieval Scores + +Evaluate retrieval precision, recall, and fidelity for RAG methods. + +```bash +uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoe retrieval-scores +``` + +**Config requires**: `rag_methods`, `question_sets`, `reference_dir`, `text_units_path`, `output_dir` + +**Fidelity metrics**: `js` (Jensen-Shannon divergence) or `tvd` (total variation distance) + +## Workflow + +### Quick Evaluation (Assertion-Based) + +- [ ] Step 1: Verify questions and answers exist — list the workspace and confirm a `settings.yaml` (or `config.yaml`), question JSON files (typically under `output/`), and your RAG method answer JSONs are present. +- [ ] Step 2: Initialize eval config — `uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed config init autoe_assertion ./eval_workspace` +- [ ] Step 3: Configure settings.yaml with answer paths and assertion paths +- [ ] Step 4: Run evaluation — `uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoe assertion-scores ./eval_workspace/settings.yaml ./eval_output` +- [ ] Step 5: Summarize results — read the CSVs in `` (e.g. `assertion_scores.csv`, `assertion_summary_by_question.csv`) and `eval_summary.json` directly. + +### Multi-RAG Comparison + +For comparing multiple RAG methods, use multi-RAG config format (include `rag_methods` key in YAML). This gives you automated pairwise significance testing. + +For the full config reference with all fields, read the config reference in the `/benchmark-qed-setup` skill: [../benchmark-qed-setup/references/config-reference.md](../benchmark-qed-setup/references/config-reference.md). + +## Gotchas + +- **Config auto-detection**: `assertion-scores` and `hierarchical-assertion-scores` detect single vs multi-RAG based on the `rag_methods` key in YAML. Ensure your config matches your intent. +- **Trials must be even**: For pairwise scores, `trials` must be even (for counterbalancing). Use 4 as default. +- **Stale outputs**: Several commands skip existing output files. Use a fresh output directory or delete specific files to force re-evaluation. +- **Output is in files**: All scores are written to CSV/JSON files. Parse output files, not CLI stdout. +- **Long-running**: Evaluation with many questions and trials can take hours. Use background execution. +- **No `config init` for hierarchical/retrieval**: `config init` only supports `autoe_assertion`, `autoe_pairwise`, and `autoe_reference`. For hierarchical and retrieval configs, create YAML manually using the config reference. diff --git a/.apm/skills/benchmark-qed-autoq/SKILL.md b/.apm/skills/benchmark-qed-autoq/SKILL.md new file mode 100644 index 0000000..90fe9bd --- /dev/null +++ b/.apm/skills/benchmark-qed-autoq/SKILL.md @@ -0,0 +1,164 @@ +--- +name: benchmark-qed-autoq +description: > + Generate benchmark questions and assertions from input data using + benchmark-qed. Use when: generating local, global, linked, or activity + questions for RAG benchmarking, creating assertions for existing questions, + computing assertion statistics, or running the autoq question generation + pipeline. Also use when the user wants to create a benchmark question set, + build evaluation questions from a dataset, or generate ground-truth + assertions — even if they don't say "autoq" explicitly. +--- + +# Benchmark-QED Question Generation (autoq) + +Generate benchmark questions and assertions from input data for RAG evaluation. + +## Prerequisites + +- A configured workspace with valid `settings.yaml` (use the `/benchmark-qed-setup` skill first) +- Input data (CSV or JSON) in the workspace `input/` directory +- Valid LLM API key in `.env` + +Run all commands with: +```bash +uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed +``` + +## Commands + +### 1. Generate Questions (`autoq`) + +The main question generation pipeline. Generates benchmark questions from input data. + +```bash +uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoq [OPTIONS] +``` + +**Options:** +| Option | Description | +|--------|-------------| +| `--generation-types` | Specific types to generate (repeatable). CLI default: all except `data_linked`, but this skill always includes `data_linked` | +| `--print-model-usage` | Print LLM token usage stats | + +**Generation types and dependencies:** + +``` +data_local ← runs first (no dependencies) + ├── data_global ← requires data_local candidates + └── data_linked ← requires data_local candidates (not in CLI default, but this skill always includes it) + +activity_local ← auto-generates activity_context first + └── activity_global ← requires activity_local +``` + +> **Important**: `data_linked` is NOT included in the CLI's default generation types, but this skill always generates it by passing all types explicitly. If running the CLI manually, you must add `--generation-types data_linked`. + +> **Gotcha**: `data_global` and `data_linked` silently return empty results if `data_local` hasn't been run first. Always run `data_local` before these types. + +**Examples:** +```bash +# Run from the workspace directory (paths resolve relative to settings.yaml location) +cd ./workspace + +# Generate all types including data_linked (skill default) +uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoq settings.yaml ./output \ + --generation-types data_local --generation-types data_global --generation-types data_linked \ + --generation-types activity_local --generation-types activity_global + +# Generate only local questions +uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoq settings.yaml ./output --generation-types data_local + +# Generate local + linked questions +uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoq settings.yaml ./output \ + --generation-types data_local --generation-types data_linked +``` + +**Output structure:** +``` +output_dir/ +├── sample_texts.parquet # Intermediate: clustered text samples +├── data_local_questions/ +│ ├── selected_questions.json # Final curated questions +│ ├── selected_questions_text.json # Human-readable version +│ └── candidate_questions.json # All generated candidates +├── data_global_questions/ # Same structure +├── data_linked_questions/ # Same structure + question_stats.json +├── activity_local_questions/ # Same structure +├── activity_global_questions/ # Same structure +├── context/ +│ └── activity_context_full.json # Generated activity context +└── model_usage.json # LLM token/cost tracking +``` + +### 2. Generate Assertions (`generate-assertions`) + +Generate ground-truth assertions for existing questions (decoupled from question generation). This is a **top-level** command, not a subcommand of `autoq`. + +```bash +uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed generate-assertions [OPTIONS] +``` + +**Options:** +| Option | Description | +|--------|-------------| +| `--type` / `-t` | Assertion type: `local`, `global`, or `linked` (default: `local`) | +| `--print-model-usage` | Print LLM token usage stats | + +**Examples:** +```bash +# Run from the workspace directory (paths resolve relative to settings.yaml location) +cd ./workspace + +uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed generate-assertions \ + settings.yaml \ + ./output/data_local_questions/candidate_questions.json \ + ./output/data_local_questions/ \ + --type local + +uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed generate-assertions \ + settings.yaml \ + ./output/data_global_questions/candidate_questions.json \ + ./output/data_global_questions/ \ + --type global +``` + +### 3. Assertion Statistics (`assertion-stats`) + +Compute quality statistics for assertion files. This is a **top-level** command, not a subcommand of `autoq`. + +```bash +uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed assertion-stats [OPTIONS] +``` + +**Options:** +| Option | Description | +|--------|-------------| +| `--output` / `-o` | Output path for stats JSON (auto-generated if omitted) | +| `--type` / `-t` | `global`, `map`, or `local` (auto-inferred if omitted) | +| `--quiet` / `-q` | Suppress console output | + +**Examples:** +```bash +uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed assertion-stats ./output/assertions.json +uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed assertion-stats ./output/data_global_questions/ -q +``` + +## Workflow + +### Standard Question Generation Flow + +- [ ] Step 1: Verify workspace is ready — confirm `settings.yaml`, `.env`, and `input/` exist in `` (the CLI will fail fast if anything is misconfigured). +- [ ] Step 2: `cd ` then run question generation — `uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoq settings.yaml ./output --generation-types data_local --generation-types data_global --generation-types data_linked --generation-types activity_local --generation-types activity_global` +- [ ] Step 3: Verify output artifacts — list `` and confirm the per-type `selected_questions.json` files (see "Output structure" above) plus `model_usage.json` exist. +- [ ] Step 4: (Optional) Generate additional assertions — use `generate-assertions` +- [ ] Step 5: (Optional) Check assertion quality — use `assertion-stats` + +## Gotchas + +- **Path resolution**: The `autoq` and `generate-assertions` commands resolve `output_dir` (and other relative paths) **relative to the settings.yaml file's directory**, not the current working directory. Always `cd` into the workspace directory first, or use absolute paths. For example, running `benchmark-qed autoq workspace/settings.yaml workspace/output` from the repo root creates output at `workspace/workspace/output/` (not `workspace/output/`). +- **Stale outputs**: The pipeline skips steps if output files already exist (`sample_texts.parquet`, `activity_context_full.json`). Use a fresh output directory for clean runs, or delete specific files to re-run a step. +- **Long-running**: Question generation with large datasets can take hours. Use background execution and monitor via `model_usage.json` presence. +- **Output is in files, not stdout**: All results are written to JSON/CSV/Parquet files. Parse the output files, not CLI stdout. +- **Generation ordering**: `data_global` and `data_linked` depend on `data_local`. `activity_global` depends on `activity_local`. Running dependent types without their prerequisites produces silent empty results. +- **`data_linked` CLI opt-in**: The CLI excludes `data_linked` by default, but this skill always includes it. If running the CLI manually outside this skill, add `--generation-types data_linked`. diff --git a/.apm/skills/benchmark-qed-setup/SKILL.md b/.apm/skills/benchmark-qed-setup/SKILL.md new file mode 100644 index 0000000..464732c --- /dev/null +++ b/.apm/skills/benchmark-qed-setup/SKILL.md @@ -0,0 +1,170 @@ +--- +name: benchmark-qed-setup +description: > + Initialize and configure benchmark-qed workspaces for RAG benchmarking. + Use when: setting up a new benchmarking project, initializing config files + for question generation or evaluation, downloading sample datasets, + or modifying benchmark-qed settings.yaml configuration. Also use when + the user mentions "benchmark-qed config", workspace setup, or needs to + prepare a benchmarking environment — even if they don't say "setup" explicitly. +--- + +# Benchmark-QED Workspace Setup + +Initialize workspaces, generate configuration files, download datasets, and manage settings for the benchmark-qed RAG benchmarking tool. + +## Prerequisites + +benchmark-qed requires Python 3.11+ and uv. Run commands with `uvx` to avoid installing globally: + +```bash +uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed +``` + +Pin a specific version for reproducibility: +```bash +uvx --from "git+https://github.com/microsoft/benchmark-qed@v1.2.3" benchmark-qed +``` + +If `uvx` is unavailable, install uv first: +```bash +pip install uv && uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed +``` + +## Procedure + +### Step 1 — Initialize a Workspace + +Generate a configuration workspace for the desired workflow type: + +```bash +uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed config init +``` + +**Config types** (pick one): +| Type | Purpose | +|------|---------| +| `autoq` | Question generation (includes all prompt templates) | +| `autoe_pairwise` | Pairwise comparison evaluation | +| `autoe_reference` | Reference-based scoring | +| `autoe_assertion` | Assertion-based scoring | + +Example: +```bash +uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed config init autoq ./my_workspace +``` + +This creates: +``` +root/ +├── .env # API key placeholder +├── input/ # Place your data here +├── settings.yaml # Main configuration file +└── prompts/ # LLM prompt templates +``` + +### Step 2 — Download Sample Data (Optional) + +Download sample datasets for testing. This command has an interactive confirmation prompt with no `--yes` flag — use one of these approaches to avoid hanging: + +**Bash/Linux/macOS:** +```bash +echo y | uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed data download +``` + +**PowerShell:** +```powershell +"y" | uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed data download +``` + +**Available datasets**: `AP_news`, `podcast`, `example_answers` + +### Step 3 — Gather Configuration Choices from the User + +Before writing any values into `settings.yaml`, **prompt the user with `ask_user`** to collect the LLM / auth / endpoint settings. Do not guess — these decisions are environment-specific and getting them wrong wastes downstream LLM calls. Use enum/boolean fields whenever possible so the user picks from a known set rather than typing free-form text. + +Ask in **a single `ask_user` form** (split into two if the workflow is autoq, since autoq also needs an embedding model). Tailor the follow-up fields based on the provider/auth choice — if the first answer reveals an Azure provider, ask the Azure-only fields in a second form. + +#### LLM (chat) fields to collect + +| Field | Type | Options / examples | Notes | +|-------|------|--------------------|-------| +| `llm_provider` | enum | `openai.chat`, `azure.openai.chat`, `azure.inference.chat` | See provider table in [references/config-reference.md](references/config-reference.md). | +| `model` | string | `gpt-4.1`, `gpt-4o`, `o3-mini`, an Azure deployment name | For Azure providers this is the **deployment name**, not the base model id. | +| `auth_type` | enum | `api_key` (default), `azure_managed_identity` | `azure_managed_identity` is only valid for `azure.*` providers. | +| `api_key_env_var` | string | `OPENAI_API_KEY` (default), `AZURE_OPENAI_API_KEY`, … | Only ask when `auth_type=api_key`. The skill writes `${VAR_NAME}` into YAML and adds the variable to `.env`. | +| `azure_endpoint` | string (uri) | e.g. `https://my-resource.openai.azure.com/` | Only ask for `azure.*` providers. | +| `api_version` | string | e.g. `2024-06-01` | Only ask for `azure.openai.*` providers. | +| `concurrent_requests` | integer | default `4` | Optional; offer the default. | + +#### Embedding fields to collect (autoq only) + +Ask the same shape of questions for the embedding model: + +| Field | Type | Notes | +|-------|------|-------| +| `embedding_provider` | enum (`openai.embedding`, `azure.openai.embedding`, `azure.inference.embedding`) | Must be an *embedding* provider. | +| `embedding_model` | string | e.g. `text-embedding-3-large`, or an Azure deployment name. | +| Reuse `auth_type` / `api_key_env_var` / `azure_endpoint` / `api_version` from the chat answers unless the user wants different values — ask a yes/no `reuse_chat_auth` boolean first. | + +#### Input data fields (autoq only) + +| Field | Type | Notes | +|-------|------|-------| +| `dataset_path` | string | Path to CSV/JSON dataset, e.g. `./input/data.csv`. | +| `input_type` | enum (`csv`, `json`) | | +| `text_column` | string | Column/key containing the text content. | + +#### Eval-config-specific fields (autoe_*) + +Only ask the questions relevant to the chosen `config_type`: +- `autoe_pairwise`: `base.name` + `base.answer_base_path`, plus a list of `others` (each with `name` and `answer_base_path`), and `question_sets`. +- `autoe_reference`: `reference.name` + `reference.answer_base_path`, list of `generated`, and `question_sets`. +- `autoe_assertion`: in single-RAG mode, `generated.name` + `generated.answer_base_path` and `assertions.assertions_path`. In multi-RAG mode (`rag_methods` provided), ask for `input_dir`, `output_dir`, `rag_methods` list, and `question_sets`. + +If the user declines a field, fall back to the documented default and call out the assumption in your response. + +### Step 4 — Apply the Answers + +Use the answers from Step 3 to edit `settings.yaml` and `.env` directly: + +```yaml +# LLM configuration (template — substitute values from ask_user answers) +chat_model: + model: + llm_provider: + auth_type: + api_key: ${} # only when auth_type=api_key + concurrent_requests: + init_args: # only for azure.* providers + azure_endpoint: + api_version: "" # azure.openai.* only + +# Input data (autoq only) +input: + dataset_path: + input_type: + text_column: +``` + +**Rules when writing the YAML:** +- Omit `api_key` entirely when `auth_type=azure_managed_identity` — do not leave `${OPENAI_API_KEY}` in place. +- Omit `init_args` for non-Azure providers. +- Quote `api_version` (it would otherwise be parsed as a date). +- For `azure_managed_identity`, do **not** add anything to `.env` for that key. +- For `api_key` auth, append `=` to `.env` if the variable is missing, and tell the user to replace the placeholder with their real key before running any command. + +For the full set of optional fields, read [references/config-reference.md](references/config-reference.md). + +### Step 5 — Validate Configuration + +The benchmark-qed CLI validates `settings.yaml` via pydantic at startup, so any missing or malformed fields are reported when you run a command. After applying the answers, run the actual target command (e.g. `benchmark-qed autoq …`) — config errors surface immediately, before any LLM calls. + +## Gotchas + +- The `data download` command blocks on `typer.confirm()`. Always use `echo y | uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed data download ...` to prevent hanging. +- Environment variables in YAML use `${VAR_NAME}` syntax (resolved at load time via python-dotenv). +- The `.env` file must be in the workspace root directory, not the project root. +- Config types `autoe_pairwise`, `autoe_reference`, and `autoe_assertion` generate different settings.yaml templates — use the correct type for your evaluation method. +- Prompts are copied as `.txt` files using Python `string.Template` syntax (`$variable` or `${variable}`). +- **`prompts_config` vs `prompt_config`**: Some generated autoe configs may use `prompts_config`, but the runtime expects `prompt_config`. If you get config validation errors, rename the key. diff --git a/.apm/skills/benchmark-qed-setup/references/config-reference.md b/.apm/skills/benchmark-qed-setup/references/config-reference.md new file mode 100644 index 0000000..08a34a6 --- /dev/null +++ b/.apm/skills/benchmark-qed-setup/references/config-reference.md @@ -0,0 +1,349 @@ +# Configuration Reference + +Reference for benchmark-qed configuration fields. Load this file when you need to understand or modify specific config settings. Default values shown are from the source code Pydantic models. + +## autoq Configuration (`QuestionGenerationConfig`) + +### Input Configuration +```yaml +input: + dataset_path: ./input/data.csv # Path to input dataset (REQUIRED) + input_type: csv # csv or json + text_column: text # Column containing text content + metadata_columns: null # Optional list of metadata columns (e.g., [headline, date]) + file_encoding: utf-8 # File encoding (template uses utf-8-sig) +``` + +### Encoding Configuration +```yaml +encoding: + model_name: o200k_base # Tokenizer model + chunk_size: 600 # Tokens per chunk + chunk_overlap: 100 # Overlap between chunks +``` + +### Sampling Configuration +```yaml +sampling: + num_clusters: 50 # Number of clusters for sampling + num_samples_per_cluster: 10 # Samples per cluster + random_seed: 42 # Reproducibility seed +``` + +### LLM Configuration (shared across all commands) +```yaml +chat_model: + model: gpt-4.1 # Model name + auth_type: api_key # api_key | azure_managed_identity + api_key: ${OPENAI_API_KEY} # Required for api_key auth + llm_provider: openai.chat # Provider (see table below) + concurrent_requests: 4 # Parallel LLM requests + init_args: {} # Extra model init args (e.g., api_version, azure_endpoint) + call_args: # Extra model call args + temperature: 0.0 + seed: 42 + custom_providers: [] # Custom provider registrations + +embedding_model: + model: text-embedding-3-large # Embedding model (template default; code default is gpt-4.1) + llm_provider: openai.embedding # Must use an embedding provider + api_key: ${OPENAI_API_KEY} +``` + +### Question Generation Types + +All question types share a base config with `num_questions` (default: `50`) and `oversample_factor` (default: `2.0`). Type-specific fields are listed below. + +```yaml +data_local: + num_questions: 50 # Number of questions to generate + oversample_factor: 2.0 # Generate oversample_factor × num_questions candidates + +data_global: # Requires data_local to be run first + num_questions: 50 + oversample_factor: 2.0 + min_questions_in_context: 2 # Min local questions required to form global context + min_claim_count: 2 # Min claims required for global question + min_relevant_reference_count: 10 # Min relevant references for global question + enable_question_validation: true # Validate generated global questions + +data_linked: # Requires data_local; opt-in (not generated by default) + num_questions: 50 + oversample_factor: 2.0 + min_questions_per_entity: 2 # Min local questions sharing an entity to form a group + max_questions_per_entity: 3 # Max local questions per entity group + type_balance_weight: 0.5 # Weight for balancing linked question types + max_questions_to_generate: 2 # Max linked questions per entity group + entity_frequency_threshold: 2 # Min entity frequency to be considered + +activity_local: # Auto-generates activity_context first + num_questions: 50 + oversample_factor: 2.0 + num_personas: 5 # Number of personas to generate + num_tasks_per_persona: 5 # Tasks per persona + num_entities_per_task: 10 # Entities per task + +activity_global: # Requires activity_local + num_questions: 50 + oversample_factor: 2.0 + num_personas: 5 + num_tasks_per_persona: 5 + num_entities_per_task: 10 +``` + +### Assertion Configuration +```yaml +assertions: + local: + max_assertions: 20 # Max assertions per question (null = unlimited, 0 = disable) + enable_validation: true # Quality filtering via LLM validation + min_validation_score: 3 # Min score (1-5) to pass validation + max_source_count: 500 # Max source chunks to consider + concurrent_llm_calls: 8 # Concurrent LLM calls for validation + max_concurrent_questions: 8 # Parallel questions for assertion generation + global: + max_assertions: 20 + enable_validation: true + min_validation_score: 3 + max_source_count: 500 + batch_size: 100 # Batch size for map-reduce processing + map_data_tokens: 8000 # Max tokens per cluster in map step + reduce_data_tokens: 32000 # Max input tokens for reduce step + enable_semantic_grouping: true # Group similar claims before map step + validate_map_assertions: true # Validate map assertions before reduce + validate_reduce_assertions: true # Validate final assertions after reduce + concurrent_llm_calls: 8 + max_concurrent_questions: 2 + linked: + max_assertions: 20 + enable_validation: true + min_validation_score: 3 + max_source_count: 500 + concurrent_llm_calls: 8 + max_concurrent_questions: 2 + +concurrent_requests: 8 # Top-level concurrency for autoq pipeline +``` + +## autoe Pairwise Configuration (`PairwiseConfig`) +```yaml +base: + name: method_a # REQUIRED + answer_base_path: ./answers/method_a/ # REQUIRED + +others: + - name: method_b + answer_base_path: ./answers/method_b/ + +question_sets: + - data_local_questions + - data_global_questions + +criteria: # Default: comprehensiveness, diversity, empowerment, relevance + - name: comprehensiveness # Each criterion requires both name and description + description: "..." + - name: diversity + description: "..." + +trials: 4 # Must be even (counterbalancing) +llm_config: ... # Same LLM config structure as above +prompt_config: + user_prompt: prompts/pairwise_user.txt + system_prompt: prompts/pairwise_system.txt +``` + +## autoe Reference Configuration (`ReferenceConfig`) +```yaml +reference: + name: gold_standard # REQUIRED + answer_base_path: ./answers/reference/ # REQUIRED + +generated: + - name: method_a + answer_base_path: ./answers/method_a/ + +criteria: # Default: correctness, completeness + - name: correctness + description: "..." + +score_min: 1 +score_max: 10 +trials: 4 # Default is 4 (not 3) +``` + +## autoe Assertion Configuration + +### Single-RAG (`AssertionConfig`) +```yaml +generated: + name: method_a # REQUIRED + answer_base_path: ./answers/method_a/ # REQUIRED + +assertions: + assertions_path: ./questions/assertions.json # REQUIRED + +pass_threshold: 0.5 +trials: 4 # Default is 4 (not 3) +``` + +### Multi-RAG (`MultiRAGAssertionConfig`) +```yaml +input_dir: ./data # REQUIRED +output_dir: ./eval_output # REQUIRED +rag_methods: # REQUIRED + - method_a + - method_b + +question_sets: # REQUIRED + - data_local_questions + +assertions_filename_template: "{question_set}_assertions.json" +answers_path_template: "{input_dir}/{rag_method}/{question_set}.json" +question_text_key: question_text # Key for question text in JSON +answer_text_key: answer # Key for answer text in JSON + +pass_threshold: 0.5 +top_k_assertions: null # null = use all +trials: 4 + +run_significance_test: true +significance_alpha: 0.05 +significance_correction: holm # holm | bonferroni | fdr_bh + +run_clustered_permutation: false +n_permutations: 10000 # Number of permutations for clustered test +permutation_seed: null # null = random seed +``` + +## autoe Hierarchical Assertion Configuration + +### Single-RAG (`HierarchicalAssertionConfig`) +```yaml +generated: + name: method_a # REQUIRED + answer_base_path: ./answers/method_a/ # REQUIRED + +assertions: + assertions_path: ./assertions.json # REQUIRED + +mode: staged # staged (default) or joint +detect_discovery: true # Detect novel findings not in assertions +pass_threshold: 0.5 +trials: 4 +``` + +### Multi-RAG (`MultiRAGHierarchicalAssertionConfig`) +```yaml +input_dir: ./data # REQUIRED +output_dir: ./eval_output # REQUIRED +rag_methods: # REQUIRED + - method_a + - method_b +assertions_file: assertions.json # REQUIRED — assertions filename + +answers_path_template: "{input_dir}/{rag_method}/data_global.json" +question_id_key: question_id +question_text_key: question_text +answer_text_key: answer +supporting_assertions_key: supporting_assertions + +mode: staged # staged | joint +pass_threshold: 0.5 +trials: 4 + +run_significance_test: true +significance_alpha: 0.05 +significance_correction: holm + +run_clustered_permutation: false +n_permutations: 10000 +permutation_seed: null +``` + +## autoe Retrieval Reference Configuration (`RetrievalReferenceConfig`) +```yaml +# Provide EITHER questions_path OR question_sets (not both) +questions_path: ./questions/selected_questions.json +# OR for multiple question sets: +question_sets: + - name: data_local + questions_path: ./questions/data_local/selected_questions.json + +text_units_path: ./data/text_units.parquet # REQUIRED +output_dir: ./retrieval_reference # REQUIRED +clusters_path: null # Optional pre-computed clusters +num_clusters: null # int, list of ints, or null (auto) +save_clusters: true + +semantic_neighbors: 10 +centroid_neighbors: 5 +relevance_threshold: 2 # Min relevance score for a text unit +assessor_type: rationale # rationale or bing +concurrent_requests: 16 +max_questions: null # null = process all questions +cache_dir: null # Optional cache directory + +embedding_config: ... # LLM config for generating embeddings (if needed) + +text_unit_fields: + id_col: id + text_col: text + embedding_col: text_embedding # Set to null to auto-generate embeddings + short_id_col: short_id # Set to null to auto-generate from index +``` + +## autoe Retrieval Scores Configuration (`RetrievalScoresConfig`) +```yaml +rag_methods: + - name: method_a + retrieval_results_path: ./results/method_a/ + +question_sets: + - data_local_questions + +reference_dir: ./retrieval_reference # REQUIRED +reference_filename: reference.json # Filename within reference_dir subdirectories +clusters_path: ./clusters.parquet # REQUIRED +text_units_path: ./text_units.parquet # REQUIRED +output_dir: ./retrieval_eval # REQUIRED + +relevance_threshold: 2 +assessor_type: rationale # rationale or bing +fidelity_metric: js # js (Jensen-Shannon) or tvd +context_id_key: chunk_id # Key for chunk ID in retrieval results +context_text_key: text # Key for chunk text in retrieval results +cluster_match_by: text # Field to match clusters + +cache_dir: null # Optional cache directory + +run_significance_test: true +significance_alpha: 0.05 +significance_correction: holm +``` + +## LLM Providers Reference + +| Provider | Value | Use for | +|----------|-------|---------| +| OpenAI Chat | `openai.chat` | Chat/generation models | +| OpenAI Embedding | `openai.embedding` | Embedding models | +| Azure OpenAI Chat | `azure.openai.chat` | Azure-hosted chat models | +| Azure OpenAI Embedding | `azure.openai.embedding` | Azure-hosted embeddings | +| Azure Inference Chat | `azure.inference.chat` | Azure AI Inference chat | +| Azure Inference Embedding | `azure.inference.embedding` | Azure AI Inference embeddings | + +## Custom LLM Providers +```yaml +custom_providers: + - model_type: chat # chat or embedding + name: custom.chat # Matches llm_provider value + module: my_module.provider # Python module path + model_class: MyCustomChatModel # Class name +``` + +## Significance Test Options +| Correction | Description | +|------------|-------------| +| `holm` | Holm-Bonferroni (default, recommended) | +| `bonferroni` | Bonferroni (conservative) | +| `fdr_bh` | Benjamini-Hochberg FDR | diff --git a/apm.yml b/apm.yml new file mode 100644 index 0000000..27a7e3d --- /dev/null +++ b/apm.yml @@ -0,0 +1,8 @@ +name: benchmark-qed +version: 0.1.0 +description: APM project for benchmark-qed +author: Andres Morales Esquivel +dependencies: + apm: [] + mcp: [] +scripts: {} diff --git a/benchmark_qed/autoq/cli.py b/benchmark_qed/autoq/cli.py index 2c3f8ea..04f468d 100644 --- a/benchmark_qed/autoq/cli.py +++ b/benchmark_qed/autoq/cli.py @@ -741,13 +741,13 @@ def assertion_stats( Examples -------- # Generate stats for a single assertion file - benchmark-qed autoq assertion-stats output/assertions.json + benchmark-qed assertion-stats output/assertions.json # Generate stats for all assertion files in a directory - benchmark-qed autoq assertion-stats output/data_global_questions/ + benchmark-qed assertion-stats output/data_global_questions/ # Specify output path - benchmark-qed autoq assertion-stats assertions.json -o stats/my_stats.json + benchmark-qed assertion-stats assertions.json -o stats/my_stats.json """ from benchmark_qed.autoq.question_gen.data_questions.assertion_gen.stats import ( generate_stats_for_assertion_file, @@ -940,17 +940,17 @@ def generate_assertions( Examples -------- # Generate local assertions for candidate questions - benchmark-qed autoq generate-assertions settings.yaml \ + benchmark-qed generate-assertions settings.yaml \ output/data_local_questions/candidate_questions.json \ output/data_local_questions/ --type local # Generate global assertions - benchmark-qed autoq generate-assertions settings.yaml \ + benchmark-qed generate-assertions settings.yaml \ output/data_global_questions/candidate_questions.json \ output/data_global_questions/ --type global # Generate linked assertions - benchmark-qed autoq generate-assertions settings.yaml \ + benchmark-qed generate-assertions settings.yaml \ output/data_linked_questions/candidate_questions.json \ output/data_linked_questions/ --type linked """ From d747707397c15af0b0efde21d1568742cd51b161 Mon Sep 17 00:00:00 2001 From: Andres Morales Esquivel Date: Wed, 22 Apr 2026 15:45:26 -0600 Subject: [PATCH 2/8] Update skills + interactive setup --- .apm/skills/benchmark-qed-autoe/SKILL.md | 8 +- .apm/skills/benchmark-qed-autoq/SKILL.md | 3 +- .apm/skills/benchmark-qed-setup/SKILL.md | 29 +- .../references/config-reference.md | 37 + .../minor-20260422214721711139.json | 4 + benchmark_qed/__main__.py | 4 + benchmark_qed/autoe/retrieval/scores.py | 2 +- benchmark_qed/autoq/config.py | 2 +- benchmark_qed/cli/init_config.py | 30 +- benchmark_qed/cli/interactive.py | 792 ++++++++++++++++++ benchmark_qed/cli/scaffold.py | 40 + benchmark_qed/cli/yaml_renderer.py | 545 ++++++++++++ tests/autoe/assertion/pipeline_test.py | 4 +- tests/test_interactive_init.py | 693 +++++++++++++++ 14 files changed, 2160 insertions(+), 33 deletions(-) create mode 100644 .semversioner/next-release/minor-20260422214721711139.json create mode 100644 benchmark_qed/cli/interactive.py create mode 100644 benchmark_qed/cli/scaffold.py create mode 100644 benchmark_qed/cli/yaml_renderer.py create mode 100644 tests/test_interactive_init.py diff --git a/.apm/skills/benchmark-qed-autoe/SKILL.md b/.apm/skills/benchmark-qed-autoe/SKILL.md index 1480976..432cf67 100644 --- a/.apm/skills/benchmark-qed-autoe/SKILL.md +++ b/.apm/skills/benchmark-qed-autoe/SKILL.md @@ -19,6 +19,7 @@ Evaluate and compare RAG system outputs using LLM-judged scoring, assertion-base - Generated questions/assertions from the autoq pipeline (or your own) - RAG method answer files (JSON, one per method per question set) - A valid `settings.yaml` for the evaluation type +- A configured workspace with valid `settings.yaml` (use the `benchmark-qed-setup` skill to initialize and configure) - LLM API key configured Run all commands with: @@ -159,7 +160,7 @@ uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoe ### Quick Evaluation (Assertion-Based) - [ ] Step 1: Verify questions and answers exist — list the workspace and confirm a `settings.yaml` (or `config.yaml`), question JSON files (typically under `output/`), and your RAG method answer JSONs are present. -- [ ] Step 2: Initialize eval config — `uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed config init autoe_assertion ./eval_workspace` +- [ ] Step 2: Initialize eval config — use the `benchmark-qed-setup` skill to create and configure an assertion evaluation workspace. - [ ] Step 3: Configure settings.yaml with answer paths and assertion paths - [ ] Step 4: Run evaluation — `uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoe assertion-scores ./eval_workspace/settings.yaml ./eval_output` - [ ] Step 5: Summarize results — read the CSVs in `` (e.g. `assertion_scores.csv`, `assertion_summary_by_question.csv`) and `eval_summary.json` directly. @@ -168,8 +169,6 @@ uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoe For comparing multiple RAG methods, use multi-RAG config format (include `rag_methods` key in YAML). This gives you automated pairwise significance testing. -For the full config reference with all fields, read the config reference in the `/benchmark-qed-setup` skill: [../benchmark-qed-setup/references/config-reference.md](../benchmark-qed-setup/references/config-reference.md). - ## Gotchas - **Config auto-detection**: `assertion-scores` and `hierarchical-assertion-scores` detect single vs multi-RAG based on the `rag_methods` key in YAML. Ensure your config matches your intent. @@ -177,4 +176,5 @@ For the full config reference with all fields, read the config reference in the - **Stale outputs**: Several commands skip existing output files. Use a fresh output directory or delete specific files to force re-evaluation. - **Output is in files**: All scores are written to CSV/JSON files. Parse output files, not CLI stdout. - **Long-running**: Evaluation with many questions and trials can take hours. Use background execution. -- **No `config init` for hierarchical/retrieval**: `config init` only supports `autoe_assertion`, `autoe_pairwise`, and `autoe_reference`. For hierarchical and retrieval configs, create YAML manually using the config reference. +- **No `config init` for hierarchical/retrieval**: The `benchmark-qed-setup` skill only supports `autoe_assertion`, `autoe_pairwise`, and `autoe_reference`. For hierarchical, multi-RAG, and retrieval configs, create YAML manually. +- **Advanced config types**: Use the `benchmark-qed-setup` skill for configuration guidance on advanced config types. diff --git a/.apm/skills/benchmark-qed-autoq/SKILL.md b/.apm/skills/benchmark-qed-autoq/SKILL.md index 90fe9bd..4443213 100644 --- a/.apm/skills/benchmark-qed-autoq/SKILL.md +++ b/.apm/skills/benchmark-qed-autoq/SKILL.md @@ -17,6 +17,7 @@ Generate benchmark questions and assertions from input data for RAG evaluation. ## Prerequisites - A configured workspace with valid `settings.yaml` (use the `/benchmark-qed-setup` skill first) +- A configured workspace with valid `settings.yaml` (use the `benchmark-qed-setup` skill to initialize and configure) - Input data (CSV or JSON) in the workspace `input/` directory - Valid LLM API key in `.env` @@ -148,7 +149,7 @@ uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed assert ### Standard Question Generation Flow -- [ ] Step 1: Verify workspace is ready — confirm `settings.yaml`, `.env`, and `input/` exist in `` (the CLI will fail fast if anything is misconfigured). +- [ ] Step 1: Initialize workspace if needed — use the `benchmark-qed-setup` skill to create and configure the workspace. Verify `settings.yaml`, `.env`, and `input/` exist. - [ ] Step 2: `cd ` then run question generation — `uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoq settings.yaml ./output --generation-types data_local --generation-types data_global --generation-types data_linked --generation-types activity_local --generation-types activity_global` - [ ] Step 3: Verify output artifacts — list `` and confirm the per-type `selected_questions.json` files (see "Output structure" above) plus `model_usage.json` exist. - [ ] Step 4: (Optional) Generate additional assertions — use `generate-assertions` diff --git a/.apm/skills/benchmark-qed-setup/SKILL.md b/.apm/skills/benchmark-qed-setup/SKILL.md index 464732c..e2c382b 100644 --- a/.apm/skills/benchmark-qed-setup/SKILL.md +++ b/.apm/skills/benchmark-qed-setup/SKILL.md @@ -35,7 +35,23 @@ pip install uv && uvx --from "git+https://github.com/microsoft/benchmark-qed" be ### Step 1 — Initialize a Workspace -Generate a configuration workspace for the desired workflow type: +**Option A (Recommended): Interactive wizard** + +The interactive wizard guides you through configuration with sensible defaults: + +```bash +uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed init +``` + +This walks through: +- Config type selection (autoq, autoe_pairwise, autoe_reference, autoe_assertion) +- LLM provider selection with Azure-specific prompts (endpoint, API version) +- Section-by-section customization (press Enter to accept defaults) +- Automatic YAML validation before writing + +**Option B: Non-interactive (template-based)** + +Generate a static template and edit manually: ```bash uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed config init @@ -160,6 +176,15 @@ For the full set of optional fields, read [references/config-reference.md](refer The benchmark-qed CLI validates `settings.yaml` via pydantic at startup, so any missing or malformed fields are reported when you run a command. After applying the answers, run the actual target command (e.g. `benchmark-qed autoq …`) — config errors surface immediately, before any LLM calls. +## Best Practices + +See [references/config-reference.md](references/config-reference.md) for detailed best practices covering LLM configuration, prompts, question generation, assertion generation, evaluation, and retrieval. + +Key highlights: +- Use `${OPENAI_API_KEY}` env var substitution — never hardcode secrets +- Use `benchmark-qed init` (interactive wizard) to avoid manual YAML errors +- Pin a specific version of benchmark-qed for reproducibility in CI/CD + ## Gotchas - The `data download` command blocks on `typer.confirm()`. Always use `echo y | uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed data download ...` to prevent hanging. @@ -167,4 +192,4 @@ The benchmark-qed CLI validates `settings.yaml` via pydantic at startup, so any - The `.env` file must be in the workspace root directory, not the project root. - Config types `autoe_pairwise`, `autoe_reference`, and `autoe_assertion` generate different settings.yaml templates — use the correct type for your evaluation method. - Prompts are copied as `.txt` files using Python `string.Template` syntax (`$variable` or `${variable}`). -- **`prompts_config` vs `prompt_config`**: Some generated autoe configs may use `prompts_config`, but the runtime expects `prompt_config`. If you get config validation errors, rename the key. +- **`prompts_config` vs `prompt_config`**: The non-interactive `config init` for some autoe types generates `prompts_config`, but the runtime expects `prompt_config`. The interactive `benchmark-qed init` wizard avoids this issue. If using `config init`, rename the key if you get validation errors. diff --git a/.apm/skills/benchmark-qed-setup/references/config-reference.md b/.apm/skills/benchmark-qed-setup/references/config-reference.md index 08a34a6..e1f6ff2 100644 --- a/.apm/skills/benchmark-qed-setup/references/config-reference.md +++ b/.apm/skills/benchmark-qed-setup/references/config-reference.md @@ -347,3 +347,40 @@ custom_providers: | `holm` | Holm-Bonferroni (default, recommended) | | `bonferroni` | Bonferroni (conservative) | | `fdr_bh` | Benjamini-Hochberg FDR | + +## Best Practices + +### LLM Configuration +- Use `${OPENAI_API_KEY}` environment variable substitution — never hardcode secrets in YAML +- Use `azure_managed_identity` for production Azure deployments (omit `api_key` entirely) +- Set `temperature: 0.0` and `seed: 42` for reproducible LLM outputs +- Start with `concurrent_requests: 4`; increase based on your rate limit budget +- For Azure providers, always set `azure_endpoint` and `api_version` in `init_args` +- Quote `api_version` values: `"2024-12-01-preview"` (YAML would otherwise parse as a date) + +### Question Generation (autoq) +- **Wizard defaults vs model defaults**: The interactive wizard uses curated starter values (e.g., `num_questions: 10`, `num_clusters: 20`) suitable for initial exploration. The Pydantic model defaults (e.g., `num_questions: 50`, `num_clusters: 50`) are for production runs. Adjust based on your dataset size and budget. +- Keep `chunk_overlap` at 15–20% of `chunk_size` (default: 100/600 ≈ 17%) +- Use `oversample_factor: 2.0` to generate 2× candidates before filtering — lower values risk insufficient quality diversity +- Enable `enable_semantic_grouping: true` for global assertions to improve claim consolidation +- Set `max_concurrent_questions` lower for global (2) than local (8) — global processing is heavier per question + +### Assertion Generation +- Keep `max_assertions: 20` as a reasonable limit per question +- Enable validation (`enable_validation: true`) for production benchmarks — it filters low-quality assertions +- `min_validation_score: 3` (scale 1–5) provides a good baseline quality threshold +- Setting `max_assertions: 0` disables assertion generation entirely for that question type +- `max_source_count: 500` drops entire questions when exceeded — monitor for unexpected question drops + +### Evaluation (autoe) +- Trials must be **even** for pairwise and reference evaluation (counterbalancing) — the config validator rejects odd values +- Assertion and hierarchical evaluation do NOT require even trials +- Use `staged` mode for hierarchical assertions (more accurate); `joint` mode is cheaper but risks anchoring bias +- Use `holm` correction for significance testing (default) — balances power and error control +- Set `pass_threshold: 0.5` as the default quality bar; adjust based on assertion strictness + +### Retrieval Evaluation +- `assessor_type: rationale` (default) provides structured JSON with reasoning; `bing` uses the UMBRELA DNA prompt +- Match the assessor type between `generate-retrieval-reference` and `retrieval-scores` to share the cache +- `relevance_threshold: 2` on a 0–3 scale is a reasonable default — lower values include marginal matches +- Use `cache_dir` for iterative development to avoid redundant LLM calls across runs diff --git a/.semversioner/next-release/minor-20260422214721711139.json b/.semversioner/next-release/minor-20260422214721711139.json new file mode 100644 index 0000000..30272b6 --- /dev/null +++ b/.semversioner/next-release/minor-20260422214721711139.json @@ -0,0 +1,4 @@ +{ + "type": "minor", + "description": "Add benchmark-qed agentic skills with interactive configuration wizard and best practices" +} diff --git a/benchmark_qed/__main__.py b/benchmark_qed/__main__.py index d517689..3f5f9c5 100644 --- a/benchmark_qed/__main__.py +++ b/benchmark_qed/__main__.py @@ -9,6 +9,7 @@ from benchmark_qed.autoe.cli import app as autoe_cli from benchmark_qed.autoq.cli import app as autoq_cli from benchmark_qed.cli.init_config import app as init_cli +from benchmark_qed.cli.interactive import interactive_init from benchmark_qed.data.cli import app as data_cli app: typer.Typer = typer.Typer(pretty_exceptions_show_locals=False) @@ -16,6 +17,9 @@ app.add_typer(autoe_cli, name="autoe", help="Relative scores CLI.") app.add_typer(autoq_cli, help="Question generation CLI.") app.add_typer(init_cli, name="config", help="Configuration initialization CLI.") +app.command(name="init", help="Interactively create a benchmark-qed configuration.")( + interactive_init +) app.add_typer(data_cli, name="data", help="Dataset downloader CLI.") diff --git a/benchmark_qed/autoe/retrieval/scores.py b/benchmark_qed/autoe/retrieval/scores.py index 42d92f4..647e174 100644 --- a/benchmark_qed/autoe/retrieval/scores.py +++ b/benchmark_qed/autoe/retrieval/scores.py @@ -654,7 +654,7 @@ async def run_retrieval_evaluation( retrieval_path = Path(rag_method["retrieval_results_path"]) # Check if path includes question_set placeholder - if "{question_set}" in str(retrieval_path): # noqa: RUF027 + if "{question_set}" in str(retrieval_path): retrieval_path = Path( str(retrieval_path).format(question_set=question_set) ) diff --git a/benchmark_qed/autoq/config.py b/benchmark_qed/autoq/config.py index d5fedfb..8d04722 100644 --- a/benchmark_qed/autoq/config.py +++ b/benchmark_qed/autoq/config.py @@ -682,7 +682,7 @@ class QuestionGenerationConfig(BaseModel): ) -class QuestionType(str): # noqa: FURB189 +class QuestionType(str): """Enumeration for question types that support assertion regeneration.""" __slots__ = () diff --git a/benchmark_qed/cli/init_config.py b/benchmark_qed/cli/init_config.py index 32f7884..a113ea3 100644 --- a/benchmark_qed/cli/init_config.py +++ b/benchmark_qed/cli/init_config.py @@ -33,6 +33,7 @@ from benchmark_qed.autoq.prompts.data_questions import ( local_questions as data_local_prompts, ) +from benchmark_qed.cli.scaffold import copy_prompts, ensure_input_folder, write_env_file app: typer.Typer = typer.Typer(pretty_exceptions_show_locals=False) @@ -314,15 +315,11 @@ class ConfigType(StrEnum): def __copy_prompts(prompts_path: Path, output_path: Path) -> None: - """Copy prompts from the prompts directory to the output directory.""" - if not output_path.exists(): - output_path.mkdir(parents=True, exist_ok=True) - for prompt_file in prompts_path.iterdir(): - if prompt_file.is_file() and prompt_file.suffix == ".txt": - target_file = output_path / prompt_file.name - target_file.write_text( - prompt_file.read_text(encoding="utf-8"), encoding="utf-8" - ) + """Copy prompts from the prompts directory to the output directory. + + Delegates to the shared scaffold utility. + """ + copy_prompts(prompts_path, output_path) @app.command() @@ -338,13 +335,7 @@ def init( ], ) -> None: """Generate settings file.""" - input_folder = root / "input" - if not input_folder.exists(): - input_folder.mkdir(parents=True, exist_ok=True) - typer.echo(f"Input folder created at {input_folder}") - typer.echo( - "Please place your input files in the 'input' folder before running, or modify the settings.yaml to point to your input files." - ) + ensure_input_folder(root) settings = root / "settings.yaml" prompts_folder = root / "prompts" @@ -399,9 +390,4 @@ def init( typer.echo(f"Configuration file created at {settings}") - env_file = root / ".env" - if not env_file.exists(): - env_file.write_text("OPENAI_API_KEY=", encoding="utf-8") - typer.echo( - f"Change the OPENAI_API_KEY placeholder at {env_file} with your actual OPENAI_API_KEY." - ) + write_env_file(root) diff --git a/benchmark_qed/cli/interactive.py b/benchmark_qed/cli/interactive.py new file mode 100644 index 0000000..a9f00e1 --- /dev/null +++ b/benchmark_qed/cli/interactive.py @@ -0,0 +1,792 @@ +# Copyright (c) 2025 Microsoft Corporation. +"""Interactive configuration wizard for benchmark-qed.""" + +from __future__ import annotations + +import sys +from dataclasses import dataclass, field +from pathlib import Path +from typing import Annotated, Any + +import typer +from rich import print as rich_print +from rich.panel import Panel +from rich.table import Table + +app: typer.Typer = typer.Typer(pretty_exceptions_show_locals=False) + + +# --------------------------------------------------------------------------- +# Data types +# --------------------------------------------------------------------------- + + +@dataclass +class FieldDef: + """Definition of a configurable field shown to the user.""" + + name: str + description: str + default: Any + field_type: type = str + choices: list[str] | None = None + + +@dataclass +class ProviderResult: + """Result from the provider selection flow.""" + + llm_provider: str + model: str + auth_type: str + init_args: dict[str, Any] = field(default_factory=dict) + + +# --------------------------------------------------------------------------- +# Provider metadata +# --------------------------------------------------------------------------- + +CHAT_PROVIDERS: list[tuple[str, str, str]] = [ + ("openai.chat", "OpenAI", "OpenAI API (default)"), + ("azure.openai.chat", "Azure OpenAI", "Azure-hosted OpenAI models"), + ("azure.inference.chat", "Azure Inference", "Azure AI Inference endpoint"), +] + +EMBEDDING_PROVIDERS: list[tuple[str, str, str]] = [ + ("openai.embedding", "OpenAI", "OpenAI API (default)"), + ("azure.openai.embedding", "Azure OpenAI", "Azure-hosted OpenAI embeddings"), + ( + "azure.inference.embedding", + "Azure Inference", + "Azure AI Inference endpoint", + ), +] + +AUTH_TYPES: list[tuple[str, str]] = [ + ("api_key", "API Key"), + ("azure_managed_identity", "Azure Managed Identity"), +] + +DEFAULT_CHAT_MODEL = "gpt-4.1" +DEFAULT_EMBEDDING_MODEL = "text-embedding-3-large" +DEFAULT_API_VERSION = "2024-12-01-preview" + + +# --------------------------------------------------------------------------- +# Guard helpers +# --------------------------------------------------------------------------- + + +def check_tty() -> None: + """Abort if stdin is not a terminal (non-interactive context).""" + if not sys.stdin.isatty(): + typer.echo( + "Error: Interactive mode requires a terminal. " + "Use 'benchmark-qed config init' for non-interactive setup.", + err=True, + ) + raise typer.Exit(code=1) + + +def confirm_overwrite(path: typer.Path | Any) -> None: + """Ask for confirmation before overwriting an existing settings file.""" + from pathlib import Path as _Path + + p = _Path(str(path)) + if p.exists(): + typer.confirm( + f"{p} already exists. Overwrite?", + abort=True, + ) + + +# --------------------------------------------------------------------------- +# Selection / display primitives +# --------------------------------------------------------------------------- + + +def select_option( + title: str, + options: list[tuple[str, str]], +) -> str: + """Display numbered options and return the selected value. + + Parameters + ---------- + title: + Prompt title shown to the user. + options: + List of ``(value, label)`` tuples. + + Returns + ------- + The *value* string of the chosen option. + """ + rich_print(f"\n[bold]{title}[/bold]") + for idx, (_value, label) in enumerate(options, 1): + rich_print(f" [cyan][{idx}][/cyan] {label}") + + choice = typer.prompt( + "Select", + type=int, + default=1, + ) + if choice < 1 or choice > len(options): + typer.echo("Invalid choice. Defaulting to 1.") + choice = 1 + return options[choice - 1][0] + + +def show_section_defaults(title: str, fields: list[FieldDef]) -> None: + """Render a Rich table showing current default values for a section.""" + table = Table(title=title, show_header=False, padding=(0, 2)) + table.add_column("Field", style="cyan", min_width=28) + table.add_column("Default", style="green") + for f in fields: + table.add_row(f.name, str(f.default)) + rich_print(table) + + +def prompt_section( + title: str, + fields: list[FieldDef], +) -> dict[str, Any]: + """Show section defaults and optionally let the user customise them. + + Returns a dict mapping field names to their (possibly user-overridden) values. + """ + show_section_defaults(title, fields) + + if not typer.confirm("Customize this section?", default=False): + return {f.name: f.default for f in fields} + + result: dict[str, Any] = {} + for f in fields: + if f.choices: + value = select_option(f.description, [(c, c) for c in f.choices]) + else: + raw = typer.prompt( + f.name, + default=f.default, + type=f.field_type, + ) + value = raw + result[f.name] = value + return result + + +# --------------------------------------------------------------------------- +# Provider selection +# --------------------------------------------------------------------------- + + +def _prompt_azure_init_args(provider_value: str) -> dict[str, Any]: + """Prompt for Azure-specific init_args based on provider type.""" + init_args: dict[str, Any] = {} + + if "azure" in provider_value: + endpoint = typer.prompt("Azure endpoint URL") + init_args["azure_endpoint"] = endpoint + + if "azure.openai" in provider_value: + api_version = typer.prompt("API version", default=DEFAULT_API_VERSION) + init_args["api_version"] = api_version + + return init_args + + +def prompt_provider( + purpose: str = "chat", + *, + default_model: str | None = None, +) -> ProviderResult: + """Guide the user through LLM provider selection. + + Parameters + ---------- + purpose: + ``"chat"`` or ``"embedding"`` — determines available providers and default model. + default_model: + Override the default model name. If *None*, uses the standard default for the purpose. + """ + providers = CHAT_PROVIDERS if purpose == "chat" else EMBEDDING_PROVIDERS + model_default = default_model or ( + DEFAULT_CHAT_MODEL if purpose == "chat" else DEFAULT_EMBEDDING_MODEL + ) + + provider_value = select_option( + f"Select {purpose} LLM provider", + [(val, label) for val, label, _desc in providers], + ) + + # Auth type + auth_type = select_option("Authentication type", AUTH_TYPES) + + # Provider-specific init args + init_args = _prompt_azure_init_args(provider_value) + + # Model name + model = typer.prompt("Model name", default=model_default) + + return ProviderResult( + llm_provider=provider_value, + model=model, + auth_type=auth_type, + init_args=init_args, + ) + + +def prompt_embedding_provider( + chat_result: ProviderResult, +) -> ProviderResult: + """Ask whether to reuse the chat provider for embeddings, or configure separately.""" + if typer.confirm("Use the same provider for embeddings?", default=True): + # Derive the embedding provider from the chat provider + mapping = { + "openai.chat": "openai.embedding", + "azure.openai.chat": "azure.openai.embedding", + "azure.inference.chat": "azure.inference.embedding", + } + emb_provider = mapping.get(chat_result.llm_provider, "openai.embedding") + return ProviderResult( + llm_provider=emb_provider, + model=DEFAULT_EMBEDDING_MODEL, + auth_type=chat_result.auth_type, + init_args=dict(chat_result.init_args), + ) + return prompt_provider("embedding") + + +# --------------------------------------------------------------------------- +# List collection +# --------------------------------------------------------------------------- + + +def prompt_list_items( + item_name: str, + field_defs: list[FieldDef], + *, + min_items: int = 1, +) -> list[dict[str, Any]]: + """Collect a list of items by prompting the user in a loop. + + Each iteration prompts for each field in *field_defs*, then asks + "Add another ?". + """ + items: list[dict[str, Any]] = [] + while True: + rich_print(f"\n[bold] {item_name} #{len(items) + 1}[/bold]") + item: dict[str, Any] = {} + for f in field_defs: + raw = typer.prompt(f" {f.name}", default=f.default, type=f.field_type) + item[f.name] = raw + items.append(item) + + if len(items) >= min_items: + if not typer.confirm(f"Add another {item_name}?", default=False): + break + else: + typer.echo(f" (need at least {min_items})") + return items + + +def prompt_comma_list(prompt_text: str, default: str = "") -> list[str]: + """Prompt for a comma-separated list and return split values.""" + raw = typer.prompt(prompt_text, default=default) + return [s.strip() for s in raw.split(",") if s.strip()] + + +# --------------------------------------------------------------------------- +# AutoQ interactive configuration +# --------------------------------------------------------------------------- + +_QUESTION_TYPES = [ + "data_local", + "data_global", + "data_linked", + "activity_local", + "activity_global", +] + + +def build_autoq_config() -> dict[str, Any]: + """Walk the user through AutoQ configuration and return a render-ready dict. + + The returned dictionary contains every value needed to render the AutoQ + YAML settings file. Keys are organised into logical sections that mirror + the wizard steps shown to the user. + """ + rich_print(Panel("[bold]AutoQ — Question Generation[/bold]", expand=False)) + + # ── 1. Chat LLM provider ────────────────────────────────────────────── + chat_result = prompt_provider("chat") + + # ── 2. Embedding LLM ────────────────────────────────────────────────── + embedding_result = prompt_embedding_provider(chat_result) + + # ── 3. Input section ────────────────────────────────────────────────── + input_fields = [ + FieldDef("dataset_path", "Path to input dataset", "./input"), + FieldDef("input_type", "Input file type", "json", choices=["csv", "json"]), + FieldDef("text_column", "Column containing text", "text"), + FieldDef( + "metadata_columns", + "Metadata columns (comma-separated)", + "", + ), + FieldDef("file_encoding", "File encoding", "utf-8-sig"), + ] + input_values = prompt_section("Input", input_fields) + + # Normalise metadata_columns to a list or None + raw_meta = input_values.get("metadata_columns", "") + if isinstance(raw_meta, str): + parts = [s.strip() for s in raw_meta.split(",") if s.strip()] + input_values["metadata_columns"] = parts or None + + # ── 4. Encoding section ─────────────────────────────────────────────── + encoding_fields = [ + FieldDef("model_name", "Tokeniser model name", "o200k_base"), + FieldDef("chunk_size", "Chunk size (tokens)", 600, field_type=int), + FieldDef("chunk_overlap", "Chunk overlap (tokens)", 100, field_type=int), + ] + encoding_values = prompt_section("Encoding", encoding_fields) + + # ── 5. Sampling section ─────────────────────────────────────────────── + sampling_fields = [ + FieldDef("num_clusters", "Number of clusters", 20, field_type=int), + FieldDef( + "num_samples_per_cluster", + "Samples per cluster", + 10, + field_type=int, + ), + FieldDef("random_seed", "Random seed", 42, field_type=int), + ] + sampling_values = prompt_section("Sampling", sampling_fields) + + # ── 6. Question Types section ───────────────────────────────────────── + qt_fields: list[FieldDef] = [] + for qt in _QUESTION_TYPES: + qt_fields.extend([ + FieldDef( + f"{qt}_num_questions", + f"{qt} — number of questions", + 10, + field_type=int, + ), + FieldDef( + f"{qt}_oversample_factor", + f"{qt} — oversample factor", + 2.0, + field_type=float, + ), + ]) + + qt_values = prompt_section("Question Types", qt_fields) + + # Reshape flat values into nested per-type dicts + question_types: dict[str, dict[str, Any]] = {} + customised_qt = qt_values != {f.name: f.default for f in qt_fields} + for qt in _QUESTION_TYPES: + question_types[qt] = { + "num_questions": qt_values[f"{qt}_num_questions"], + "oversample_factor": qt_values[f"{qt}_oversample_factor"], + } + + # ── 7. Activity question params (only when QT section was customised) ─ + activity_defaults = { + "num_personas": 5, + "num_tasks_per_persona": 2, + "num_entities_per_task": 5, + } + if customised_qt: + activity_fields = [ + FieldDef("num_personas", "Number of personas", 5, field_type=int), + FieldDef( + "num_tasks_per_persona", + "Tasks per persona", + 2, + field_type=int, + ), + FieldDef( + "num_entities_per_task", + "Entities per task", + 5, + field_type=int, + ), + ] + activity_values = prompt_section("Activity Question Params", activity_fields) + else: + activity_values = dict(activity_defaults) + + # ── 8. Assertions section ───────────────────────────────────────────── + assertions_fields = [ + FieldDef("max_assertions", "Max assertions per question", 20, field_type=int), + FieldDef( + "enable_validation", + "Enable assertion validation", + default=True, + field_type=bool, + ), + FieldDef( + "min_validation_score", + "Minimum validation score", + 3, + field_type=int, + ), + ] + + # Display defaults, then offer customisation. + # We handle the bool field (enable_validation) specially via typer.confirm. + show_section_defaults("Assertions", assertions_fields) + if not typer.confirm("Customize this section?", default=False): + assertions_values = {f.name: f.default for f in assertions_fields} + else: + assertions_values: dict[str, Any] = {} + for f in assertions_fields: + if f.field_type is bool: + assertions_values[f.name] = typer.confirm(f.name, default=f.default) + elif f.choices: + assertions_values[f.name] = select_option( + f.description, [(c, c) for c in f.choices] + ) + else: + assertions_values[f.name] = typer.prompt( + f.name, default=f.default, type=f.field_type + ) + + # ── 9. Concurrency ──────────────────────────────────────────────────── + concurrent_requests = typer.prompt("Concurrent requests", default=8, type=int) + + # ── Build final config dict ─────────────────────────────────────────── + return { + "chat_provider": chat_result, + "embedding_provider": embedding_result, + "input": input_values, + "encoding": encoding_values, + "sampling": sampling_values, + "question_types": question_types, + "activity_params": activity_values, + "assertions": assertions_values, + "concurrent_requests": concurrent_requests, + } + + +# --------------------------------------------------------------------------- +# AutoE interactive configuration flows +# --------------------------------------------------------------------------- + +_CONDITION_FIELDS = [ + FieldDef( + name="name", + description="Condition name", + default="", + field_type=str, + ), + FieldDef( + name="answer_base_path", + description="Path to answer files", + default="input/method_name", + field_type=str, + ), +] + + +def _prompt_condition(label: str) -> dict[str, Any]: + """Prompt for a single condition (name + answer_base_path).""" + rich_print(f"\n[bold] {label}[/bold]") + name = typer.prompt(" name", default="") + answer_base_path = typer.prompt(" answer_base_path", default="input/method_name") + return {"name": name, "answer_base_path": answer_base_path} + + +def _prompt_even_trials(default: int = 4) -> int: + """Prompt for a trial count and ensure it is even.""" + trials = typer.prompt("Number of trials (must be even)", default=default, type=int) + if trials % 2 != 0: + trials += 1 + typer.echo(f" Trials must be even — rounded up to {trials}.") + return trials + + +def build_autoe_pairwise_config() -> dict[str, Any]: + """Interactive flow for AutoE pairwise evaluation configuration.""" + rich_print(Panel("AutoE — Pairwise Evaluation")) + + # LLM provider + chat_provider = prompt_provider("chat") + + # Base condition + base = _prompt_condition("Base condition") + + # Other conditions + rich_print("\n[bold]Other conditions to compare against the base:[/bold]") + others = prompt_list_items("condition", _CONDITION_FIELDS, min_items=1) + + # Question sets + question_sets = prompt_comma_list( + "Question sets (comma-separated)", "activity_global, activity_local" + ) + + # Trials + trials = _prompt_even_trials() + + # Custom criteria + criteria: list[dict[str, Any]] | None = None + if typer.confirm("Add custom scoring criteria?", default=False): + criteria = prompt_list_items( + "criterion", + [ + FieldDef( + name="name", + description="Criterion name", + default="", + field_type=str, + ), + FieldDef( + name="description", + description="Criterion description", + default="", + field_type=str, + ), + ], + ) + + return { + "chat_provider": chat_provider, + "base": base, + "others": others, + "question_sets": question_sets, + "trials": trials, + "criteria": criteria, + } + + +def build_autoe_reference_config() -> dict[str, Any]: + """Interactive flow for AutoE reference evaluation configuration.""" + rich_print(Panel("AutoE — Reference Evaluation")) + + # LLM provider + chat_provider = prompt_provider("chat") + + # Reference condition + reference = _prompt_condition("Reference condition") + + # Generated conditions + rich_print("\n[bold]Generated conditions to evaluate:[/bold]") + generated = prompt_list_items("generated condition", _CONDITION_FIELDS, min_items=1) + + # Score range + score_min = typer.prompt("Score minimum", default=1, type=int) + score_max = typer.prompt("Score maximum", default=10, type=int) + + # Trials + trials = _prompt_even_trials() + + return { + "chat_provider": chat_provider, + "reference": reference, + "generated": generated, + "score_min": score_min, + "score_max": score_max, + "trials": trials, + } + + +def build_autoe_assertion_config() -> dict[str, Any]: + """Interactive flow for AutoE assertion evaluation configuration.""" + rich_print(Panel("AutoE — Assertion Evaluation")) + + # LLM provider + chat_provider = prompt_provider("chat") + + # Generated condition + generated = _prompt_condition("Generated condition") + + # Assertions path + assertions_path = typer.prompt( + "Path to assertions file", default="input/assertions.json" + ) + + # Pass threshold + pass_threshold = typer.prompt("Pass threshold", default=0.5, type=float) + + # Trials + trials = typer.prompt("Number of trials", default=4, type=int) + + return { + "chat_provider": chat_provider, + "generated": generated, + "assertions": {"assertions_path": assertions_path}, + "pass_threshold": pass_threshold, + "trials": trials, + } + + +# --------------------------------------------------------------------------- +# Config type metadata +# --------------------------------------------------------------------------- + +CONFIG_TYPE_OPTIONS: list[tuple[str, str]] = [ + ("autoq", "AutoQ — Question Generation"), + ("autoe_pairwise", "AutoE — Pairwise Evaluation"), + ("autoe_reference", "AutoE — Reference Evaluation"), + ("autoe_assertion", "AutoE — Assertion Evaluation"), +] + + +# --------------------------------------------------------------------------- +# Prompt copying orchestration +# --------------------------------------------------------------------------- + + +def _copy_prompts_for_config(config_type: str, prompts_folder: Path) -> None: + """Copy the appropriate prompt templates for the given config type.""" + from benchmark_qed.autod.prompts import summarization + from benchmark_qed.autoe.prompts import assertion as assertion_prompts + from benchmark_qed.autoe.prompts import pairwise as pairwise_prompts + from benchmark_qed.autoe.prompts import reference as reference_prompts + from benchmark_qed.autoq.prompts import data_questions as data_questions_prompts + from benchmark_qed.autoq.prompts.activity_questions import ( + activity_context as activity_context_prompts, + ) + from benchmark_qed.autoq.prompts.activity_questions import ( + global_questions as activity_global_prompts, + ) + from benchmark_qed.autoq.prompts.activity_questions import ( + local_questions as activity_local_prompts, + ) + from benchmark_qed.autoq.prompts.data_questions import ( + assertions as autoq_assertion_prompts, + ) + from benchmark_qed.autoq.prompts.data_questions import ( + global_questions as data_global_prompts, + ) + from benchmark_qed.autoq.prompts.data_questions import ( + linked_questions as data_linked_prompts, + ) + from benchmark_qed.autoq.prompts.data_questions import ( + local_questions as data_local_prompts, + ) + from benchmark_qed.cli.scaffold import copy_prompts + + match config_type: + case "autoq": + copy_prompts( + Path(summarization.__file__).parent, + prompts_folder / "summarization", + ) + copy_prompts( + Path(activity_context_prompts.__file__).parent, + prompts_folder / "activity_questions" / "activity_context", + ) + copy_prompts( + Path(activity_global_prompts.__file__).parent, + prompts_folder / "activity_questions" / "activity_global", + ) + copy_prompts( + Path(activity_local_prompts.__file__).parent, + prompts_folder / "activity_questions" / "activity_local", + ) + copy_prompts( + Path(data_global_prompts.__file__).parent, + prompts_folder / "data_questions" / "data_global", + ) + copy_prompts( + Path(data_local_prompts.__file__).parent, + prompts_folder / "data_questions" / "data_local", + ) + copy_prompts( + Path(data_linked_prompts.__file__).parent, + prompts_folder / "data_questions" / "data_linked", + ) + copy_prompts( + Path(data_questions_prompts.__file__).parent, + prompts_folder / "data_questions", + ) + copy_prompts( + Path(autoq_assertion_prompts.__file__).parent, + prompts_folder / "data_questions" / "assertions", + ) + case "autoe_pairwise": + copy_prompts(Path(pairwise_prompts.__file__).parent, prompts_folder) + case "autoe_reference": + copy_prompts(Path(reference_prompts.__file__).parent, prompts_folder) + case "autoe_assertion": + copy_prompts(Path(assertion_prompts.__file__).parent, prompts_folder) + + +# --------------------------------------------------------------------------- +# Main init command +# --------------------------------------------------------------------------- + + +@app.command() +def interactive_init( + root: Annotated[ + Path, + typer.Argument(help="The root directory for the new benchmark project."), + ], +) -> None: + """Interactively create a benchmark-qed configuration.""" + from benchmark_qed.cli.scaffold import ensure_input_folder, write_env_file + from benchmark_qed.cli.yaml_renderer import ( + render_autoe_assertion_yaml, + render_autoe_pairwise_yaml, + render_autoe_reference_yaml, + render_autoq_yaml, + validate_config, + ) + + check_tty() + + rich_print( + Panel( + "[bold]benchmark-qed[/bold] — Interactive Configuration Wizard", + subtitle="Press Enter to accept defaults", + ) + ) + + # 1. Select config type + config_type = select_option("Select configuration type", CONFIG_TYPE_OPTIONS) + + # 2. Run the appropriate builder + builders = { + "autoq": build_autoq_config, + "autoe_pairwise": build_autoe_pairwise_config, + "autoe_reference": build_autoe_reference_config, + "autoe_assertion": build_autoe_assertion_config, + } + config_dict = builders[config_type]() + + # 3. Render YAML + renderers = { + "autoq": render_autoq_yaml, + "autoe_pairwise": render_autoe_pairwise_yaml, + "autoe_reference": render_autoe_reference_yaml, + "autoe_assertion": render_autoe_assertion_yaml, + } + yaml_content = renderers[config_type](config_dict) + + # 4. Validate against Pydantic model + validate_config(yaml_content, config_type) + + # 5. Write files + root.mkdir(parents=True, exist_ok=True) + settings_path = root / "settings.yaml" + confirm_overwrite(settings_path) + settings_path.write_text(yaml_content, encoding="utf-8") + + prompts_folder = root / "prompts" + _copy_prompts_for_config(config_type, prompts_folder) + + ensure_input_folder(root) + write_env_file(root) + + # 6. Success summary + rich_print(f"\n[green]✅ Configuration created at {settings_path}[/green]") + rich_print(f"[green]✅ Prompt templates copied to {prompts_folder}/[/green]") + rich_print( + "[green]✅ .env file created — update OPENAI_API_KEY before running[/green]" + ) diff --git a/benchmark_qed/cli/scaffold.py b/benchmark_qed/cli/scaffold.py new file mode 100644 index 0000000..794a67e --- /dev/null +++ b/benchmark_qed/cli/scaffold.py @@ -0,0 +1,40 @@ +# Copyright (c) 2025 Microsoft Corporation. +"""Shared scaffolding utilities for config initialization.""" + +from pathlib import Path + +import typer + + +def copy_prompts(prompts_path: Path, output_path: Path) -> None: + """Copy prompt template files from a source directory to an output directory.""" + if not output_path.exists(): + output_path.mkdir(parents=True, exist_ok=True) + for prompt_file in prompts_path.iterdir(): + if prompt_file.is_file() and prompt_file.suffix == ".txt": + target_file = output_path / prompt_file.name + target_file.write_text( + prompt_file.read_text(encoding="utf-8"), encoding="utf-8" + ) + + +def write_env_file(root: Path) -> None: + """Create a .env file with placeholder API key if it doesn't exist.""" + env_file = root / ".env" + if not env_file.exists(): + env_file.write_text("OPENAI_API_KEY=", encoding="utf-8") + typer.echo( + f"Change the OPENAI_API_KEY placeholder at {env_file} with your actual OPENAI_API_KEY." + ) + + +def ensure_input_folder(root: Path) -> None: + """Create the input folder if it doesn't exist.""" + input_folder = root / "input" + if not input_folder.exists(): + input_folder.mkdir(parents=True, exist_ok=True) + typer.echo(f"Input folder created at {input_folder}") + typer.echo( + "Please place your input files in the 'input' folder before running, " + "or modify the settings.yaml to point to your input files." + ) diff --git a/benchmark_qed/cli/yaml_renderer.py b/benchmark_qed/cli/yaml_renderer.py new file mode 100644 index 0000000..00166fa --- /dev/null +++ b/benchmark_qed/cli/yaml_renderer.py @@ -0,0 +1,545 @@ +# Copyright (c) 2025 Microsoft Corporation. +"""YAML renderer for the interactive config wizard. + +Transforms structured dicts (from the interactive wizard) into well-formatted, +commented YAML strings using a template-based approach to preserve inline +comments and consistent formatting. +""" + +from __future__ import annotations + +from typing import Any + +import typer +import yaml + +# --------------------------------------------------------------------------- +# Helper +# --------------------------------------------------------------------------- + + +def _render_llm_section(provider_dict: dict[str, Any], indent: int = 2) -> str: + """Render an LLM configuration section as a YAML fragment. + + Parameters + ---------- + provider_dict: + Dict (or dataclass) with keys ``llm_provider``, ``model``, + ``auth_type``, ``init_args`` (from :class:`ProviderResult`). + indent: + Number of leading spaces for each line. + """ + import dataclasses + + if dataclasses.is_dataclass(provider_dict) and not isinstance(provider_dict, type): + provider_dict = dataclasses.asdict(provider_dict) + + pad = " " * indent + lines: list[str] = [] + + lines.extend([ + f"{pad}model: {provider_dict['model']}", + f"{pad}auth_type: {provider_dict['auth_type']}", + ]) + + if provider_dict["auth_type"] == "api_key": + lines.append(f"{pad}api_key: ${{OPENAI_API_KEY}}") + + lines.extend([ + f"{pad}llm_provider: {provider_dict['llm_provider']}", + f"{pad}concurrent_requests: 4", + ]) + + init_args = provider_dict.get("init_args") or {} + if init_args: + lines.append(f"{pad}init_args:") + for key, value in init_args.items(): + lines.append(f"{pad} {key}: {value}") + + return "\n".join(lines) + + +# --------------------------------------------------------------------------- +# AutoQ +# --------------------------------------------------------------------------- + + +def render_autoq_yaml(config: dict[str, Any]) -> str: + """Render a complete AutoQ ``settings.yaml`` from wizard configuration. + + Parameters + ---------- + config: + Dict with keys ``chat_provider``, ``embedding_provider``, ``input``, + ``encoding``, ``sampling``, ``question_types``, ``activity_params``, + ``assertions``, ``concurrent_requests``. + """ + inp = config["input"] + enc = config["encoding"] + samp = config["sampling"] + qt = config["question_types"] + ap = config["activity_params"] + assrt = config["assertions"] + concurrent = config["concurrent_requests"] + + # Metadata columns + meta = inp.get("metadata_columns") + if meta is not None and isinstance(meta, list) and len(meta) > 0: + meta_line = f" metadata_columns: [{', '.join(meta)}]" + else: + meta_line = "" + + chat_section = _render_llm_section(config["chat_provider"]) + embedding_section = _render_llm_section(config["embedding_provider"]) + + # Build metadata_columns block (include line only if present) + metadata_block = f"\n{meta_line}" if meta_line else "" + + return f"""\ +## Input Configuration +input: + dataset_path: {inp["dataset_path"]} + input_type: {inp["input_type"]} + text_column: {inp["text_column"]}{metadata_block} + file_encoding: {inp["file_encoding"]} + +## Encoder configuration +encoding: + model_name: {enc["model_name"]} + chunk_size: {enc["chunk_size"]} + chunk_overlap: {enc["chunk_overlap"]} + +## Sampling Configuration +sampling: + num_clusters: {samp["num_clusters"]} + num_samples_per_cluster: {samp["num_samples_per_cluster"]} + random_seed: {samp["random_seed"]} + +## LLM Configuration +chat_model: +{chat_section} + +embedding_model: +{embedding_section} + +## Question Generation Configuration +data_local: + num_questions: {qt["data_local"]["num_questions"]} + oversample_factor: {_fmt_float(qt["data_local"]["oversample_factor"])} +data_global: + num_questions: {qt["data_global"]["num_questions"]} + oversample_factor: {_fmt_float(qt["data_global"]["oversample_factor"])} +data_linked: + num_questions: {qt["data_linked"]["num_questions"]} + oversample_factor: {_fmt_float(qt["data_linked"]["oversample_factor"])} + min_questions_per_entity: 2 + max_questions_per_entity: 10 +activity_local: + num_questions: {qt["activity_local"]["num_questions"]} + oversample_factor: {_fmt_float(qt["activity_local"]["oversample_factor"])} + num_personas: {ap["num_personas"]} + num_tasks_per_persona: {ap["num_tasks_per_persona"]} + num_entities_per_task: {ap["num_entities_per_task"]} +activity_global: + num_questions: {qt["activity_global"]["num_questions"]} + oversample_factor: {_fmt_float(qt["activity_global"]["oversample_factor"])} + num_personas: {ap["num_personas"]} + num_tasks_per_persona: {ap["num_tasks_per_persona"]} + num_entities_per_task: {ap["num_entities_per_task"]} + +concurrent_requests: {concurrent} + +activity_questions_prompt_config: + activity_context_prompt_config: + data_summary_prompt_config: + summary_map_system_prompt: + prompt: prompts/summarization/summary_map_system_prompt.txt + summary_map_user_prompt: + prompt: prompts/summarization/summary_map_user_prompt.txt + summary_reduce_system_prompt: + prompt: prompts/summarization/summary_reduce_system_prompt.txt + summary_reduce_user_prompt: + prompt: prompts/summarization/summary_reduce_user_prompt.txt + activity_identification_prompt: + prompt: prompts/activity_questions/activity_context/activity_identification_prompt.txt + entity_extraction_map_system_prompt: + prompt: prompts/activity_questions/activity_context/entity_extraction_map_system_prompt.txt + entity_extraction_map_user_prompt: + prompt: prompts/activity_questions/activity_context/entity_extraction_map_user_prompt.txt + entity_extraction_reduce_system_prompt: + prompt: prompts/activity_questions/activity_context/entity_extraction_reduce_system_prompt.txt + entity_extraction_reduce_user_prompt: + prompt: prompts/activity_questions/activity_context/entity_extraction_reduce_user_prompt.txt + activity_global_prompt_config: + activity_global_gen_system_prompt: + prompt: prompts/activity_questions/activity_global/activity_global_gen_system_prompt.txt + activity_global_gen_user_prompt: + prompt: prompts/activity_questions/activity_global/activity_global_gen_user_prompt.txt + activity_local_prompt_config: + activity_local_gen_system_prompt: + prompt: prompts/activity_questions/activity_local/activity_local_gen_system_prompt.txt + activity_local_gen_user_prompt: + prompt: prompts/activity_questions/activity_local/activity_local_gen_user_prompt.txt + +data_questions_prompt_config: + claim_extraction_system_prompt: + prompt: prompts/data_questions/claim_extraction_system_prompt.txt + data_global_prompt_config: + data_global_gen_user_prompt: + prompt: prompts/data_questions/data_global/data_global_gen_user_prompt.txt + data_global_gen_system_prompt: + prompt: prompts/data_questions/data_global/data_global_gen_system_prompt.txt + data_local_prompt_config: + data_local_gen_system_prompt: + prompt: prompts/data_questions/data_local/data_local_gen_system_prompt.txt + data_local_expansion_system_prompt: + prompt: prompts/data_questions/data_local/data_local_expansion_system_prompt.txt + data_local_gen_user_prompt: + prompt: prompts/data_questions/data_local/data_local_gen_user_prompt.txt + data_linked_prompt_config: + bridge_question_system_prompt: + prompt: prompts/data_questions/data_linked/bridge_question_system_prompt.txt + comparison_question_system_prompt: + prompt: prompts/data_questions/data_linked/comparison_question_system_prompt.txt + intersection_question_system_prompt: + prompt: prompts/data_questions/data_linked/intersection_question_system_prompt.txt + linked_question_user_prompt: + prompt: prompts/data_questions/data_linked/linked_question_user_prompt.txt + batch_validation_prompt: + prompt: prompts/data_questions/data_linked/batch_validation_prompt.txt + +## Assertion Generation Configuration +assertions: + local: + max_assertions: {assrt["max_assertions"]} + enable_validation: {_fmt_bool(assrt["enable_validation"])} + min_validation_score: {assrt["min_validation_score"]} + concurrent_llm_calls: 8 + max_concurrent_questions: 8 + global: + max_assertions: {assrt["max_assertions"]} + enable_validation: {_fmt_bool(assrt["enable_validation"])} + min_validation_score: {assrt["min_validation_score"]} + batch_size: 100 + map_data_tokens: 8000 + reduce_data_tokens: 32000 + enable_semantic_grouping: true + validate_map_assertions: true + validate_reduce_assertions: true + concurrent_llm_calls: 8 + max_concurrent_questions: 2 + linked: + max_assertions: {assrt["max_assertions"]} + enable_validation: {_fmt_bool(assrt["enable_validation"])} + min_validation_score: {assrt["min_validation_score"]} + concurrent_llm_calls: 8 + max_concurrent_questions: 2 + +assertion_prompts: + local_assertion_gen_prompt: + prompt: prompts/data_questions/assertions/local_claim_assertion_gen_prompt.txt + global_assertion_map_prompt: + prompt: prompts/data_questions/assertions/global_claim_assertion_map_prompt.txt + global_assertion_reduce_prompt: + prompt: prompts/data_questions/assertions/global_claim_assertion_reduce_prompt.txt + local_validation_prompt: + prompt: prompts/data_questions/assertions/local_validation_prompt.txt + global_validation_prompt: + prompt: prompts/data_questions/assertions/global_validation_prompt.txt +""" + + +# --------------------------------------------------------------------------- +# AutoE - Pairwise +# --------------------------------------------------------------------------- + + +def render_autoe_pairwise_yaml(config: dict[str, Any]) -> str: + """Render a pairwise evaluation ``settings.yaml`` from wizard configuration. + + Parameters + ---------- + config: + Dict with keys ``chat_provider``, ``base``, ``others``, + ``question_sets``, ``trials``, ``criteria``. + """ + base = config["base"] + others = config["others"] + question_sets = config["question_sets"] + trials = config["trials"] + criteria = config.get("criteria") + + llm_section = _render_llm_section(config["chat_provider"]) + + # others entries + others_lines = "\n".join( + f" - name: {o['name']}\n answer_base_path: {o['answer_base_path']}" + for o in others + ) + + # question sets + qsets_lines = "\n".join(f" - {qs}" for qs in question_sets) + + # criteria block + if criteria is not None: + criteria_lines = "criteria:\n" + "\n".join( + f' - name: "{c["name"]}"\n description: "{c["description"]}"' + for c in criteria + ) + else: + criteria_lines = ( + "# criteria:\n" + '# - name: "criteria name"\n' + '# description: "criteria description"' + ) + + return f"""\ +## Input Configuration +base: + name: {base["name"]} + answer_base_path: {base["answer_base_path"]} +others: +{others_lines} +question_sets: +{qsets_lines} + +## Scoring Configuration +{criteria_lines} +trials: {trials} + +## LLM Configuration +llm_config: +{llm_section} + +prompts_config: + user_prompt: + prompt: prompts/pairwise_user_prompt.txt + system_prompt: + prompt: prompts/pairwise_system_prompt.txt +""" + + +# --------------------------------------------------------------------------- +# AutoE - Reference +# --------------------------------------------------------------------------- + + +def render_autoe_reference_yaml(config: dict[str, Any]) -> str: + """Render a reference evaluation ``settings.yaml`` from wizard configuration. + + Parameters + ---------- + config: + Dict with keys ``chat_provider``, ``reference``, ``generated``, + ``score_min``, ``score_max``, ``trials``. + """ + ref = config["reference"] + generated = config["generated"] + trials = config["trials"] + score_min = config.get("score_min", 1) + score_max = config.get("score_max", 10) + + llm_section = _render_llm_section(config["chat_provider"]) + + generated_lines = "\n".join( + f" - name: {g['name']}\n answer_base_path: {g['answer_base_path']}" + for g in generated + ) + + return f"""\ +## Input Configuration +reference: + name: {ref["name"]} + answer_base_path: {ref["answer_base_path"]} +generated: +{generated_lines} + +## Scoring Configuration +score_min: {score_min} +score_max: {score_max} +trials: {trials} + +## LLM Configuration +llm_config: +{llm_section} + +prompts_config: + user_prompt: + prompt: prompts/reference_user_prompt.txt + system_prompt: + prompt: prompts/reference_system_prompt.txt +""" + + +# --------------------------------------------------------------------------- +# AutoE - Assertion +# --------------------------------------------------------------------------- + + +def render_autoe_assertion_yaml(config: dict[str, Any]) -> str: + """Render an assertion evaluation ``settings.yaml`` from wizard configuration. + + Parameters + ---------- + config: + Dict with keys ``chat_provider``, ``generated``, ``assertions``, + ``pass_threshold``, ``trials``. + """ + gen = config["generated"] + assertions = config["assertions"] + pass_threshold = config.get("pass_threshold", 0.5) + trials = config["trials"] + + llm_section = _render_llm_section(config["chat_provider"]) + + return f"""\ +## Input Configuration +generated: + name: {gen["name"]} + answer_base_path: {gen["answer_base_path"]} +assertions: + assertions_path: {assertions["assertions_path"]} + +pass_threshold: {pass_threshold} +trials: {trials} + +## LLM Configuration +llm_config: +{llm_section} + +prompts_config: + user_prompt: + prompt: prompts/assertion_user_prompt.txt + system_prompt: + prompt: prompts/assertion_system_prompt.txt +""" + + +# --------------------------------------------------------------------------- +# Validation +# --------------------------------------------------------------------------- + +_REQUIRED_KEYS: dict[str, list[str]] = { + "autoq": [ + "input", + "encoding", + "sampling", + "chat_model", + "embedding_model", + "data_local", + "data_global", + "data_linked", + "activity_local", + "activity_global", + "assertions", + ], + "autoe_pairwise": [ + "base", + "others", + "question_sets", + "trials", + "llm_config", + ], + "autoe_reference": [ + "reference", + "generated", + "score_min", + "score_max", + "trials", + "llm_config", + ], + "autoe_assertion": [ + "generated", + "assertions", + "pass_threshold", + "trials", + "llm_config", + ], +} + + +def validate_config(yaml_content: str, config_type: str) -> None: + """Validate generated YAML against expected structure. + + Parses the YAML and checks that required top-level keys exist and have + the correct types. Prompt file paths are **not** validated because they + are written to disk *after* the settings file is generated. + + Raises :class:`typer.BadParameter` on validation failure. + """ + try: + data = yaml.safe_load(yaml_content) + except yaml.YAMLError as exc: + msg = f"Generated YAML is not valid: {exc}" + raise typer.BadParameter(msg) from exc + + if not isinstance(data, dict): + msg = "Generated YAML root must be a mapping." + raise typer.BadParameter(msg) + + required = _REQUIRED_KEYS.get(config_type) + if required is None: + msg = f"Unknown config type: {config_type!r}" + raise typer.BadParameter(msg) + + missing = [k for k in required if k not in data] + if missing: + msg = f"Missing required keys for {config_type}: {', '.join(missing)}" + raise typer.BadParameter(msg) + + # Type-check a few critical fields + try: + if config_type == "autoq": + _check_type(data, "input", dict) + _check_type(data, "encoding", dict) + _check_type(data, "sampling", dict) + _check_type(data, "chat_model", dict) + _check_type(data, "embedding_model", dict) + _check_type(data, "assertions", dict) + elif config_type == "autoe_pairwise": + _check_type(data, "base", dict) + _check_type(data, "others", list) + _check_type(data, "question_sets", list) + _check_type(data, "trials", int) + _check_type(data, "llm_config", dict) + elif config_type == "autoe_reference": + _check_type(data, "reference", dict) + _check_type(data, "generated", list) + _check_type(data, "trials", int) + _check_type(data, "llm_config", dict) + elif config_type == "autoe_assertion": + _check_type(data, "generated", dict) + _check_type(data, "assertions", dict) + _check_type(data, "trials", int) + _check_type(data, "llm_config", dict) + except typer.BadParameter: + raise + except Exception as exc: + msg = f"Validation error for {config_type}: {exc}" + raise typer.BadParameter(msg) from exc + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + + +def _check_type(data: dict[str, Any], key: str, expected: type) -> None: + """Raise :class:`typer.BadParameter` if *data[key]* is not *expected* type.""" + value = data[key] + if not isinstance(value, expected): + msg = f"Key '{key}' should be {expected.__name__}, got {type(value).__name__}" + raise typer.BadParameter(msg) + + +def _fmt_bool(value: Any) -> str: + """Format a Python boolean as a YAML boolean literal.""" + return "true" if value else "false" + + +def _fmt_float(value: Any) -> str: + """Format a number, ensuring floats keep a decimal point.""" + if isinstance(value, float): + return str(value) + # Integers that should display as floats (e.g. 2 -> 2.0) + return f"{float(value)}" diff --git a/tests/autoe/assertion/pipeline_test.py b/tests/autoe/assertion/pipeline_test.py index 774e51e..b129ad6 100644 --- a/tests/autoe/assertion/pipeline_test.py +++ b/tests/autoe/assertion/pipeline_test.py @@ -177,14 +177,14 @@ def test_non_dict_assertions_renamed(self, tmp_path: Path) -> None: """Non-dict assertions use the assertions_key rename path.""" # Build a DataFrame where assertions are plain strings but # supporting_assertions is a separate column - df = pd.DataFrame({ + test_df = pd.DataFrame({ "question_id": ["q1"], "question_text": ["What?"], "assertions": ["A plain assertion."], "supporting_assertions": [["SA1"]], }) path = tmp_path / "assertions.json" - df.to_json(path, orient="records") + test_df.to_json(path, orient="records") result = load_and_normalize_hierarchical_assertions(path) diff --git a/tests/test_interactive_init.py b/tests/test_interactive_init.py new file mode 100644 index 0000000..f515d4a --- /dev/null +++ b/tests/test_interactive_init.py @@ -0,0 +1,693 @@ +# Copyright (c) 2025 Microsoft Corporation. +"""Tests for the interactive init wizard and YAML renderers.""" + +from __future__ import annotations + +from typing import Any + +import pytest +import typer +import yaml +from typer.testing import CliRunner + +from benchmark_qed.__main__ import app +from benchmark_qed.cli.interactive import ( + prompt_comma_list, + select_option, +) +from benchmark_qed.cli.yaml_renderer import ( + _render_llm_section, + render_autoe_assertion_yaml, + render_autoe_pairwise_yaml, + render_autoe_reference_yaml, + render_autoq_yaml, + validate_config, +) + +# --------------------------------------------------------------------------- +# Shared factory helpers +# --------------------------------------------------------------------------- + + +def _openai_chat_provider() -> dict[str, Any]: + return { + "llm_provider": "openai.chat", + "model": "gpt-4.1", + "auth_type": "api_key", + "init_args": {}, + } + + +def _openai_embedding_provider() -> dict[str, Any]: + return { + "llm_provider": "openai.embedding", + "model": "text-embedding-3-large", + "auth_type": "api_key", + "init_args": {}, + } + + +def _azure_chat_provider() -> dict[str, Any]: + return { + "llm_provider": "azure.openai.chat", + "model": "gpt-4.1", + "auth_type": "api_key", + "init_args": { + "azure_endpoint": "https://example.openai.azure.com", + "api_version": "2024-12-01-preview", + }, + } + + +def _azure_managed_identity_provider() -> dict[str, Any]: + return { + "llm_provider": "azure.openai.chat", + "model": "gpt-4.1", + "auth_type": "azure_managed_identity", + "init_args": { + "azure_endpoint": "https://example.openai.azure.com", + "api_version": "2024-12-01-preview", + }, + } + + +def _default_autoq_config() -> dict[str, Any]: + return { + "chat_provider": _openai_chat_provider(), + "embedding_provider": _openai_embedding_provider(), + "input": { + "dataset_path": "./input", + "input_type": "json", + "text_column": "text", + "metadata_columns": None, + "file_encoding": "utf-8", + }, + "encoding": { + "model_name": "o200k_base", + "chunk_size": 600, + "chunk_overlap": 100, + }, + "sampling": { + "num_clusters": 20, + "num_samples_per_cluster": 10, + "random_seed": 42, + }, + "question_types": { + qt: {"num_questions": 10, "oversample_factor": 2.0} + for qt in [ + "data_local", + "data_global", + "data_linked", + "activity_local", + "activity_global", + ] + }, + "activity_params": { + "num_personas": 5, + "num_tasks_per_persona": 2, + "num_entities_per_task": 5, + }, + "assertions": { + "max_assertions": 20, + "enable_validation": True, + "min_validation_score": 3, + }, + "concurrent_requests": 8, + } + + +def _default_pairwise_config() -> dict[str, Any]: + return { + "chat_provider": _openai_chat_provider(), + "base": {"name": "baseline", "answer_base_path": "input/baseline"}, + "others": [ + {"name": "method_a", "answer_base_path": "input/method_a"}, + ], + "question_sets": ["activity_global", "activity_local"], + "trials": 4, + "criteria": None, + } + + +def _default_reference_config() -> dict[str, Any]: + return { + "chat_provider": _openai_chat_provider(), + "reference": {"name": "golden", "answer_base_path": "input/golden"}, + "generated": [ + {"name": "method_a", "answer_base_path": "input/method_a"}, + ], + "score_min": 1, + "score_max": 10, + "trials": 4, + } + + +def _default_assertion_config() -> dict[str, Any]: + return { + "chat_provider": _openai_chat_provider(), + "generated": {"name": "method_a", "answer_base_path": "input/method_a"}, + "assertions": {"assertions_path": "input/assertions.json"}, + "pass_threshold": 0.5, + "trials": 4, + } + + +# ═══════════════════════════════════════════════════════════════════════════ +# 1. YAML Renderer Tests +# ═══════════════════════════════════════════════════════════════════════════ + + +class TestYamlRenderers: + """Verify each renderer produces valid, parseable YAML.""" + + def test_render_autoq_yaml_produces_valid_yaml(self): + """render_autoq_yaml returns parseable YAML with all expected sections.""" + config = _default_autoq_config() + yaml_content = render_autoq_yaml(config) + parsed = yaml.safe_load(yaml_content) + assert parsed is not None + assert "input" in parsed + assert "chat_model" in parsed + assert "embedding_model" in parsed + assert "sampling" in parsed + assert parsed["concurrent_requests"] == 8 + + def test_render_autoq_yaml_includes_question_types(self): + """AutoQ YAML includes all five question type sections.""" + config = _default_autoq_config() + yaml_content = render_autoq_yaml(config) + parsed = yaml.safe_load(yaml_content) + for qt in [ + "data_local", + "data_global", + "data_linked", + "activity_local", + "activity_global", + ]: + assert qt in parsed, f"Missing question type section: {qt}" + assert "num_questions" in parsed[qt] + + def test_render_autoq_yaml_includes_assertions(self): + """AutoQ YAML includes assertions section with local/global/linked.""" + config = _default_autoq_config() + yaml_content = render_autoq_yaml(config) + parsed = yaml.safe_load(yaml_content) + assert "assertions" in parsed + for section in ["local", "global", "linked"]: + assert section in parsed["assertions"] + + def test_render_autoq_yaml_with_metadata_columns(self): + """Metadata columns are included when provided.""" + config = _default_autoq_config() + config["input"]["metadata_columns"] = ["source", "date"] + yaml_content = render_autoq_yaml(config) + parsed = yaml.safe_load(yaml_content) + assert parsed["input"]["metadata_columns"] == ["source", "date"] + + def test_render_autoq_yaml_without_metadata_columns(self): + """No metadata_columns key when None is provided.""" + config = _default_autoq_config() + config["input"]["metadata_columns"] = None + yaml_content = render_autoq_yaml(config) + parsed = yaml.safe_load(yaml_content) + assert "metadata_columns" not in parsed["input"] + + def test_render_autoq_yaml_includes_prompt_configs(self): + """AutoQ YAML includes prompt configuration sections.""" + config = _default_autoq_config() + yaml_content = render_autoq_yaml(config) + parsed = yaml.safe_load(yaml_content) + assert "activity_questions_prompt_config" in parsed + assert "data_questions_prompt_config" in parsed + assert "assertion_prompts" in parsed + + def test_render_autoe_pairwise_yaml(self): + """Pairwise YAML includes base, others, question_sets.""" + config = _default_pairwise_config() + yaml_content = render_autoe_pairwise_yaml(config) + parsed = yaml.safe_load(yaml_content) + assert parsed is not None + assert "base" in parsed + assert "others" in parsed + assert "question_sets" in parsed + assert parsed["trials"] == 4 + assert "llm_config" in parsed + + def test_render_autoe_pairwise_yaml_with_criteria(self): + """Pairwise YAML with custom criteria includes them.""" + config = _default_pairwise_config() + config["criteria"] = [ + {"name": "accuracy", "description": "Is the answer correct?"}, + ] + yaml_content = render_autoe_pairwise_yaml(config) + parsed = yaml.safe_load(yaml_content) + assert "criteria" in parsed + assert len(parsed["criteria"]) == 1 + + def test_render_autoe_pairwise_yaml_no_criteria_is_commented(self): + """Pairwise YAML without criteria has commented-out criteria block.""" + config = _default_pairwise_config() + config["criteria"] = None + yaml_content = render_autoe_pairwise_yaml(config) + assert "# criteria:" in yaml_content + + def test_render_autoe_reference_yaml(self): + """Reference YAML includes score range.""" + config = _default_reference_config() + yaml_content = render_autoe_reference_yaml(config) + parsed = yaml.safe_load(yaml_content) + assert parsed is not None + assert "reference" in parsed + assert "generated" in parsed + assert parsed["score_min"] == 1 + assert parsed["score_max"] == 10 + assert "llm_config" in parsed + + def test_render_autoe_reference_yaml_multiple_generated(self): + """Reference YAML with multiple generated conditions.""" + config = _default_reference_config() + config["generated"].append({ + "name": "method_b", + "answer_base_path": "input/method_b", + }) + yaml_content = render_autoe_reference_yaml(config) + parsed = yaml.safe_load(yaml_content) + assert len(parsed["generated"]) == 2 + + def test_render_autoe_assertion_yaml(self): + """Assertion YAML includes pass_threshold.""" + config = _default_assertion_config() + yaml_content = render_autoe_assertion_yaml(config) + parsed = yaml.safe_load(yaml_content) + assert parsed is not None + assert "generated" in parsed + assert "assertions" in parsed + assert parsed["pass_threshold"] == 0.5 + assert parsed["trials"] == 4 + assert "llm_config" in parsed + + +# ═══════════════════════════════════════════════════════════════════════════ +# 2. LLM Section Rendering Tests +# ═══════════════════════════════════════════════════════════════════════════ + + +class TestLlmRendering: + """Verify _render_llm_section output for different provider configs.""" + + def test_openai_provider_includes_api_key(self): + """OpenAI provider with api_key auth includes ${OPENAI_API_KEY}.""" + provider = _openai_chat_provider() + section = _render_llm_section(provider) + assert "${OPENAI_API_KEY}" in section + assert "model: gpt-4.1" in section + assert "llm_provider: openai.chat" in section + + def test_azure_provider_includes_init_args(self): + """Azure provider renders azure_endpoint and api_version in init_args.""" + provider = _azure_chat_provider() + section = _render_llm_section(provider) + assert "init_args:" in section + assert "azure_endpoint: https://example.openai.azure.com" in section + assert "api_version: 2024-12-01-preview" in section + + def test_managed_identity_omits_api_key(self): + """azure_managed_identity auth type does NOT include api_key line.""" + provider = _azure_managed_identity_provider() + section = _render_llm_section(provider) + assert "api_key:" not in section + assert "auth_type: azure_managed_identity" in section + + def test_openai_provider_no_init_args_block(self): + """OpenAI provider with empty init_args omits init_args block.""" + provider = _openai_chat_provider() + section = _render_llm_section(provider) + assert "init_args:" not in section + + def test_section_includes_concurrent_requests(self): + """Every LLM section includes concurrent_requests.""" + provider = _openai_chat_provider() + section = _render_llm_section(provider) + assert "concurrent_requests: 4" in section + + def test_custom_indent(self): + """Custom indent produces properly indented output.""" + provider = _openai_chat_provider() + section = _render_llm_section(provider, indent=4) + for line in section.split("\n"): + assert line.startswith(" ") + + +# ═══════════════════════════════════════════════════════════════════════════ +# 3. Validation Tests +# ═══════════════════════════════════════════════════════════════════════════ + + +class TestValidation: + """Verify validate_config accepts valid YAML and rejects invalid YAML.""" + + def test_validate_config_passes_valid_autoq(self): + """Valid AutoQ YAML passes validation.""" + yaml_content = render_autoq_yaml(_default_autoq_config()) + validate_config(yaml_content, "autoq") + + def test_validate_config_passes_valid_pairwise(self): + """Valid pairwise YAML passes validation.""" + yaml_content = render_autoe_pairwise_yaml(_default_pairwise_config()) + validate_config(yaml_content, "autoe_pairwise") + + def test_validate_config_passes_valid_reference(self): + """Valid reference YAML passes validation.""" + yaml_content = render_autoe_reference_yaml(_default_reference_config()) + validate_config(yaml_content, "autoe_reference") + + def test_validate_config_passes_valid_assertion(self): + """Valid assertion YAML passes validation.""" + yaml_content = render_autoe_assertion_yaml(_default_assertion_config()) + validate_config(yaml_content, "autoe_assertion") + + def test_validate_config_rejects_invalid_yaml(self): + """Malformed YAML is rejected.""" + with pytest.raises(typer.BadParameter): + validate_config(": :\n bad: [", "autoq") + + def test_validate_config_rejects_missing_keys(self): + """YAML missing required keys is rejected.""" + with pytest.raises(typer.BadParameter, match="Missing required keys"): + validate_config("foo: bar\n", "autoq") + + def test_validate_config_rejects_wrong_type(self): + """YAML with wrong types is rejected.""" + yaml_content = render_autoq_yaml(_default_autoq_config()) + parsed = yaml.safe_load(yaml_content) + parsed["input"] = "not_a_dict" + bad_yaml = yaml.dump(parsed) + with pytest.raises(typer.BadParameter, match="should be dict"): + validate_config(bad_yaml, "autoq") + + def test_validate_config_rejects_unknown_type(self): + """Unknown config type is rejected.""" + with pytest.raises(typer.BadParameter, match="Unknown config type"): + validate_config("foo: bar\n", "unknown_type") + + def test_validate_config_rejects_non_mapping_root(self): + """YAML whose root is not a mapping is rejected.""" + with pytest.raises(typer.BadParameter, match="root must be a mapping"): + validate_config("- item1\n- item2\n", "autoq") + + +# ═══════════════════════════════════════════════════════════════════════════ +# 4. CLI Integration Tests +# ═══════════════════════════════════════════════════════════════════════════ + + +class TestCliIntegration: + """End-to-end CLI tests using typer.testing.CliRunner.""" + + @pytest.fixture(autouse=True) + def _patch_tty_check(self, monkeypatch): + """Disable the TTY check so CliRunner can drive the wizard.""" + monkeypatch.setattr("benchmark_qed.cli.interactive.check_tty", lambda: None) + + def test_init_autoq_creates_files(self, tmp_path): + """benchmark-qed init creates settings.yaml, prompts/, .env.""" + runner = CliRunner() + # Input sequence: see build_autoq_config for prompt order. + input_lines = [ + "1", # config type: autoq + "1", # chat provider: OpenAI + "1", # auth type: api_key + "", # model: accept default gpt-4.1 + "Y", # use same provider for embeddings + "N", # customize input section + "N", # customize encoding section + "N", # customize sampling section + "N", # customize question types section + "N", # customize assertions section + "8", # concurrent requests + ] + input_text = "\n".join(input_lines) + "\n" + result = runner.invoke( + app, + ["init", str(tmp_path)], + input=input_text, + ) + assert result.exit_code == 0, ( + f"CLI failed (code={result.exit_code}):\n{result.output}" + ) + assert (tmp_path / "settings.yaml").exists() + assert (tmp_path / "prompts").exists() + assert (tmp_path / ".env").exists() + + def test_init_autoq_settings_yaml_is_valid(self, tmp_path): + """The generated settings.yaml is parseable and contains expected keys.""" + runner = CliRunner() + input_lines = [ + "1", + "1", + "1", + "", + "Y", + "N", + "N", + "N", + "N", + "N", + "8", + ] + input_text = "\n".join(input_lines) + "\n" + result = runner.invoke( + app, + ["init", str(tmp_path)], + input=input_text, + ) + assert result.exit_code == 0, result.output + + settings = yaml.safe_load( + (tmp_path / "settings.yaml").read_text(encoding="utf-8") + ) + assert "chat_model" in settings + assert "embedding_model" in settings + assert "input" in settings + + def test_init_autoe_pairwise_creates_files(self, tmp_path): + """Pairwise init creates correct files.""" + runner = CliRunner() + input_lines = [ + "2", # config type: autoe_pairwise + "1", # chat provider: OpenAI + "1", # auth type: api_key + "", # model: default + "baseline", # base condition name + "input/baseline", # base answer_base_path + "method_a", # other condition #1 name + "input/method_a", # other condition #1 answer_base_path + "N", # add another condition? no + "", # question sets: accept default + "4", # trials (even) + "N", # add custom criteria? no + ] + input_text = "\n".join(input_lines) + "\n" + result = runner.invoke( + app, + ["init", str(tmp_path)], + input=input_text, + ) + assert result.exit_code == 0, f"CLI failed:\n{result.output}" + assert (tmp_path / "settings.yaml").exists() + assert (tmp_path / "prompts").exists() + + def test_init_autoe_reference_creates_files(self, tmp_path): + """Reference init creates correct files.""" + runner = CliRunner() + input_lines = [ + "3", # config type: autoe_reference + "1", # chat provider: OpenAI + "1", # auth type: api_key + "", # model: default + "golden", # reference condition name + "input/golden", # reference answer_base_path + "method_a", # generated condition #1 name + "input/method_a", # generated condition #1 answer_base_path + "N", # add another generated? no + "1", # score min + "10", # score max + "4", # trials + ] + input_text = "\n".join(input_lines) + "\n" + result = runner.invoke( + app, + ["init", str(tmp_path)], + input=input_text, + ) + assert result.exit_code == 0, f"CLI failed:\n{result.output}" + assert (tmp_path / "settings.yaml").exists() + + def test_init_autoe_assertion_creates_files(self, tmp_path): + """Assertion init creates correct files.""" + runner = CliRunner() + input_lines = [ + "4", # config type: autoe_assertion + "1", # chat provider: OpenAI + "1", # auth type: api_key + "", # model: default + "method_a", # generated condition name + "input/method_a", # generated answer_base_path + "", # assertions path: accept default + "0.5", # pass threshold + "4", # trials + ] + input_text = "\n".join(input_lines) + "\n" + result = runner.invoke( + app, + ["init", str(tmp_path)], + input=input_text, + ) + assert result.exit_code == 0, f"CLI failed:\n{result.output}" + assert (tmp_path / "settings.yaml").exists() + + +# ═══════════════════════════════════════════════════════════════════════════ +# 5. Overwrite Protection Tests +# ═══════════════════════════════════════════════════════════════════════════ + + +class TestOverwriteProtection: + """Verify overwrite protection for existing settings files.""" + + @pytest.fixture(autouse=True) + def _patch_tty_check(self, monkeypatch): + monkeypatch.setattr("benchmark_qed.cli.interactive.check_tty", lambda: None) + + def _autoq_default_input(self) -> str: + """Input sequence that walks through autoq with all defaults.""" + lines = ["1", "1", "1", "", "Y", "N", "N", "N", "N", "N", "8"] + return "\n".join(lines) + "\n" + + def test_init_warns_on_existing_settings(self, tmp_path): + """If settings.yaml exists, overwrite confirmation is prompted.""" + (tmp_path / "settings.yaml").write_text("existing", encoding="utf-8") + runner = CliRunner() + # Same autoq defaults + "y" for overwrite confirmation + input_text = self._autoq_default_input() + "y\n" + result = runner.invoke( + app, + ["init", str(tmp_path)], + input=input_text, + ) + assert result.exit_code == 0, f"CLI failed:\n{result.output}" + content = (tmp_path / "settings.yaml").read_text(encoding="utf-8") + assert content != "existing" + + def test_init_aborts_on_overwrite_decline(self, tmp_path): + """Declining overwrite aborts the command.""" + (tmp_path / "settings.yaml").write_text("existing", encoding="utf-8") + runner = CliRunner() + input_text = self._autoq_default_input() + "N\n" + result = runner.invoke( + app, + ["init", str(tmp_path)], + input=input_text, + ) + assert result.exit_code != 0 + content = (tmp_path / "settings.yaml").read_text(encoding="utf-8") + assert content == "existing" + + +# ═══════════════════════════════════════════════════════════════════════════ +# 6. Helper Function Tests +# ═══════════════════════════════════════════════════════════════════════════ + + +class TestHelpers: + """Tests for individual interactive helper functions.""" + + def test_prompt_comma_list_splits(self): + """prompt_comma_list splits comma-separated input.""" + result_holder: list[str] = [] + test_app = typer.Typer() + + @test_app.command() + def _cmd() -> None: + result_holder.extend(prompt_comma_list("Enter items", default="")) + + runner = CliRunner() + runner.invoke(test_app, input="alpha, beta, gamma\n") + assert result_holder == ["alpha", "beta", "gamma"] + + def test_prompt_comma_list_uses_default(self): + """prompt_comma_list uses default when input is empty.""" + result_holder: list[str] = [] + test_app = typer.Typer() + + @test_app.command() + def _cmd() -> None: + result_holder.extend(prompt_comma_list("Enter items", default="a, b")) + + runner = CliRunner() + runner.invoke(test_app, input="\n") + assert result_holder == ["a", "b"] + + def test_prompt_comma_list_strips_whitespace(self): + """prompt_comma_list strips whitespace from items.""" + result_holder: list[str] = [] + test_app = typer.Typer() + + @test_app.command() + def _cmd() -> None: + result_holder.extend(prompt_comma_list("Enter", default="")) + + runner = CliRunner() + runner.invoke(test_app, input=" x , y , z \n") + assert result_holder == ["x", "y", "z"] + + def test_select_option_returns_correct_value(self): + """select_option returns the value of the chosen option.""" + result_holder: list[str] = [] + test_app = typer.Typer() + + @test_app.command() + def _cmd() -> None: + val = select_option( + "Pick one", + [("val_a", "Label A"), ("val_b", "Label B")], + ) + result_holder.append(val) + + runner = CliRunner() + runner.invoke(test_app, input="2\n") + assert result_holder == ["val_b"] + + def test_select_option_out_of_range_defaults(self): + """Out-of-range selection defaults to option 1.""" + result_holder: list[str] = [] + test_app = typer.Typer() + + @test_app.command() + def _cmd() -> None: + val = select_option( + "Pick one", + [("first", "First"), ("second", "Second")], + ) + result_holder.append(val) + + runner = CliRunner() + runner.invoke(test_app, input="99\n") + assert result_holder == ["first"] + + def test_select_option_default_is_one(self): + """Empty input (Enter) defaults to option 1.""" + result_holder: list[str] = [] + test_app = typer.Typer() + + @test_app.command() + def _cmd() -> None: + val = select_option( + "Pick one", + [("default_val", "Default"), ("other", "Other")], + ) + result_holder.append(val) + + runner = CliRunner() + runner.invoke(test_app, input="\n") + assert result_holder == ["default_val"] From bb5ab56e0da3adfea8817e9dba54d1c5379a624c Mon Sep 17 00:00:00 2001 From: Andres Morales Esquivel Date: Wed, 22 Apr 2026 15:56:20 -0600 Subject: [PATCH 3/8] Undo change --- benchmark_qed/autoe/retrieval/scores.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark_qed/autoe/retrieval/scores.py b/benchmark_qed/autoe/retrieval/scores.py index 647e174..42d92f4 100644 --- a/benchmark_qed/autoe/retrieval/scores.py +++ b/benchmark_qed/autoe/retrieval/scores.py @@ -654,7 +654,7 @@ async def run_retrieval_evaluation( retrieval_path = Path(rag_method["retrieval_results_path"]) # Check if path includes question_set placeholder - if "{question_set}" in str(retrieval_path): + if "{question_set}" in str(retrieval_path): # noqa: RUF027 retrieval_path = Path( str(retrieval_path).format(question_set=question_set) ) From 1c4382461b0ff4ae55a4e8d15c6b566cead69b80 Mon Sep 17 00:00:00 2001 From: Andres Morales Esquivel Date: Fri, 24 Apr 2026 11:54:09 -0600 Subject: [PATCH 4/8] Ask for confirmation settings --- .apm/skills/benchmark-qed-setup/SKILL.md | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/.apm/skills/benchmark-qed-setup/SKILL.md b/.apm/skills/benchmark-qed-setup/SKILL.md index e2c382b..f6f76f6 100644 --- a/.apm/skills/benchmark-qed-setup/SKILL.md +++ b/.apm/skills/benchmark-qed-setup/SKILL.md @@ -172,7 +172,26 @@ input: For the full set of optional fields, read [references/config-reference.md](references/config-reference.md). -### Step 5 — Validate Configuration +### Step 5 — Review Settings with the User + +After writing `settings.yaml`, **show the user the generated configuration** and ask if they want to customize anything. This is critical — the generated config uses sensible defaults, but users often need to tune dataset-specific or environment-specific values. + +1. Read the generated `settings.yaml` and display its contents to the user (use `show_file`). +2. Use `ask_user` with a boolean field: *"Would you like to customize any settings before proceeding?"* +3. If the user wants changes, use `ask_user` with a **free-text string field**: *"Describe what you'd like to change"* — let them say it in their own words (e.g., "increase num_questions to 50 for all types", "change the model to gpt-4o", "set trials to 6 and add a custom criterion"). Then apply the requested changes to `settings.yaml`. +4. After applying changes, show the updated file and ask again: *"Any other changes?"* (boolean). Repeat until the user says no. + +Do **not** limit the user to predefined sections — they should be able to modify any field in `settings.yaml` by describing what they want. + +**Sections the user is most likely to customize** (call these out): +- **autoq**: `num_questions` per type, `num_clusters`, `chunk_size`, assertion settings, `concurrent_requests` +- **autoe_pairwise**: `trials`, `criteria`, `question_sets` +- **autoe_reference**: `score_min`/`score_max`, `trials` +- **autoe_assertion**: `pass_threshold`, `trials` + +For the full set of optional fields and best practices, read [references/config-reference.md](references/config-reference.md). + +### Step 6 — Validate Configuration The benchmark-qed CLI validates `settings.yaml` via pydantic at startup, so any missing or malformed fields are reported when you run a command. After applying the answers, run the actual target command (e.g. `benchmark-qed autoq …`) — config errors surface immediately, before any LLM calls. From 3115fa326311fea04912472e81d5bc765c25e5c5 Mon Sep 17 00:00:00 2001 From: Andres Morales Esquivel Date: Fri, 24 Apr 2026 16:18:05 -0600 Subject: [PATCH 5/8] Update skills to match config --- .../references/config-reference.md | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/.apm/skills/benchmark-qed-setup/references/config-reference.md b/.apm/skills/benchmark-qed-setup/references/config-reference.md index e1f6ff2..b1d693e 100644 --- a/.apm/skills/benchmark-qed-setup/references/config-reference.md +++ b/.apm/skills/benchmark-qed-setup/references/config-reference.md @@ -38,6 +38,8 @@ chat_model: api_key: ${OPENAI_API_KEY} # Required for api_key auth llm_provider: openai.chat # Provider (see table below) concurrent_requests: 4 # Parallel LLM requests + azure_identity_scopes: # Azure identity scopes (azure_managed_identity only) + - https://cognitiveservices.azure.com/.default init_args: {} # Extra model init args (e.g., api_version, azure_endpoint) call_args: # Extra model call args temperature: 0.0 @@ -50,6 +52,24 @@ embedding_model: api_key: ${OPENAI_API_KEY} ``` +### Azure Identity Scopes + +When using `auth_type: azure_managed_identity`, the `azure_identity_scopes` field controls which OAuth scopes are requested from Azure Active Directory via `get_bearer_token_provider`. + +```yaml +azure_identity_scopes: + - https://cognitiveservices.azure.com/.default # Default — Azure Cognitive Services +``` + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `azure_identity_scopes` | `list[str]` | `["https://cognitiveservices.azure.com/.default"]` | OAuth scopes passed to `get_bearer_token_provider`. Only used when `auth_type` is `azure_managed_identity`. | + +**When to change this:** +- The default scope (`https://cognitiveservices.azure.com/.default`) works for standard Azure OpenAI deployments +- Use a custom scope if your Azure resource requires a different audience (e.g., private endpoints, sovereign clouds) +- Multiple scopes can be listed if your deployment requires more than one + ### Question Generation Types All question types share a base config with `num_questions` (default: `50`) and `oversample_factor` (default: `2.0`). Type-specific fields are listed below. From 520ad5567ed692478fade74c8a2be46c511ffb26 Mon Sep 17 00:00:00 2001 From: Andres Morales Esquivel Date: Mon, 27 Apr 2026 16:04:18 -0600 Subject: [PATCH 6/8] Update skills to match config --- .../references/config-reference.md | 51 ++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/.apm/skills/benchmark-qed-setup/references/config-reference.md b/.apm/skills/benchmark-qed-setup/references/config-reference.md index b1d693e..83282c3 100644 --- a/.apm/skills/benchmark-qed-setup/references/config-reference.md +++ b/.apm/skills/benchmark-qed-setup/references/config-reference.md @@ -7,11 +7,12 @@ Reference for benchmark-qed configuration fields. Load this file when you need t ### Input Configuration ```yaml input: - dataset_path: ./input/data.csv # Path to input dataset (REQUIRED) + dataset_path: ./input/data.csv # Path to input dataset (when storage is configured, path within the container) input_type: csv # csv or json text_column: text # Column containing text content metadata_columns: null # Optional list of metadata columns (e.g., [headline, date]) file_encoding: utf-8 # File encoding (template uses utf-8-sig) + storage: null # Optional StorageConfig for cloud storage (Azure Blob or Cosmos DB) ``` ### Encoding Configuration @@ -70,6 +71,47 @@ azure_identity_scopes: - Use a custom scope if your Azure resource requires a different audience (e.g., private endpoints, sovereign clouds) - Multiple scopes can be listed if your deployment requires more than one +### Storage Configuration (Optional) + +All config types support optional cloud storage backends for reading input and writing output. When omitted, the local filesystem is used (default behavior). + +```yaml +# AutoQ — input storage (inside the 'input' block) +input: + dataset_path: ./input # When storage is set, this is the path within the container + storage: # Optional: read input from Azure Blob Storage + type: blob + container_name: my-datasets + connection_string: ${AZURE_STORAGE_CONNECTION_STRING} # Or use account_url for managed identity + # account_url: https://.blob.core.windows.net + # base_dir: path/within/container + +# AutoQ/AutoE — output storage (top-level) +output_storage: # Optional: write output to Azure Blob Storage + type: blob + container_name: my-output + connection_string: ${AZURE_STORAGE_CONNECTION_STRING} + # base_dir: experiments/run1 + +# AutoE — input storage (top-level, for reading answers/assertions) +input_storage: # Optional: read input from Azure Blob Storage + type: blob + container_name: my-datasets + account_url: https://.blob.core.windows.net +``` + +#### StorageConfig Fields + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `type` | `str` | `"file"` | Storage backend: `file` (local), `blob` (Azure Blob Storage), `cosmosdb` (Azure Cosmos DB) | +| `container_name` | `str \| null` | `null` | Azure Blob container or Cosmos DB container name | +| `connection_string` | `str \| null` | `null` | Connection string for Azure (auth option 1) | +| `account_url` | `str \| null` | `null` | Account URL for Azure managed identity (auth option 2) | +| `base_dir` | `str \| null` | `null` | Base directory/prefix within the container | +| `database_name` | `str \| null` | `null` | Database name (Cosmos DB only) | +| `encoding` | `str \| null` | `null` | File encoding (file storage only) | + ### Question Generation Types All question types share a base config with `num_questions` (default: `50`) and `oversample_factor` (default: `2.0`). Type-specific fields are listed below. @@ -143,6 +185,7 @@ assertions: max_concurrent_questions: 2 concurrent_requests: 8 # Top-level concurrency for autoq pipeline +output_storage: null # Optional StorageConfig for writing output to cloud storage ``` ## autoe Pairwise Configuration (`PairwiseConfig`) @@ -404,3 +447,9 @@ custom_providers: - Match the assessor type between `generate-retrieval-reference` and `retrieval-scores` to share the cache - `relevance_threshold: 2` on a 0–3 scale is a reasonable default — lower values include marginal matches - Use `cache_dir` for iterative development to avoid redundant LLM calls across runs + +### Storage Configuration +- Use `connection_string` with `${AZURE_STORAGE_CONNECTION_STRING}` for development; use `account_url` with managed identity for production +- `base_dir` is optional — use it to organize multiple experiments within a single container +- When `storage` is set on `input`, `dataset_path` becomes relative to the container/base_dir, not the local filesystem +- Cosmos DB storage requires `database_name` in addition to `container_name` From 9b3bb35da1062fe95c20da198721522c6054fd35 Mon Sep 17 00:00:00 2001 From: Andres Morales Esquivel Date: Mon, 27 Apr 2026 16:13:49 -0600 Subject: [PATCH 7/8] Formatting --- benchmark_qed/cli/interactive.py | 78 ++++++++++++++---------------- benchmark_qed/cli/yaml_renderer.py | 3 +- 2 files changed, 38 insertions(+), 43 deletions(-) diff --git a/benchmark_qed/cli/interactive.py b/benchmark_qed/cli/interactive.py index a9f00e1..2afc5ec 100644 --- a/benchmark_qed/cli/interactive.py +++ b/benchmark_qed/cli/interactive.py @@ -13,6 +13,41 @@ from rich.panel import Panel from rich.table import Table +from benchmark_qed.autod.prompts import summarization +from benchmark_qed.autoe.prompts import assertion as assertion_prompts +from benchmark_qed.autoe.prompts import pairwise as pairwise_prompts +from benchmark_qed.autoe.prompts import reference as reference_prompts +from benchmark_qed.autoq.prompts import data_questions as data_questions_prompts +from benchmark_qed.autoq.prompts.activity_questions import ( + activity_context as activity_context_prompts, +) +from benchmark_qed.autoq.prompts.activity_questions import ( + global_questions as activity_global_prompts, +) +from benchmark_qed.autoq.prompts.activity_questions import ( + local_questions as activity_local_prompts, +) +from benchmark_qed.autoq.prompts.data_questions import ( + assertions as autoq_assertion_prompts, +) +from benchmark_qed.autoq.prompts.data_questions import ( + global_questions as data_global_prompts, +) +from benchmark_qed.autoq.prompts.data_questions import ( + linked_questions as data_linked_prompts, +) +from benchmark_qed.autoq.prompts.data_questions import ( + local_questions as data_local_prompts, +) +from benchmark_qed.cli.scaffold import copy_prompts, ensure_input_folder, write_env_file +from benchmark_qed.cli.yaml_renderer import ( + render_autoe_assertion_yaml, + render_autoe_pairwise_yaml, + render_autoe_reference_yaml, + render_autoq_yaml, + validate_config, +) + app: typer.Typer = typer.Typer(pretty_exceptions_show_locals=False) @@ -88,11 +123,9 @@ def check_tty() -> None: raise typer.Exit(code=1) -def confirm_overwrite(path: typer.Path | Any) -> None: +def confirm_overwrite(path: Path | str) -> None: """Ask for confirmation before overwriting an existing settings file.""" - from pathlib import Path as _Path - - p = _Path(str(path)) + p = Path(path) if not isinstance(path, Path) else path if p.exists(): typer.confirm( f"{p} already exists. Overwrite?", @@ -643,34 +676,6 @@ def build_autoe_assertion_config() -> dict[str, Any]: def _copy_prompts_for_config(config_type: str, prompts_folder: Path) -> None: """Copy the appropriate prompt templates for the given config type.""" - from benchmark_qed.autod.prompts import summarization - from benchmark_qed.autoe.prompts import assertion as assertion_prompts - from benchmark_qed.autoe.prompts import pairwise as pairwise_prompts - from benchmark_qed.autoe.prompts import reference as reference_prompts - from benchmark_qed.autoq.prompts import data_questions as data_questions_prompts - from benchmark_qed.autoq.prompts.activity_questions import ( - activity_context as activity_context_prompts, - ) - from benchmark_qed.autoq.prompts.activity_questions import ( - global_questions as activity_global_prompts, - ) - from benchmark_qed.autoq.prompts.activity_questions import ( - local_questions as activity_local_prompts, - ) - from benchmark_qed.autoq.prompts.data_questions import ( - assertions as autoq_assertion_prompts, - ) - from benchmark_qed.autoq.prompts.data_questions import ( - global_questions as data_global_prompts, - ) - from benchmark_qed.autoq.prompts.data_questions import ( - linked_questions as data_linked_prompts, - ) - from benchmark_qed.autoq.prompts.data_questions import ( - local_questions as data_local_prompts, - ) - from benchmark_qed.cli.scaffold import copy_prompts - match config_type: case "autoq": copy_prompts( @@ -730,15 +735,6 @@ def interactive_init( ], ) -> None: """Interactively create a benchmark-qed configuration.""" - from benchmark_qed.cli.scaffold import ensure_input_folder, write_env_file - from benchmark_qed.cli.yaml_renderer import ( - render_autoe_assertion_yaml, - render_autoe_pairwise_yaml, - render_autoe_reference_yaml, - render_autoq_yaml, - validate_config, - ) - check_tty() rich_print( diff --git a/benchmark_qed/cli/yaml_renderer.py b/benchmark_qed/cli/yaml_renderer.py index 00166fa..fb1f09f 100644 --- a/benchmark_qed/cli/yaml_renderer.py +++ b/benchmark_qed/cli/yaml_renderer.py @@ -8,6 +8,7 @@ from __future__ import annotations +import dataclasses from typing import Any import typer @@ -29,8 +30,6 @@ def _render_llm_section(provider_dict: dict[str, Any], indent: int = 2) -> str: indent: Number of leading spaces for each line. """ - import dataclasses - if dataclasses.is_dataclass(provider_dict) and not isinstance(provider_dict, type): provider_dict = dataclasses.asdict(provider_dict) From 7f404629e17a9c94f6118994bad18a492932d3e0 Mon Sep 17 00:00:00 2001 From: Andres Morales Esquivel Date: Mon, 27 Apr 2026 16:53:17 -0600 Subject: [PATCH 8/8] Address comments --- .apm/skills/benchmark-qed-setup/SKILL.md | 2 +- benchmark_qed/cli/interactive.py | 6 +----- benchmark_qed/cli/yaml_renderer.py | 15 +++++++++++---- tests/test_interactive_init.py | 12 +++++++++--- 4 files changed, 22 insertions(+), 13 deletions(-) diff --git a/.apm/skills/benchmark-qed-setup/SKILL.md b/.apm/skills/benchmark-qed-setup/SKILL.md index f6f76f6..b1b8443 100644 --- a/.apm/skills/benchmark-qed-setup/SKILL.md +++ b/.apm/skills/benchmark-qed-setup/SKILL.md @@ -211,4 +211,4 @@ Key highlights: - The `.env` file must be in the workspace root directory, not the project root. - Config types `autoe_pairwise`, `autoe_reference`, and `autoe_assertion` generate different settings.yaml templates — use the correct type for your evaluation method. - Prompts are copied as `.txt` files using Python `string.Template` syntax (`$variable` or `${variable}`). -- **`prompts_config` vs `prompt_config`**: The non-interactive `config init` for some autoe types generates `prompts_config`, but the runtime expects `prompt_config`. The interactive `benchmark-qed init` wizard avoids this issue. If using `config init`, rename the key if you get validation errors. +- **`prompt_config` key**: The runtime expects `prompt_config` (singular) for all autoe config types. Both `benchmark-qed init` and `config init` now generate the correct key. If you hand-edit YAML, ensure you use `prompt_config`, not `prompts_config`. diff --git a/benchmark_qed/cli/interactive.py b/benchmark_qed/cli/interactive.py index 2afc5ec..fb8f254 100644 --- a/benchmark_qed/cli/interactive.py +++ b/benchmark_qed/cli/interactive.py @@ -48,9 +48,6 @@ validate_config, ) -app: typer.Typer = typer.Typer(pretty_exceptions_show_locals=False) - - # --------------------------------------------------------------------------- # Data types # --------------------------------------------------------------------------- @@ -727,7 +724,6 @@ def _copy_prompts_for_config(config_type: str, prompts_folder: Path) -> None: # --------------------------------------------------------------------------- -@app.command() def interactive_init( root: Annotated[ Path, @@ -765,7 +761,7 @@ def interactive_init( } yaml_content = renderers[config_type](config_dict) - # 4. Validate against Pydantic model + # 4. Validate rendered YAML structure and basic config fields validate_config(yaml_content, config_type) # 5. Write files diff --git a/benchmark_qed/cli/yaml_renderer.py b/benchmark_qed/cli/yaml_renderer.py index fb1f09f..4b75cbc 100644 --- a/benchmark_qed/cli/yaml_renderer.py +++ b/benchmark_qed/cli/yaml_renderer.py @@ -53,7 +53,11 @@ def _render_llm_section(provider_dict: dict[str, Any], indent: int = 2) -> str: if init_args: lines.append(f"{pad}init_args:") for key, value in init_args.items(): - lines.append(f"{pad} {key}: {value}") + # Quote string values to prevent YAML coercion (e.g., api_version dates) + if isinstance(value, str): + lines.append(f'{pad} {key}: "{value}"') + else: + lines.append(f"{pad} {key}: {value}") return "\n".join(lines) @@ -310,7 +314,7 @@ def render_autoe_pairwise_yaml(config: dict[str, Any]) -> str: llm_config: {llm_section} -prompts_config: +prompt_config: user_prompt: prompt: prompts/pairwise_user_prompt.txt system_prompt: @@ -362,7 +366,7 @@ def render_autoe_reference_yaml(config: dict[str, Any]) -> str: llm_config: {llm_section} -prompts_config: +prompt_config: user_prompt: prompt: prompts/reference_user_prompt.txt system_prompt: @@ -406,7 +410,7 @@ def render_autoe_assertion_yaml(config: dict[str, Any]) -> str: llm_config: {llm_section} -prompts_config: +prompt_config: user_prompt: prompt: prompts/assertion_user_prompt.txt system_prompt: @@ -438,6 +442,7 @@ def render_autoe_assertion_yaml(config: dict[str, Any]) -> str: "question_sets", "trials", "llm_config", + "prompt_config", ], "autoe_reference": [ "reference", @@ -446,6 +451,7 @@ def render_autoe_assertion_yaml(config: dict[str, Any]) -> str: "score_max", "trials", "llm_config", + "prompt_config", ], "autoe_assertion": [ "generated", @@ -453,6 +459,7 @@ def render_autoe_assertion_yaml(config: dict[str, Any]) -> str: "pass_threshold", "trials", "llm_config", + "prompt_config", ], } diff --git a/tests/test_interactive_init.py b/tests/test_interactive_init.py index f515d4a..dc37f10 100644 --- a/tests/test_interactive_init.py +++ b/tests/test_interactive_init.py @@ -232,6 +232,8 @@ def test_render_autoe_pairwise_yaml(self): assert "question_sets" in parsed assert parsed["trials"] == 4 assert "llm_config" in parsed + assert "prompt_config" in parsed + assert isinstance(parsed["prompt_config"], dict) def test_render_autoe_pairwise_yaml_with_criteria(self): """Pairwise YAML with custom criteria includes them.""" @@ -262,6 +264,8 @@ def test_render_autoe_reference_yaml(self): assert parsed["score_min"] == 1 assert parsed["score_max"] == 10 assert "llm_config" in parsed + assert "prompt_config" in parsed + assert isinstance(parsed["prompt_config"], dict) def test_render_autoe_reference_yaml_multiple_generated(self): """Reference YAML with multiple generated conditions.""" @@ -282,9 +286,11 @@ def test_render_autoe_assertion_yaml(self): assert parsed is not None assert "generated" in parsed assert "assertions" in parsed - assert parsed["pass_threshold"] == 0.5 + assert parsed["pass_threshold"] == pytest.approx(0.5) assert parsed["trials"] == 4 assert "llm_config" in parsed + assert "prompt_config" in parsed + assert isinstance(parsed["prompt_config"], dict) # ═══════════════════════════════════════════════════════════════════════════ @@ -308,8 +314,8 @@ def test_azure_provider_includes_init_args(self): provider = _azure_chat_provider() section = _render_llm_section(provider) assert "init_args:" in section - assert "azure_endpoint: https://example.openai.azure.com" in section - assert "api_version: 2024-12-01-preview" in section + assert 'azure_endpoint: "https://example.openai.azure.com"' in section + assert 'api_version: "2024-12-01-preview"' in section def test_managed_identity_omits_api_key(self): """azure_managed_identity auth type does NOT include api_key line."""