diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py index de48078b7..c68613e62 100644 --- a/benchmarks/commit0/run_infer.py +++ b/benchmarks/commit0/run_infer.py @@ -35,8 +35,9 @@ get_default_on_result_writer, ) from benchmarks.utils.image_utils import create_docker_workspace, remote_image_exists +from benchmarks.utils.intelligent_routing import classify_and_route from benchmarks.utils.litellm_proxy import build_eval_llm -from benchmarks.utils.llm_config import load_llm_config +from benchmarks.utils.llm_config import load_llm_config, maybe_load_router_spec from benchmarks.utils.models import ( EvalInstance, EvalMetadata, @@ -385,7 +386,24 @@ def evaluate_instance( if is_acp_agent(self.metadata.agent_type): agent = build_acp_agent(self.metadata.agent_type, self.metadata.llm.model) else: - agent_llm = build_eval_llm(self.metadata.llm) + primary_llm = self.metadata.llm + if self.metadata.routing is not None: + decision = classify_and_route( + benchmark="commit0", + instance_data=instance.data, + router=self.metadata.routing, + ) + logger.info( + "intelligent-routing instance=%s category=%s model=%s " + "vision_fallback=%s raw=%r", + instance.id, + decision.category, + decision.chosen_model_id, + decision.forced_vision_fallback, + decision.raw_classifier_output[:120], + ) + primary_llm = decision.chosen_llm + agent_llm = build_eval_llm(primary_llm) tools = get_tools_for_preset( self.metadata.tool_preset, enable_browser=False ) @@ -394,7 +412,7 @@ def evaluate_instance( condenser = None if self.metadata.enable_condenser: condenser = LLMSummarizingCondenser( - llm=build_eval_llm(self.metadata.llm, usage_id="condenser"), + llm=build_eval_llm(primary_llm, usage_id="condenser"), max_size=self.metadata.condenser_max_size, keep_first=self.metadata.condenser_keep_first, ) @@ -638,6 +656,14 @@ def main() -> None: raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}") llm = load_llm_config(args.llm_config_path) + routing_spec = maybe_load_router_spec(args.llm_config_path) + if routing_spec is not None: + logger.info( + "Using intelligent routing: classifier=%s tiers=%s fallback=%s", + routing_spec.classifier_llm.model, + sorted(routing_spec.tiers.keys()), + routing_spec.fallback_model_id, + ) logger.info("Using LLM config: %s", llm.model_dump_json(indent=2)) dataset_description = ( @@ -658,6 +684,7 @@ def main() -> None: metadata = EvalMetadata( llm=llm, + routing=routing_spec, dataset=args.dataset, dataset_split=args.split, max_iterations=args.max_iterations, diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py index bb967d27a..e1ecbffea 100644 --- a/benchmarks/gaia/run_infer.py +++ b/benchmarks/gaia/run_infer.py @@ -37,8 +37,9 @@ ) from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response from benchmarks.utils.image_utils import create_docker_workspace, remote_image_exists +from benchmarks.utils.intelligent_routing import classify_and_route from benchmarks.utils.litellm_proxy import build_eval_llm -from benchmarks.utils.llm_config import load_llm_config +from benchmarks.utils.llm_config import load_llm_config, maybe_load_router_spec from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput from benchmarks.utils.tool_presets import get_tools_for_preset from benchmarks.utils.version import IMAGE_TAG_PREFIX @@ -325,7 +326,24 @@ def evaluate_instance( if is_acp_agent(self.metadata.agent_type): agent = build_acp_agent(self.metadata.agent_type, self.metadata.llm.model) else: - agent_llm = build_eval_llm(self.metadata.llm) + primary_llm = self.metadata.llm + if self.metadata.routing is not None: + decision = classify_and_route( + benchmark="gaia", + instance_data=instance.data, + router=self.metadata.routing, + ) + logger.info( + "intelligent-routing instance=%s category=%s model=%s " + "vision_fallback=%s raw=%r", + instance.id, + decision.category, + decision.chosen_model_id, + decision.forced_vision_fallback, + decision.raw_classifier_output[:120], + ) + primary_llm = decision.chosen_llm + agent_llm = build_eval_llm(primary_llm) tools = get_tools_for_preset(self.metadata.tool_preset, enable_browser=True) if self.metadata.enable_delegation: tools.append(Tool(name=TaskToolSet.name)) @@ -334,7 +352,7 @@ def evaluate_instance( condenser = None if self.metadata.enable_condenser: condenser = LLMSummarizingCondenser( - llm=build_eval_llm(self.metadata.llm, usage_id="condenser"), + llm=build_eval_llm(primary_llm, usage_id="condenser"), max_size=self.metadata.condenser_max_size, keep_first=self.metadata.condenser_keep_first, ) @@ -618,6 +636,14 @@ def main() -> None: raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}") llm = load_llm_config(args.llm_config_path) + routing_spec = maybe_load_router_spec(args.llm_config_path) + if routing_spec is not None: + logger.info( + "Using intelligent routing: classifier=%s tiers=%s fallback=%s", + routing_spec.classifier_llm.model, + sorted(routing_spec.tiers.keys()), + routing_spec.fallback_model_id, + ) logger.info("Using LLM config: %s", llm.model_dump_json(indent=2)) # Construct dataset description @@ -635,6 +661,7 @@ def main() -> None: # Create metadata metadata = EvalMetadata( llm=llm, + routing=routing_spec, dataset=args.dataset, dataset_split=args.split, max_iterations=args.max_iterations, diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py index 33f802373..3dd4392f7 100644 --- a/benchmarks/swebench/run_infer.py +++ b/benchmarks/swebench/run_infer.py @@ -35,8 +35,9 @@ ) from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response from benchmarks.utils.image_utils import remote_image_exists +from benchmarks.utils.intelligent_routing import classify_and_route from benchmarks.utils.litellm_proxy import build_eval_llm -from benchmarks.utils.llm_config import load_llm_config +from benchmarks.utils.llm_config import load_llm_config, maybe_load_router_spec from benchmarks.utils.models import ( EvalInstance, EvalMetadata, @@ -262,7 +263,24 @@ def evaluate_instance( if is_acp_agent(self.metadata.agent_type): agent = build_acp_agent(self.metadata.agent_type, self.metadata.llm.model) else: - agent_llm = build_eval_llm(self.metadata.llm) + primary_llm = self.metadata.llm + if self.metadata.routing is not None: + decision = classify_and_route( + benchmark="swebench", + instance_data=instance.data, + router=self.metadata.routing, + ) + logger.info( + "intelligent-routing instance=%s category=%s model=%s " + "vision_fallback=%s raw=%r", + instance.id, + decision.category, + decision.chosen_model_id, + decision.forced_vision_fallback, + decision.raw_classifier_output[:120], + ) + primary_llm = decision.chosen_llm + agent_llm = build_eval_llm(primary_llm) tools = get_tools_for_preset( preset=self.metadata.tool_preset, # Disable browser tools in CLI mode @@ -273,7 +291,7 @@ def evaluate_instance( condenser = None if self.metadata.enable_condenser: condenser = LLMSummarizingCondenser( - llm=build_eval_llm(self.metadata.llm, usage_id="condenser"), + llm=build_eval_llm(primary_llm, usage_id="condenser"), max_size=self.metadata.condenser_max_size, keep_first=self.metadata.condenser_keep_first, ) @@ -395,7 +413,16 @@ def main() -> None: raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}") llm = load_llm_config(args.llm_config_path) - logger.info("Using LLM config: %s", llm.model_dump_json(indent=2)) + routing_spec = maybe_load_router_spec(args.llm_config_path) + if routing_spec is not None: + logger.info( + "Using intelligent routing: classifier=%s tiers=%s fallback=%s", + routing_spec.classifier_llm.model, + sorted(routing_spec.tiers.keys()), + routing_spec.fallback_model_id, + ) + else: + logger.info("Using LLM config: %s", llm.model_dump_json(indent=2)) dataset_description = ( args.dataset.replace("/", "__") + "-" + args.split.replace("/", "__") @@ -422,6 +449,7 @@ def main() -> None: metadata = EvalMetadata( llm=llm, + routing=routing_spec, dataset=args.dataset, dataset_split=args.split, max_iterations=args.max_iterations, diff --git a/benchmarks/swebenchmultimodal/run_infer.py b/benchmarks/swebenchmultimodal/run_infer.py index 5f61b6d13..6ec06f2ee 100644 --- a/benchmarks/swebenchmultimodal/run_infer.py +++ b/benchmarks/swebenchmultimodal/run_infer.py @@ -33,8 +33,9 @@ ) from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response from benchmarks.utils.image_utils import remote_image_exists +from benchmarks.utils.intelligent_routing import classify_and_route from benchmarks.utils.litellm_proxy import build_eval_llm -from benchmarks.utils.llm_config import load_llm_config +from benchmarks.utils.llm_config import load_llm_config, maybe_load_router_spec from benchmarks.utils.models import ( EvalInstance, EvalMetadata, @@ -242,7 +243,24 @@ def evaluate_instance( if is_acp_agent(self.metadata.agent_type): agent = build_acp_agent(self.metadata.agent_type, self.metadata.llm.model) else: - agent_llm = build_eval_llm(self.metadata.llm) + primary_llm = self.metadata.llm + if self.metadata.routing is not None: + decision = classify_and_route( + benchmark="swebenchmultimodal", + instance_data=instance.data, + router=self.metadata.routing, + ) + logger.info( + "intelligent-routing instance=%s category=%s model=%s " + "vision_fallback=%s raw=%r", + instance.id, + decision.category, + decision.chosen_model_id, + decision.forced_vision_fallback, + decision.raw_classifier_output[:120], + ) + primary_llm = decision.chosen_llm + agent_llm = build_eval_llm(primary_llm) tools = get_tools_for_preset( self.metadata.tool_preset, # Enable browser tools for frontend development tasks @@ -253,7 +271,7 @@ def evaluate_instance( condenser = None if self.metadata.enable_condenser: condenser = LLMSummarizingCondenser( - llm=build_eval_llm(self.metadata.llm, usage_id="condenser"), + llm=build_eval_llm(primary_llm, usage_id="condenser"), max_size=self.metadata.condenser_max_size, keep_first=self.metadata.condenser_keep_first, ) @@ -441,6 +459,14 @@ def main() -> None: raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}") llm = load_llm_config(args.llm_config_path) + routing_spec = maybe_load_router_spec(args.llm_config_path) + if routing_spec is not None: + logger.info( + "Using intelligent routing: classifier=%s tiers=%s fallback=%s", + routing_spec.classifier_llm.model, + sorted(routing_spec.tiers.keys()), + routing_spec.fallback_model_id, + ) logger.info("Using LLM config: %s", llm.model_dump_json(indent=2)) dataset_description = ( @@ -468,6 +494,7 @@ def main() -> None: metadata = EvalMetadata( llm=llm, + routing=routing_spec, dataset=args.dataset, dataset_split=args.split, max_iterations=args.max_iterations, diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py index b06555d90..350d0c6ef 100644 --- a/benchmarks/swtbench/run_infer.py +++ b/benchmarks/swtbench/run_infer.py @@ -30,8 +30,9 @@ create_docker_workspace, remote_image_exists, ) +from benchmarks.utils.intelligent_routing import classify_and_route from benchmarks.utils.litellm_proxy import build_eval_llm -from benchmarks.utils.llm_config import load_llm_config +from benchmarks.utils.llm_config import load_llm_config, maybe_load_router_spec from benchmarks.utils.models import ( EvalInstance, EvalMetadata, @@ -251,7 +252,24 @@ def evaluate_instance( if is_acp_agent(self.metadata.agent_type): agent = build_acp_agent(self.metadata.agent_type, self.metadata.llm.model) else: - agent_llm = build_eval_llm(self.metadata.llm) + primary_llm = self.metadata.llm + if self.metadata.routing is not None: + decision = classify_and_route( + benchmark="swtbench", + instance_data=instance.data, + router=self.metadata.routing, + ) + logger.info( + "intelligent-routing instance=%s category=%s model=%s " + "vision_fallback=%s raw=%r", + instance.id, + decision.category, + decision.chosen_model_id, + decision.forced_vision_fallback, + decision.raw_classifier_output[:120], + ) + primary_llm = decision.chosen_llm + agent_llm = build_eval_llm(primary_llm) tools = get_tools_for_preset( self.metadata.tool_preset, # Disable browser tools in CLI mode @@ -262,7 +280,7 @@ def evaluate_instance( condenser = None if self.metadata.enable_condenser: condenser = LLMSummarizingCondenser( - llm=build_eval_llm(self.metadata.llm, usage_id="condenser"), + llm=build_eval_llm(primary_llm, usage_id="condenser"), max_size=self.metadata.condenser_max_size, keep_first=self.metadata.condenser_keep_first, ) @@ -382,6 +400,14 @@ def main() -> None: raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}") llm = load_llm_config(args.llm_config_path) + routing_spec = maybe_load_router_spec(args.llm_config_path) + if routing_spec is not None: + logger.info( + "Using intelligent routing: classifier=%s tiers=%s fallback=%s", + routing_spec.classifier_llm.model, + sorted(routing_spec.tiers.keys()), + routing_spec.fallback_model_id, + ) logger.info("Using LLM config: %s", llm.model_dump_json(indent=2)) dataset_description = ( @@ -408,6 +434,7 @@ def main() -> None: metadata = EvalMetadata( llm=llm, + routing=routing_spec, dataset=args.dataset, dataset_split=args.split, max_iterations=args.max_iterations, diff --git a/benchmarks/utils/intelligent_routing.py b/benchmarks/utils/intelligent_routing.py new file mode 100644 index 000000000..3629897a5 --- /dev/null +++ b/benchmarks/utils/intelligent_routing.py @@ -0,0 +1,453 @@ +"""Intelligent per-instance model routing for benchmarks. + +A *router config* is a JSON file shaped like:: + + { + "kind": "intelligent-router-v0", + "classifier_model_id": "minimax-m2.7", + "fallback_model_id": "gpt-5.5", + "tiers": { + "kimi-k2.6": { ...standard LLM config... }, + "minimax-m2.7": { ...standard LLM config... }, + "gpt-5.5": { ...standard LLM config... } + }, + "routing": { + "Frontend": "kimi-k2.6", + "Issue Resolution (other)": "minimax-m2.7", + "Greenfield": "gpt-5.5", + "Testing": "gpt-5.5", + "Information Gathering": "gpt-5.5" + } + } + +When a benchmark sees this shape, it calls the classifier LLM once per +instance against ``CLASSIFIER_PROMPT`` to pick one of the categories above, +then runs the conversation on the matching tier LLM. The decision is logged +per instance for offline analysis. + +This module does **not** depend on any benchmark-specific code; benchmarks +only call ``classify_and_route`` with their own task-text extractor. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Callable, Literal + +from pydantic import BaseModel, ConfigDict, Field + +from openhands.sdk import LLM, get_logger +from openhands.sdk.llm import Message, TextContent + + +logger = get_logger(__name__) + + +# --------------------------------------------------------------------------- # +# Classifier prompt (iter5, frozen verbatim from research repo) # +# --------------------------------------------------------------------------- # + +# Source: OpenHands/research/juan/intelligent-model-selection/ +# optimization_200_4cat/classifier_prompt_iter5.txt +CLASSIFIER_PROMPT = """\ +You are a task classification expert. Your job is to classify software development task instructions into exactly one of the following categories: + +**Categories:** + +1. **Greenfield** - Tasks that involve creating new projects, repositories, or applications from scratch. These tasks typically start with nothing and build something new. + +2. **Frontend** - Tasks focused on user interface, visual rendering, styling, or frontend component behavior issues. Key indicators: UI components, forms, visual display, CSS, rendering problems, dynamic UI behavior, frontend frameworks, visual elements, or browser-specific issues. Must explicitly mention UI/visual/display/rendering/styling concerns. + +3. **Testing** - Tasks involving problems with test logic, test computations, test utilities, or testing infrastructure. Key indicators: test functions computing wrong values, test methods producing incorrect results, test utilities with bugs, testing framework malfunctions, or phrases describing how tests themselves are broken or compute incorrectly. + +4. **Information Gathering** - Tasks that involve research, questions, or gathering information WITHOUT any implementation, fixing, or changes. Pure information requests only. Key phrases: "Should we...", "Consider whether...", "What if...", "Is it better to..." when asking for opinions rather than reporting bugs. + +5. **Issue Resolution (other)** - All other bug fixes, issue resolution, and debugging tasks including: backend logic bugs, algorithms, data processing, APIs, core functionality, model computations, library functions, data structures, configuration issues, development tool bugs, computational problems, incorrect calculations, wrong outputs, data handling errors, parser issues, linter bugs, and any functional problems. + +**Classification Rules:** + +1. **Bug reports are Issue Resolution by default** - If a task describes something not working correctly, producing wrong results, or behaving incorrectly, classify as Issue Resolution (other) UNLESS it explicitly describes UI/visual problems (Frontend) or test infrastructure problems (Testing). + +2. **Testing**: Only for broken test infrastructure itself. If underlying code produces wrong results and tests detect it, that's Issue Resolution. + +3. **Frontend**: Requires explicit mention of UI, visual, display, rendering, styling, or frontend framework components. Generic "bug" or "error" without UI context is Issue Resolution. + +4. **Tool/library bugs are Issue Resolution** - ESLint issues, parser problems, linter bugs, configuration errors, and development tool malfunctions are Issue Resolution (other), not Frontend. + +5. **Questions about design decisions** - Phrases like "Should we", "Consider whether", "What if" asking for opinions are Information Gathering only if no bug is being reported. + +Respond with ONLY the category name, exactly as written above. +""" + +# Canonical category strings the classifier may emit. Order matters for the +# substring matcher below: longer / more specific keys appear first. +CATEGORIES: list[str] = [ + "Issue Resolution (other)", + "Information Gathering", + "Greenfield", + "Frontend", + "Testing", +] + + +# --------------------------------------------------------------------------- # +# Default routing table (the user-specified 3-tier mapping) # +# --------------------------------------------------------------------------- # + +DEFAULT_ROUTING: dict[str, str] = { + "Frontend": "kimi-k2.6", + "Issue Resolution (other)": "minimax-m2.7", + "Greenfield": "gpt-5.5", + "Testing": "gpt-5.5", + "Information Gathering": "gpt-5.5", +} + +# Model IDs known to accept image inputs. Used by ``classify_and_route`` to +# fall back to a vision-capable tier when an instance has images but the +# classified tier is text-only (e.g. swebenchmultimodal frontend instance +# routed to a text-only tier). Extend as new tiers are added. +DEFAULT_VISION_CAPABLE: frozenset[str] = frozenset({"kimi-k2.6", "gpt-5.5"}) + + +# --------------------------------------------------------------------------- # +# Per-benchmark task-text extractors # +# --------------------------------------------------------------------------- # + +BenchmarkName = Literal[ + "swebench", + "swebenchmultimodal", + "swtbench", + "gaia", + "commit0", +] + +TaskTextExtractor = Callable[[dict], str] + + +def _swebench_task_text(data: dict) -> str: + return str(data.get("problem_statement", "") or "") + + +def _gaia_task_text(data: dict) -> str: + return str(data.get("Question", "") or "") + + +def _commit0_task_text(data: dict) -> str: + # commit0 instances render the spec into the agent prompt; the spec text + # itself isn't always denormalized onto ``data``. Fall back to + # ``problem_statement`` and then to the repo name so the classifier has + # *something* to work with rather than emitting EMPTY_TASK across the + # whole benchmark. + for key in ("spec", "problem_statement", "instruction", "issue"): + value = data.get(key) + if value: + return str(value) + repo = data.get("repo", "") + return f"Implement the project from its spec: {repo}" if repo else "" + + +BENCHMARK_TASK_EXTRACTORS: dict[BenchmarkName, TaskTextExtractor] = { + "swebench": _swebench_task_text, + "swebenchmultimodal": _swebench_task_text, + "swtbench": _swebench_task_text, + "gaia": _gaia_task_text, + "commit0": _commit0_task_text, +} + + +def _instance_has_images(data: dict) -> bool: + """Best-effort detection of image-bearing instances (swebenchmultimodal).""" + assets = data.get("image_assets") + if not assets: + return False + if isinstance(assets, str): + try: + assets = json.loads(assets) + except (TypeError, ValueError): + return False + if not isinstance(assets, dict): + return False + return bool(assets.get("problem_statement")) + + +# --------------------------------------------------------------------------- # +# Router spec + decision types # +# --------------------------------------------------------------------------- # + + +class RouterSpec(BaseModel): + """Parsed intelligent-router-v0 configuration.""" + + model_config = ConfigDict(arbitrary_types_allowed=True) + + classifier_llm: LLM = Field( + description=( + "LLM used to classify each instance into a category. " + "Typically one of the tier LLMs (e.g. minimax-m2.7)." + ), + ) + tiers: dict[str, LLM] = Field( + description="Map of model_id -> LLM instance for each routing tier.", + ) + routing: dict[str, str] = Field( + default_factory=lambda: dict(DEFAULT_ROUTING), + description="Map of classifier category -> tier model_id.", + ) + fallback_model_id: str = Field( + description=( + "Model ID used when classification is empty, unparseable, or the " + "chosen tier is text-only but the instance has images." + ), + ) + vision_capable_model_ids: set[str] = Field( + default_factory=lambda: set(DEFAULT_VISION_CAPABLE), + description="Model IDs known to accept image inputs.", + ) + + def tier_or_fallback(self, model_id: str) -> tuple[str, LLM]: + """Return ``(model_id, llm)``, falling back if the ID is unknown.""" + if model_id in self.tiers: + return model_id, self.tiers[model_id] + return self.fallback_model_id, self.tiers[self.fallback_model_id] + + +@dataclass(frozen=True) +class RoutingDecision: + """Result of classifying and routing a single instance.""" + + chosen_llm: LLM + chosen_model_id: str + category: str + raw_classifier_output: str + forced_vision_fallback: bool + + +# --------------------------------------------------------------------------- # +# Public API # +# --------------------------------------------------------------------------- # + + +_ROUTER_KIND = "intelligent-router-v0" + + +def is_router_config_payload(payload: dict) -> bool: + """Return True if the given parsed JSON object is a router config.""" + return isinstance(payload, dict) and payload.get("kind") == _ROUTER_KIND + + +def maybe_load_router_spec(config_path: str | Path) -> RouterSpec | None: + """Load a router config from ``config_path`` if it matches the v0 shape. + + Returns ``None`` for plain LLM configs (so the caller can fall back to + ``load_llm_config``). Raises ``ValueError`` if the payload claims to be a + router config but is missing required fields. + """ + path = Path(config_path) + if not path.is_file(): + raise ValueError(f"LLM config file {path} does not exist") + + text = path.read_text(encoding="utf-8") + try: + payload = json.loads(text) + except json.JSONDecodeError: + # Not JSON or not a router payload; let the plain loader produce its + # own error. + return None + + if not is_router_config_payload(payload): + return None + + return _build_router_spec(payload) + + +def _build_router_spec(payload: dict) -> RouterSpec: + tiers_raw = payload.get("tiers") + if not isinstance(tiers_raw, dict) or not tiers_raw: + raise ValueError("router config: 'tiers' must be a non-empty object") + + classifier_model_id = payload.get("classifier_model_id") + if not classifier_model_id or classifier_model_id not in tiers_raw: + raise ValueError( + "router config: 'classifier_model_id' must reference a key in 'tiers'" + ) + + fallback_model_id = payload.get("fallback_model_id") or classifier_model_id + if fallback_model_id not in tiers_raw: + raise ValueError( + "router config: 'fallback_model_id' must reference a key in 'tiers'" + ) + + tiers: dict[str, LLM] = {} + for model_id, llm_cfg in tiers_raw.items(): + if not isinstance(llm_cfg, dict): + raise ValueError(f"router config: tier '{model_id}' must be an object") + # Attach a stable usage_id so per-tier metrics are distinguishable in + # traces and per-instance cost reports. + cfg = {**llm_cfg, "usage_id": llm_cfg.get("usage_id") or f"agent:{model_id}"} + tiers[model_id] = LLM.model_validate(cfg) + + classifier_cfg = { + **tiers_raw[classifier_model_id], + "usage_id": "router:classifier", + } + classifier_llm = LLM.model_validate(classifier_cfg) + + routing = payload.get("routing") or DEFAULT_ROUTING + if not isinstance(routing, dict) or not routing: + raise ValueError("router config: 'routing' must be a non-empty object") + unknown_targets = {v for v in routing.values() if v not in tiers} + if unknown_targets: + raise ValueError( + f"router config: routing targets not present in 'tiers': {unknown_targets}" + ) + + vision = payload.get("vision_capable_model_ids") + vision_set: set[str] = ( + set(vision) if isinstance(vision, list) else set(DEFAULT_VISION_CAPABLE) + ) + + return RouterSpec( + classifier_llm=classifier_llm, + tiers=tiers, + routing=routing, + fallback_model_id=fallback_model_id, + vision_capable_model_ids=vision_set, + ) + + +def parse_classifier_output(raw: str, routing: dict[str, str]) -> str | None: + """Map a raw classifier response to a routing category, or ``None``. + + Substring-matches against the canonical category strings in priority order + (most specific first). Case-insensitive. Tolerates models that wrap their + answer in markdown or commentary. + """ + if not raw: + return None + lowered = raw.lower() + for category in CATEGORIES: + if category.lower() in lowered: + return category if category in routing else None + # Loose fallback: bare keywords without the "(other)" suffix. + for keyword, canonical in ( + ("issue resolution", "Issue Resolution (other)"), + ("information gathering", "Information Gathering"), + ("greenfield", "Greenfield"), + ("frontend", "Frontend"), + ("testing", "Testing"), + ): + if keyword in lowered and canonical in routing: + return canonical + return None + + +def classify_and_route( + benchmark: BenchmarkName | str, + instance_data: dict, + router: RouterSpec, +) -> RoutingDecision: + """Classify a single instance and pick the matching tier LLM. + + Falls back to ``router.fallback_model_id`` when: + + * the task text is empty, + * the classifier output cannot be parsed into a known category, + * the instance carries images but the chosen tier isn't vision-capable. + + All cost-relevant LLM construction happens here (via the LLM objects + stored on the router); per-instance virtual-key injection should still be + applied by the caller via :func:`benchmarks.utils.litellm_proxy.build_eval_llm`. + """ + extractor = BENCHMARK_TASK_EXTRACTORS.get(benchmark) # type: ignore[arg-type] + if extractor is None: + chosen_id, chosen_llm = router.tier_or_fallback(router.fallback_model_id) + return RoutingDecision( + chosen_llm=chosen_llm, + chosen_model_id=chosen_id, + category="NO_EXTRACTOR", + raw_classifier_output="", + forced_vision_fallback=False, + ) + + task_text = extractor(instance_data).strip() + if not task_text: + chosen_id, chosen_llm = router.tier_or_fallback(router.fallback_model_id) + return RoutingDecision( + chosen_llm=chosen_llm, + chosen_model_id=chosen_id, + category="EMPTY_TASK", + raw_classifier_output="", + forced_vision_fallback=False, + ) + + raw = _run_classifier(router.classifier_llm, task_text) + category = parse_classifier_output(raw, router.routing) + + if category is None: + chosen_id, chosen_llm = router.tier_or_fallback(router.fallback_model_id) + return RoutingDecision( + chosen_llm=chosen_llm, + chosen_model_id=chosen_id, + category="UNPARSED", + raw_classifier_output=raw, + forced_vision_fallback=False, + ) + + target_id = router.routing[category] + chosen_id, chosen_llm = router.tier_or_fallback(target_id) + + forced = False + if ( + _instance_has_images(instance_data) + and chosen_id not in router.vision_capable_model_ids + ): + forced = True + chosen_id, chosen_llm = router.tier_or_fallback(router.fallback_model_id) + if chosen_id not in router.vision_capable_model_ids: + # Fallback isn't vision-capable either: try the first declared one. + for candidate in router.vision_capable_model_ids: + if candidate in router.tiers: + chosen_id = candidate + chosen_llm = router.tiers[candidate] + break + + return RoutingDecision( + chosen_llm=chosen_llm, + chosen_model_id=chosen_id, + category=category, + raw_classifier_output=raw, + forced_vision_fallback=forced, + ) + + +def _run_classifier(classifier_llm: LLM, task_text: str) -> str: + """Run the classifier prompt against a single task; return raw text.""" + try: + response = classifier_llm.completion( + messages=[ + Message(role="system", content=[TextContent(text=CLASSIFIER_PROMPT)]), + Message(role="user", content=[TextContent(text=task_text)]), + ] + ) + except Exception as exc: # noqa: BLE001 — best-effort classification + logger.warning("Classifier call failed: %s", exc, exc_info=True) + return "" + + message = getattr(response, "message", None) + if message is None: + return "" + content = getattr(message, "content", None) + if isinstance(content, str): + return content.strip() + if isinstance(content, list): + parts: list[str] = [] + for item in content: + text = getattr(item, "text", None) + if text is None and isinstance(item, dict): + text = item.get("text") + if text: + parts.append(str(text)) + return "".join(parts).strip() + return "" diff --git a/benchmarks/utils/llm_config.py b/benchmarks/utils/llm_config.py index ca89ea1c3..eb510ea12 100644 --- a/benchmarks/utils/llm_config.py +++ b/benchmarks/utils/llm_config.py @@ -1,16 +1,48 @@ from __future__ import annotations +import json from pathlib import Path +from benchmarks.utils.intelligent_routing import ( + RouterSpec, + is_router_config_payload, + maybe_load_router_spec, +) from openhands.sdk import LLM def load_llm_config(config_path: str | Path) -> LLM: + """Load an SDK :class:`LLM` from a JSON config file. + + For backwards compatibility, this function also accepts an intelligent + router config (``kind: intelligent-router-v0``); in that case it returns + the classifier LLM, which downstream code uses as the "primary" LLM (e.g. + for ACP agents, condensers, or as the fallback when routing is bypassed). + Use :func:`maybe_load_router_spec` to additionally retrieve the routing + configuration. + """ config_path = Path(config_path) if not config_path.is_file(): raise ValueError(f"LLM config file {config_path} does not exist") - with config_path.open("r", encoding="utf-8") as f: - llm_config = f.read() + text = config_path.read_text(encoding="utf-8") + + # Fast path: plain LLM config (the overwhelmingly common case). Avoid the + # double-parse for non-router configs by sniffing only when the JSON + # parses to an object carrying our discriminator. + try: + payload = json.loads(text) + except json.JSONDecodeError: + # Let pydantic produce its existing ValidationError on malformed JSON. + return LLM.model_validate_json(text) + + if is_router_config_payload(payload): + spec: RouterSpec = maybe_load_router_spec(config_path) # type: ignore[assignment] + # Surface the classifier LLM as the "primary" LLM so callers that + # only need an LLM (e.g. ACP wiring) continue to work. + return spec.classifier_llm + + return LLM.model_validate(payload) + - return LLM.model_validate_json(llm_config) +__all__ = ["load_llm_config", "maybe_load_router_spec"] diff --git a/benchmarks/utils/models.py b/benchmarks/utils/models.py index 9dd471383..ac039081d 100644 --- a/benchmarks/utils/models.py +++ b/benchmarks/utils/models.py @@ -3,6 +3,7 @@ from pydantic import BaseModel, Field, model_validator +from benchmarks.utils.intelligent_routing import RouterSpec from benchmarks.utils.laminar import LaminarEvalMetadata from openhands.sdk import LLM, Event, get_logger from openhands.sdk.critic import CriticBase @@ -19,6 +20,16 @@ class EvalMetadata(BaseModel): llm: LLM + routing: RouterSpec | None = Field( + default=None, + description=( + "Optional intelligent-routing spec. When set and agent_type is " + "'default', the evaluator classifies each instance via the " + "router's classifier LLM and routes the agent conversation to " + "the matching tier LLM. 'llm' is still used as the fallback " + "(e.g. for ACP agents, condensers, or when classification fails)." + ), + ) dataset: str dataset_split: str = Field(default="test") max_iterations: int diff --git a/benchmarks/utils/sample_configs/intelligent_router_3tier.example.json b/benchmarks/utils/sample_configs/intelligent_router_3tier.example.json new file mode 100644 index 000000000..3ef30c1d3 --- /dev/null +++ b/benchmarks/utils/sample_configs/intelligent_router_3tier.example.json @@ -0,0 +1,44 @@ +{ + "_comment": [ + "Sample intelligent-router-v0 config for testing per-instance model routing.", + "Pass to any benchmark's run_infer.py via --llm-config-path.", + "Classifier = minimax-m2.7. Tier mapping = Frontend -> kimi, Issue Resolution", + "(other) -> minimax, all other categories -> gpt-5.5. Vision-capable tiers", + "are kimi-k2.6 and gpt-5.5; image-bearing instances classified into the", + "minimax tier are auto-rerouted to the vision-capable fallback (gpt-5.5).", + "Fill in api_key/base_url before use; these are intentionally blank." + ], + "kind": "intelligent-router-v0", + "classifier_model_id": "minimax-m2.7", + "fallback_model_id": "gpt-5.5", + "tiers": { + "kimi-k2.6": { + "model": "litellm_proxy/moonshot/kimi-k2.6", + "base_url": "", + "api_key": "", + "temperature": 1.0, + "inline_image_urls": true + }, + "minimax-m2.7": { + "model": "litellm_proxy/minimax/MiniMax-M2.7", + "base_url": "", + "api_key": "", + "temperature": 1.0, + "top_p": 0.95 + }, + "gpt-5.5": { + "model": "litellm_proxy/openai/gpt-5.5", + "base_url": "", + "api_key": "", + "reasoning_effort": "high" + } + }, + "routing": { + "Frontend": "kimi-k2.6", + "Issue Resolution (other)": "minimax-m2.7", + "Greenfield": "gpt-5.5", + "Testing": "gpt-5.5", + "Information Gathering": "gpt-5.5" + }, + "vision_capable_model_ids": ["kimi-k2.6", "gpt-5.5"] +} diff --git a/tests/test_intelligent_routing.py b/tests/test_intelligent_routing.py new file mode 100644 index 000000000..762149599 --- /dev/null +++ b/tests/test_intelligent_routing.py @@ -0,0 +1,474 @@ +"""Tests for benchmarks.utils.intelligent_routing. + +These tests cover the routing helpers directly without making real LLM +calls. The classifier is exercised end-to-end with a tiny in-memory LLM +stub so that ``classify_and_route`` exercises real parsing/dispatch code +paths rather than a mock of itself. +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +import pytest + +from benchmarks.utils.intelligent_routing import ( + BENCHMARK_TASK_EXTRACTORS, + DEFAULT_ROUTING, + RouterSpec, + RoutingDecision, + classify_and_route, + is_router_config_payload, + maybe_load_router_spec, + parse_classifier_output, +) +from openhands.sdk import LLM + + +# --------------------------------------------------------------------------- # +# Helpers # +# --------------------------------------------------------------------------- # + + +def _llm(model: str, **extra: Any) -> LLM: + return LLM.model_validate({"model": model, **extra}) + + +def _router_spec(**overrides: Any) -> RouterSpec: + tiers = { + "kimi-k2.6": _llm("litellm_proxy/moonshot/kimi-k2.6"), + "minimax-m2.7": _llm("litellm_proxy/minimax/MiniMax-M2.7"), + "gpt-5.5": _llm("litellm_proxy/openai/gpt-5.5"), + } + spec = RouterSpec( + classifier_llm=tiers["minimax-m2.7"], + tiers=tiers, + routing=dict(DEFAULT_ROUTING), + fallback_model_id="gpt-5.5", + vision_capable_model_ids={"kimi-k2.6", "gpt-5.5"}, + ) + if overrides: + spec = spec.model_copy(update=overrides) + return spec + + +class _StubClassifier: + """Captures classifier calls and returns a predetermined response.""" + + def __init__(self, response: str) -> None: + self.response = response + self.calls: list[list[Any]] = [] + + def completion(self, messages: list[Any], **_: Any) -> Any: # noqa: ANN401 + self.calls.append(messages) + + class _Part: + def __init__(self, text: str) -> None: + self.text = text + + class _Msg: + def __init__(self, text: str) -> None: + self.content = [_Part(text)] + + class _Resp: + def __init__(self, text: str) -> None: + self.message = _Msg(text) + + return _Resp(self.response) + + +def _attach_stub(spec: RouterSpec, response: str) -> _StubClassifier: + stub = _StubClassifier(response) + # Pydantic models are frozen-by-validation, but ``classifier_llm`` is a + # plain attribute; mutate it directly for the test stub. + object.__setattr__(spec, "classifier_llm", stub) + return stub + + +# --------------------------------------------------------------------------- # +# Output parsing # +# --------------------------------------------------------------------------- # + + +class TestParseClassifierOutput: + def test_exact_match(self) -> None: + assert parse_classifier_output("Frontend", DEFAULT_ROUTING) == "Frontend" + + def test_exact_match_issue_resolution_other(self) -> None: + assert ( + parse_classifier_output("Issue Resolution (other)", DEFAULT_ROUTING) + == "Issue Resolution (other)" + ) + + def test_case_insensitive(self) -> None: + assert parse_classifier_output("FRONTEND", DEFAULT_ROUTING) == "Frontend" + + def test_chatty_response_returns_category(self) -> None: + raw = "**Frontend** — this clearly involves CSS rendering." + assert parse_classifier_output(raw, DEFAULT_ROUTING) == "Frontend" + + def test_bare_keyword_resolves_to_canonical_with_suffix(self) -> None: + # Model emitted the bare keyword without the "(other)" suffix. + assert ( + parse_classifier_output("Issue Resolution", DEFAULT_ROUTING) + == "Issue Resolution (other)" + ) + + def test_unknown_category_returns_none(self) -> None: + assert parse_classifier_output("Backend Refactor", DEFAULT_ROUTING) is None + + def test_empty_string_returns_none(self) -> None: + assert parse_classifier_output("", DEFAULT_ROUTING) is None + + def test_most_specific_match_wins(self) -> None: + # "Information Gathering" must beat "Greenfield" if both appear, + # because they are checked in priority order and "Information + # Gathering" appears earlier in CATEGORIES. + raw = "Information Gathering. (Could be Greenfield but no.)" + assert parse_classifier_output(raw, DEFAULT_ROUTING) == "Information Gathering" + + +# --------------------------------------------------------------------------- # +# Router config loading # +# --------------------------------------------------------------------------- # + + +def _write_router_config(tmp_path: Path, payload: dict[str, Any]) -> Path: + path = tmp_path / "router.json" + path.write_text(json.dumps(payload)) + return path + + +def _minimal_router_payload() -> dict[str, Any]: + return { + "kind": "intelligent-router-v0", + "classifier_model_id": "minimax-m2.7", + "fallback_model_id": "gpt-5.5", + "tiers": { + "kimi-k2.6": {"model": "litellm_proxy/moonshot/kimi-k2.6"}, + "minimax-m2.7": {"model": "litellm_proxy/minimax/MiniMax-M2.7"}, + "gpt-5.5": {"model": "litellm_proxy/openai/gpt-5.5"}, + }, + "routing": dict(DEFAULT_ROUTING), + } + + +class TestRouterConfigLoading: + def test_is_router_config_payload_positive(self) -> None: + assert is_router_config_payload({"kind": "intelligent-router-v0"}) + + def test_is_router_config_payload_negative(self) -> None: + assert not is_router_config_payload({"model": "gpt-4o"}) + assert not is_router_config_payload({}) + assert not is_router_config_payload({"kind": "something-else"}) + + def test_load_minimal_config(self, tmp_path: Path) -> None: + path = _write_router_config(tmp_path, _minimal_router_payload()) + + spec = maybe_load_router_spec(path) + + assert spec is not None + assert set(spec.tiers) == {"kimi-k2.6", "minimax-m2.7", "gpt-5.5"} + assert spec.classifier_llm.model == "litellm_proxy/minimax/MiniMax-M2.7" + assert spec.fallback_model_id == "gpt-5.5" + assert spec.routing == DEFAULT_ROUTING + + def test_load_assigns_per_tier_usage_id(self, tmp_path: Path) -> None: + path = _write_router_config(tmp_path, _minimal_router_payload()) + + spec = maybe_load_router_spec(path) + + assert spec is not None + assert spec.tiers["kimi-k2.6"].usage_id == "agent:kimi-k2.6" + assert spec.tiers["gpt-5.5"].usage_id == "agent:gpt-5.5" + assert spec.classifier_llm.usage_id == "router:classifier" + + def test_plain_llm_config_returns_none(self, tmp_path: Path) -> None: + path = tmp_path / "plain.json" + path.write_text(json.dumps({"model": "gpt-4o"})) + + assert maybe_load_router_spec(path) is None + + def test_malformed_json_returns_none(self, tmp_path: Path) -> None: + path = tmp_path / "bad.json" + path.write_text("{not json") + + # Delegated to the plain loader, which raises its own ValidationError. + assert maybe_load_router_spec(path) is None + + def test_missing_file_raises(self, tmp_path: Path) -> None: + with pytest.raises(ValueError, match="does not exist"): + maybe_load_router_spec(tmp_path / "nope.json") + + def test_missing_classifier_model_id_raises(self, tmp_path: Path) -> None: + payload = _minimal_router_payload() + payload["classifier_model_id"] = "not-in-tiers" + path = _write_router_config(tmp_path, payload) + + with pytest.raises(ValueError, match="classifier_model_id"): + maybe_load_router_spec(path) + + def test_routing_target_not_in_tiers_raises(self, tmp_path: Path) -> None: + payload = _minimal_router_payload() + payload["routing"]["Frontend"] = "phantom-model" + path = _write_router_config(tmp_path, payload) + + with pytest.raises(ValueError, match="routing targets"): + maybe_load_router_spec(path) + + def test_empty_tiers_raises(self, tmp_path: Path) -> None: + payload = _minimal_router_payload() + payload["tiers"] = {} + path = _write_router_config(tmp_path, payload) + + with pytest.raises(ValueError, match="tiers"): + maybe_load_router_spec(path) + + +# --------------------------------------------------------------------------- # +# load_llm_config integration # +# --------------------------------------------------------------------------- # + + +class TestLoadLLMConfigWithRouter: + def test_router_config_returns_classifier_llm(self, tmp_path: Path) -> None: + from benchmarks.utils.llm_config import load_llm_config + + path = _write_router_config(tmp_path, _minimal_router_payload()) + llm = load_llm_config(path) + # Should be the classifier (minimax) LLM since the primary LLM slot + # is filled with the classifier for ACP/condenser fallback paths. + assert llm.model == "litellm_proxy/minimax/MiniMax-M2.7" + + def test_plain_config_still_works(self, tmp_path: Path) -> None: + from benchmarks.utils.llm_config import load_llm_config + + path = tmp_path / "plain.json" + path.write_text(json.dumps({"model": "gpt-4o"})) + llm = load_llm_config(path) + assert llm.model == "gpt-4o" + + +# --------------------------------------------------------------------------- # +# Per-benchmark task-text extraction # +# --------------------------------------------------------------------------- # + + +class TestTaskTextExtractors: + def test_swebench_extractor(self) -> None: + text = BENCHMARK_TASK_EXTRACTORS["swebench"]( + {"problem_statement": "Fix the bug"} + ) + assert text == "Fix the bug" + + def test_swebench_missing_returns_empty(self) -> None: + assert BENCHMARK_TASK_EXTRACTORS["swebench"]({}) == "" + + def test_gaia_extractor(self) -> None: + text = BENCHMARK_TASK_EXTRACTORS["gaia"]({"Question": "What is X?"}) + assert text == "What is X?" + + def test_commit0_prefers_spec(self) -> None: + text = BENCHMARK_TASK_EXTRACTORS["commit0"]( + {"spec": "Build a calculator", "repo": "example/calc"} + ) + assert text == "Build a calculator" + + def test_commit0_falls_back_to_repo(self) -> None: + text = BENCHMARK_TASK_EXTRACTORS["commit0"]({"repo": "foo/bar"}) + assert "foo/bar" in text + + def test_swebenchmultimodal_uses_same_extractor_as_swebench(self) -> None: + assert ( + BENCHMARK_TASK_EXTRACTORS["swebenchmultimodal"] + is BENCHMARK_TASK_EXTRACTORS["swebench"] + ) + + def test_swtbench_uses_same_extractor_as_swebench(self) -> None: + assert ( + BENCHMARK_TASK_EXTRACTORS["swtbench"] + is BENCHMARK_TASK_EXTRACTORS["swebench"] + ) + + +# --------------------------------------------------------------------------- # +# classify_and_route end-to-end # +# --------------------------------------------------------------------------- # + + +class TestClassifyAndRoute: + def test_frontend_routes_to_kimi(self) -> None: + spec = _router_spec() + _attach_stub(spec, "Frontend") + + decision = classify_and_route( + benchmark="swebench", + instance_data={ + "problem_statement": "CSS rendering bug on form button hover" + }, + router=spec, + ) + + assert isinstance(decision, RoutingDecision) + assert decision.category == "Frontend" + assert decision.chosen_model_id == "kimi-k2.6" + assert decision.forced_vision_fallback is False + + def test_issue_resolution_routes_to_minimax(self) -> None: + spec = _router_spec() + _attach_stub(spec, "Issue Resolution (other)") + + decision = classify_and_route( + benchmark="swebench", + instance_data={"problem_statement": "Pagination off-by-one"}, + router=spec, + ) + + assert decision.category == "Issue Resolution (other)" + assert decision.chosen_model_id == "minimax-m2.7" + + def test_information_gathering_routes_to_gpt55(self) -> None: + spec = _router_spec() + _attach_stub(spec, "Information Gathering") + + decision = classify_and_route( + benchmark="gaia", + instance_data={"Question": "How many Olympics has Sweden hosted?"}, + router=spec, + ) + + assert decision.category == "Information Gathering" + assert decision.chosen_model_id == "gpt-5.5" + + def test_unparseable_response_falls_back(self) -> None: + spec = _router_spec() + _attach_stub(spec, "Backend Refactor — not a real category") + + decision = classify_and_route( + benchmark="swebench", + instance_data={"problem_statement": "Fix something."}, + router=spec, + ) + + assert decision.category == "UNPARSED" + assert decision.chosen_model_id == "gpt-5.5" + assert "Backend Refactor" in decision.raw_classifier_output + + def test_empty_task_text_skips_classifier(self) -> None: + spec = _router_spec() + stub = _attach_stub(spec, "Frontend") + + decision = classify_and_route( + benchmark="swebench", + instance_data={"problem_statement": ""}, + router=spec, + ) + + assert decision.category == "EMPTY_TASK" + assert decision.chosen_model_id == "gpt-5.5" + assert stub.calls == [], "classifier should not be invoked when task is empty" + + def test_unknown_benchmark_uses_fallback(self) -> None: + spec = _router_spec() + + decision = classify_and_route( + benchmark="not-a-known-benchmark", # type: ignore[arg-type] + instance_data={"problem_statement": "Hello"}, + router=spec, + ) + + assert decision.category == "NO_EXTRACTOR" + assert decision.chosen_model_id == "gpt-5.5" + + def test_classifier_failure_falls_back_gracefully(self) -> None: + spec = _router_spec() + + class _ExplodingClassifier: + def completion(self, **_: Any) -> Any: # noqa: ANN401 + raise RuntimeError("classifier proxy unavailable") + + object.__setattr__(spec, "classifier_llm", _ExplodingClassifier()) + + decision = classify_and_route( + benchmark="swebench", + instance_data={"problem_statement": "Something is broken"}, + router=spec, + ) + + # Empty classifier output → UNPARSED → fallback. + assert decision.category == "UNPARSED" + assert decision.chosen_model_id == "gpt-5.5" + + +class TestVisionFallback: + """When classifier sends an image-bearing instance to a text-only tier, + routing must redirect to a vision-capable tier (e.g. swebenchmultimodal + Frontend → kimi is fine; Issue Resolution → minimax is not because + minimax lacks vision).""" + + def _image_instance(self) -> dict[str, Any]: + return { + "problem_statement": "Backend bug; the screenshot shows the error.", + "image_assets": {"problem_statement": ["https://example.com/a.png"]}, + } + + def test_text_only_tier_for_image_instance_redirects(self) -> None: + spec = _router_spec() + _attach_stub(spec, "Issue Resolution (other)") + + decision = classify_and_route( + benchmark="swebenchmultimodal", + instance_data=self._image_instance(), + router=spec, + ) + + assert decision.forced_vision_fallback is True + assert decision.chosen_model_id in {"gpt-5.5", "kimi-k2.6"} + + def test_vision_tier_keeps_assignment(self) -> None: + spec = _router_spec() + _attach_stub(spec, "Frontend") + + decision = classify_and_route( + benchmark="swebenchmultimodal", + instance_data=self._image_instance(), + router=spec, + ) + + # Frontend → kimi, which is vision-capable; no redirect needed. + assert decision.forced_vision_fallback is False + assert decision.chosen_model_id == "kimi-k2.6" + + def test_text_only_instance_unaffected(self) -> None: + spec = _router_spec() + _attach_stub(spec, "Issue Resolution (other)") + + decision = classify_and_route( + benchmark="swebench", + instance_data={"problem_statement": "Fix the pagination off-by-one"}, + router=spec, + ) + + assert decision.forced_vision_fallback is False + assert decision.chosen_model_id == "minimax-m2.7" + + def test_image_assets_as_json_string(self) -> None: + spec = _router_spec() + _attach_stub(spec, "Issue Resolution (other)") + + instance = { + "problem_statement": "Bug.", + "image_assets": json.dumps( + {"problem_statement": ["https://example.com/a.png"]} + ), + } + + decision = classify_and_route( + benchmark="swebenchmultimodal", + instance_data=instance, + router=spec, + ) + + assert decision.forced_vision_fallback is True