From 97a406937df1e6dc63e2effcbedfb0fa523f230a Mon Sep 17 00:00:00 2001 From: Matthias De Lange Date: Tue, 6 Jan 2026 18:19:36 +0100 Subject: [PATCH 1/2] refactor: rename evaluate.py to run.py to remove ambiguity with workrb.evaluate function --- src/workrb/__init__.py | 2 +- src/workrb/{evaluate.py => run.py} | 10 ---------- tests/test_evaluate_multiple_models.py | 14 +++++++------- 3 files changed, 8 insertions(+), 18 deletions(-) rename src/workrb/{evaluate.py => run.py} (97%) diff --git a/src/workrb/__init__.py b/src/workrb/__init__.py index a62feec..6f1770b 100644 --- a/src/workrb/__init__.py +++ b/src/workrb/__init__.py @@ -3,7 +3,7 @@ """ from workrb import data, metrics, models, tasks -from workrb.evaluate import ( +from workrb.run import ( evaluate, evaluate_multiple_models, get_tasks_overview, diff --git a/src/workrb/evaluate.py b/src/workrb/run.py similarity index 97% rename from src/workrb/evaluate.py rename to src/workrb/run.py index 6b9507b..72cf31b 100644 --- a/src/workrb/evaluate.py +++ b/src/workrb/run.py @@ -248,16 +248,6 @@ def _get_total_evaluations(tasks: Sequence[Task]) -> int: return sum(len(task.languages) for task in tasks) -def _validate_tasks(tasks: Sequence[Task]): - """Validate that all tasks are properly configured.""" - if not tasks: - raise ValueError("At least one task must be provided") - - for task in tasks: - if not isinstance(task, Task): - raise TypeError(f"All tasks must inherit from Task, got {type(task)}") - - def _init_checkpointing( tasks: Sequence[Task], config: BenchmarkConfig, diff --git a/tests/test_evaluate_multiple_models.py b/tests/test_evaluate_multiple_models.py index 33eecbb..93801f3 100644 --- a/tests/test_evaluate_multiple_models.py +++ b/tests/test_evaluate_multiple_models.py @@ -15,7 +15,6 @@ import torch from tests.test_utils import create_toy_task_class -from workrb.evaluate import evaluate_multiple_models from workrb.models.base import ModelInterface from workrb.results import ( BenchmarkMetadata, @@ -24,6 +23,7 @@ TaskResultMetadata, TaskResults, ) +from workrb.run import evaluate_multiple_models from workrb.tasks import SkillMatch1kSkillSimilarityRanking from workrb.tasks.abstract.base import DatasetSplit, Language from workrb.types import ModelInputType @@ -118,7 +118,7 @@ def test_evaluate_multiple_models_basic(): task_name = task.name # Mock the evaluate function - with patch("workrb.evaluate.evaluate") as mock_evaluate: + with patch("workrb.run.evaluate") as mock_evaluate: # Set up return values for each model mock_evaluate.side_effect = [ create_mock_results("model1", task_name), @@ -165,7 +165,7 @@ def test_evaluate_multiple_models_with_additional_kwargs(): task = ToyTask(split=DatasetSplit.VAL, languages=[Language.EN]) task_name = task.name - with patch("workrb.evaluate.evaluate") as mock_evaluate: + with patch("workrb.run.evaluate") as mock_evaluate: mock_evaluate.return_value = create_mock_results("test_model", task_name) results = evaluate_multiple_models( @@ -227,7 +227,7 @@ def test_evaluate_multiple_models_error_handling(): task = ToyTask(split=DatasetSplit.VAL, languages=[Language.EN]) task_name = task.name - with patch("workrb.evaluate.evaluate") as mock_evaluate: + with patch("workrb.run.evaluate") as mock_evaluate: # First model succeeds, second fails mock_evaluate.side_effect = [ create_mock_results("model1", task_name), @@ -256,7 +256,7 @@ def test_evaluate_multiple_models_output_folder_overrides_kwargs(): task = ToyTask(split=DatasetSplit.VAL, languages=[Language.EN]) task_name = task.name - with patch("workrb.evaluate.evaluate") as mock_evaluate: + with patch("workrb.run.evaluate") as mock_evaluate: mock_evaluate.side_effect = [ create_mock_results("model1", task_name), create_mock_results("model2", task_name), @@ -287,7 +287,7 @@ def test_evaluate_multiple_models_single_model(): task = ToyTask(split=DatasetSplit.VAL, languages=[Language.EN]) task_name = task.name - with patch("workrb.evaluate.evaluate") as mock_evaluate: + with patch("workrb.run.evaluate") as mock_evaluate: mock_evaluate.return_value = create_mock_results("single_model", task_name) results = evaluate_multiple_models( @@ -307,7 +307,7 @@ def test_evaluate_multiple_models_empty_models_list(): ToyTask = create_toy_task_class(SkillMatch1kSkillSimilarityRanking) task = ToyTask(split=DatasetSplit.VAL, languages=[Language.EN]) - with patch("workrb.evaluate.evaluate") as mock_evaluate: + with patch("workrb.run.evaluate") as mock_evaluate: with pytest.raises(AssertionError) as excinfo: evaluate_multiple_models( models=[], From 76d8b3d8f46aa776b2e07a2773b3249094d28ea1 Mon Sep 17 00:00:00 2001 From: Matthias De Lange Date: Tue, 6 Jan 2026 18:28:12 +0100 Subject: [PATCH 2/2] refactor: move functions centered in run.py for public api to registry.py and results.py. Prioritizes separation of concerns with public api determined in __init__.py --- src/workrb/__init__.py | 10 +++------- src/workrb/registry.py | 5 +++++ src/workrb/results.py | 12 ++++++++++++ src/workrb/run.py | 18 ------------------ 4 files changed, 20 insertions(+), 25 deletions(-) diff --git a/src/workrb/__init__.py b/src/workrb/__init__.py index 6f1770b..1e61d30 100644 --- a/src/workrb/__init__.py +++ b/src/workrb/__init__.py @@ -3,13 +3,9 @@ """ from workrb import data, metrics, models, tasks -from workrb.run import ( - evaluate, - evaluate_multiple_models, - get_tasks_overview, - list_available_tasks, - load_results, -) +from workrb.registry import list_available_tasks +from workrb.results import load_results +from workrb.run import evaluate, evaluate_multiple_models, get_tasks_overview __all__ = [ "data", diff --git a/src/workrb/registry.py b/src/workrb/registry.py index 68ca2a0..dedd8a8 100644 --- a/src/workrb/registry.py +++ b/src/workrb/registry.py @@ -85,6 +85,11 @@ def auto_discover(cls): importlib.import_module(modname) +def list_available_tasks() -> dict[str, str]: + """List all available task classes that can be used in configs.""" + return TaskRegistry.list_available() + + def register_task(name: str | None = None): """ Decorator registering a task class. diff --git a/src/workrb/results.py b/src/workrb/results.py index 937616e..707039f 100644 --- a/src/workrb/results.py +++ b/src/workrb/results.py @@ -1,3 +1,4 @@ +import json import pprint from collections import defaultdict from typing import Any @@ -355,3 +356,14 @@ def _get_flat_dataframe(self) -> pd.DataFrame: ) return pd.DataFrame(data) + + +def load_results(results_path: str = "./results.json") -> BenchmarkResults: + """ + Load results from specified folder. + + Useful for external usage of the results, when only the folder is available. + """ + with open(results_path) as f: + data = json.load(f) + return BenchmarkResults.model_validate(data) diff --git a/src/workrb/run.py b/src/workrb/run.py index 72cf31b..1770512 100644 --- a/src/workrb/run.py +++ b/src/workrb/run.py @@ -5,7 +5,6 @@ checkpointing, resuming, and efficient multi-model evaluation. """ -import json import logging import time from collections import Counter @@ -16,7 +15,6 @@ from workrb.logging import setup_logger from workrb.metrics.reporting import format_results from workrb.models.base import ModelInterface -from workrb.registry import TaskRegistry from workrb.results import ( BenchmarkMetadata, BenchmarkResults, @@ -219,22 +217,6 @@ def get_tasks_overview(tasks: Sequence[Task]) -> str: return "\n".join(lines) -def load_results(results_path: str = "./results.json") -> BenchmarkResults: - """ - Load results from specified folder. - - Useful for external usage of the results, when only the folder is available. - """ - with open(results_path) as f: - data = json.load(f) - return BenchmarkResults.model_validate(data) - - -def list_available_tasks() -> dict[str, str]: - """List all available task classes that can be used in configs.""" - return TaskRegistry.list_available() - - def _get_all_languages(tasks: Sequence[Task]) -> list[str]: """Get all unique languages across tasks.""" languages = set()