From 97a406937df1e6dc63e2effcbedfb0fa523f230a Mon Sep 17 00:00:00 2001
From: Matthias De Lange <matthias.delange@techwolf.ai>
Date: Tue, 6 Jan 2026 18:19:36 +0100
Subject: [PATCH 1/2] refactor: rename evaluate.py to run.py to remove
 ambiguity with workrb.evaluate function

---
 src/workrb/__init__.py                 |  2 +-
 src/workrb/{evaluate.py => run.py}     | 10 ----------
 tests/test_evaluate_multiple_models.py | 14 +++++++-------
 3 files changed, 8 insertions(+), 18 deletions(-)
 rename src/workrb/{evaluate.py => run.py} (97%)

diff --git a/src/workrb/__init__.py b/src/workrb/__init__.py
index a62feec..6f1770b 100644
--- a/src/workrb/__init__.py
+++ b/src/workrb/__init__.py
@@ -3,7 +3,7 @@
 """
 
 from workrb import data, metrics, models, tasks
-from workrb.evaluate import (
+from workrb.run import (
     evaluate,
     evaluate_multiple_models,
     get_tasks_overview,
diff --git a/src/workrb/evaluate.py b/src/workrb/run.py
similarity index 97%
rename from src/workrb/evaluate.py
rename to src/workrb/run.py
index 6b9507b..72cf31b 100644
--- a/src/workrb/evaluate.py
+++ b/src/workrb/run.py
@@ -248,16 +248,6 @@ def _get_total_evaluations(tasks: Sequence[Task]) -> int:
     return sum(len(task.languages) for task in tasks)
 
 
-def _validate_tasks(tasks: Sequence[Task]):
-    """Validate that all tasks are properly configured."""
-    if not tasks:
-        raise ValueError("At least one task must be provided")
-
-    for task in tasks:
-        if not isinstance(task, Task):
-            raise TypeError(f"All tasks must inherit from Task, got {type(task)}")
-
-
 def _init_checkpointing(
     tasks: Sequence[Task],
     config: BenchmarkConfig,
diff --git a/tests/test_evaluate_multiple_models.py b/tests/test_evaluate_multiple_models.py
index 33eecbb..93801f3 100644
--- a/tests/test_evaluate_multiple_models.py
+++ b/tests/test_evaluate_multiple_models.py
@@ -15,7 +15,6 @@
 import torch
 
 from tests.test_utils import create_toy_task_class
-from workrb.evaluate import evaluate_multiple_models
 from workrb.models.base import ModelInterface
 from workrb.results import (
     BenchmarkMetadata,
@@ -24,6 +23,7 @@
     TaskResultMetadata,
     TaskResults,
 )
+from workrb.run import evaluate_multiple_models
 from workrb.tasks import SkillMatch1kSkillSimilarityRanking
 from workrb.tasks.abstract.base import DatasetSplit, Language
 from workrb.types import ModelInputType
@@ -118,7 +118,7 @@ def test_evaluate_multiple_models_basic():
     task_name = task.name
 
     # Mock the evaluate function
-    with patch("workrb.evaluate.evaluate") as mock_evaluate:
+    with patch("workrb.run.evaluate") as mock_evaluate:
         # Set up return values for each model
         mock_evaluate.side_effect = [
             create_mock_results("model1", task_name),
@@ -165,7 +165,7 @@ def test_evaluate_multiple_models_with_additional_kwargs():
     task = ToyTask(split=DatasetSplit.VAL, languages=[Language.EN])
     task_name = task.name
 
-    with patch("workrb.evaluate.evaluate") as mock_evaluate:
+    with patch("workrb.run.evaluate") as mock_evaluate:
         mock_evaluate.return_value = create_mock_results("test_model", task_name)
 
         results = evaluate_multiple_models(
@@ -227,7 +227,7 @@ def test_evaluate_multiple_models_error_handling():
     task = ToyTask(split=DatasetSplit.VAL, languages=[Language.EN])
     task_name = task.name
 
-    with patch("workrb.evaluate.evaluate") as mock_evaluate:
+    with patch("workrb.run.evaluate") as mock_evaluate:
         # First model succeeds, second fails
         mock_evaluate.side_effect = [
             create_mock_results("model1", task_name),
@@ -256,7 +256,7 @@ def test_evaluate_multiple_models_output_folder_overrides_kwargs():
     task = ToyTask(split=DatasetSplit.VAL, languages=[Language.EN])
     task_name = task.name
 
-    with patch("workrb.evaluate.evaluate") as mock_evaluate:
+    with patch("workrb.run.evaluate") as mock_evaluate:
         mock_evaluate.side_effect = [
             create_mock_results("model1", task_name),
             create_mock_results("model2", task_name),
@@ -287,7 +287,7 @@ def test_evaluate_multiple_models_single_model():
     task = ToyTask(split=DatasetSplit.VAL, languages=[Language.EN])
     task_name = task.name
 
-    with patch("workrb.evaluate.evaluate") as mock_evaluate:
+    with patch("workrb.run.evaluate") as mock_evaluate:
         mock_evaluate.return_value = create_mock_results("single_model", task_name)
 
         results = evaluate_multiple_models(
@@ -307,7 +307,7 @@ def test_evaluate_multiple_models_empty_models_list():
     ToyTask = create_toy_task_class(SkillMatch1kSkillSimilarityRanking)
     task = ToyTask(split=DatasetSplit.VAL, languages=[Language.EN])
 
-    with patch("workrb.evaluate.evaluate") as mock_evaluate:
+    with patch("workrb.run.evaluate") as mock_evaluate:
         with pytest.raises(AssertionError) as excinfo:
             evaluate_multiple_models(
                 models=[],

From 76d8b3d8f46aa776b2e07a2773b3249094d28ea1 Mon Sep 17 00:00:00 2001
From: Matthias De Lange <matthias.delange@techwolf.ai>
Date: Tue, 6 Jan 2026 18:28:12 +0100
Subject: [PATCH 2/2] refactor: move functions centered in run.py for public
 api to registry.py and results.py.

Prioritizes separation of concerns with public api determined in __init__.py
---
 src/workrb/__init__.py | 10 +++-------
 src/workrb/registry.py |  5 +++++
 src/workrb/results.py  | 12 ++++++++++++
 src/workrb/run.py      | 18 ------------------
 4 files changed, 20 insertions(+), 25 deletions(-)

diff --git a/src/workrb/__init__.py b/src/workrb/__init__.py
index 6f1770b..1e61d30 100644
--- a/src/workrb/__init__.py
+++ b/src/workrb/__init__.py
@@ -3,13 +3,9 @@
 """
 
 from workrb import data, metrics, models, tasks
-from workrb.run import (
-    evaluate,
-    evaluate_multiple_models,
-    get_tasks_overview,
-    list_available_tasks,
-    load_results,
-)
+from workrb.registry import list_available_tasks
+from workrb.results import load_results
+from workrb.run import evaluate, evaluate_multiple_models, get_tasks_overview
 
 __all__ = [
     "data",
diff --git a/src/workrb/registry.py b/src/workrb/registry.py
index 68ca2a0..dedd8a8 100644
--- a/src/workrb/registry.py
+++ b/src/workrb/registry.py
@@ -85,6 +85,11 @@ def auto_discover(cls):
             importlib.import_module(modname)
 
 
+def list_available_tasks() -> dict[str, str]:
+    """List all available task classes that can be used in configs."""
+    return TaskRegistry.list_available()
+
+
 def register_task(name: str | None = None):
     """
     Decorator registering a task class.
diff --git a/src/workrb/results.py b/src/workrb/results.py
index 937616e..707039f 100644
--- a/src/workrb/results.py
+++ b/src/workrb/results.py
@@ -1,3 +1,4 @@
+import json
 import pprint
 from collections import defaultdict
 from typing import Any
@@ -355,3 +356,14 @@ def _get_flat_dataframe(self) -> pd.DataFrame:
                     )
 
         return pd.DataFrame(data)
+
+
+def load_results(results_path: str = "./results.json") -> BenchmarkResults:
+    """
+    Load results from specified folder.
+
+    Useful for external usage of the results, when only the folder is available.
+    """
+    with open(results_path) as f:
+        data = json.load(f)
+    return BenchmarkResults.model_validate(data)
diff --git a/src/workrb/run.py b/src/workrb/run.py
index 72cf31b..1770512 100644
--- a/src/workrb/run.py
+++ b/src/workrb/run.py
@@ -5,7 +5,6 @@
 checkpointing, resuming, and efficient multi-model evaluation.
 """
 
-import json
 import logging
 import time
 from collections import Counter
@@ -16,7 +15,6 @@
 from workrb.logging import setup_logger
 from workrb.metrics.reporting import format_results
 from workrb.models.base import ModelInterface
-from workrb.registry import TaskRegistry
 from workrb.results import (
     BenchmarkMetadata,
     BenchmarkResults,
@@ -219,22 +217,6 @@ def get_tasks_overview(tasks: Sequence[Task]) -> str:
     return "\n".join(lines)
 
 
-def load_results(results_path: str = "./results.json") -> BenchmarkResults:
-    """
-    Load results from specified folder.
-
-    Useful for external usage of the results, when only the folder is available.
-    """
-    with open(results_path) as f:
-        data = json.load(f)
-    return BenchmarkResults.model_validate(data)
-
-
-def list_available_tasks() -> dict[str, str]:
-    """List all available task classes that can be used in configs."""
-    return TaskRegistry.list_available()
-
-
 def _get_all_languages(tasks: Sequence[Task]) -> list[str]:
     """Get all unique languages across tasks."""
     languages = set()