From 102e1cf69626907ba9de907a0088dbe9e9dbc505 Mon Sep 17 00:00:00 2001 From: Matthias De Lange Date: Fri, 9 Jan 2026 11:22:17 +0100 Subject: [PATCH 1/3] test: refactor of tests to exclude benchmarking validation of models. Reproducing results for models on a benchmark is now in a separate github workflow that can be manually triggered and is excluded by default to avoid bloating tests with the number of models being added. --- .github/workflows/benchmark.yml | 38 ++++++++++++ .github/workflows/test.yml | 2 +- CONTRIBUTING.md | 61 +++++++++++++++++-- README.md | 6 ++ pyproject.toml | 11 +++- tests/test_models/__init__.py | 1 + .../test_contextmatch_model.py | 1 + .../test_curriculum_encoder_model.py | 3 +- 8 files changed, 115 insertions(+), 8 deletions(-) create mode 100644 .github/workflows/benchmark.yml create mode 100644 tests/test_models/__init__.py rename tests/{ => test_models}/test_contextmatch_model.py (99%) rename tests/{ => test_models}/test_curriculum_encoder_model.py (98%) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 0000000..bd9035f --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,38 @@ +# Workflow for running model benchmark tests +# Can be triggered manually from GitHub Actions UI + +name: Model Benchmarks + +on: + workflow_dispatch: + inputs: + test_filter: + description: 'Pytest -k filter (e.g., "contextmatch" or leave empty for all)' + required: false + default: '' + +jobs: + benchmark: + runs-on: ubuntu-latest + name: Run Model Benchmarks + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install the latest version of uv (with Python 3.10) + uses: astral-sh/setup-uv@v6 + with: + python-version: "3.10" + + - name: Install the project + run: uv sync --all-extras --dev + + - name: Run benchmark tests + run: | + if [ -n "${{ github.event.inputs.test_filter }}" ]; then + uv run --no-sync pytest -m 'model_performance' -k "${{ github.event.inputs.test_filter }}" -v + else + uv run --no-sync poe test-benchmark + fi + diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 9b5210c..fc754d8 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.11"] + python-version: ["3.10"] resolution-strategy: ["highest", "lowest-direct"] name: Python ${{ matrix.python-version }} (resolution=${{ matrix.resolution-strategy }}) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 94b2360..d0feed6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -81,8 +81,9 @@ Project: 3. Ensure all linting and tests complete successfully locally before creating a PR: ```bash uv run poe lint - uv run pytest test/my_task_tests.py # Just your tests - uv run poe test # (ideally) For all tests + uv run pytest tests/my_task_tests.py # Just your tests + uv run poe test # Test suite (excludes model benchmarks) + uv run poe test-benchmark # Model benchmark tests only ``` 4. Having questions? Add them to your Github Issue. @@ -432,6 +433,53 @@ def test_my_model_ranking(): If you want your model discoverable via `ModelRegistry.list_available()`, use the `@register_model()` decorator (shown in Step 1). +### Step 5: Validate Model Performance (if prior paper results available) + +If your model has published benchmark results and a compatible (ideally small) dataset is available in WorkRB, add a performance validation test. This ensures your model reproduces expected results. + +Create a test in `tests/test_models/` and mark the benchmark class with `@pytest.mark.model_performance`: + +```python +# tests/test_models/test_my_model.py + +import pytest +from workrb.models.my_model import MyCustomModel +from workrb.tasks import TechSkillExtractRanking # or relevant task +from workrb.tasks.abstract.base import DatasetSplit, Language + + +@pytest.mark.model_performance +class TestMyCustomModelBenchmark: + """Validate MyCustomModel against paper-reported metrics.""" + + def test_benchmark_metrics(self): + """ + Verify model achieves results close to paper-reported metrics. + + Paper: "Title" (Venue Year) + Reported on [dataset] test set: + - MRR: 0.XX + - RP@5: XX.X% + """ + model = MyCustomModel() + task = TechSkillExtractRanking(split=DatasetSplit.TEST, languages=[Language.EN]) + + results = task.evaluate(model=model, metrics=["mrr", "rp@5"], language=Language.EN) + + # Paper-reported values (allow tolerance for minor differences) + expected_mrr = 0.55 + expected_rp5 = 0.60 + + assert results["mrr"] == pytest.approx(expected_mrr, abs=0.05) + assert results["rp@5"] == pytest.approx(expected_rp5, abs=0.05) +``` + +**See [tests/test_models/test_contextmatch_model.py](tests/test_models/test_contextmatch_model.py) for a complete example.** + +Tests marked with `@pytest.mark.model_performance` are excluded from `poe test` by default. To run them: +- **Locally**: `uv run poe test-benchmark` +- **In CI**: Contributors can trigger the **Model Benchmarks** workflow manually from GitHub Actions (Actions → Model Benchmarks → Run workflow) + ## Adding New Metrics To add new evaluation metrics: @@ -503,12 +551,17 @@ uv run poe lint ```bash # Run your specific tests only -uv run pytest test/my_tests.py +uv run pytest tests/my_tests.py -# Run all tests (can take some time) +# Run tests with coverage (excludes model benchmarks) uv run poe test + +# Run model benchmark tests only +uv run poe test-benchmark ``` +**Model Performance Tests**: Benchmark tests in `tests/test_models/` that are marked with `@pytest.mark.model_performance` validate model scores against paper-reported results. These are excluded from `poe test` by default. + ### Documentation Standards - All public functions/classes must have docstrings diff --git a/README.md b/README.md index 7587625..f4c6d61 100644 --- a/README.md +++ b/README.md @@ -244,6 +244,12 @@ source .venv/bin/activate # Install the pre-commit hooks pre-commit install --install-hooks + +# Run tests (excludes model benchmarking by default) +uv run poe test + +# Run model benchmark tests only, checks reproducibility of original results +uv run poe test-benchmark ``` diff --git a/pyproject.toml b/pyproject.toml index 01eefb8..9b0275a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -85,7 +85,7 @@ skip_covered = true [tool.coverage.run] # https://coverage.readthedocs.io/en/latest/config.html#run branch = true -command_line = "--module pytest" +command_line = "--module pytest -m 'not model_performance'" data_file = "reports/.coverage" source = ["src"] @@ -96,6 +96,9 @@ output = "reports/coverage.xml" addopts = "--color=yes --doctest-modules --exitfirst --failed-first --verbosity=2 --junitxml=reports/pytest.xml" testpaths = ["src", "tests"] xfail_strict = true +markers = [ + "model_performance: marks tests that run heavy model benchmark evaluation (excluded by default, use poe test-benchmark)", +] [tool.ruff] # https://docs.astral.sh/ruff/settings/ fix = true @@ -189,7 +192,7 @@ cmd = """ """ [tool.poe.tasks.test] -help = "Test this package" +help = "Test this package (excludes model_performance benchmarks by default)" [[tool.poe.tasks.test.sequence]] cmd = "coverage run" @@ -199,3 +202,7 @@ cmd = "coverage report" [[tool.poe.tasks.test.sequence]] cmd = "coverage xml" + +[tool.poe.tasks.test-benchmark] +help = "Run model benchmark tests only (model_performance marked tests)" +cmd = "pytest -m 'model_performance' -v" diff --git a/tests/test_models/__init__.py b/tests/test_models/__init__.py new file mode 100644 index 0000000..cdda8a1 --- /dev/null +++ b/tests/test_models/__init__.py @@ -0,0 +1 @@ +"""Model tests including loading, usage, and benchmark validation.""" diff --git a/tests/test_contextmatch_model.py b/tests/test_models/test_contextmatch_model.py similarity index 99% rename from tests/test_contextmatch_model.py rename to tests/test_models/test_contextmatch_model.py index 34f9696..f7fe99c 100644 --- a/tests/test_contextmatch_model.py +++ b/tests/test_models/test_contextmatch_model.py @@ -111,6 +111,7 @@ def test_compute_classification_default_target_type(self): assert torch.isfinite(scores).all() +@pytest.mark.model_performance class TestConTeXTMatchModelTechSkillExtraction: """Test ConTeXTMatchModel performance on TECH skill extraction test set.""" diff --git a/tests/test_curriculum_encoder_model.py b/tests/test_models/test_curriculum_encoder_model.py similarity index 98% rename from tests/test_curriculum_encoder_model.py rename to tests/test_models/test_curriculum_encoder_model.py index 04e50c6..f819784 100644 --- a/tests/test_curriculum_encoder_model.py +++ b/tests/test_models/test_curriculum_encoder_model.py @@ -1,4 +1,4 @@ -import pytest +import pytest # noqa: D100 from workrb.models.curriculum_encoder import CurriculumMatchModel from workrb.tasks import TechSkillExtractRanking @@ -47,6 +47,7 @@ def test_model_classification_label_space(self): assert model.classification_label_space is None +@pytest.mark.model_performance class TestCurriculumMatchModelBenchmark: """Test CurriculumMatchModel performance on skill extraction benchmarks.""" From 687b28b1c2a0a417d038566dd5cc73c03b87260b Mon Sep 17 00:00:00 2001 From: Matthias De Lange Date: Fri, 9 Jan 2026 11:36:33 +0100 Subject: [PATCH 2/3] docs: CONTRIBUTING model guideline update --- CONTRIBUTING.md | 80 ++++++++++++++++++++++++++----------------------- 1 file changed, 43 insertions(+), 37 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d0feed6..f3878e9 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -398,55 +398,48 @@ __all__ = [ ### Step 3: Test Your Model +Create a test file in `tests/test_models/`. This file contains both unit tests and (optionally) benchmark validation tests in a single file: + ```python -# tests/test_my_model.py +# tests/test_models/test_my_model.py import pytest -import workrb - - -def test_my_model_initialization(): - """Test model initialization""" - model = workrb.models.MyCustomModel("all-MiniLM-L6-v2") - assert model.name is not None +from workrb.models.my_model import MyCustomModel +from workrb.tasks import TechSkillExtractRanking +from workrb.tasks.abstract.base import DatasetSplit, Language +from workrb.types import ModelInputType -def test_my_model_ranking(): - """Test ranking computation""" - model = workrb.models.MyCustomModel("all-MiniLM-L6-v2") - from workrb.types import ModelInputType - - queries = ["Software Engineer", "Data Scientist"] - targets = ["Python", "Machine Learning", "SQL"] - - scores = model.compute_rankings( - queries=queries, - targets=targets, - query_input_type=ModelInputType.JOB_TITLE, - target_input_type=ModelInputType.SKILL_NAME, - ) - - assert scores.shape == (len(queries), len(targets)) -``` +class TestMyCustomModelLoading: + """Test model loading and basic properties.""" -### Step 4: Register Your Model (if using registry) + def test_model_initialization(self): + """Test model initialization""" + model = MyCustomModel() + assert model.name is not None -If you want your model discoverable via `ModelRegistry.list_available()`, use the `@register_model()` decorator (shown in Step 1). + def test_model_ranking(self): + """Test ranking computation""" + model = MyCustomModel() + queries = ["Software Engineer", "Data Scientist"] + targets = ["Python", "Machine Learning", "SQL"] + + scores = model._compute_rankings( + queries=queries, + targets=targets, + query_input_type=ModelInputType.JOB_TITLE, + target_input_type=ModelInputType.SKILL_NAME, + ) -### Step 5: Validate Model Performance (if prior paper results available) + assert scores.shape == (len(queries), len(targets)) +``` -If your model has published benchmark results and a compatible (ideally small) dataset is available in WorkRB, add a performance validation test. This ensures your model reproduces expected results. +### Step 4: Validate Model Performance (if prior paper results available) -Create a test in `tests/test_models/` and mark the benchmark class with `@pytest.mark.model_performance`: +If your model has published benchmark results and a compatible (ideally small) dataset is available in WorkRB, add a benchmark validation test **in the same test file**. Mark the benchmark class with `@pytest.mark.model_performance`: ```python -# tests/test_models/test_my_model.py - -import pytest -from workrb.models.my_model import MyCustomModel -from workrb.tasks import TechSkillExtractRanking # or relevant task -from workrb.tasks.abstract.base import DatasetSplit, Language - +# tests/test_models/test_my_model.py (continued) @pytest.mark.model_performance class TestMyCustomModelBenchmark: @@ -480,6 +473,19 @@ Tests marked with `@pytest.mark.model_performance` are excluded from `poe test` - **Locally**: `uv run poe test-benchmark` - **In CI**: Contributors can trigger the **Model Benchmarks** workflow manually from GitHub Actions (Actions → Model Benchmarks → Run workflow) +### Step 5: Register Your Model +Make sure to use the `@register_model()` decorator (shown in Step 1), this will make your model discoverable via `ModelRegistry.list_available()`. + +### Step 6: Document Your Model + +Add your model to the **Models** table in [README.md](README.md). You can either: + +1. **Manually** add a row to the table with your model's name, description, and whether it supports adaptive targets +2. **Generate** a table over all registered models using the helper script: + ```bash + uv run python examples/list_available_tasks_and_models.py + ``` + ## Adding New Metrics To add new evaluation metrics: From 39705068884057ca078bf597a550365b01d1a6cc Mon Sep 17 00:00:00 2001 From: Matthias De Lange Date: Fri, 9 Jan 2026 11:39:28 +0100 Subject: [PATCH 3/3] chore: remove noqa from model tests --- tests/test_models/test_contextmatch_model.py | 4 +++- tests/test_models/test_curriculum_encoder_model.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test_models/test_contextmatch_model.py b/tests/test_models/test_contextmatch_model.py index f7fe99c..5179dae 100644 --- a/tests/test_models/test_contextmatch_model.py +++ b/tests/test_models/test_contextmatch_model.py @@ -1,4 +1,6 @@ -import pytest # noqa: D100 +"""Tests for ConTeXTMatchModel: loading, usage, and benchmark validation.""" + +import pytest import torch from workrb.models.bi_encoder import ConTeXTMatchModel diff --git a/tests/test_models/test_curriculum_encoder_model.py b/tests/test_models/test_curriculum_encoder_model.py index f819784..fb736ad 100644 --- a/tests/test_models/test_curriculum_encoder_model.py +++ b/tests/test_models/test_curriculum_encoder_model.py @@ -1,4 +1,6 @@ -import pytest # noqa: D100 +"""Tests for CurriculumMatchModel: loading, usage, and benchmark validation.""" + +import pytest from workrb.models.curriculum_encoder import CurriculumMatchModel from workrb.tasks import TechSkillExtractRanking