diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 0000000..bd9035f --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,38 @@ +# Workflow for running model benchmark tests +# Can be triggered manually from GitHub Actions UI + +name: Model Benchmarks + +on: + workflow_dispatch: + inputs: + test_filter: + description: 'Pytest -k filter (e.g., "contextmatch" or leave empty for all)' + required: false + default: '' + +jobs: + benchmark: + runs-on: ubuntu-latest + name: Run Model Benchmarks + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install the latest version of uv (with Python 3.10) + uses: astral-sh/setup-uv@v6 + with: + python-version: "3.10" + + - name: Install the project + run: uv sync --all-extras --dev + + - name: Run benchmark tests + run: | + if [ -n "${{ github.event.inputs.test_filter }}" ]; then + uv run --no-sync pytest -m 'model_performance' -k "${{ github.event.inputs.test_filter }}" -v + else + uv run --no-sync poe test-benchmark + fi + diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 9b5210c..fc754d8 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.11"] + python-version: ["3.10"] resolution-strategy: ["highest", "lowest-direct"] name: Python ${{ matrix.python-version }} (resolution=${{ matrix.resolution-strategy }}) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 94b2360..f3878e9 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -81,8 +81,9 @@ Project: 3. Ensure all linting and tests complete successfully locally before creating a PR: ```bash uv run poe lint - uv run pytest test/my_task_tests.py # Just your tests - uv run poe test # (ideally) For all tests + uv run pytest tests/my_task_tests.py # Just your tests + uv run poe test # Test suite (excludes model benchmarks) + uv run poe test-benchmark # Model benchmark tests only ``` 4. Having questions? Add them to your Github Issue. @@ -397,40 +398,93 @@ __all__ = [ ### Step 3: Test Your Model +Create a test file in `tests/test_models/`. This file contains both unit tests and (optionally) benchmark validation tests in a single file: + ```python -# tests/test_my_model.py +# tests/test_models/test_my_model.py import pytest -import workrb +from workrb.models.my_model import MyCustomModel +from workrb.tasks import TechSkillExtractRanking +from workrb.tasks.abstract.base import DatasetSplit, Language +from workrb.types import ModelInputType -def test_my_model_initialization(): - """Test model initialization""" - model = workrb.models.MyCustomModel("all-MiniLM-L6-v2") - assert model.name is not None +class TestMyCustomModelLoading: + """Test model loading and basic properties.""" + def test_model_initialization(self): + """Test model initialization""" + model = MyCustomModel() + assert model.name is not None -def test_my_model_ranking(): - """Test ranking computation""" - model = workrb.models.MyCustomModel("all-MiniLM-L6-v2") - from workrb.types import ModelInputType - - queries = ["Software Engineer", "Data Scientist"] - targets = ["Python", "Machine Learning", "SQL"] - - scores = model.compute_rankings( - queries=queries, - targets=targets, - query_input_type=ModelInputType.JOB_TITLE, - target_input_type=ModelInputType.SKILL_NAME, - ) - - assert scores.shape == (len(queries), len(targets)) + def test_model_ranking(self): + """Test ranking computation""" + model = MyCustomModel() + queries = ["Software Engineer", "Data Scientist"] + targets = ["Python", "Machine Learning", "SQL"] + + scores = model._compute_rankings( + queries=queries, + targets=targets, + query_input_type=ModelInputType.JOB_TITLE, + target_input_type=ModelInputType.SKILL_NAME, + ) + + assert scores.shape == (len(queries), len(targets)) ``` -### Step 4: Register Your Model (if using registry) +### Step 4: Validate Model Performance (if prior paper results available) + +If your model has published benchmark results and a compatible (ideally small) dataset is available in WorkRB, add a benchmark validation test **in the same test file**. Mark the benchmark class with `@pytest.mark.model_performance`: + +```python +# tests/test_models/test_my_model.py (continued) + +@pytest.mark.model_performance +class TestMyCustomModelBenchmark: + """Validate MyCustomModel against paper-reported metrics.""" -If you want your model discoverable via `ModelRegistry.list_available()`, use the `@register_model()` decorator (shown in Step 1). + def test_benchmark_metrics(self): + """ + Verify model achieves results close to paper-reported metrics. + + Paper: "Title" (Venue Year) + Reported on [dataset] test set: + - MRR: 0.XX + - RP@5: XX.X% + """ + model = MyCustomModel() + task = TechSkillExtractRanking(split=DatasetSplit.TEST, languages=[Language.EN]) + + results = task.evaluate(model=model, metrics=["mrr", "rp@5"], language=Language.EN) + + # Paper-reported values (allow tolerance for minor differences) + expected_mrr = 0.55 + expected_rp5 = 0.60 + + assert results["mrr"] == pytest.approx(expected_mrr, abs=0.05) + assert results["rp@5"] == pytest.approx(expected_rp5, abs=0.05) +``` + +**See [tests/test_models/test_contextmatch_model.py](tests/test_models/test_contextmatch_model.py) for a complete example.** + +Tests marked with `@pytest.mark.model_performance` are excluded from `poe test` by default. To run them: +- **Locally**: `uv run poe test-benchmark` +- **In CI**: Contributors can trigger the **Model Benchmarks** workflow manually from GitHub Actions (Actions → Model Benchmarks → Run workflow) + +### Step 5: Register Your Model +Make sure to use the `@register_model()` decorator (shown in Step 1), this will make your model discoverable via `ModelRegistry.list_available()`. + +### Step 6: Document Your Model + +Add your model to the **Models** table in [README.md](README.md). You can either: + +1. **Manually** add a row to the table with your model's name, description, and whether it supports adaptive targets +2. **Generate** a table over all registered models using the helper script: + ```bash + uv run python examples/list_available_tasks_and_models.py + ``` ## Adding New Metrics @@ -503,12 +557,17 @@ uv run poe lint ```bash # Run your specific tests only -uv run pytest test/my_tests.py +uv run pytest tests/my_tests.py -# Run all tests (can take some time) +# Run tests with coverage (excludes model benchmarks) uv run poe test + +# Run model benchmark tests only +uv run poe test-benchmark ``` +**Model Performance Tests**: Benchmark tests in `tests/test_models/` that are marked with `@pytest.mark.model_performance` validate model scores against paper-reported results. These are excluded from `poe test` by default. + ### Documentation Standards - All public functions/classes must have docstrings diff --git a/README.md b/README.md index 7587625..f4c6d61 100644 --- a/README.md +++ b/README.md @@ -244,6 +244,12 @@ source .venv/bin/activate # Install the pre-commit hooks pre-commit install --install-hooks + +# Run tests (excludes model benchmarking by default) +uv run poe test + +# Run model benchmark tests only, checks reproducibility of original results +uv run poe test-benchmark ``` diff --git a/pyproject.toml b/pyproject.toml index 01eefb8..9b0275a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -85,7 +85,7 @@ skip_covered = true [tool.coverage.run] # https://coverage.readthedocs.io/en/latest/config.html#run branch = true -command_line = "--module pytest" +command_line = "--module pytest -m 'not model_performance'" data_file = "reports/.coverage" source = ["src"] @@ -96,6 +96,9 @@ output = "reports/coverage.xml" addopts = "--color=yes --doctest-modules --exitfirst --failed-first --verbosity=2 --junitxml=reports/pytest.xml" testpaths = ["src", "tests"] xfail_strict = true +markers = [ + "model_performance: marks tests that run heavy model benchmark evaluation (excluded by default, use poe test-benchmark)", +] [tool.ruff] # https://docs.astral.sh/ruff/settings/ fix = true @@ -189,7 +192,7 @@ cmd = """ """ [tool.poe.tasks.test] -help = "Test this package" +help = "Test this package (excludes model_performance benchmarks by default)" [[tool.poe.tasks.test.sequence]] cmd = "coverage run" @@ -199,3 +202,7 @@ cmd = "coverage report" [[tool.poe.tasks.test.sequence]] cmd = "coverage xml" + +[tool.poe.tasks.test-benchmark] +help = "Run model benchmark tests only (model_performance marked tests)" +cmd = "pytest -m 'model_performance' -v" diff --git a/tests/test_models/__init__.py b/tests/test_models/__init__.py new file mode 100644 index 0000000..cdda8a1 --- /dev/null +++ b/tests/test_models/__init__.py @@ -0,0 +1 @@ +"""Model tests including loading, usage, and benchmark validation.""" diff --git a/tests/test_contextmatch_model.py b/tests/test_models/test_contextmatch_model.py similarity index 98% rename from tests/test_contextmatch_model.py rename to tests/test_models/test_contextmatch_model.py index 34f9696..5179dae 100644 --- a/tests/test_contextmatch_model.py +++ b/tests/test_models/test_contextmatch_model.py @@ -1,4 +1,6 @@ -import pytest # noqa: D100 +"""Tests for ConTeXTMatchModel: loading, usage, and benchmark validation.""" + +import pytest import torch from workrb.models.bi_encoder import ConTeXTMatchModel @@ -111,6 +113,7 @@ def test_compute_classification_default_target_type(self): assert torch.isfinite(scores).all() +@pytest.mark.model_performance class TestConTeXTMatchModelTechSkillExtraction: """Test ConTeXTMatchModel performance on TECH skill extraction test set.""" diff --git a/tests/test_curriculum_encoder_model.py b/tests/test_models/test_curriculum_encoder_model.py similarity index 97% rename from tests/test_curriculum_encoder_model.py rename to tests/test_models/test_curriculum_encoder_model.py index 04e50c6..fb736ad 100644 --- a/tests/test_curriculum_encoder_model.py +++ b/tests/test_models/test_curriculum_encoder_model.py @@ -1,3 +1,5 @@ +"""Tests for CurriculumMatchModel: loading, usage, and benchmark validation.""" + import pytest from workrb.models.curriculum_encoder import CurriculumMatchModel @@ -47,6 +49,7 @@ def test_model_classification_label_space(self): assert model.classification_label_space is None +@pytest.mark.model_performance class TestCurriculumMatchModelBenchmark: """Test CurriculumMatchModel performance on skill extraction benchmarks."""