diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
new file mode 100644
index 0000000..bd9035f
--- /dev/null
+++ b/.github/workflows/benchmark.yml
@@ -0,0 +1,38 @@
+# Workflow for running model benchmark tests
+# Can be triggered manually from GitHub Actions UI
+
+name: Model Benchmarks
+
+on:
+  workflow_dispatch:
+    inputs:
+      test_filter:
+        description: 'Pytest -k filter (e.g., "contextmatch" or leave empty for all)'
+        required: false
+        default: ''
+
+jobs:
+  benchmark:
+    runs-on: ubuntu-latest
+    name: Run Model Benchmarks
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Install the latest version of uv (with Python 3.10)
+        uses: astral-sh/setup-uv@v6
+        with:
+          python-version: "3.10"
+
+      - name: Install the project
+        run: uv sync --all-extras --dev
+
+      - name: Run benchmark tests
+        run: |
+          if [ -n "${{ github.event.inputs.test_filter }}" ]; then
+            uv run --no-sync pytest -m 'model_performance' -k "${{ github.event.inputs.test_filter }}" -v
+          else
+            uv run --no-sync poe test-benchmark
+          fi
+
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 9b5210c..fc754d8 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.11"]
+        python-version: ["3.10"]
         resolution-strategy: ["highest", "lowest-direct"]
     name: Python ${{ matrix.python-version }} (resolution=${{ matrix.resolution-strategy }})
     
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 94b2360..f3878e9 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -81,8 +81,9 @@ Project:
 3. Ensure all linting and tests complete successfully locally before creating a PR:
    ```bash
    uv run poe lint
-   uv run pytest test/my_task_tests.py # Just your tests
-   uv run poe test # (ideally) For all tests
+   uv run pytest tests/my_task_tests.py  # Just your tests
+   uv run poe test                       # Test suite (excludes model benchmarks)
+   uv run poe test-benchmark             # Model benchmark tests only
    ```
 4. Having questions? Add them to your Github Issue. 
 
@@ -397,40 +398,93 @@ __all__ = [
 
 ### Step 3: Test Your Model
 
+Create a test file in `tests/test_models/`. This file contains both unit tests and (optionally) benchmark validation tests in a single file:
+
 ```python
-# tests/test_my_model.py
+# tests/test_models/test_my_model.py
 
 import pytest
-import workrb
+from workrb.models.my_model import MyCustomModel
+from workrb.tasks import TechSkillExtractRanking
+from workrb.tasks.abstract.base import DatasetSplit, Language
+from workrb.types import ModelInputType
 
 
-def test_my_model_initialization():
-    """Test model initialization"""
-    model = workrb.models.MyCustomModel("all-MiniLM-L6-v2")
-    assert model.name is not None
+class TestMyCustomModelLoading:
+    """Test model loading and basic properties."""
 
+    def test_model_initialization(self):
+        """Test model initialization"""
+        model = MyCustomModel()
+        assert model.name is not None
 
-def test_my_model_ranking():
-    """Test ranking computation"""
-    model = workrb.models.MyCustomModel("all-MiniLM-L6-v2")
-    from workrb.types import ModelInputType
-    
-    queries = ["Software Engineer", "Data Scientist"]
-    targets = ["Python", "Machine Learning", "SQL"]
-    
-    scores = model.compute_rankings(
-        queries=queries,
-        targets=targets,
-        query_input_type=ModelInputType.JOB_TITLE,
-        target_input_type=ModelInputType.SKILL_NAME,
-    )
-    
-    assert scores.shape == (len(queries), len(targets))
+    def test_model_ranking(self):
+        """Test ranking computation"""
+        model = MyCustomModel()
+        queries = ["Software Engineer", "Data Scientist"]
+        targets = ["Python", "Machine Learning", "SQL"]
+
+        scores = model._compute_rankings(
+            queries=queries,
+            targets=targets,
+            query_input_type=ModelInputType.JOB_TITLE,
+            target_input_type=ModelInputType.SKILL_NAME,
+        )
+
+        assert scores.shape == (len(queries), len(targets))
 ```
 
-### Step 4: Register Your Model (if using registry)
+### Step 4: Validate Model Performance (if prior paper results available)
+
+If your model has published benchmark results and a compatible (ideally small) dataset is available in WorkRB, add a benchmark validation test **in the same test file**. Mark the benchmark class with `@pytest.mark.model_performance`:
+
+```python
+# tests/test_models/test_my_model.py (continued)
+
+@pytest.mark.model_performance
+class TestMyCustomModelBenchmark:
+    """Validate MyCustomModel against paper-reported metrics."""
 
-If you want your model discoverable via `ModelRegistry.list_available()`, use the `@register_model()` decorator (shown in Step 1).
+    def test_benchmark_metrics(self):
+        """
+        Verify model achieves results close to paper-reported metrics.
+
+        Paper: "Title" (Venue Year)
+        Reported on [dataset] test set:
+        - MRR: 0.XX
+        - RP@5: XX.X%
+        """
+        model = MyCustomModel()
+        task = TechSkillExtractRanking(split=DatasetSplit.TEST, languages=[Language.EN])
+
+        results = task.evaluate(model=model, metrics=["mrr", "rp@5"], language=Language.EN)
+
+        # Paper-reported values (allow tolerance for minor differences)
+        expected_mrr = 0.55
+        expected_rp5 = 0.60
+
+        assert results["mrr"] == pytest.approx(expected_mrr, abs=0.05)
+        assert results["rp@5"] == pytest.approx(expected_rp5, abs=0.05)
+```
+
+**See [tests/test_models/test_contextmatch_model.py](tests/test_models/test_contextmatch_model.py) for a complete example.**
+
+Tests marked with `@pytest.mark.model_performance` are excluded from `poe test` by default. To run them:
+- **Locally**: `uv run poe test-benchmark`
+- **In CI**: Contributors can trigger the **Model Benchmarks** workflow manually from GitHub Actions (Actions → Model Benchmarks → Run workflow)
+
+### Step 5: Register Your Model
+Make sure to use the `@register_model()` decorator (shown in Step 1), this will make your model discoverable via `ModelRegistry.list_available()`.
+
+### Step 6: Document Your Model
+
+Add your model to the **Models** table in [README.md](README.md). You can either:
+
+1. **Manually** add a row to the table with your model's name, description, and whether it supports adaptive targets
+2. **Generate** a table over all registered models using the helper script:
+   ```bash
+   uv run python examples/list_available_tasks_and_models.py
+   ```
 
 ## Adding New Metrics
 
@@ -503,12 +557,17 @@ uv run poe lint
 
 ```bash
 # Run your specific tests only
-uv run pytest test/my_tests.py
+uv run pytest tests/my_tests.py
 
-# Run all tests (can take some time)
+# Run tests with coverage (excludes model benchmarks)
 uv run poe test
+
+# Run model benchmark tests only
+uv run poe test-benchmark
 ```
 
+**Model Performance Tests**: Benchmark tests in `tests/test_models/` that are marked with `@pytest.mark.model_performance` validate model scores against paper-reported results. These are excluded from `poe test` by default.
+
 ### Documentation Standards
 
 - All public functions/classes must have docstrings
diff --git a/README.md b/README.md
index 7587625..f4c6d61 100644
--- a/README.md
+++ b/README.md
@@ -244,6 +244,12 @@ source .venv/bin/activate
 
 # Install the pre-commit hooks
 pre-commit install --install-hooks
+
+# Run tests (excludes model benchmarking by default)
+uv run poe test
+
+# Run model benchmark tests only, checks reproducibility of original results
+uv run poe test-benchmark
 ```
 
 
diff --git a/pyproject.toml b/pyproject.toml
index 01eefb8..9b0275a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -85,7 +85,7 @@ skip_covered = true
 
 [tool.coverage.run] # https://coverage.readthedocs.io/en/latest/config.html#run
 branch = true
-command_line = "--module pytest"
+command_line = "--module pytest -m 'not model_performance'"
 data_file = "reports/.coverage"
 source = ["src"]
 
@@ -96,6 +96,9 @@ output = "reports/coverage.xml"
 addopts = "--color=yes --doctest-modules --exitfirst --failed-first --verbosity=2 --junitxml=reports/pytest.xml"
 testpaths = ["src", "tests"]
 xfail_strict = true
+markers = [
+    "model_performance: marks tests that run heavy model benchmark evaluation (excluded by default, use poe test-benchmark)",
+]
 
 [tool.ruff] # https://docs.astral.sh/ruff/settings/
 fix = true
@@ -189,7 +192,7 @@ cmd = """
     """
 
 [tool.poe.tasks.test]
-help = "Test this package"
+help = "Test this package (excludes model_performance benchmarks by default)"
 
 [[tool.poe.tasks.test.sequence]]
 cmd = "coverage run"
@@ -199,3 +202,7 @@ cmd = "coverage report"
 
 [[tool.poe.tasks.test.sequence]]
 cmd = "coverage xml"
+
+[tool.poe.tasks.test-benchmark]
+help = "Run model benchmark tests only (model_performance marked tests)"
+cmd = "pytest -m 'model_performance' -v"
diff --git a/tests/test_models/__init__.py b/tests/test_models/__init__.py
new file mode 100644
index 0000000..cdda8a1
--- /dev/null
+++ b/tests/test_models/__init__.py
@@ -0,0 +1 @@
+"""Model tests including loading, usage, and benchmark validation."""
diff --git a/tests/test_contextmatch_model.py b/tests/test_models/test_contextmatch_model.py
similarity index 98%
rename from tests/test_contextmatch_model.py
rename to tests/test_models/test_contextmatch_model.py
index 34f9696..5179dae 100644
--- a/tests/test_contextmatch_model.py
+++ b/tests/test_models/test_contextmatch_model.py
@@ -1,4 +1,6 @@
-import pytest  # noqa: D100
+"""Tests for ConTeXTMatchModel: loading, usage, and benchmark validation."""
+
+import pytest
 import torch
 
 from workrb.models.bi_encoder import ConTeXTMatchModel
@@ -111,6 +113,7 @@ def test_compute_classification_default_target_type(self):
         assert torch.isfinite(scores).all()
 
 
+@pytest.mark.model_performance
 class TestConTeXTMatchModelTechSkillExtraction:
     """Test ConTeXTMatchModel performance on TECH skill extraction test set."""
 
diff --git a/tests/test_curriculum_encoder_model.py b/tests/test_models/test_curriculum_encoder_model.py
similarity index 97%
rename from tests/test_curriculum_encoder_model.py
rename to tests/test_models/test_curriculum_encoder_model.py
index 04e50c6..fb736ad 100644
--- a/tests/test_curriculum_encoder_model.py
+++ b/tests/test_models/test_curriculum_encoder_model.py
@@ -1,3 +1,5 @@
+"""Tests for CurriculumMatchModel: loading, usage, and benchmark validation."""
+
 import pytest
 
 from workrb.models.curriculum_encoder import CurriculumMatchModel
@@ -47,6 +49,7 @@ def test_model_classification_label_space(self):
         assert model.classification_label_space is None
 
 
+@pytest.mark.model_performance
 class TestCurriculumMatchModelBenchmark:
     """Test CurriculumMatchModel performance on skill extraction benchmarks."""