diff --git a/.github/workflows/ab_tests.yml b/.github/workflows/ab_tests.yml
index 9865e0e08e..85261aa450 100644
--- a/.github/workflows/ab_tests.yml
+++ b/.github/workflows/ab_tests.yml
@@ -20,34 +20,44 @@ jobs:
     name: Discover A/B environments
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - name: Checkout
+        uses: actions/checkout@v2
         with:
           fetch-depth: 0
-      - uses: actions/setup-python@v4
+
+      - name: Install Python
+        uses: actions/setup-python@v4
         with:
           python-version: '3.10'
-      - id: set-matrix
+
+      - name: Install dependencies
+        run: pip install PyYaml
+
+      - name: Generate dynamic matrix
+        id: set-matrix
         run: echo "::set-output name=matrix::$(python ci/scripts/discover_ab_environments.py)"
+
     outputs:
-        matrix: ${{ steps.set-matrix.outputs.matrix }}
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
 
   # Everything below this point runs iff there are files matching
-  # AB_environments/AB_*.conda.yaml
-  # AB_environments/AB_*.dask.yaml
+  # AB_environments/AB_*.{conda,dask}.yaml
+  # and AB_environments/config.yaml set repeat > 0
 
   software:
-    name: Setup
+    name: Setup - ${{ matrix.runtime-version }} py${{ matrix.python-version }}
     runs-on: ubuntu-latest
     needs: discover_ab_envs
-    if: ${{ fromJson(needs.discover_ab_envs.outputs.matrix) }}
+    if: ${{ fromJson(needs.discover_ab_envs.outputs.matrix).runtime }}
     strategy:
       fail-fast: false
       matrix:
         python-version: ["3.9"]
-        runtime-version: ${{ fromJson(needs.discover_ab_envs.outputs.matrix) }}
+        runtime-version: ${{ fromJson(needs.discover_ab_envs.outputs.matrix).runtime }}
 
     steps:
-      - uses: actions/checkout@v2
+      - name: Checkout
+        uses: actions/checkout@v2
         with:
           fetch-depth: 0
 
@@ -60,6 +70,13 @@ jobs:
           python-version: ${{ matrix.python-version }}
           environment-file: ci/environment.yml
 
+      - name: Create null hypothesis as a copy of baseline
+        if: matrix.runtime-version == 'AB_null_hypothesis'
+        run: |
+          cd AB_environments
+          cp AB_baseline.conda.yaml AB_null_hypothesis.conda.yaml
+          cp AB_baseline.dask.yaml AB_null_hypothesis.dask.yaml
+
       - name: Build Coiled Software Environment
         env:
           DASK_COILED__TOKEN: ${{ secrets.COILED_BENCHMARK_BOT_TOKEN }}
@@ -101,8 +118,8 @@ jobs:
             software_name.txt
             test_upstream.txt
 
-  runtime:
-    name: Runtime - ${{ matrix.os }}, Python ${{ matrix.python-version }}, Runtime ${{ matrix.runtime-version }}
+  tests:
+    name: A/B Tests - ${{ matrix.category }} ${{ matrix.runtime-version }} ${{ matrix.os }} py${{ matrix.python-version }}
     needs: [discover_ab_envs, software]
     runs-on: ${{ matrix.os }}
     timeout-minutes: 120
@@ -111,10 +128,13 @@ jobs:
       matrix:
         os: [ubuntu-latest]
         python-version: ["3.9"]
-        runtime-version: ${{ fromJson(needs.discover_ab_envs.outputs.matrix) }}
+        category: ${{ fromJson(needs.discover_ab_envs.outputs.matrix).category }}
+        runtime-version: ${{ fromJson(needs.discover_ab_envs.outputs.matrix).runtime }}
+        repeat: ${{ fromJson(needs.discover_ab_envs.outputs.matrix).repeat }}
 
     steps:
-      - uses: actions/checkout@v2
+      - name: Checkout
+        uses: actions/checkout@v2
         with:
           fetch-depth: 0
 
@@ -128,7 +148,6 @@ jobs:
           environment-file: ci/environment.yml
 
       - name: Download software environment assets
-        if: matrix.runtime-version == 'latest' || startsWith(matrix.runtime-version, 'AB_')
         uses: actions/download-artifact@v3
         with:
           name: software-environment-${{ matrix.runtime-version }}-py${{ matrix.python-version }}
@@ -145,140 +164,32 @@ jobs:
           AWS_ACCESS_KEY_ID: ${{ secrets.RUNTIME_CI_BOT_AWS_ACCESS_KEY_ID }}
           AWS_SECRET_ACCESS_KEY: ${{ secrets.RUNTIME_CI_BOT_AWS_SECRET_ACCESS_KEY }}
           COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }}
-          DB_NAME: runtime-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db
-          BENCHMARK: true
-        run: bash ci/scripts/run_tests.sh tests/runtime
-
-      - name: Upload benchmark results
-        uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: runtime-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}
-          path: runtime-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db
-
-  benchmarks:
-    name: Benchmarks - ${{ matrix.os }}, Python ${{ matrix.python-version }}, Runtime ${{ matrix.runtime-version }}
-    needs: [discover_ab_envs, software]
-    runs-on: ${{ matrix.os }}
-    timeout-minutes: 120
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ubuntu-latest]
-        python-version: ["3.9"]
-        runtime-version: ${{ fromJson(needs.discover_ab_envs.outputs.matrix) }}
-
-    steps:
-      - uses: actions/checkout@v2
-        with:
-          fetch-depth: 0
-
-      - name: Set up environment
-        uses: conda-incubator/setup-miniconda@v2
-        with:
-          miniforge-variant: Mambaforge
-          use-mamba: true
-          condarc-file: ci/condarc
-          python-version: ${{ matrix.python-version }}
-          environment-file: ci/environment.yml
-
-      - name: Download software environment assets
-        uses: actions/download-artifact@v3
-        with:
-          name: software-environment-${{ matrix.runtime-version }}-py${{ matrix.python-version }}
-
-      - name: Install coiled-runtime
-        env:
-          COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }}
-        run: source ci/scripts/install_coiled_runtime.sh
-
-      - name: Run benchmarking tests
-        id: benchmarking_tests
-        env:
-          DASK_COILED__TOKEN: ${{ secrets.COILED_BENCHMARK_BOT_TOKEN }}
-          AWS_ACCESS_KEY_ID: ${{ secrets.RUNTIME_CI_BOT_AWS_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.RUNTIME_CI_BOT_AWS_SECRET_ACCESS_KEY }}
-          COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }}
-          DB_NAME: benchmark-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db
-          BENCHMARK: true
-        run: bash ci/scripts/run_tests.sh tests/benchmarks
-
-      - name: Upload benchmark results
-        uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: benchmark-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}
-          path: benchmark-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db
-
-  stability:
-    name: Stability - ${{ matrix.os }}, Python ${{ matrix.python-version }}, Runtime ${{ matrix.runtime-version }}
-    needs: [discover_ab_envs, software]
-    runs-on: ${{ matrix.os }}
-    timeout-minutes: 120
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ubuntu-latest]
-        python-version: ["3.9"]
-        runtime-version: ${{ fromJson(needs.discover_ab_envs.outputs.matrix) }}
-
-    steps:
-      - uses: actions/checkout@v2
-        with:
-          fetch-depth: 0
-
-      - name: Set up environment
-        uses: conda-incubator/setup-miniconda@v2
-        with:
-          miniforge-variant: Mambaforge
-          use-mamba: true
-          condarc-file: ci/condarc
-          python-version: ${{ matrix.python-version }}
-          environment-file: ci/environment.yml
-
-      - name: Download software environment assets
-        if: matrix.runtime-version == 'latest' || startsWith(matrix.runtime-version, 'AB_')
-        uses: actions/download-artifact@v3
-        with:
-          name: software-environment-${{ matrix.runtime-version }}-py${{ matrix.python-version }}
-
-      - name: Install coiled-runtime
-        env:
-          COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }}
-        run: source ci/scripts/install_coiled_runtime.sh
-
-      - name: Run stability tests
-        id: stability_tests
-        env:
-          DASK_COILED__TOKEN: ${{ secrets.COILED_BENCHMARK_BOT_TOKEN }}
-          AWS_ACCESS_KEY_ID: ${{ secrets.RUNTIME_CI_BOT_AWS_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.RUNTIME_CI_BOT_AWS_SECRET_ACCESS_KEY }}
-          COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }}
-          DB_NAME: stability-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db
+          DB_NAME: ${{ matrix.category }}-${{ matrix.os }}-${{ matrix.runtime-version }}-${{ matrix.repeat }}-py${{ matrix.python-version }}.db
           BENCHMARK: true
           CLUSTER_DUMP: true
-        run: bash ci/scripts/run_tests.sh tests/stability
+        run: bash ci/scripts/run_tests.sh tests/${{ matrix.category }}
 
       - name: Upload benchmark results
         uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: stability-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}
-          path: stability-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db
+          name: ${{ matrix.category }}-${{ matrix.os }}-${{ matrix.runtime-version }}-${{ matrix.repeat }}-py${{ matrix.python-version }}
+          path: ${{ matrix.category }}-${{ matrix.os }}-${{ matrix.runtime-version }}-${{ matrix.repeat }}-py${{ matrix.python-version }}.db
 
   cleanup:
-    needs: [discover_ab_envs, software, runtime, benchmarks, stability]
-    if: always() && ${{ fromJson(needs.discover_ab_envs.outputs.matrix) }}
+    needs: [discover_ab_envs, software, tests]
+    if: always() && ${{ fromJson(needs.discover_ab_envs.outputs.matrix).runtime }}
     name: Cleanup
     runs-on: ubuntu-latest
     strategy:
       fail-fast: false
       matrix:
         python-version: ["3.9"]
-        runtime-version: ${{ fromJson(needs.discover_ab_envs.outputs.matrix) }}
+        runtime-version: ${{ fromJson(needs.discover_ab_envs.outputs.matrix).runtime }}
 
     steps:
-      - uses: actions/checkout@v2
+      - name: Checkout
+        uses: actions/checkout@v2
 
       - name: Set up Python
         uses: actions/setup-python@v4
@@ -302,9 +213,9 @@ jobs:
           coiled env delete $SOFTWARE_NAME
 
   process-results:
-    needs: [discover_ab_envs, runtime, benchmarks, stability]
+    needs: [discover_ab_envs, tests]
     name: Combine separate benchmark results
-    if: always() && ${{ fromJson(needs.discover_ab_envs.outputs.matrix) }}
+    if: always() && ${{ fromJson(needs.discover_ab_envs.outputs.matrix).runtime }}
     runs-on: ubuntu-latest
     concurrency:
       # Fairly strict concurrency rule to avoid stepping on benchmark db.
@@ -312,14 +223,17 @@ jobs:
       group: process-benchmarks
       cancel-in-progress: false
     steps:
-      - uses: actions/checkout@v2
+      - name: Checkout
+        uses: actions/checkout@v2
 
-      - uses: actions/setup-python@v4
+      - name: Install Python
+        uses: actions/setup-python@v4
 
       - name: Install dependencies
         run: pip install alembic
 
-      - uses: actions/download-artifact@v3
+      - name: Download artifacts
+        uses: actions/download-artifact@v3
         with:
           path: benchmarks
 
@@ -337,15 +251,17 @@ jobs:
   static-site:
     needs: [discover_ab_envs, process-results]
     # Always generate the site, as this can be skipped even if an indirect dependency fails (like a test run)
-    if: always() && ${{ fromJson(needs.discover_ab_envs.outputs.matrix) }}
+    if: always() && ${{ fromJson(needs.discover_ab_envs.outputs.matrix).runtime }}
     name: Build static dashboards
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - name: Checkout
+        uses: actions/checkout@v2
         with:
           fetch-depth: 0
 
-      - uses: actions/download-artifact@v3
+      - name: Download artifacts
+        uses: actions/download-artifact@v3
         with:
           name: benchmark.db
 
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 307a615f1f..209b3f9e60 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -23,7 +23,7 @@ defaults:
 
 jobs:
   software:
-    name: Setup
+    name: Setup - py${{ matrix.python-version }}
     runs-on: ubuntu-latest
     strategy:
       fail-fast: false
@@ -31,7 +31,8 @@ jobs:
         python-version: ["3.8", "3.9", "3.10"]
 
     steps:
-      - uses: actions/checkout@v2
+      - name: Checkout
+        uses: actions/checkout@v2
         with:
           fetch-depth: 0
 
@@ -109,20 +110,58 @@ jobs:
             test_upstream.txt
             ab_baseline.txt
 
-  runtime:
-    name: Runtime - ${{ matrix.os }}, Python ${{ matrix.python-version }}, Runtime ${{ matrix.runtime-version }}
+  tests:
+    name: Tests - ${{ matrix.category }} ${{ matrix.runtime-version }} ${{ matrix.os }} py${{ matrix.python-version }}
     needs: software
     runs-on: ${{ matrix.os }}
     timeout-minutes: 120
     strategy:
       fail-fast: false
       matrix:
-        os: ["ubuntu-latest"]
+        os: [ubuntu-latest]
         python-version: ["3.9"]
-        runtime-version: ["latest", "0.0.4", "0.1.0"]
+        category: [runtime, benchmarks, stability]
+        runtime-version: [latest, "0.0.4", "0.1.0"]
+        include:
+          # Run stability tests on Python 3.8
+          - category: stability
+            python-version: "3.8"
+            runtime-version: latest
+            os: ubuntu-latest
+          - category: stability
+            python-version: "3.8"
+            runtime-version: "0.0.4"
+            os: ubuntu-latest
+          - category: stability
+            python-version: "3.8"
+            runtime-version: "0.1.0"
+            os: ubuntu-latest
+          # Run stability tests on Python 3.10
+          - category: stability
+            python-version: "3.10"
+            runtime-version: latest
+            os: ubuntu-latest
+          - category: stability
+            python-version: "3.10"
+            runtime-version: "0.0.4"
+            os: ubuntu-latest
+          - category: stability
+            python-version: "3.10"
+            runtime-version: "0.1.0"
+            os: ubuntu-latest
+          # Run stability tests on Python Windows and MacOS (latest py39 only)
+          - category: stability
+            python-version: "3.9"
+            runtime-version: latest
+            os: windows-latest
+          - category: stability
+            python-version: "3.9"
+            runtime-version: latest
+            os: macos-latest
 
     steps:
-      - uses: actions/checkout@v2
+      - name: Checkout
+        uses: actions/checkout@v2
         with:
           fetch-depth: 0
 
@@ -153,137 +192,20 @@ jobs:
           AWS_ACCESS_KEY_ID: ${{ secrets.RUNTIME_CI_BOT_AWS_ACCESS_KEY_ID }}
           AWS_SECRET_ACCESS_KEY: ${{ secrets.RUNTIME_CI_BOT_AWS_SECRET_ACCESS_KEY }}
           COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }}
-          DB_NAME: runtime-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db
-          BENCHMARK: true
-        run: bash ci/scripts/run_tests.sh tests/runtime
-
-      - name: Upload benchmark results
-        uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: runtime-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}
-          path: runtime-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db
-
-  benchmarks:
-    name: Benchmarks - ${{ matrix.os }}, Python ${{ matrix.python-version }}, Runtime ${{ matrix.runtime-version }}
-    needs: software
-    runs-on: ${{ matrix.os }}
-    timeout-minutes: 120
-    strategy:
-      fail-fast: false
-      matrix:
-        os: ["ubuntu-latest"]
-        python-version: ["3.9"]
-        runtime-version: ["latest", "0.0.4", "0.1.0"]
-
-    steps:
-      - uses: actions/checkout@v2
-        with:
-          fetch-depth: 0
-
-      - name: Set up environment
-        uses: conda-incubator/setup-miniconda@v2
-        with:
-          miniforge-variant: Mambaforge
-          use-mamba: true
-          condarc-file: ci/condarc
-          python-version: ${{ matrix.python-version }}
-          environment-file: ci/environment.yml
-
-      - name: Download software environment assets
-        if: matrix.runtime-version == 'latest'
-        uses: actions/download-artifact@v3
-        with:
-          name: software-environment-py${{ matrix.python-version }}
-
-      - name: Install coiled-runtime
-        env:
-          COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }}
-        run: source ci/scripts/install_coiled_runtime.sh
-
-      - name: Run benchmarking tests
-        id: benchmarking_tests
-        env:
-          DASK_COILED__TOKEN: ${{ secrets.COILED_BENCHMARK_BOT_TOKEN }}
-          AWS_ACCESS_KEY_ID: ${{ secrets.RUNTIME_CI_BOT_AWS_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.RUNTIME_CI_BOT_AWS_SECRET_ACCESS_KEY }}
-          COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }}
-          DB_NAME: benchmark-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db
-          BENCHMARK: true
-        run: bash ci/scripts/run_tests.sh tests/benchmarks
-
-      - name: Upload benchmark results
-        uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: benchmark-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}
-          path: benchmark-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db
-
-  stability:
-    name: Stability - ${{ matrix.os }}, Python ${{ matrix.python-version }}, Runtime ${{ matrix.runtime-version }}
-    needs: software
-    runs-on: ${{ matrix.os }}
-    timeout-minutes: 120
-    strategy:
-      fail-fast: false
-      matrix:
-        os: ["ubuntu-latest"]
-        python-version: ["3.8", "3.9", "3.10"]
-        runtime-version: ["latest", "0.0.4", "0.1.0"]
-        include:
-          - python-version: "3.9"
-            runtime-version: "latest"
-            os: "windows-latest"
-          - python-version: "3.9"
-            runtime-version: "latest"
-            os: "macos-latest"
-
-    steps:
-      - uses: actions/checkout@v2
-        with:
-          fetch-depth: 0
-
-      - name: Set up environment
-        uses: conda-incubator/setup-miniconda@v2
-        with:
-          miniforge-variant: Mambaforge
-          use-mamba: true
-          condarc-file: ci/condarc
-          python-version: ${{ matrix.python-version }}
-          environment-file: ci/environment.yml
-
-      - name: Download software environment assets
-        if: matrix.runtime-version == 'latest'
-        uses: actions/download-artifact@v3
-        with:
-          name: software-environment-py${{ matrix.python-version }}
-
-      - name: Install coiled-runtime
-        env:
-          COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }}
-        run: source ci/scripts/install_coiled_runtime.sh
-
-      - name: Run stability tests
-        id: stability_tests
-        env:
-          DASK_COILED__TOKEN: ${{ secrets.COILED_BENCHMARK_BOT_TOKEN }}
-          AWS_ACCESS_KEY_ID: ${{ secrets.RUNTIME_CI_BOT_AWS_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.RUNTIME_CI_BOT_AWS_SECRET_ACCESS_KEY }}
-          COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }}
-          DB_NAME: stability-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db
+          DB_NAME: ${{ matrix.category }}-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db
           BENCHMARK: true
           CLUSTER_DUMP: true
-        run: bash ci/scripts/run_tests.sh tests/stability
+        run: bash ci/scripts/run_tests.sh tests/${{ matrix.category }}
 
       - name: Upload benchmark results
         uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: stability-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}
-          path: stability-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db
+          name: ${{ matrix.category }}-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}
+          path: ${{ matrix.category }}-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db
 
   cleanup:
-    needs: [software, runtime, benchmarks, stability]
+    needs: [software, tests]
     if: always()
     name: Cleanup
     runs-on: ubuntu-latest
@@ -292,7 +214,8 @@ jobs:
       matrix:
         python-version: ["3.8", "3.9", "3.10"]
     steps:
-      - uses: actions/checkout@v2
+      - name: Checkout
+        uses: actions/checkout@v2
 
       - name: Set up Python
         uses: actions/setup-python@v4
@@ -316,7 +239,7 @@ jobs:
           coiled env delete $SOFTWARE_NAME
 
   process-results:
-    needs: [runtime, benchmarks, stability]
+    needs: tests
     name: Combine separate benchmark results
     if: always() && github.repository == 'coiled/coiled-runtime'
     runs-on: ubuntu-latest
@@ -326,14 +249,17 @@ jobs:
       group: process-benchmarks
       cancel-in-progress: false
     steps:
-      - uses: actions/checkout@v2
+      - name: Checkout
+        uses: actions/checkout@v2
 
-      - uses: actions/setup-python@v4
+      - name: Install Python
+        uses: actions/setup-python@v4
 
       - name: Install dependencies
         run: pip install alembic
 
-      - uses: actions/download-artifact@v3
+      - name: Download artifacts
+        uses: actions/download-artifact@v3
         with:
           path: benchmarks
 
@@ -374,7 +300,8 @@ jobs:
     name: Detect regressions
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - name: Checkout
+        uses: actions/checkout@v2
         with:
           fetch-depth: 0
 
@@ -406,15 +333,12 @@ jobs:
 
   report:
     name: report
-    needs: [runtime, benchmarks, stability, regressions]
+    needs: [tests, regressions]
     if: |
       always()
       && github.event_name != 'pull_request'
       && github.repository == 'coiled/coiled-runtime'
-      && (needs.runtime.result == 'failure' ||
-          needs.benchmarks.result == 'failure' ||
-          needs.stability.result == 'failure' ||
-          needs.regressions.result == 'failure') 
+      && (needs.tests.result == 'failure' || needs.regressions.result == 'failure') 
 
     runs-on: ubuntu-latest
     defaults:
@@ -437,7 +361,6 @@ jobs:
                 labels: ["ci-failure"],
             })
 
-
   static-site:
     needs: process-results
     # Always generate the site, as this can be skipped even if an indirect dependency fails (like a test run)
@@ -445,11 +368,13 @@ jobs:
     name: Build static dashboards
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - name: Checkout
+        uses: actions/checkout@v2
         with:
           fetch-depth: 0
 
-      - uses: actions/download-artifact@v3
+      - name: Download tests database
+        uses: actions/download-artifact@v3
         with:
           name: benchmark.db
 
diff --git a/AB_environments/AB_baseline.conda.yaml.rename_me b/AB_environments/AB_baseline.conda.yaml
similarity index 87%
rename from AB_environments/AB_baseline.conda.yaml.rename_me
rename to AB_environments/AB_baseline.conda.yaml
index 8485352382..2beac715c6 100644
--- a/AB_environments/AB_baseline.conda.yaml.rename_me
+++ b/AB_environments/AB_baseline.conda.yaml
@@ -1,5 +1,5 @@
 # Special environment file for A/B testing, used as the baseline environment.
-# Change contents as needed and remove the .rename_me suffix.
+# Change contents, but do not rename.
 channels:
   - conda-forge
 dependencies:
@@ -14,6 +14,6 @@ dependencies:
       # - You can point to your own git fork instead
       # For example, if you want to test a PR before it's merged into main, you should
       # change this to the dask/dask and/or dask/distributed git tip
-      - dask==2022.8.1
-      - distributed=2022.8.1
+      - dask==2022.9.0
+      - distributed==2022.9.0
       # - git+https://github.com/dask/distributed@dd81b424971e81616e1a52fa09ce4698a5002d41
diff --git a/AB_environments/AB_baseline.dask.yaml.rename_me b/AB_environments/AB_baseline.dask.yaml
similarity index 68%
rename from AB_environments/AB_baseline.dask.yaml.rename_me
rename to AB_environments/AB_baseline.dask.yaml
index 8c296301be..cd1d2e38d3 100644
--- a/AB_environments/AB_baseline.dask.yaml.rename_me
+++ b/AB_environments/AB_baseline.dask.yaml
@@ -1,3 +1,3 @@
 # Special environment file for A/B testing, used as the baseline environment.
-# Change contents as needed and remove the .rename_me suffix.
+# Change contents, but do not rename.
 # Leave empty if you don't want to override anything.
diff --git a/AB_environments/AB_sample.conda.yaml.rename_me b/AB_environments/AB_sample.conda.yaml
similarity index 68%
rename from AB_environments/AB_sample.conda.yaml.rename_me
rename to AB_environments/AB_sample.conda.yaml
index 87b6409f3f..46dbb07913 100644
--- a/AB_environments/AB_sample.conda.yaml.rename_me
+++ b/AB_environments/AB_sample.conda.yaml
@@ -10,5 +10,6 @@ dependencies:
     - python=3.9
     - coiled-runtime=0.1.0
     - pip:
-      - dask==2022.8.1
-      - git+https://github.com/dask/distributed@dd81b424971e81616e1a52fa09ce4698a5002d41
+      - dask==2022.9.0
+      # - distributed==2022.9.0
+      - git+https://github.com/dask/distributed@1fd07f03cacee6fde81d13282568a727bce789b9
diff --git a/AB_environments/AB_sample.dask.yaml.rename_me b/AB_environments/AB_sample.dask.yaml
similarity index 100%
rename from AB_environments/AB_sample.dask.yaml.rename_me
rename to AB_environments/AB_sample.dask.yaml
diff --git a/AB_environments/README.md b/AB_environments/README.md
index ddc056c2a2..a647c7be85 100644
--- a/AB_environments/README.md
+++ b/AB_environments/README.md
@@ -34,8 +34,8 @@ dependencies:
     - python=3.9
     - coiled-runtime=0.1.0
     - pip:
-      - dask==2022.8.1
-      - distributed=2022.8.1
+      - dask==2022.9.0
+      - distributed==2022.9.0
 ```
 In this example it's using `coiled-runtime` as a base, but it doesn't have to. If you do
 use `coiled-runtime` though, you must install any conflicting packages with pip; in the
@@ -47,8 +47,8 @@ arbitrary forks, e.g.
 
 ```yaml
     - pip:
-      - dask==2022.8.1
-      - git+https://github.com/yourname/distributed@dd81b424971e81616e1a52fa09ce4698a5002d41
+      - dask==2022.9.0
+      - git+https://github.com/yourname/distributed@1fd07f03cacee6fde81d13282568a727bce789b9
 ```
 The second file in each pair is a dask config file. If you don't want to change the
 config, you must create an empty file.
@@ -66,8 +66,32 @@ If you create *any* files in `AB_environments/`, you *must* create the baseline
 - `AB_baseline.conda.yaml`
 - `AB_baseline.dask.yaml`
 
-#### Complete example
-We want to test the impact of disabling work stealing. We create 4 files:
+### 4. Tweak configuration file
+Open `AB_environments/config.yaml` and set the `repeat` setting to a number higher than 0.
+This enables the A/B tests.
+Setting a low number of repeated runs is faster and cheaper, but will result in higher
+variance.
+
+`repeat` must remain set to 0 in the main branch, thus completely disabling
+A/B tests, in order to avoid unnecessary runs.
+
+In the same file, you can also set the `test_null_hypothesis` flag to true to
+automatically create a verbatim copy of AB_baseline and then compare the two in the A/B
+tests. Set it to false to save some money if you are already confident that the 'repeat'
+setting is high enough.
+
+Finally, the files offers a `categories` list. These are the subdirectories of `tests/`
+which you wish to run.
+
+### 5. (optional) Tweak tests
+Nothing prevents you from changing the tests themselves.
+
+For example, you may be interested in a single test, but you don't want to run its
+whole category; all you need to do is open the test files and delete what you don't care
+about.
+
+### Complete example
+You want to test the impact of disabling work stealing. You'll create at least 4 files:
 
 - `AB_environments/AB_baseline.conda.yaml`:
 ```yaml
@@ -77,8 +101,8 @@ dependencies:
     - python=3.9
     - coiled-runtime=0.1.0
     - pip:
-      - dask==2022.8.1
-      - distributed=2022.8.1
+      - dask==2022.9.0
+      - distributed==2022.9.0
 ```
 - `AB_environments/AB_baseline.dask.yaml`: (empty file)
 - `AB_environments/AB_no_steal.conda.yaml`: (same as baseline)
@@ -89,8 +113,18 @@ distributed:
     work-stealing: False
 ```
 
-### 4. Run CI
-- `git push`. Note: we are *not* creating a PR. 
+- `AB_environments/config.yaml`:
+```yaml
+repeat: 5
+test_null_hypothesis: true
+categories:
+  - runtime
+  - benchmarks
+  - stability
+```
+
+### 6. Run CI
+- `git push`. Note: you should *not* open a Pull Request. 
 - Open https://github.com/coiled/coiled-runtime/actions/workflows/ab_tests.yml and wait
   for the run to complete.
 - Open the run from the link above. In the Summary tab, scroll down and download the
@@ -98,9 +132,11 @@ distributed:
   Note: artifacts will appear only after the run is complete.
 - Decompress `static-dashboard.zip` and open `index.html` in your browser.
 
-### 5. Clean up
+
+### 7. Clean up
 Remember to delete the branch once you're done.
 
+
 ### Troubleshooting
 
 #### Problem:
diff --git a/AB_environments/config.yaml b/AB_environments/config.yaml
new file mode 100644
index 0000000000..9c1e3f011c
--- /dev/null
+++ b/AB_environments/config.yaml
@@ -0,0 +1,16 @@
+# Number of times to run each test suite.
+# Lower values are faster and cheaper but will result in higher variance.
+# This must remain set to 0 in the main branch, thus completely disabling
+# A/B tests, in order to avoid unnecessary runs.
+repeat: 0
+
+# Set to true to automatically create a verbatim copy of AB_baseline and then compare
+# the two in the A/B tests. Set to false to save some money if you are already confident
+# that the 'repeat' setting is high enough.
+test_null_hypothesis: true
+
+# Tests categories to run. These are subdirectories of tests/.
+categories:
+  - runtime
+  - benchmarks
+  - stability
diff --git a/ci/scripts/discover_ab_environments.py b/ci/scripts/discover_ab_environments.py
index 60db39bf9a..f13610338d 100644
--- a/ci/scripts/discover_ab_environments.py
+++ b/ci/scripts/discover_ab_environments.py
@@ -1,22 +1,47 @@
+from __future__ import annotations
+
 import glob
 import json
 import os.path
 
+import yaml
+
 
-def main():
-    envs = []
+def build_json() -> dict[str, list[int]]:
+    with open("AB_environments/config.yaml") as fh:
+        cfg = yaml.safe_load(fh)
+    if not isinstance(cfg.get("repeat"), int) or cfg["repeat"] < 0:
+        raise ValueError("AB_environments/config.yaml: missing key {repeat: N}")
+    if not cfg["repeat"]:
+        return {"repeat": [], "runtime": [], "category": []}
+
+    runtimes = []
     for conda_fname in sorted(glob.glob("AB_environments/AB_*.conda.yaml")):
         env_name = os.path.basename(conda_fname)[: -len(".conda.yaml")]
         dask_fname = f"AB_environments/{env_name}.dask.yaml"
         # Raise FileNotFoundError if missing
         open(dask_fname).close()
-        envs.append(env_name)
+        runtimes.append(env_name)
+
+    if not runtimes:
+        return {"repeat": [], "runtime": [], "category": []}
 
-    if envs and "AB_baseline" not in envs:
+    if "AB_baseline" not in runtimes:
         # If any A/B environments are defined, AB_baseline is required
         raise FileNotFoundError("AB_environments/AB_baseline.conda.yaml")
 
-    print(json.dumps(envs))
+    if cfg["test_null_hypothesis"]:
+        runtimes += ["AB_null_hypothesis"]
+
+    return {
+        "repeat": list(range(1, cfg["repeat"] + 1)),
+        "runtime": runtimes,
+        "category": cfg["categories"],
+    }
+
+
+def main() -> None:
+    print(json.dumps(build_json()))
 
 
 if __name__ == "__main__":
diff --git a/dashboard.py b/dashboard.py
index ff819dee91..5fcd3a7424 100644
--- a/dashboard.py
+++ b/dashboard.py
@@ -4,15 +4,19 @@
 import glob
 import importlib
 import inspect
+import operator
 import pathlib
-from typing import Literal, NamedTuple
+from collections.abc import Callable
+from typing import Any, Literal, NamedTuple
 
 import altair
+import numpy
 import pandas
 import panel
 import sqlalchemy
 from bokeh.resources import INLINE
 
+altair.data_transformers.enable("default", max_rows=None)
 panel.extension("vega")
 
 
@@ -53,121 +57,222 @@ def load_test_source() -> None:
     print(f"Discovered {len(source)} tests")
 
 
-def align_to_baseline(df: pandas.DataFrame, baseline: str) -> pandas.DataFrame | None:
-    """Add columns
+def calc_ab_confidence_intervals(
+    df: pandas.DataFrame, field_name: str, A: str, B: str
+) -> pandas.DataFrame:
+    """Calculate p(B / A - 1) > x and p(B / A - 1) < -x for discrete x, where A and B
+    are runtimes, for all tests in df.
+
+    Algorithm
+    ---------
+    https://towardsdatascience.com/a-practical-guide-to-a-b-tests-in-python-66666f5c3b02
+
+    Returns
+    -------
+    DataFrame:
+
+    fullname
+        Test name with category, e.g. bencharks/test_foo.py::test_123[1]
+    fullname_no_category
+        Test name without category, e.g. test_foo.py::test_123[1]
+    x
+        Confidence interval [-0.5, 0.5]. Note that element 0 will be repeated.
+    xlabel
+        "<-{p*100}% | x < 0
+        ">{p*100}% | x > 0
+    p
+        p(B/A-1) < x | x < 0
+        p(B/A-1) > x | x > 0
+    color
+        0 if p=1 and x < 0
+        0.5 if p=0
+        1 if p=1 and x > 0
+        plus all shades in between
+    """
 
-    - duration_baseline
-    - average_memory_baseline
-    - peak_memory_baseline
-    - duration_delta (A/B - 1)
-    - average_memory_delta (A/B - 1)
-    - peak_memory_delta (A/B - 1)
+    def bootstrap_mean(df_i: pandas.DataFrame) -> pandas.DataFrame:
+        boot = df_i[field_name].sample(frac=10_000, replace=True).to_frame()
+        boot["i"] = pandas.RangeIndex(boot.shape[0]) // df_i.shape[0]
+        out = boot.groupby("i").mean().reset_index()[[field_name]]
+        assert out.shape == (10_000, 1)
+        out.index.name = "bootstrap_run"
+        return out
+
+    # DataFrame with 20,000 rows per test exactly, with columns
+    # [fullname, fullname_no_category, runtime, bootstrap_run, {field_name}]
+    bootstrapped = (
+        df.groupby(["fullname", "fullname_no_category", "runtime"])
+        .apply(bootstrap_mean)
+        .reset_index()
+    )
 
-    Baseline values are from the matching rows given the same test name and the baseline
-    runtime. Note that this means that df is expected to have exactly 1 test in the
-    baseline runtime for each test in every other runtime.
-    """
-    df_baseline = df[df["runtime"] == baseline]
-
-    if df_baseline.empty:
-        # Typically a misspelling. However, this can legitimately happen in CI if all
-        # three jobs of the baseline runtime failed early.
-        print(
-            f"Baseline runtime {baseline!r} not found; valid choices are:",
-            ", ".join(df["runtime"].unique()),
+    # DataFrame with 10,000 rows per test exactly, with columns
+    # [fullname, fullname_no_category, bootstrap_run, {A}, {B}, diff]
+    pivot = bootstrapped.pivot(
+        ["fullname", "fullname_no_category", "bootstrap_run"],
+        "runtime",
+        field_name,
+    ).reset_index()
+    pivot["diff"] = pivot[B] / pivot[A] - 1
+
+    def confidence(
+        df_i: pandas.DataFrame,
+        x: numpy.ndarray,
+        op: Literal["<", ">"],
+        cmp: Callable[[Any, Any], bool],
+        color_factor: float,
+    ) -> pandas.DataFrame:
+        xlabel = [f"{op}{xi * 100:.0f}%" for xi in x]
+        p = (cmp(df_i["diff"].values.reshape([-1, 1]), x)).sum(axis=0) / df_i.shape[0]
+        color = color_factor * p / 2 + 0.5
+        return pandas.DataFrame({"x": x, "xlabel": xlabel, "p": p, "color": color})
+
+    pivot_groups = pivot.groupby(["fullname", "fullname_no_category"])[["diff"]]
+    x_neg = numpy.linspace(-0.8, 0, 17)
+    x_pos = numpy.linspace(0, 0.8, 17)
+    conf_neg, conf_pos = [
+        # DataFrame with 1 row per element of x_neg/x_pos and columns
+        # [fullname, fullname_no_category, x, xlabel, p, color]
+        (
+            pivot_groups.apply(confidence, p, op, cmp, color_factor)
+            .reset_index()
+            .drop("level_2", axis=1)
         )
-        return None
-
-    baseline_names = df_baseline["fullname"].unique()
-    all_names = df["fullname"].unique()
-
-    assert len(baseline_names) == df_baseline.shape[0]
-    if len(baseline_names) < len(all_names):
-        # This will happen in CI if one or two out of three jobs of the baseline failed.
-        # Note that df contains the latest run only. It means that tests on all runtimes
-        # (including historical ones) should be from the coiled-runtime git tip, so
-        # adding or removing tests should not cause a mismatch.
-        print(
-            f"Baseline runtime {baseline!r} is missing some tests:",
-            ", ".join(set(all_names) - set(baseline_names)),
+        for (p, op, cmp, color_factor) in (
+            (x_neg, "<", operator.lt, -1),
+            (x_pos, ">", operator.gt, 1),
         )
-        return None
-
-    columns = [spec.field_name for spec in SPECS]
-    df_baseline = (
-        df_baseline.set_index("fullname")
-        .loc[df["fullname"], columns]
-        .rename(columns={k: k + "_baseline" for k in columns})
-    )
-    df_baseline.index = df.index
-    df = pandas.concat([df, df_baseline], axis=1)
-    for column in columns:
-        df[column + "_delta"] = (df[column] / df[column + "_baseline"] - 1) * 100
-    return df
+    ]
+    return pandas.concat([conf_neg, conf_pos], axis=0)
 
 
 def make_barchart(
     df: pandas.DataFrame,
     spec: ChartSpec,
     title: str,
-    baseline: str | None,
-) -> altair.Chart | None:
+) -> tuple[altair.Chart | None, int]:
     """Make a single Altair barchart for a given test or runtime"""
     df = df.dropna(subset=[spec.field_name, "start"])
     if not len(df):
         # Some tests do not have average_memory or peak_memory measures, only runtime
-        return None
+        return None, 0
 
-    fields = [
-        spec.field_name,
-        "fullname",
-        "fullname_no_category",
-        "dask_version",
-        "distributed_version",
-        "runtime",
+    df = df[
+        [
+            spec.field_name,
+            "fullname",
+            "fullname_no_category",
+            "dask_version",
+            "distributed_version",
+            "runtime",
+        ]
     ]
 
-    height = max(df.shape[0] * 20 + 50, 90)
     tooltip = [
         altair.Tooltip("fullname:N", title="Test"),
+        altair.Tooltip("runtime:N", title="Runtime"),
         altair.Tooltip("dask_version:N", title="Dask"),
         altair.Tooltip("distributed_version:N", title="Distributed"),
-        altair.Tooltip(f"{spec.field_name}:Q", title=f"{spec.field_desc} {spec.unit}"),
+        altair.Tooltip(f"count({spec.field_name}):N", title="Number of runs"),
+        altair.Tooltip(f"stdev({spec.field_name}):Q", title=f"std dev {spec.unit}"),
+        altair.Tooltip(f"min({spec.field_name}):Q", title=f"min {spec.unit}"),
+        altair.Tooltip(f"median({spec.field_name}):Q", title=f"median {spec.unit}"),
+        altair.Tooltip(f"mean({spec.field_name}):Q", title=f"mean {spec.unit}"),
+        altair.Tooltip(f"max({spec.field_name}):Q", title=f"max {spec.unit}"),
     ]
 
     by_test = len(df["fullname"].unique()) == 1
     if by_test:
         df = df.sort_values("runtime", key=runtime_sort_key_pd)
         y = altair.Y("runtime", title="Runtime", sort=None)
+        n_bars = df["runtime"].unique().size
     else:
         y = altair.Y("fullname_no_category", title="Test name")
+        n_bars = df["fullname_no_category"].unique().size
 
-    if baseline:
-        fields += [
-            f"{spec.field_name}_delta",
-            f"{spec.field_name}_baseline",
-        ]
-        x = altair.X(
-            f"{spec.field_name}_delta",
-            title=f"{spec.field_desc} (delta % from {baseline})",
-        )
-        tooltip += [
-            altair.Tooltip(
-                f"{spec.field_name}_baseline:Q", title=f"{baseline} {spec.unit}"
+    height = max(n_bars * 20 + 50, 90)
+
+    bars = (
+        altair.Chart(width=800, height=height)
+        .mark_bar()
+        .encode(
+            x=altair.X(
+                f"median({spec.field_name}):Q", title=f"{spec.field_desc} {spec.unit}"
             ),
-            altair.Tooltip(f"{spec.field_name}_delta:Q", title="Delta %"),
+            y=y,
+            tooltip=tooltip,
+        )
+    )
+    ticks = (
+        altair.Chart()
+        .mark_tick(color="black")
+        .encode(x=f"mean({spec.field_name})", y=y)
+    )
+    error_bars = (
+        altair.Chart().mark_errorbar(extent="stdev").encode(x=spec.field_name, y=y)
+    )
+    chart = (
+        altair.layer(bars, ticks, error_bars, data=df)
+        .properties(title=title)
+        .configure(autosize="fit")
+    )
+
+    return chart, height
+
+
+def make_ab_confidence_map(
+    df: pandas.DataFrame,
+    spec: ChartSpec,
+    title: str,
+    baseline: str,
+) -> tuple[altair.Chart | None, int]:
+    """Make a single Altair heatmap of p(B/A - 1) confidence intervals, where B is the
+    examined runtime and A is the baseline, for all tests for a given measure.
+    """
+    df = df.dropna(subset=[spec.field_name, "start"])
+    if not len(df):
+        # Some tests do not have average_memory or peak_memory measures, only runtime
+        return None, 0
+
+    df = df[
+        [
+            spec.field_name,
+            "fullname",
+            "fullname_no_category",
+            "runtime",
         ]
-    else:
-        x = altair.X(spec.field_name, title=f"{spec.field_desc} {spec.unit}")
+    ]
+    runtimes = df["runtime"].unique()
+    A = baseline
+    B = next(r for r in runtimes if r != baseline)
+    conf = calc_ab_confidence_intervals(df, spec.field_name, A, B)
 
-    return (
-        altair.Chart(df[fields], width=800, height=height)
-        .mark_bar()
-        .encode(x=x, y=y, tooltip=tooltip)
+    n_bars = df["fullname_no_category"].unique().size
+    height = max(n_bars * 20 + 50, 90)
+
+    chart = (
+        altair.Chart(conf, width=800, height=height)
+        .mark_rect()
+        .encode(
+            x=altair.X("xlabel:O", title="confidence threshold (B/A - 1)", sort=None),
+            y=altair.Y("fullname_no_category:O", title="Test"),
+            color=altair.Color(
+                "color:Q",
+                scale=altair.Scale(scheme="redblue", domain=[0, 1], reverse=True),
+                legend=None,
+            ),
+            tooltip=[
+                altair.Tooltip("fullname:O", title="Test Name"),
+                altair.Tooltip("xlabel:O", title="Confidence threshold"),
+                altair.Tooltip("p:Q", format=".2p", title="p(B/A-1) exceeds threshold"),
+            ],
+        )
         .properties(title=title)
         .configure(autosize="fit")
     )
 
+    return chart, height
+
 
 def make_timeseries(
     df: pandas.DataFrame, spec: ChartSpec, title: str
@@ -229,7 +334,7 @@ def make_timeseries(
 
 def make_test_report(
     df: pandas.DataFrame,
-    kind: Literal["barchart" | "timeseries"],
+    kind: Literal["barchart" | "timeseries" | "A/B"],
     title: str,
     sourcename: str | None = None,
     baseline: str | None = None,
@@ -240,17 +345,19 @@ def make_test_report(
         if kind == "timeseries":
             assert not baseline
             chart = make_timeseries(df, spec, title)
+            height = 384
+        elif kind == "barchart":
+            assert not baseline
+            chart, height = make_barchart(df, spec, title)
+        elif kind == "A/B":
+            assert baseline
+            chart, height = make_ab_confidence_map(df, spec, title, baseline=baseline)
         else:
-            chart = make_barchart(df, spec, title, baseline)
+            raise ValueError(kind)  # pragma: nocover
         if not chart:
             continue
         tabs.append((spec.field_desc, chart))
 
-    if kind == "timeseries":
-        height = 384
-    else:
-        height = max(df.shape[0] * 20 + 50, 90)
-
     if sourcename in source:
         code = panel.pane.Markdown(
             f"```python\n{source[sourcename]}\n```",
@@ -281,10 +388,8 @@ def make_timeseries_html_report(
     categories = sorted(df[df.runtime == runtime].category.unique())
     tabs = []
     for category in categories:
-        df_by_test = (
-            df[(df.runtime == runtime) & (df.category == category)]
-            .sort_values("sourcename")
-            .groupby("sourcename")
+        df_by_test = df[(df.runtime == runtime) & (df.category == category)].groupby(
+            "sourcename"
         )
         panes = [
             make_test_report(
@@ -302,29 +407,22 @@ def make_timeseries_html_report(
     doc.save(out_fname, title=runtime, resources=INLINE)
 
 
-def make_ab_html_report(
+def make_barchart_html_report(
     df: pandas.DataFrame,
     output_dir: pathlib.Path,
     by_test: bool,
-    baseline: str | None,
 ) -> None:
-    """Generate HTML report for the latest CI run, comparing all runtimes (e.g.
-    coiled-upstream-py3.9) against a baseline runtime
+    """Generate HTML report containing bar charts showing statistical information
+    (mean, median, etc).
 
     Create one tab for each test category (e.g. benchmarks, runtime, stability),
     one graph for each runtime and one bar for each test
     OR one graph for each test and one bar for each runtime,
     and one graph tab for each measure (wall clock, average memory, peak memory).
-
-    If a baseline runtime is defined, all measures are expressed relative to the
-    baseline; otherwise they're expressed in absolute terms.
     """
     out_fname = str(
         output_dir.joinpath(
-            "AB_by_"
-            + ("test" if by_test else "runtime")
-            + (f"_vs_{baseline}" if baseline else "")
-            + ".html"
+            "barcharts_by_" + ("test" if by_test else "runtime") + ".html"
         )
     )
     print(f"Generating {out_fname}")
@@ -333,36 +431,25 @@ def make_ab_html_report(
     tabs = []
     for category in categories:
         if by_test:
-            df_by_test = (
-                df[df.category == category]
-                .sort_values(["sourcename", "fullname"])
-                .groupby(["sourcename", "fullname"])
-            )
+            df_by_test = df[df.category == category].groupby(["sourcename", "fullname"])
             panes = [
                 make_test_report(
                     df_by_test.get_group((sourcename, fullname)),
                     kind="barchart",
                     title=fullname,
                     sourcename=sourcename,
-                    baseline=baseline,
                 )
                 for sourcename, fullname in df_by_test.groups
             ]
         else:
-            df_by_runtime = (
-                df[df.category == category]
-                .sort_values("runtime", key=runtime_sort_key_pd)
-                .groupby("runtime")
-            )
+            df_by_runtime = df[df.category == category].groupby("runtime")
             panes = [
                 make_test_report(
                     df_by_runtime.get_group(runtime),
                     kind="barchart",
                     title=runtime,
-                    baseline=baseline,
                 )
                 for runtime in sorted(df_by_runtime.groups, key=runtime_sort_key)
-                if runtime != baseline
             ]
         flex = panel.FlexBox(*panes, align_items="start", justify_content="start")
         tabs.append((category.title(), flex))
@@ -370,11 +457,69 @@ def make_ab_html_report(
 
     doc.save(
         out_fname,
-        title="A/B by "
-        + ("test" if by_test else "runtime")
-        + (f" vs. {baseline}" if baseline else ""),
+        title="Bar charts by " + ("test" if by_test else "runtime"),
+        resources=INLINE,
+    )
+
+
+def make_ab_html_report(
+    df: pandas.DataFrame,
+    output_dir: pathlib.Path,
+    baseline: str,
+) -> bool:
+    """Generate HTML report containing heat maps for confidence intervals relative to
+    a baseline runtime, e.g. p(B/A-1) > 10%
+
+    Create one tab for each test category (e.g. benchmarks, runtime, stability), one
+    graph for each runtime, and one graph tab for each measure (wall clock, average
+    memory, peak memory).
+
+    Returns
+    -------
+    True if the report was generated; False otherwise
+    """
+    out_fname = str(output_dir.joinpath(f"AB_vs_{baseline}.html"))
+    print(f"Generating {out_fname}")
+
+    categories = sorted(df.category.unique())
+    tabs = []
+    for category in categories:
+        df_by_runtime = df[df.category == category].groupby("runtime")
+        if baseline not in df_by_runtime.groups:
+            # Typically a misspelling. However, this can legitimately happen in CI if
+            # all three jobs of the baseline runtime failed early.
+            print(
+                f"Baseline runtime {baseline!r} not found; valid choices are:",
+                ", ".join(df["runtime"].unique()),
+            )
+            return False
+
+        panes = [
+            make_test_report(
+                pandas.concat(
+                    [
+                        df_by_runtime.get_group(runtime),
+                        df_by_runtime.get_group(baseline),
+                    ],
+                    axis=0,
+                ),
+                kind="A/B",
+                title=runtime,
+                baseline=baseline,
+            )
+            for runtime in sorted(df_by_runtime.groups, key=runtime_sort_key)
+            if runtime != baseline
+        ]
+        flex = panel.FlexBox(*panes, align_items="start", justify_content="start")
+        tabs.append((category.title(), flex))
+    doc = panel.Tabs(*tabs, margin=12)
+
+    doc.save(
+        out_fname,
+        title="A/B confidence intervals vs. " + baseline,
         resources=INLINE,
     )
+    return True
 
 
 def make_index_html_report(
@@ -385,12 +530,12 @@ def make_index_html_report(
     index_txt += "### Historical timeseries\n"
     for runtime in runtimes:
         index_txt += f"- [{runtime}](./{runtime}.html)\n"
-    index_txt += "\n\n### A/B tests\n"
-    index_txt += "- [by test](./AB_by_test.html)\n"
-    index_txt += "- [by runtime](./AB_by_runtime.html)\n"
+    index_txt += "\n\n### Statistical analysis\n"
+    index_txt += "- [Bar charts, by test](./barcharts_by_test.html)\n"
+    index_txt += "- [Bar charts, by runtime](./barcharts_by_runtime.html)\n"
     for baseline in baselines:
         index_txt += (
-            f"- [by runtime vs. {baseline}](./AB_by_runtime_vs_{baseline}.html)\n"
+            f"- [A/B confidence intervals vs. {baseline}](./AB_vs_{baseline}.html)\n"
         )
 
     index = panel.pane.Markdown(index_txt, width=800)
@@ -503,24 +648,17 @@ def main() -> None:
     for runtime in runtimes:
         make_timeseries_html_report(df, output_dir, runtime)
 
-    # Select only the latest run for each runtime. This may pick up historical runs (up
-    # to 6h old) if they have not been rerun in the current pull/PR.
-    # TODO This is fragile. Keep the latest and historical databases separate, or record
-    #      the coiled-runtime git hash and use it to filter?
-    max_end = df.sort_values("end").groupby(["runtime", "category"]).tail(1)
-    max_end = max_end[max_end["end"] > max_end["end"].max() - pandas.Timedelta("6h")]
-    session_ids = max_end["session_id"].unique()
-    latest_run = df[df["session_id"].isin(session_ids)]
-
-    make_ab_html_report(latest_run, output_dir, by_test=True, baseline=None)
-    make_ab_html_report(latest_run, output_dir, by_test=False, baseline=None)
+    # Do not use data that is more than a week old in statistical analysis
+    df_recent = df[df["end"] > df["end"].max() - pandas.Timedelta("7d")]
+
+    make_barchart_html_report(df_recent, output_dir, by_test=True)
+    make_barchart_html_report(df_recent, output_dir, by_test=False)
+
     baselines = []
     for baseline in args.baseline:
-        df_baseline = align_to_baseline(latest_run, baseline)
-        if df_baseline is None:
-            continue
-        baselines.append(baseline)
-        make_ab_html_report(df_baseline, output_dir, by_test=False, baseline=baseline)
+        has_baseline = make_ab_html_report(df_recent, output_dir, baseline)
+        if has_baseline:
+            baselines.append(baseline)
 
     make_index_html_report(output_dir, runtimes, baselines)