From 4e3fae0f99084414dadceea1f4f0e3cb1564f939 Mon Sep 17 00:00:00 2001
From: Jonas Bhend <jonas.bhend@meteoswiss.ch>
Date: Wed, 6 May 2026 09:55:45 +0200
Subject: [PATCH 01/16] Initial suggestion from our dear friend

---
 pyproject.toml                                |   3 +-
 resources/report/dashboard/script.js          |  66 +++++++
 .../report/dashboard/template.html.jinja2     |  24 +++
 src/diagnostics/__init__.py                   | 162 ++++++++++++++++++
 tests/unit/test_parse_inference_logs.py       |  81 +++++++++
 workflow/rules/report.smk                     |  63 +++++++
 workflow/scripts/parse_inference_logs.py      |  47 +++++
 .../scripts/report_experiment_dashboard.py    |  25 +++
 8 files changed, 470 insertions(+), 1 deletion(-)
 create mode 100644 src/diagnostics/__init__.py
 create mode 100644 tests/unit/test_parse_inference_logs.py
 create mode 100644 workflow/scripts/parse_inference_logs.py

diff --git a/pyproject.toml b/pyproject.toml
index f754384d..66214d57 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -57,7 +57,8 @@ markers = [
 packages = [
   "src/evalml",
   "src/verification",
-  "src/data_input"
+  "src/data_input",
+  "src/diagnostics"
 ]
 
 [tool.uv.sources]
diff --git a/resources/report/dashboard/script.js b/resources/report/dashboard/script.js
index 9eed0ba3..2561d7bf 100644
--- a/resources/report/dashboard/script.js
+++ b/resources/report/dashboard/script.js
@@ -185,3 +185,69 @@ function updateChart() {
 
 // Initial chart
 updateChart()
+
+
+// ---- System metrics tab ----
+
+const sysDataEl = document.getElementById("sysmetrics-data");
+const sysData = sysDataEl ? JSON.parse(sysDataEl.textContent) : [];
+
+if (sysData.length > 0) {
+  choicesInstances["sys-source-select"] = new Choices("#sys-source-select", {
+    searchEnabled: false,
+    removeItemButton: true,
+    shouldSort: false,
+    itemSelectText: "",
+    placeholder: false,
+  });
+  document.getElementById("sys-source-select").addEventListener("change", updateSysChart);
+
+  const sysSpec = {
+    "data": { "values": sysData },
+    "facet": {
+      "column": { "field": "metric", "type": "nominal", "title": null },
+    },
+    "resolve": { "scale": { "y": "independent" } },
+    "spec": {
+      "width": 220,
+      "height": 220,
+      "layer": [
+        {
+          "mark": { "type": "bar", "opacity": 0.7 },
+          "encoding": {
+            "x": { "field": "source", "type": "nominal", "axis": { "labelAngle": -30, "title": null } },
+            "y": { "field": "value", "aggregate": "mean", "type": "quantitative", "title": "mean" },
+            "color": { "field": "source", "type": "nominal", "legend": { "orient": "top", "title": "Source" } },
+          },
+        },
+        {
+          "mark": { "type": "point", "filled": true, "size": 40, "opacity": 0.9 },
+          "encoding": {
+            "x": { "field": "source", "type": "nominal" },
+            "y": { "field": "value", "type": "quantitative" },
+            "color": { "field": "source", "type": "nominal" },
+            "tooltip": [
+              { "field": "source", "type": "nominal", "title": "Source" },
+              { "field": "init_time", "type": "nominal", "title": "Init time" },
+              { "field": "metric", "type": "nominal", "title": "Metric" },
+              { "field": "value", "type": "quantitative", "title": "Value", "format": ".3f" },
+              { "field": "n_gpu", "type": "quantitative", "title": "GPUs" },
+              { "field": "job_id", "type": "nominal", "title": "Job ID" },
+            ],
+          },
+        },
+      ],
+    },
+  };
+
+  function updateSysChart() {
+    const selectedSources = getSelectedValues("sys-source-select");
+    const newSpec = JSON.parse(JSON.stringify(sysSpec));
+    if (selectedSources.length > 0) {
+      newSpec.transform = [{ filter: { field: "source", oneOf: selectedSources } }];
+    }
+    vegaEmbed("#sys-vis", newSpec, { actions: false });
+  }
+
+  updateSysChart();
+}
diff --git a/resources/report/dashboard/template.html.jinja2 b/resources/report/dashboard/template.html.jinja2
index 78404974..d0ba035c 100644
--- a/resources/report/dashboard/template.html.jinja2
+++ b/resources/report/dashboard/template.html.jinja2
@@ -95,6 +95,9 @@
     <div class="header">
         <div class="tab-buttons">
             <button class="tab-link active" data-tab="tab_scores">Standard verification</button>
+            {% if sysmetrics_sources %}
+            <button class="tab-link" data-tab="tab_sysmetrics">System metrics</button>
+            {% endif %}
             <button class="tab-link" data-tab="tab_config">Config</button>
         </div>
     </div>
@@ -155,6 +158,23 @@
     </div>
 
 
+    <!-- System metrics tab -->
+    {% if sysmetrics_sources %}
+    <div id="tab_sysmetrics" class="tab-content">
+        <div class="controls">
+            <div class="control-group">
+                <label>Source(s)</label>
+                <select id="sys-source-select" multiple>
+                    {% for source in sysmetrics_sources %}
+                    <option value="{{source}}" selected>{{source}}</option>
+                    {% endfor %}
+                </select>
+            </div>
+        </div>
+        <div id="sys-vis"></div>
+    </div>
+    {% endif %}
+
     <!-- Config file tab -->
     <div id="tab_config" class="tab-content">
         <pre style="background:#f7f7f7; border:1px solid #ddd; padding:1em; overflow-x:auto; max-height:60vh;">
@@ -163,6 +183,10 @@
     </div>
 
 
+    <script id="sysmetrics-data" type="application/json">
+        {{ sysmetrics_data | safe | indent(8, false) }}
+    </script>
+
     <script id="verif-data" type="application/json">
         {{ verif_data | safe | indent(8, false)}}
     </script>
diff --git a/src/diagnostics/__init__.py b/src/diagnostics/__init__.py
new file mode 100644
index 00000000..f9720b3c
--- /dev/null
+++ b/src/diagnostics/__init__.py
@@ -0,0 +1,162 @@
+"""System-performance diagnostics for inference jobs."""
+
+import logging
+import re
+from datetime import datetime
+from pathlib import Path
+
+LOG = logging.getLogger(__name__)
+
+_TIMESTAMP = re.compile(r"^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})")
+_JOB_ID = re.compile(r"srun: job (\d+) queued")
+_CHECKPOINT_SIZE = re.compile(r"Checkpoint size: ([\d.]+) GiB")
+_N_STEPS = re.compile(r"Forecasting (\d+) steps")
+_STEP_TIME = re.compile(r"Forecast\. Model call \d+:.+?: (\d+) seconds\.")
+
+# Human-readable names for wide-format columns used in the dashboard
+SYSMETRICS_COLS = {
+    "wall_time_s": "Wall Time (s)",
+    "gpu_hours": "GPU Hours",
+    "mean_step_time_s": "Mean Step Time (s)",
+    "max_step_time_s": "Max Step Time (s)",
+    "checkpoint_size_gib": "Checkpoint Size (GiB)",
+    "n_steps": "No. Steps",
+}
+
+
+def parse_single_log(log_path: str) -> dict:
+    """Return raw metric values extracted from one inference log file."""
+    job_id = None
+    first_ts = last_ts = None
+    checkpoint_gib = None
+    n_steps = None
+    step_times: list[int] = []
+
+    with open(log_path) as fh:
+        for line in fh:
+            if job_id is None:
+                m = _JOB_ID.search(line)
+                if m:
+                    job_id = m.group(1)
+
+            m = _TIMESTAMP.match(line)
+            if m:
+                ts = datetime.strptime(m.group(1), "%Y-%m-%d %H:%M:%S")
+                if first_ts is None:
+                    first_ts = ts
+                last_ts = ts
+
+            if checkpoint_gib is None:
+                m = _CHECKPOINT_SIZE.search(line)
+                if m:
+                    checkpoint_gib = float(m.group(1))
+
+            if n_steps is None:
+                m = _N_STEPS.search(line)
+                if m:
+                    n_steps = int(m.group(1))
+
+            m = _STEP_TIME.search(line)
+            if m:
+                step_times.append(int(m.group(1)))
+
+    wall_time_s = (
+        (last_ts - first_ts).total_seconds()
+        if first_ts is not None and last_ts is not None
+        else None
+    )
+    return {
+        "job_id": job_id,
+        "wall_time_s": wall_time_s,
+        "n_steps": n_steps if n_steps is not None else len(step_times),
+        "mean_step_time_s": (
+            round(sum(step_times) / len(step_times), 2) if step_times else None
+        ),
+        "max_step_time_s": max(step_times) if step_times else None,
+        "checkpoint_size_gib": checkpoint_gib,
+    }
+
+
+def parse_logs(
+    log_files: list[str],
+    label_map: dict[str, str],
+    gpu_map: dict[str, int],
+    log_dir: str,
+) -> list[dict]:
+    """Parse inference log files and return one record per (run, init_time).
+
+    Parameters
+    ----------
+    log_files : paths to .log files to parse.
+    label_map : {run_id: human-readable label} — supplied by Snakemake rule params.
+    gpu_map   : {run_id: n_gpu} — GPU count used for each run.
+    log_dir   : root of the inference_execute logs directory; used to derive run_id.
+    """
+    log_dir_path = Path(log_dir)
+    records: list[dict] = []
+
+    for log_file in log_files:
+        log_path = Path(log_file)
+        if not log_path.exists():
+            LOG.warning("Log file not found, skipping: %s", log_file)
+            continue
+
+        # Derive run_id and init_time from the path.
+        # Relative path structure: "{run_id}-{init_time}.log"
+        # init_time is always 12 digits (YYYYMMDDHHM).
+        try:
+            stem = str(log_path.relative_to(log_dir_path).with_suffix(""))
+            init_time_str = stem[-12:]
+            run_id = stem[:-13]  # strip trailing "-YYYYMMDDHHM"
+        except Exception:
+            LOG.warning("Cannot derive run_id from path, skipping: %s", log_file)
+            continue
+
+        label = label_map.get(run_id, run_id)
+        n_gpu = int(gpu_map.get(run_id, 1))
+
+        try:
+            raw = parse_single_log(str(log_path))
+        except Exception as exc:
+            LOG.warning("Failed to parse %s: %s", log_file, exc)
+            continue
+
+        wall_s = raw.get("wall_time_s")
+        gpu_hours = round(wall_s / 3600 * n_gpu, 4) if wall_s is not None else None
+
+        try:
+            init_iso = datetime.strptime(init_time_str, "%Y%m%d%H%M").isoformat()
+        except ValueError:
+            init_iso = init_time_str
+
+        records.append(
+            {
+                "source": label,
+                "run_id": run_id,
+                "init_time": init_iso,
+                "n_gpu": n_gpu,
+                "gpu_hours": gpu_hours,
+                **raw,
+            }
+        )
+
+    LOG.info("Parsed %d log files → %d records", len(log_files), len(records))
+    return records
+
+
+def melt_for_dashboard(records: list[dict]) -> tuple[str, list[str]]:
+    """Convert wide-format system metrics records to long format for Vega-Lite.
+
+    Returns (json_string, sorted_source_list).
+    """
+    import json
+
+    long_records = []
+    for r in records:
+        base = {k: r.get(k) for k in ("source", "init_time", "n_gpu", "job_id")}
+        for col, label in SYSMETRICS_COLS.items():
+            if r.get(col) is not None:
+                long_records.append({**base, "metric": label, "value": r[col]})
+
+    sources = sorted({r["source"] for r in records})
+    return json.dumps(long_records), sources
diff --git a/tests/unit/test_parse_inference_logs.py b/tests/unit/test_parse_inference_logs.py
new file mode 100644
index 00000000..ef76014e
--- /dev/null
+++ b/tests/unit/test_parse_inference_logs.py
@@ -0,0 +1,81 @@
+import pytest
+
+from diagnostics import parse_logs, parse_single_log
+
+LOG_CONTENT = """\
+srun: job 4242140 queued and waiting for resources
+srun: job 4242140 has been allocated resources
+2026-05-01 11:10:19 INFO Loading multi-dataset metadata
+2026-05-01 11:10:52 INFO Checkpoint size: 1.4 GiB
+2026-05-01 11:10:52 INFO Lead time: 5 days, 0:00:00 Forecasting 3 steps through 3 autoregressive steps
+2026-05-01 11:11:07 INFO Forecast. Model call 1: horizon 6:00:00, freq. 6:00:00 (2025-03-01 06:00:00): 6 seconds.
+2026-05-01 11:11:12 INFO Forecast. Model call 2: horizon 12:00:00, freq. 6:00:00 (2025-03-01 12:00:00): 2 seconds.
+2026-05-01 11:11:17 INFO Forecast. Model call 3: horizon 18:00:00, freq. 6:00:00 (2025-03-01 18:00:00): 2 seconds.
+2026-05-01 11:11:20 INFO Done.
+"""
+
+
+def test_parse_single_log(tmp_path):
+    log_file = tmp_path / "test.log"
+    log_file.write_text(LOG_CONTENT)
+
+    result = parse_single_log(str(log_file))
+
+    assert result["job_id"] == "4242140"
+    assert result["checkpoint_size_gib"] == 1.4
+    assert result["n_steps"] == 3
+    assert result["max_step_time_s"] == 6
+    assert result["mean_step_time_s"] == pytest.approx(round((6 + 2 + 2) / 3, 2))
+    # wall time: 11:11:20 - 11:10:19 = 61 seconds
+    assert result["wall_time_s"] == pytest.approx(61.0)
+
+
+def test_parse_logs_extracts_run_id_and_init_time(tmp_path):
+    log_dir = tmp_path / "inference_execute" / "forecaster-c304-1e7e"
+    log_dir.mkdir(parents=True)
+    log_file = log_dir / "253b-202503010000.log"
+    log_file.write_text(LOG_CONTENT)
+
+    label_map = {"forecaster-c304-1e7e/253b": "My Model"}
+    gpu_map = {"forecaster-c304-1e7e/253b": 2}
+
+    records = parse_logs(
+        log_files=[str(log_file)],
+        label_map=label_map,
+        gpu_map=gpu_map,
+        log_dir=str(tmp_path / "inference_execute"),
+    )
+
+    assert len(records) == 1
+    r = records[0]
+    assert r["source"] == "My Model"
+    assert r["run_id"] == "forecaster-c304-1e7e/253b"
+    assert r["init_time"] == "2025-03-01T00:00:00"
+    assert r["n_gpu"] == 2
+    assert r["gpu_hours"] == pytest.approx(61.0 / 3600 * 2, rel=1e-3)
+
+
+def test_parse_logs_missing_file_is_skipped(tmp_path):
+    records = parse_logs(
+        log_files=[str(tmp_path / "does_not_exist.log")],
+        label_map={},
+        gpu_map={},
+        log_dir=str(tmp_path),
+    )
+    assert records == []
+
+
+def test_parse_logs_fallback_label_is_run_id(tmp_path):
+    log_dir = tmp_path / "inference_execute" / "env-abc"
+    log_dir.mkdir(parents=True)
+    log_file = log_dir / "1234-202503020000.log"
+    log_file.write_text(LOG_CONTENT)
+
+    records = parse_logs(
+        log_files=[str(log_file)],
+        label_map={},  # no label provided
+        gpu_map={},
+        log_dir=str(tmp_path / "inference_execute"),
+    )
+
+    assert records[0]["source"] == "env-abc/1234"
diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk
index b0acc44b..66843b5e 100644
--- a/workflow/rules/report.smk
+++ b/workflow/rules/report.smk
@@ -15,6 +15,67 @@ def make_header_text():
     return f"Verification against {truth} with initializations from {dates.get('start')} to {dates.get('end')} by {dates.get('frequency')}"
 
 
+def _candidate_gpu(run_cfg: dict) -> int:
+    """Return the GPU count for a run, defaulting to 1."""
+    ir = run_cfg.get("inference_resources")
+    if ir is None:
+        return 1
+    if isinstance(ir, dict):
+        return int(ir.get("gpu", 1) or 1)
+    return int(getattr(ir, "gpu", 1) or 1)
+
+
+rule collect_system_metrics:
+    localrule: True
+    input:
+        okfiles=[
+            OUT_ROOT / f"logs/inference_execute/{run_id}-{t.strftime('%Y%m%d%H%M')}.ok"
+            for run_id, run_cfg in RUN_CONFIGS.items()
+            if run_cfg.get("_is_candidate", False)
+            for t in REFTIMES
+        ],
+    output:
+        OUT_ROOT / "results/{experiment}/system_metrics.json",
+    params:
+        log_files=[
+            str(
+                OUT_ROOT
+                / f"logs/inference_execute/{run_id}-{t.strftime('%Y%m%d%H%M')}.log"
+            )
+            for run_id, run_cfg in RUN_CONFIGS.items()
+            if run_cfg.get("_is_candidate", False)
+            for t in REFTIMES
+        ],
+        label_map={
+            run_id: run_cfg.get("label", run_id)
+            for run_id, run_cfg in RUN_CONFIGS.items()
+            if run_cfg.get("_is_candidate", False)
+        },
+        gpu_map={
+            run_id: _candidate_gpu(run_cfg)
+            for run_id, run_cfg in RUN_CONFIGS.items()
+            if run_cfg.get("_is_candidate", False)
+        },
+        log_dir=str(OUT_ROOT / "logs/inference_execute"),
+    run:
+        import json
+        import sys
+        from pathlib import Path
+
+        sys.path.insert(0, str(Path(workflow.snakefile).parent / "scripts"))
+        from parse_inference_logs import parse_logs
+
+        records = parse_logs(
+            log_files=params.log_files,
+            label_map=params.label_map,
+            gpu_map=params.gpu_map,
+            log_dir=params.log_dir,
+        )
+        out_path = Path(str(output[0]))
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        out_path.write_text(json.dumps(records, indent=2))
+
+
 rule report_experiment_dashboard:
     localrule: True
     input:
@@ -23,6 +84,7 @@ rule report_experiment_dashboard:
         template="resources/report/dashboard/template.html.jinja2",
         js_script="resources/report/dashboard/script.js",
         configfile={workflow.configfiles[0]},
+        sysmetrics=OUT_ROOT / "results/{experiment}/system_metrics.json",
     output:
         report(
             directory(OUT_ROOT / "results/{experiment}/dashboard"),
@@ -41,5 +103,6 @@ rule report_experiment_dashboard:
             --script {input.js_script} \
             --header_text "{params.header_text}" \
             --configfile "{input.configfile}" \
+            --sysmetrics_file "{input.sysmetrics}" \
             --output {output} > {log} 2>&1
         """
diff --git a/workflow/scripts/parse_inference_logs.py b/workflow/scripts/parse_inference_logs.py
new file mode 100644
index 00000000..389df9a2
--- /dev/null
+++ b/workflow/scripts/parse_inference_logs.py
@@ -0,0 +1,47 @@
+"""CLI wrapper: parse anemoi-inference logs and write system metrics JSON."""
+
+import argparse
+import json
+import logging
+from pathlib import Path
+
+from diagnostics import parse_logs
+
+LOG = logging.getLogger(__name__)
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+
+
+def main(args: argparse.Namespace) -> None:
+    records = parse_logs(
+        log_files=args.logs,
+        label_map=json.loads(args.label_map),
+        gpu_map=json.loads(args.gpu_map),
+        log_dir=args.log_dir,
+    )
+    out = Path(args.output)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    with open(out, "w") as fh:
+        json.dump(records, fh, indent=2)
+    LOG.info("Saved system metrics to %s", args.output)
+
+
+if __name__ == "__main__":
+    p = argparse.ArgumentParser(description="Parse inference logs for system metrics.")
+    p.add_argument("--logs", nargs="+", required=True, help="Inference log file paths.")
+    p.add_argument(
+        "--label_map", required=True, help="JSON dict mapping run_id → source label."
+    )
+    p.add_argument(
+        "--gpu_map",
+        default="{}",
+        help="JSON dict mapping run_id → GPU count (default: 1).",
+    )
+    p.add_argument(
+        "--log_dir",
+        required=True,
+        help="Root of inference_execute logs; used to extract run_id from file path.",
+    )
+    p.add_argument("--output", required=True, help="Output JSON file path.")
+    main(p.parse_args())
diff --git a/workflow/scripts/report_experiment_dashboard.py b/workflow/scripts/report_experiment_dashboard.py
index 46e132d7..e1872e5a 100644
--- a/workflow/scripts/report_experiment_dashboard.py
+++ b/workflow/scripts/report_experiment_dashboard.py
@@ -1,4 +1,5 @@
 import argparse
+import json
 import logging
 import sys as _sys
 from pathlib import Path
@@ -6,6 +7,8 @@
 import jinja2
 import xarray as xr
 
+from diagnostics import melt_for_dashboard
+
 _sys.path.append(str(Path(__file__).parent))
 from verification_plot_metrics import _ensure_unique_lead_time
 from verification_plot_metrics import _select_best_sources
@@ -16,6 +19,17 @@
 )
 
 
+def _load_sysmetrics(sysmetrics_file: Path) -> tuple[str, list[str]]:
+    """Load system metrics JSON and melt to long format for Vega-Lite."""
+    if not sysmetrics_file or not sysmetrics_file.is_file():
+        return "[]", []
+    with open(sysmetrics_file) as fh:
+        records = json.load(fh)
+    sysmetrics_json, sources = melt_for_dashboard(records)
+    LOG.info("Loaded system metrics for %d source(s)", len(sources))
+    return sysmetrics_json, sources
+
+
 def program_summary_log(args):
     """Log a welcome message with the script and template information."""
     LOG.info("=" * 80)
@@ -75,6 +89,9 @@ def main(args):
     json_size = len(df_json.encode("utf-8"))
     LOG.info("Size of embedded JSON data: %d bytes", json_size)
 
+    # load system metrics
+    sysmetrics_json, sysmetrics_sources = _load_sysmetrics(args.sysmetrics_file)
+
     # read script
     with open(args.script, "r") as f:
         js_src = f.read()
@@ -97,6 +114,8 @@ def main(args):
         configfile_content=open(args.configfile, "r").read()
         if args.configfile.is_file()
         else "",
+        sysmetrics_data=sysmetrics_json,
+        sysmetrics_sources=sysmetrics_sources,
     )
     LOG.info("Size of generated HTML: %d bytes", len(html.encode("utf-8")))
 
@@ -139,6 +158,12 @@ def main(args):
         type=Path,
         help="Path to config file for the evalml run.",
     )
+    parser.add_argument(
+        "--sysmetrics_file",
+        type=Path,
+        default=None,
+        help="Path to system metrics JSON produced by parse_inference_logs.py.",
+    )
     parser.add_argument(
         "--output",
         type=Path,

From 6755cdab125ac0594afe9886d0ebd2d73868948e Mon Sep 17 00:00:00 2001
From: Jonas Bhend <jonas.bhend@meteoswiss.ch>
Date: Wed, 6 May 2026 10:01:04 +0200
Subject: [PATCH 02/16] fix referencing

---
 workflow/rules/report.smk | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk
index 66843b5e..452e5d7f 100644
--- a/workflow/rules/report.smk
+++ b/workflow/rules/report.smk
@@ -59,11 +59,9 @@ rule collect_system_metrics:
         log_dir=str(OUT_ROOT / "logs/inference_execute"),
     run:
         import json
-        import sys
         from pathlib import Path
 
-        sys.path.insert(0, str(Path(workflow.snakefile).parent / "scripts"))
-        from parse_inference_logs import parse_logs
+        from diagnostics import parse_logs
 
         records = parse_logs(
             log_files=params.log_files,

From d8249d575d4ce976953f97017f038f2e4ee730aa Mon Sep 17 00:00:00 2001
From: Jonas Bhend <jonas.bhend@meteoswiss.ch>
Date: Wed, 6 May 2026 10:32:34 +0200
Subject: [PATCH 03/16] exclude rulegraph and dag from being tracked

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 50b2ff7d..346ade17 100644
--- a/.gitignore
+++ b/.gitignore
@@ -55,3 +55,5 @@ uv.lock
 
 # evalml
 .evalml_snakemake_cmd.txt
+rulegraph.svg
+dag.svg

From 1b871a7af9b55237146b21a31c67addf8d8d557a Mon Sep 17 00:00:00 2001
From: Jonas Bhend <jonas.bhend@meteoswiss.ch>
Date: Wed, 6 May 2026 13:53:30 +0200
Subject: [PATCH 04/16] expose individual runs

---
 resources/report/dashboard/script.js          | 78 ++++++++++++-------
 .../report/dashboard/template.html.jinja2     |  8 ++
 src/diagnostics/__init__.py                   | 19 +++--
 tests/unit/test_parse_inference_logs.py       | 61 ++++++++++++++-
 workflow/rules/report.smk                     | 11 +--
 .../scripts/report_experiment_dashboard.py    | 19 +++--
 6 files changed, 147 insertions(+), 49 deletions(-)

diff --git a/resources/report/dashboard/script.js b/resources/report/dashboard/script.js
index 2561d7bf..dddc3fd3 100644
--- a/resources/report/dashboard/script.js
+++ b/resources/report/dashboard/script.js
@@ -193,6 +193,15 @@ const sysDataEl = document.getElementById("sysmetrics-data");
 const sysData = sysDataEl ? JSON.parse(sysDataEl.textContent) : [];
 
 if (sysData.length > 0) {
+  choicesInstances["sys-model-type-select"] = new Choices("#sys-model-type-select", {
+    searchEnabled: false,
+    removeItemButton: true,
+    shouldSort: false,
+    itemSelectText: "",
+    placeholder: false,
+  });
+  document.getElementById("sys-model-type-select").addEventListener("change", updateSysChart);
+
   choicesInstances["sys-source-select"] = new Choices("#sys-source-select", {
     searchEnabled: false,
     removeItemButton: true,
@@ -209,42 +218,57 @@ if (sysData.length > 0) {
     },
     "resolve": { "scale": { "y": "independent" } },
     "spec": {
-      "width": 220,
-      "height": 220,
-      "layer": [
-        {
-          "mark": { "type": "bar", "opacity": 0.7 },
-          "encoding": {
-            "x": { "field": "source", "type": "nominal", "axis": { "labelAngle": -30, "title": null } },
-            "y": { "field": "value", "aggregate": "mean", "type": "quantitative", "title": "mean" },
-            "color": { "field": "source", "type": "nominal", "legend": { "orient": "top", "title": "Source" } },
-          },
+      "width": 280,
+      "height": 240,
+      "mark": { "type": "point", "filled": true, "size": 70, "opacity": 0.85 },
+      "encoding": {
+        "x": {
+          "field": "source",
+          "type": "nominal",
+          "axis": { "labelAngle": -30, "title": null }
+        },
+        "y": {
+          "field": "value",
+          "type": "quantitative",
+          "title": null,
+          "scale": { "zero": true }
+        },
+        "color": {
+          "field": "model_type",
+          "type": "nominal",
+          "legend": { "orient": "top", "title": "Model type" }
         },
-        {
-          "mark": { "type": "point", "filled": true, "size": 40, "opacity": 0.9 },
-          "encoding": {
-            "x": { "field": "source", "type": "nominal" },
-            "y": { "field": "value", "type": "quantitative" },
-            "color": { "field": "source", "type": "nominal" },
-            "tooltip": [
-              { "field": "source", "type": "nominal", "title": "Source" },
-              { "field": "init_time", "type": "nominal", "title": "Init time" },
-              { "field": "metric", "type": "nominal", "title": "Metric" },
-              { "field": "value", "type": "quantitative", "title": "Value", "format": ".3f" },
-              { "field": "n_gpu", "type": "quantitative", "title": "GPUs" },
-              { "field": "job_id", "type": "nominal", "title": "Job ID" },
-            ],
-          },
+        "shape": {
+          "field": "model_type",
+          "type": "nominal",
+          "legend": { "orient": "top", "title": "Model type" }
         },
-      ],
+        "tooltip": [
+          { "field": "source", "type": "nominal", "title": "Source" },
+          { "field": "model_type", "type": "nominal", "title": "Model type" },
+          { "field": "init_time", "type": "nominal", "title": "Init time" },
+          { "field": "metric", "type": "nominal", "title": "Metric" },
+          { "field": "value", "type": "quantitative", "title": "Value", "format": ".3f" },
+          { "field": "n_gpu", "type": "quantitative", "title": "GPUs" },
+          { "field": "job_id", "type": "nominal", "title": "Job ID" },
+        ],
+      },
     },
   };
 
   function updateSysChart() {
+    const selectedModelTypes = getSelectedValues("sys-model-type-select");
     const selectedSources = getSelectedValues("sys-source-select");
     const newSpec = JSON.parse(JSON.stringify(sysSpec));
+    const filters = [];
+    if (selectedModelTypes.length > 0) {
+      filters.push({ field: "model_type", oneOf: selectedModelTypes });
+    }
     if (selectedSources.length > 0) {
-      newSpec.transform = [{ filter: { field: "source", oneOf: selectedSources } }];
+      filters.push({ field: "source", oneOf: selectedSources });
+    }
+    if (filters.length > 0) {
+      newSpec.transform = [{ filter: { and: filters } }];
     }
     vegaEmbed("#sys-vis", newSpec, { actions: false });
   }
diff --git a/resources/report/dashboard/template.html.jinja2 b/resources/report/dashboard/template.html.jinja2
index d0ba035c..6710d011 100644
--- a/resources/report/dashboard/template.html.jinja2
+++ b/resources/report/dashboard/template.html.jinja2
@@ -162,6 +162,14 @@
     {% if sysmetrics_sources %}
     <div id="tab_sysmetrics" class="tab-content">
         <div class="controls">
+            <div class="control-group">
+                <label>Model type(s)</label>
+                <select id="sys-model-type-select" multiple>
+                    {% for model_type in sysmetrics_model_types %}
+                    <option value="{{model_type}}" selected>{{model_type}}</option>
+                    {% endfor %}
+                </select>
+            </div>
             <div class="control-group">
                 <label>Source(s)</label>
                 <select id="sys-source-select" multiple>
diff --git a/src/diagnostics/__init__.py b/src/diagnostics/__init__.py
index f9720b3c..c01f0d70 100644
--- a/src/diagnostics/__init__.py
+++ b/src/diagnostics/__init__.py
@@ -13,14 +13,13 @@
 _N_STEPS = re.compile(r"Forecasting (\d+) steps")
 _STEP_TIME = re.compile(r"Forecast\. Model call \d+:.+?: (\d+) seconds\.")
 
-# Human-readable names for wide-format columns used in the dashboard
+# Columns exposed as distribution metrics in the dashboard
 SYSMETRICS_COLS = {
     "wall_time_s": "Wall Time (s)",
     "gpu_hours": "GPU Hours",
     "mean_step_time_s": "Mean Step Time (s)",
     "max_step_time_s": "Max Step Time (s)",
-    "checkpoint_size_gib": "Checkpoint Size (GiB)",
-    "n_steps": "No. Steps",
+    "n_gpu": "GPUs",
 }
 
 
@@ -114,6 +113,7 @@ def parse_logs(
 
         label = label_map.get(run_id, run_id)
         n_gpu = int(gpu_map.get(run_id, 1))
+        model_type = run_id.split("-")[0]
 
         try:
             raw = parse_single_log(str(log_path))
@@ -133,6 +133,7 @@ def parse_logs(
             {
                 "source": label,
                 "run_id": run_id,
+                "model_type": model_type,
                 "init_time": init_iso,
                 "n_gpu": n_gpu,
                 "gpu_hours": gpu_hours,
@@ -144,19 +145,23 @@ def parse_logs(
     return records
 
 
-def melt_for_dashboard(records: list[dict]) -> tuple[str, list[str]]:
+def melt_for_dashboard(records: list[dict]) -> tuple[str, list[str], list[str]]:
     """Convert wide-format system metrics records to long format for Vega-Lite.
 
-    Returns (json_string, sorted_source_list).
+    Returns (json_string, sorted_source_list, sorted_model_type_list).
     """
     import json
 
     long_records = []
     for r in records:
-        base = {k: r.get(k) for k in ("source", "init_time", "n_gpu", "job_id")}
+        base = {
+            k: r.get(k)
+            for k in ("source", "model_type", "init_time", "n_gpu", "job_id")
+        }
         for col, label in SYSMETRICS_COLS.items():
             if r.get(col) is not None:
                 long_records.append({**base, "metric": label, "value": r[col]})
 
     sources = sorted({r["source"] for r in records})
-    return json.dumps(long_records), sources
+    model_types = sorted({r.get("model_type", "unknown") for r in records})
+    return json.dumps(long_records), sources, model_types
diff --git a/tests/unit/test_parse_inference_logs.py b/tests/unit/test_parse_inference_logs.py
index ef76014e..6b7607a9 100644
--- a/tests/unit/test_parse_inference_logs.py
+++ b/tests/unit/test_parse_inference_logs.py
@@ -1,6 +1,8 @@
+import json
+
 import pytest
 
-from diagnostics import parse_logs, parse_single_log
+from diagnostics import melt_for_dashboard, parse_logs, parse_single_log
 
 LOG_CONTENT = """\
 srun: job 4242140 queued and waiting for resources
@@ -50,6 +52,7 @@ def test_parse_logs_extracts_run_id_and_init_time(tmp_path):
     r = records[0]
     assert r["source"] == "My Model"
     assert r["run_id"] == "forecaster-c304-1e7e/253b"
+    assert r["model_type"] == "forecaster"
     assert r["init_time"] == "2025-03-01T00:00:00"
     assert r["n_gpu"] == 2
     assert r["gpu_hours"] == pytest.approx(61.0 / 3600 * 2, rel=1e-3)
@@ -65,6 +68,62 @@ def test_parse_logs_missing_file_is_skipped(tmp_path):
     assert records == []
 
 
+def test_parse_logs_model_type_from_run_id_prefix(tmp_path):
+    for prefix, expected_type in [
+        ("forecaster-c304", "forecaster"),
+        ("interpolator-tmp-d5aa", "interpolator"),
+    ]:
+        log_dir = tmp_path / "inference_execute" / prefix
+        log_dir.mkdir(parents=True, exist_ok=True)
+        log_file = log_dir / "abcd-202503010000.log"
+        log_file.write_text(LOG_CONTENT)
+
+        records = parse_logs(
+            log_files=[str(log_file)],
+            label_map={},
+            gpu_map={},
+            log_dir=str(tmp_path / "inference_execute"),
+        )
+        assert records[0]["model_type"] == expected_type
+
+
+def test_melt_for_dashboard_exposes_model_type_and_distribution_metrics(tmp_path):
+    records = [
+        {
+            "source": "ModelA",
+            "model_type": "forecaster",
+            "init_time": "2025-03-01T00:00:00",
+            "n_gpu": 1,
+            "job_id": "111",
+            "wall_time_s": 60.0,
+            "gpu_hours": 1 / 60,
+            "mean_step_time_s": 2.0,
+            "checkpoint_size_gib": 1.4,
+        },
+        {
+            "source": "ModelA",
+            "model_type": "interpolator",
+            "init_time": "2025-03-02T00:00:00",
+            "n_gpu": 1,
+            "job_id": "222",
+            "wall_time_s": 120.0,
+            "gpu_hours": 2 / 60,
+            "mean_step_time_s": 3.0,
+            "checkpoint_size_gib": 0.8,
+        },
+    ]
+    data_json, sources, model_types = melt_for_dashboard(records)
+    rows = json.loads(data_json)
+
+    assert sources == ["ModelA"]
+    assert model_types == ["forecaster", "interpolator"]
+    # only wall_time_s and gpu_hours should be melted
+    metrics_present = {r["metric"] for r in rows}
+    assert metrics_present == {"Wall Time (s)", "GPU Hours"}
+    # model_type must be present in every row
+    assert all("model_type" in r for r in rows)
+
+
 def test_parse_logs_fallback_label_is_run_id(tmp_path):
     log_dir = tmp_path / "inference_execute" / "env-abc"
     log_dir.mkdir(parents=True)
diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk
index 452e5d7f..6bf2c5aa 100644
--- a/workflow/rules/report.smk
+++ b/workflow/rules/report.smk
@@ -30,8 +30,7 @@ rule collect_system_metrics:
     input:
         okfiles=[
             OUT_ROOT / f"logs/inference_execute/{run_id}-{t.strftime('%Y%m%d%H%M')}.ok"
-            for run_id, run_cfg in RUN_CONFIGS.items()
-            if run_cfg.get("_is_candidate", False)
+            for run_id in RUN_CONFIGS
             for t in REFTIMES
         ],
     output:
@@ -42,19 +41,15 @@ rule collect_system_metrics:
                 OUT_ROOT
                 / f"logs/inference_execute/{run_id}-{t.strftime('%Y%m%d%H%M')}.log"
             )
-            for run_id, run_cfg in RUN_CONFIGS.items()
-            if run_cfg.get("_is_candidate", False)
+            for run_id in RUN_CONFIGS
             for t in REFTIMES
         ],
         label_map={
             run_id: run_cfg.get("label", run_id)
             for run_id, run_cfg in RUN_CONFIGS.items()
-            if run_cfg.get("_is_candidate", False)
         },
         gpu_map={
-            run_id: _candidate_gpu(run_cfg)
-            for run_id, run_cfg in RUN_CONFIGS.items()
-            if run_cfg.get("_is_candidate", False)
+            run_id: _candidate_gpu(run_cfg) for run_id, run_cfg in RUN_CONFIGS.items()
         },
         log_dir=str(OUT_ROOT / "logs/inference_execute"),
     run:
diff --git a/workflow/scripts/report_experiment_dashboard.py b/workflow/scripts/report_experiment_dashboard.py
index e1872e5a..3e37eda4 100644
--- a/workflow/scripts/report_experiment_dashboard.py
+++ b/workflow/scripts/report_experiment_dashboard.py
@@ -19,15 +19,19 @@
 )
 
 
-def _load_sysmetrics(sysmetrics_file: Path) -> tuple[str, list[str]]:
+def _load_sysmetrics(sysmetrics_file: Path) -> tuple[str, list[str], list[str]]:
     """Load system metrics JSON and melt to long format for Vega-Lite."""
     if not sysmetrics_file or not sysmetrics_file.is_file():
-        return "[]", []
+        return "[]", [], []
     with open(sysmetrics_file) as fh:
         records = json.load(fh)
-    sysmetrics_json, sources = melt_for_dashboard(records)
-    LOG.info("Loaded system metrics for %d source(s)", len(sources))
-    return sysmetrics_json, sources
+    sysmetrics_json, sources, model_types = melt_for_dashboard(records)
+    LOG.info(
+        "Loaded system metrics for %d source(s), %d model type(s)",
+        len(sources),
+        len(model_types),
+    )
+    return sysmetrics_json, sources, model_types
 
 
 def program_summary_log(args):
@@ -90,7 +94,9 @@ def main(args):
     LOG.info("Size of embedded JSON data: %d bytes", json_size)
 
     # load system metrics
-    sysmetrics_json, sysmetrics_sources = _load_sysmetrics(args.sysmetrics_file)
+    sysmetrics_json, sysmetrics_sources, sysmetrics_model_types = _load_sysmetrics(
+        args.sysmetrics_file
+    )
 
     # read script
     with open(args.script, "r") as f:
@@ -116,6 +122,7 @@ def main(args):
         else "",
         sysmetrics_data=sysmetrics_json,
         sysmetrics_sources=sysmetrics_sources,
+        sysmetrics_model_types=sysmetrics_model_types,
     )
     LOG.info("Size of generated HTML: %d bytes", len(html.encode("utf-8")))
 

From 58f9a6c94b000fa3bc2cb7a0b59f0a50d9fce8bf Mon Sep 17 00:00:00 2001
From: Jonas Bhend <jonas.bhend@meteoswiss.ch>
Date: Tue, 2 Jun 2026 09:17:07 +0200
Subject: [PATCH 05/16] add slurm-based collection of system metrics for
 inference

---
 workflow/rules/inference.smk | 37 +++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/workflow/rules/inference.smk b/workflow/rules/inference.smk
index f59d6102..edd4a5ae 100644
--- a/workflow/rules/inference.smk
+++ b/workflow/rules/inference.smk
@@ -307,6 +307,28 @@ rule inference_execute:
 
         cd {params.workdir}
 
+        METRICS_DIR={params.workdir}
+
+        # Write wrapper script that collects GPU metrics alongside inference.
+        # It runs as the srun payload so it executes on the compute node where
+        # the GPU lives. nvidia-smi is a system binary, available outside the
+        # squashfs venv. Task 0 owns metric collection to avoid conflicts in
+        # multi-task jobs. METRICS_DIR and INFERENCE_CMD_ARGS are propagated
+        # via Slurm's default --export=ALL env forwarding.
+        cat > "$METRICS_DIR/run_with_metrics.sh" << 'RUN_METRICS_EOF'
+        #!/bin/bash
+        set -euo pipefail
+        if [ "${{SLURM_PROCID:-0}}" = "0" ]; then
+            echo "$SLURM_JOB_ID" > "$METRICS_DIR/slurm_job_id"
+            nvidia-smi dmon -s pucvmet -d 5 -o DT \
+                > "$METRICS_DIR/gpu_metrics.log" 2>/dev/null &
+            DMON_PID=$!
+            trap 'kill $DMON_PID 2>/dev/null; wait $DMON_PID 2>/dev/null' EXIT INT TERM
+        fi
+        anemoi-inference run config.yaml $INFERENCE_CMD_ARGS
+        RUN_METRICS_EOF
+        chmod +x "$METRICS_DIR/run_with_metrics.sh"
+
         squashfs-mount {params.image_path}:/user-environment -- bash -c '
         source /user-environment/bin/activate
 
@@ -321,6 +343,9 @@ rule inference_execute:
             CMD_ARGS+=(runner.parallel.cluster=slurm)
         fi
 
+        export METRICS_DIR={params.workdir}
+        export INFERENCE_CMD_ARGS="${{CMD_ARGS[*]}}"
+
         srun \
             --unbuffered \
             --partition={resources.slurm_partition} \
@@ -329,7 +354,17 @@ rule inference_execute:
             --time={resources.runtime} \
             --gres={resources.gres} \
             --ntasks={resources.ntasks} \
-            anemoi-inference run config.yaml "${{CMD_ARGS[@]}}"
+            bash "$METRICS_DIR/run_with_metrics.sh"
         '
+
+        # Post-job: collect Slurm CPU/memory accounting once sacct catches up
+        if [ -f "$METRICS_DIR/slurm_job_id" ]; then
+            SLURM_JOB=$(cat "$METRICS_DIR/slurm_job_id")
+            sleep 5
+            sacct -j "$SLURM_JOB" \
+                --format=JobID,JobName,Elapsed,CPUTime,MaxRSS,MaxVMSize,AveRSS,MaxDiskRead,MaxDiskWrite \
+                > "$METRICS_DIR/slurm_metrics.log" 2>/dev/null || true
+        fi
+
         ) > {log} 2>&1
         """

From 3d064383b7292a38c671e26ba2c164c1f2660a07 Mon Sep 17 00:00:00 2001
From: Jonas Bhend <jonas.bhend@meteoswiss.ch>
Date: Tue, 2 Jun 2026 09:36:35 +0200
Subject: [PATCH 06/16] log accumulation for dashboard

---
 src/diagnostics/__init__.py             | 275 ++++++++++++++++--------
 tests/unit/test_parse_inference_logs.py | 199 +++++++++++------
 workflow/rules/inference.smk            |   1 +
 workflow/rules/report.smk               |  17 +-
 4 files changed, 331 insertions(+), 161 deletions(-)

diff --git a/src/diagnostics/__init__.py b/src/diagnostics/__init__.py
index c01f0d70..58a19152 100644
--- a/src/diagnostics/__init__.py
+++ b/src/diagnostics/__init__.py
@@ -1,114 +1,213 @@
 """System-performance diagnostics for inference jobs."""
 
 import logging
-import re
+import math
 from datetime import datetime
 from pathlib import Path
 
 LOG = logging.getLogger(__name__)
 
-_TIMESTAMP = re.compile(r"^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})")
-_JOB_ID = re.compile(r"srun: job (\d+) queued")
-_CHECKPOINT_SIZE = re.compile(r"Checkpoint size: ([\d.]+) GiB")
-_N_STEPS = re.compile(r"Forecasting (\d+) steps")
-_STEP_TIME = re.compile(r"Forecast\. Model call \d+:.+?: (\d+) seconds\.")
-
 # Columns exposed as distribution metrics in the dashboard
 SYSMETRICS_COLS = {
     "wall_time_s": "Wall Time (s)",
     "gpu_hours": "GPU Hours",
-    "mean_step_time_s": "Mean Step Time (s)",
-    "max_step_time_s": "Max Step Time (s)",
-    "n_gpu": "GPUs",
+    "max_rss_mb": "Peak CPU Memory (MB)",
+    "gpu_util_mean": "Mean GPU Util (%)",
+    "gpu_util_max": "Peak GPU Util (%)",
+    "gpu_mem_used_mean": "Mean GPU Memory (MiB)",
+    "gpu_mem_used_max": "Peak GPU Memory (MiB)",
+    "gpu_power_mean": "Mean GPU Power (W)",
 }
 
 
-def parse_single_log(log_path: str) -> dict:
-    """Return raw metric values extracted from one inference log file."""
-    job_id = None
-    first_ts = last_ts = None
-    checkpoint_gib = None
-    n_steps = None
-    step_times: list[int] = []
-
-    with open(log_path) as fh:
-        for line in fh:
-            if job_id is None:
-                m = _JOB_ID.search(line)
-                if m:
-                    job_id = m.group(1)
-
-            m = _TIMESTAMP.match(line)
-            if m:
-                ts = datetime.strptime(m.group(1), "%Y-%m-%d %H:%M:%S")
-                if first_ts is None:
-                    first_ts = ts
-                last_ts = ts
-
-            if checkpoint_gib is None:
-                m = _CHECKPOINT_SIZE.search(line)
-                if m:
-                    checkpoint_gib = float(m.group(1))
-
-            if n_steps is None:
-                m = _N_STEPS.search(line)
-                if m:
-                    n_steps = int(m.group(1))
-
-            m = _STEP_TIME.search(line)
-            if m:
-                step_times.append(int(m.group(1)))
-
-    wall_time_s = (
-        (last_ts - first_ts).total_seconds()
-        if first_ts is not None and last_ts is not None
-        else None
-    )
-    return {
-        "job_id": job_id,
-        "wall_time_s": wall_time_s,
-        "n_steps": n_steps if n_steps is not None else len(step_times),
-        "mean_step_time_s": (
-            round(sum(step_times) / len(step_times), 2) if step_times else None
-        ),
-        "max_step_time_s": max(step_times) if step_times else None,
-        "checkpoint_size_gib": checkpoint_gib,
-    }
+def _parse_elapsed(s: str) -> float | None:
+    """Parse sacct elapsed 'D-HH:MM:SS' or 'HH:MM:SS' → seconds."""
+    s = s.strip()
+    try:
+        if "-" in s:
+            days, rest = s.split("-", 1)
+            h, m, sec = rest.split(":")
+            return int(days) * 86400 + int(h) * 3600 + int(m) * 60 + int(sec)
+        h, m, sec = s.split(":")
+        return int(h) * 3600 + int(m) * 60 + int(sec)
+    except Exception:
+        return None
+
+
+def _parse_memory(s: str) -> float | None:
+    """Parse sacct memory string ('2048K', '1.5M', '0.5G') → MB."""
+    s = s.strip()
+    if not s or s == "0":
+        return None
+    try:
+        if s.endswith("K"):
+            return float(s[:-1]) / 1024
+        if s.endswith("M"):
+            return float(s[:-1])
+        if s.endswith("G"):
+            return float(s[:-1]) * 1024
+        return float(s) / (1024 * 1024)
+    except ValueError:
+        return None
+
+
+def parse_sacct_log(log_path: str) -> dict:
+    """Parse slurm_metrics.log (sacct --parsable2 output) → metric dict.
+
+    Extracts wall_time_s from the parent job record (no dot in JobID) and
+    max_rss_mb as the maximum MaxRSS across all step records.
+    """
+    path = Path(log_path)
+    if not path.exists() or path.stat().st_size == 0:
+        return {}
+
+    lines = [ln for ln in path.read_text().splitlines() if ln.strip()]
+    if len(lines) < 2:
+        return {}
+
+    headers = [h.strip() for h in lines[0].split("|")]
+    rows = []
+    for line in lines[1:]:
+        fields = [f.strip() for f in line.split("|")]
+        if len(fields) >= len(headers):
+            rows.append(dict(zip(headers, fields)))
+
+    if not rows:
+        return {}
+
+    result: dict = {}
+
+    # Elapsed from the parent job record (no dot in JobID, no 'batch'/'extern' suffix)
+    for row in rows:
+        job_id = row.get("JobID", "")
+        if "." not in job_id and not job_id.endswith(("batch", "extern")):
+            val = _parse_elapsed(row.get("Elapsed", ""))
+            if val is not None:
+                result["wall_time_s"] = val
+            break
+
+    # MaxRSS: take the maximum across all step rows (parent job entry is 0)
+    rss_vals = [
+        v
+        for row in rows
+        if (v := _parse_memory(row.get("MaxRSS", ""))) is not None and v > 0
+    ]
+    if rss_vals:
+        result["max_rss_mb"] = round(max(rss_vals), 1)
+
+    return result
+
+
+def _parse_dmon_headers(header_line: str) -> dict[str, int]:
+    """Return {col_name: data_row_index} from a '# gpu …' nvidia-smi dmon header.
+
+    The first token after '#' is 'gpu' — a row-type label, not a data column —
+    so actual data indices are offset by -1 relative to the token positions.
+    """
+    tokens = header_line.lstrip("#").split()
+    return {name: i - 1 for i, name in enumerate(tokens) if i > 0}
+
+
+def parse_gpu_metrics_log(log_path: str) -> dict:
+    """Parse gpu_metrics.log (nvidia-smi dmon -o DT) → GPU utilisation/memory/power dict."""
+    path = Path(log_path)
+    if not path.exists() or path.stat().st_size == 0:
+        return {}
+
+    col_idx: dict[str, int] = {}
+    data_rows: list[list[float]] = []
+
+    for raw_line in path.read_text().splitlines():
+        line = raw_line.strip()
+        if not line:
+            continue
+        if line.startswith("#"):
+            tokens = line.lstrip("#").split()
+            # Column-name header contains 'sm' or 'Idx'; unit header contains '%' or 'W'
+            if "sm" in tokens or "Idx" in tokens:
+                col_idx = _parse_dmon_headers(line)
+            continue
+        fields = line.split()
+        if len(fields) < 5 or not col_idx:
+            continue
+        nums: list[float] = []
+        for f in fields:
+            try:
+                nums.append(float(f))
+            except ValueError:
+                nums.append(float("nan"))
+        data_rows.append(nums)
+
+    if not data_rows or not col_idx:
+        return {}
+
+    def _vals(col: str) -> list[float]:
+        idx = col_idx.get(col)
+        if idx is None:
+            return []
+        return [
+            r[idx]
+            for r in data_rows
+            if idx < len(r) and math.isfinite(r[idx])
+        ]
+
+    result: dict = {}
+
+    sm = _vals("sm")
+    if sm:
+        result["gpu_util_mean"] = round(sum(sm) / len(sm), 1)
+        result["gpu_util_max"] = round(max(sm), 1)
+
+    fb = _vals("fb")
+    if fb:
+        result["gpu_mem_used_mean"] = round(sum(fb) / len(fb), 1)
+        result["gpu_mem_used_max"] = round(max(fb), 1)
+
+    pwr = _vals("pwr")
+    if pwr:
+        result["gpu_power_mean"] = round(sum(pwr) / len(pwr), 1)
+
+    return result
+
+
+def parse_run_metrics(workdir: str) -> dict:
+    """Read sacct and GPU metric files from a run's workdir and merge them."""
+    wd = Path(workdir)
+    result: dict = {}
+
+    result.update(parse_sacct_log(str(wd / "slurm_metrics.log")))
+    result.update(parse_gpu_metrics_log(str(wd / "gpu_metrics.log")))
+
+    job_id_file = wd / "slurm_job_id"
+    if job_id_file.exists():
+        result["job_id"] = job_id_file.read_text().strip()
+
+    return result
 
 
 def parse_logs(
-    log_files: list[str],
+    run_info: list[dict],
     label_map: dict[str, str],
     gpu_map: dict[str, int],
-    log_dir: str,
 ) -> list[dict]:
-    """Parse inference log files and return one record per (run, init_time).
+    """Parse Slurm metric files and return one record per (run, init_time).
 
     Parameters
     ----------
-    log_files : paths to .log files to parse.
-    label_map : {run_id: human-readable label} — supplied by Snakemake rule params.
-    gpu_map   : {run_id: n_gpu} — GPU count used for each run.
-    log_dir   : root of the inference_execute logs directory; used to derive run_id.
+    run_info  : list of dicts with keys 'workdir', 'run_id', 'init_time'.
+    label_map : {run_id: human-readable label}.
+    gpu_map   : {run_id: n_gpu}.
     """
-    log_dir_path = Path(log_dir)
     records: list[dict] = []
 
-    for log_file in log_files:
-        log_path = Path(log_file)
-        if not log_path.exists():
-            LOG.warning("Log file not found, skipping: %s", log_file)
-            continue
+    for spec in run_info:
+        workdir = spec["workdir"]
+        run_id = spec["run_id"]
+        init_time_str = spec["init_time"]
 
-        # Derive run_id and init_time from the path.
-        # Relative path structure: "{run_id}-{init_time}.log"
-        # init_time is always 12 digits (YYYYMMDDHHM).
-        try:
-            stem = str(log_path.relative_to(log_dir_path).with_suffix(""))
-            init_time_str = stem[-12:]
-            run_id = stem[:-13]  # strip trailing "-YYYYMMDDHHM"
-        except Exception:
-            LOG.warning("Cannot derive run_id from path, skipping: %s", log_file)
+        wd = Path(workdir)
+        if not wd.exists():
+            LOG.warning("Workdir not found, skipping: %s", workdir)
             continue
 
         label = label_map.get(run_id, run_id)
@@ -116,9 +215,13 @@ def parse_logs(
         model_type = run_id.split("-")[0]
 
         try:
-            raw = parse_single_log(str(log_path))
+            raw = parse_run_metrics(workdir)
         except Exception as exc:
-            LOG.warning("Failed to parse %s: %s", log_file, exc)
+            LOG.warning("Failed to parse metrics for %s: %s", workdir, exc)
+            continue
+
+        if not raw:
+            LOG.warning("No metrics found for %s, skipping", workdir)
             continue
 
         wall_s = raw.get("wall_time_s")
@@ -141,7 +244,7 @@ def parse_logs(
             }
         )
 
-    LOG.info("Parsed %d log files → %d records", len(log_files), len(records))
+    LOG.info("Parsed %d run specs → %d records", len(run_info), len(records))
     return records
 
 
diff --git a/tests/unit/test_parse_inference_logs.py b/tests/unit/test_parse_inference_logs.py
index 6b7607a9..9c96baa7 100644
--- a/tests/unit/test_parse_inference_logs.py
+++ b/tests/unit/test_parse_inference_logs.py
@@ -2,68 +2,142 @@
 
 import pytest
 
-from diagnostics import melt_for_dashboard, parse_logs, parse_single_log
-
-LOG_CONTENT = """\
-srun: job 4242140 queued and waiting for resources
-srun: job 4242140 has been allocated resources
-2026-05-01 11:10:19 INFO Loading multi-dataset metadata
-2026-05-01 11:10:52 INFO Checkpoint size: 1.4 GiB
-2026-05-01 11:10:52 INFO Lead time: 5 days, 0:00:00 Forecasting 3 steps through 3 autoregressive steps
-2026-05-01 11:11:07 INFO Forecast. Model call 1: horizon 6:00:00, freq. 6:00:00 (2025-03-01 06:00:00): 6 seconds.
-2026-05-01 11:11:12 INFO Forecast. Model call 2: horizon 12:00:00, freq. 6:00:00 (2025-03-01 12:00:00): 2 seconds.
-2026-05-01 11:11:17 INFO Forecast. Model call 3: horizon 18:00:00, freq. 6:00:00 (2025-03-01 18:00:00): 2 seconds.
-2026-05-01 11:11:20 INFO Done.
+from diagnostics import melt_for_dashboard, parse_logs
+from diagnostics import parse_gpu_metrics_log, parse_sacct_log
+
+SACCT_CONTENT = """\
+JobID|JobName|Elapsed|CPUTime|MaxRSS|MaxVMSize|AveRSS|MaxDiskRead|MaxDiskWrite
+12345678|run_with_metrics.sh|00:05:23|02:08:48|0|0|0|4320K|2100K
+12345678.0|run_with_metrics.sh|00:05:23|02:08:48|2048K|5120K|1024K|4320K|2100K
+12345678.batch|batch|00:00:01|00:00:24|512K|1024K|256K|0|0
 """
 
+DMON_CONTENT = """\
+# gpu   Date        Time        Idx   sm   mem   enc   dec   pwr  mclk  pclk pviol tviol    fb  bar1 sbecc dbecc   pci gtemp mtemp
+# Devicems                           %     %     %     %    W   MHz   MHz      %     % MiB   MiB                       C     C
+20250602 12:34:56    0   85    70     0     0   250  1215  1530     0     0  8192 10240     0     0     0    65    50
+20250602 12:35:01    0   90    75     0     0   260  1215  1530     0     0  8300 10240     0     0     0    66    51
+"""
+
+
+def test_parse_sacct_wall_time(tmp_path):
+    log_file = tmp_path / "slurm_metrics.log"
+    log_file.write_text(SACCT_CONTENT)
+
+    result = parse_sacct_log(str(log_file))
+
+    assert result["wall_time_s"] == pytest.approx(5 * 60 + 23)
+
+
+def test_parse_sacct_max_rss(tmp_path):
+    log_file = tmp_path / "slurm_metrics.log"
+    log_file.write_text(SACCT_CONTENT)
+
+    result = parse_sacct_log(str(log_file))
+
+    # 2048K = 2.0 MB; 512K = 0.5 MB; parent row is 0 → max is 2.0
+    assert result["max_rss_mb"] == pytest.approx(2.0, rel=1e-3)
+
+
+def test_parse_sacct_missing_file(tmp_path):
+    result = parse_sacct_log(str(tmp_path / "does_not_exist.log"))
+    assert result == {}
+
+
+def test_parse_sacct_empty_file(tmp_path):
+    f = tmp_path / "slurm_metrics.log"
+    f.write_text("")
+    assert parse_sacct_log(str(f)) == {}
+
+
+def test_parse_gpu_metrics_utilisation(tmp_path):
+    log_file = tmp_path / "gpu_metrics.log"
+    log_file.write_text(DMON_CONTENT)
+
+    result = parse_gpu_metrics_log(str(log_file))
+
+    assert result["gpu_util_mean"] == pytest.approx((85 + 90) / 2, rel=1e-3)
+    assert result["gpu_util_max"] == pytest.approx(90.0)
+
+
+def test_parse_gpu_metrics_memory(tmp_path):
+    log_file = tmp_path / "gpu_metrics.log"
+    log_file.write_text(DMON_CONTENT)
+
+    result = parse_gpu_metrics_log(str(log_file))
+
+    assert result["gpu_mem_used_mean"] == pytest.approx((8192 + 8300) / 2, rel=1e-3)
+    assert result["gpu_mem_used_max"] == pytest.approx(8300.0)
+
+
+def test_parse_gpu_metrics_power(tmp_path):
+    log_file = tmp_path / "gpu_metrics.log"
+    log_file.write_text(DMON_CONTENT)
 
-def test_parse_single_log(tmp_path):
-    log_file = tmp_path / "test.log"
-    log_file.write_text(LOG_CONTENT)
+    result = parse_gpu_metrics_log(str(log_file))
 
-    result = parse_single_log(str(log_file))
+    assert result["gpu_power_mean"] == pytest.approx((250 + 260) / 2, rel=1e-3)
 
-    assert result["job_id"] == "4242140"
-    assert result["checkpoint_size_gib"] == 1.4
-    assert result["n_steps"] == 3
-    assert result["max_step_time_s"] == 6
-    assert result["mean_step_time_s"] == pytest.approx(round((6 + 2 + 2) / 3, 2))
-    # wall time: 11:11:20 - 11:10:19 = 61 seconds
-    assert result["wall_time_s"] == pytest.approx(61.0)
 
+def test_parse_gpu_metrics_missing_file(tmp_path):
+    assert parse_gpu_metrics_log(str(tmp_path / "does_not_exist.log")) == {}
 
-def test_parse_logs_extracts_run_id_and_init_time(tmp_path):
-    log_dir = tmp_path / "inference_execute" / "forecaster-c304-1e7e"
-    log_dir.mkdir(parents=True)
-    log_file = log_dir / "253b-202503010000.log"
-    log_file.write_text(LOG_CONTENT)
 
-    label_map = {"forecaster-c304-1e7e/253b": "My Model"}
-    gpu_map = {"forecaster-c304-1e7e/253b": 2}
+def test_parse_logs_reads_slurm_and_gpu_files(tmp_path):
+    workdir = tmp_path / "data" / "runs" / "forecaster-abc" / "202503010000"
+    workdir.mkdir(parents=True)
+    (workdir / "slurm_metrics.log").write_text(SACCT_CONTENT)
+    (workdir / "gpu_metrics.log").write_text(DMON_CONTENT)
+    (workdir / "slurm_job_id").write_text("12345678\n")
 
     records = parse_logs(
-        log_files=[str(log_file)],
-        label_map=label_map,
-        gpu_map=gpu_map,
-        log_dir=str(tmp_path / "inference_execute"),
+        run_info=[
+            {
+                "workdir": str(workdir),
+                "run_id": "forecaster-abc",
+                "init_time": "202503010000",
+            }
+        ],
+        label_map={"forecaster-abc": "My Model"},
+        gpu_map={"forecaster-abc": 2},
     )
 
     assert len(records) == 1
     r = records[0]
     assert r["source"] == "My Model"
-    assert r["run_id"] == "forecaster-c304-1e7e/253b"
+    assert r["run_id"] == "forecaster-abc"
     assert r["model_type"] == "forecaster"
     assert r["init_time"] == "2025-03-01T00:00:00"
     assert r["n_gpu"] == 2
-    assert r["gpu_hours"] == pytest.approx(61.0 / 3600 * 2, rel=1e-3)
+    assert r["job_id"] == "12345678"
+    assert r["wall_time_s"] == pytest.approx(323.0)
+    assert r["gpu_hours"] == pytest.approx(323.0 / 3600 * 2, rel=1e-3)
+    assert "gpu_util_mean" in r
+    assert "max_rss_mb" in r
 
 
-def test_parse_logs_missing_file_is_skipped(tmp_path):
+def test_parse_logs_missing_workdir_is_skipped(tmp_path):
     records = parse_logs(
-        log_files=[str(tmp_path / "does_not_exist.log")],
+        run_info=[
+            {
+                "workdir": str(tmp_path / "does_not_exist"),
+                "run_id": "forecaster-abc",
+                "init_time": "202503010000",
+            }
+        ],
+        label_map={},
+        gpu_map={},
+    )
+    assert records == []
+
+
+def test_parse_logs_no_metrics_files_is_skipped(tmp_path):
+    workdir = tmp_path / "empty_run"
+    workdir.mkdir()
+    records = parse_logs(
+        run_info=[{"workdir": str(workdir), "run_id": "x", "init_time": "202503010000"}],
         label_map={},
         gpu_map={},
-        log_dir=str(tmp_path),
     )
     assert records == []
 
@@ -73,21 +147,32 @@ def test_parse_logs_model_type_from_run_id_prefix(tmp_path):
         ("forecaster-c304", "forecaster"),
         ("interpolator-tmp-d5aa", "interpolator"),
     ]:
-        log_dir = tmp_path / "inference_execute" / prefix
-        log_dir.mkdir(parents=True, exist_ok=True)
-        log_file = log_dir / "abcd-202503010000.log"
-        log_file.write_text(LOG_CONTENT)
+        workdir = tmp_path / prefix / "202503010000"
+        workdir.mkdir(parents=True)
+        (workdir / "slurm_metrics.log").write_text(SACCT_CONTENT)
 
         records = parse_logs(
-            log_files=[str(log_file)],
+            run_info=[{"workdir": str(workdir), "run_id": prefix, "init_time": "202503010000"}],
             label_map={},
             gpu_map={},
-            log_dir=str(tmp_path / "inference_execute"),
         )
         assert records[0]["model_type"] == expected_type
 
 
-def test_melt_for_dashboard_exposes_model_type_and_distribution_metrics(tmp_path):
+def test_parse_logs_fallback_label_is_run_id(tmp_path):
+    workdir = tmp_path / "env-abc" / "202503020000"
+    workdir.mkdir(parents=True)
+    (workdir / "slurm_metrics.log").write_text(SACCT_CONTENT)
+
+    records = parse_logs(
+        run_info=[{"workdir": str(workdir), "run_id": "env-abc", "init_time": "202503020000"}],
+        label_map={},
+        gpu_map={},
+    )
+    assert records[0]["source"] == "env-abc"
+
+
+def test_melt_for_dashboard_exposes_model_type_and_distribution_metrics():
     records = [
         {
             "source": "ModelA",
@@ -97,8 +182,6 @@ def test_melt_for_dashboard_exposes_model_type_and_distribution_metrics(tmp_path
             "job_id": "111",
             "wall_time_s": 60.0,
             "gpu_hours": 1 / 60,
-            "mean_step_time_s": 2.0,
-            "checkpoint_size_gib": 1.4,
         },
         {
             "source": "ModelA",
@@ -108,8 +191,6 @@ def test_melt_for_dashboard_exposes_model_type_and_distribution_metrics(tmp_path
             "job_id": "222",
             "wall_time_s": 120.0,
             "gpu_hours": 2 / 60,
-            "mean_step_time_s": 3.0,
-            "checkpoint_size_gib": 0.8,
         },
     ]
     data_json, sources, model_types = melt_for_dashboard(records)
@@ -117,24 +198,8 @@ def test_melt_for_dashboard_exposes_model_type_and_distribution_metrics(tmp_path
 
     assert sources == ["ModelA"]
     assert model_types == ["forecaster", "interpolator"]
-    # only wall_time_s and gpu_hours should be melted
+    # only wall_time_s and gpu_hours are present in the test records
     metrics_present = {r["metric"] for r in rows}
     assert metrics_present == {"Wall Time (s)", "GPU Hours"}
     # model_type must be present in every row
     assert all("model_type" in r for r in rows)
-
-
-def test_parse_logs_fallback_label_is_run_id(tmp_path):
-    log_dir = tmp_path / "inference_execute" / "env-abc"
-    log_dir.mkdir(parents=True)
-    log_file = log_dir / "1234-202503020000.log"
-    log_file.write_text(LOG_CONTENT)
-
-    records = parse_logs(
-        log_files=[str(log_file)],
-        label_map={},  # no label provided
-        gpu_map={},
-        log_dir=str(tmp_path / "inference_execute"),
-    )
-
-    assert records[0]["source"] == "env-abc/1234"
diff --git a/workflow/rules/inference.smk b/workflow/rules/inference.smk
index edd4a5ae..9a858718 100644
--- a/workflow/rules/inference.smk
+++ b/workflow/rules/inference.smk
@@ -362,6 +362,7 @@ rule inference_execute:
             SLURM_JOB=$(cat "$METRICS_DIR/slurm_job_id")
             sleep 5
             sacct -j "$SLURM_JOB" \
+                --parsable2 \
                 --format=JobID,JobName,Elapsed,CPUTime,MaxRSS,MaxVMSize,AveRSS,MaxDiskRead,MaxDiskWrite \
                 > "$METRICS_DIR/slurm_metrics.log" 2>/dev/null || true
         fi
diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk
index 6bf2c5aa..b36a71aa 100644
--- a/workflow/rules/report.smk
+++ b/workflow/rules/report.smk
@@ -36,11 +36,14 @@ rule collect_system_metrics:
     output:
         OUT_ROOT / "results/{experiment}/system_metrics.json",
     params:
-        log_files=[
-            str(
-                OUT_ROOT
-                / f"logs/inference_execute/{run_id}-{t.strftime('%Y%m%d%H%M')}.log"
-            )
+        run_info=[
+            {
+                "workdir": str(
+                    (OUT_ROOT / f"data/runs/{run_id}/{t.strftime('%Y%m%d%H%M')}").resolve()
+                ),
+                "run_id": run_id,
+                "init_time": t.strftime("%Y%m%d%H%M"),
+            }
             for run_id in RUN_CONFIGS
             for t in REFTIMES
         ],
@@ -51,7 +54,6 @@ rule collect_system_metrics:
         gpu_map={
             run_id: _candidate_gpu(run_cfg) for run_id, run_cfg in RUN_CONFIGS.items()
         },
-        log_dir=str(OUT_ROOT / "logs/inference_execute"),
     run:
         import json
         from pathlib import Path
@@ -59,10 +61,9 @@ rule collect_system_metrics:
         from diagnostics import parse_logs
 
         records = parse_logs(
-            log_files=params.log_files,
+            run_info=params.run_info,
             label_map=params.label_map,
             gpu_map=params.gpu_map,
-            log_dir=params.log_dir,
         )
         out_path = Path(str(output[0]))
         out_path.parent.mkdir(parents=True, exist_ok=True)

From cde34d7fc9c2f93f67bcce04a3cebb3cb26ce562 Mon Sep 17 00:00:00 2001
From: Jonas Bhend <jonas.bhend@meteoswiss.ch>
Date: Tue, 2 Jun 2026 15:25:17 +0200
Subject: [PATCH 07/16] update dashboard

---
 resources/report/dashboard/script.js          | 64 +++++++++++++++----
 .../report/dashboard/template.html.jinja2     |  4 ++
 2 files changed, 55 insertions(+), 13 deletions(-)

diff --git a/resources/report/dashboard/script.js b/resources/report/dashboard/script.js
index dddc3fd3..4e5f2cc1 100644
--- a/resources/report/dashboard/script.js
+++ b/resources/report/dashboard/script.js
@@ -211,34 +211,68 @@ if (sysData.length > 0) {
   });
   document.getElementById("sys-source-select").addEventListener("change", updateSysChart);
 
+  // Populate metric filter from the data, then initialise Choices on it
+  const sysMetricEl = document.getElementById("sys-metric-select");
+  [...new Set(sysData.map(d => d.metric))].sort().forEach(m => {
+    const opt = document.createElement("option");
+    opt.value = m;
+    opt.textContent = m;
+    opt.selected = true;
+    sysMetricEl.appendChild(opt);
+  });
+  choicesInstances["sys-metric-select"] = new Choices("#sys-metric-select", {
+    searchEnabled: false,
+    removeItemButton: true,
+    shouldSort: false,
+    itemSelectText: "",
+    placeholder: false,
+  });
+  document.getElementById("sys-metric-select").addEventListener("change", updateSysChart);
+
   const sysSpec = {
     "data": { "values": sysData },
-    "facet": {
-      "column": { "field": "metric", "type": "nominal", "title": null },
-    },
-    "resolve": { "scale": { "y": "independent" } },
+    "facet": { "field": "metric", "type": "nominal", "title": null },
+    "columns": 4,
+    "resolve": { "scale": { "x": "shared", "y": "independent" } },
     "spec": {
-      "width": 280,
-      "height": 240,
-      "mark": { "type": "point", "filled": true, "size": 70, "opacity": 0.85 },
+      "params": [
+        {
+          "name": "xZoom",
+          "select": {
+            "type": "interval",
+            "encodings": ["x"],
+            "zoom": "wheel![!event.shiftKey]"
+          },
+          "bind": "scales"
+        }
+      ],
+      "width": 300,
+      "height": 200,
+      "mark": { "type": "line", "point": { "filled": true, "size": 50 } },
       "encoding": {
         "x": {
-          "field": "source",
-          "type": "nominal",
-          "axis": { "labelAngle": -30, "title": null }
+          "field": "init_time",
+          "type": "temporal",
+          "title": null,
+          "axis": { "labelAngle": -30, "format": "%b %d" }
         },
         "y": {
           "field": "value",
           "type": "quantitative",
           "title": null,
-          "scale": { "zero": true }
+          "scale": { "zero": false }
         },
         "color": {
+          "field": "source",
+          "type": "nominal",
+          "legend": { "orient": "top", "title": "Source" }
+        },
+        "shape": {
           "field": "model_type",
           "type": "nominal",
           "legend": { "orient": "top", "title": "Model type" }
         },
-        "shape": {
+        "strokeDash": {
           "field": "model_type",
           "type": "nominal",
           "legend": { "orient": "top", "title": "Model type" }
@@ -246,7 +280,7 @@ if (sysData.length > 0) {
         "tooltip": [
           { "field": "source", "type": "nominal", "title": "Source" },
           { "field": "model_type", "type": "nominal", "title": "Model type" },
-          { "field": "init_time", "type": "nominal", "title": "Init time" },
+          { "field": "init_time", "type": "temporal", "title": "Init time", "format": "%Y-%m-%d %H:%M" },
           { "field": "metric", "type": "nominal", "title": "Metric" },
           { "field": "value", "type": "quantitative", "title": "Value", "format": ".3f" },
           { "field": "n_gpu", "type": "quantitative", "title": "GPUs" },
@@ -259,6 +293,7 @@ if (sysData.length > 0) {
   function updateSysChart() {
     const selectedModelTypes = getSelectedValues("sys-model-type-select");
     const selectedSources = getSelectedValues("sys-source-select");
+    const selectedMetrics = getSelectedValues("sys-metric-select");
     const newSpec = JSON.parse(JSON.stringify(sysSpec));
     const filters = [];
     if (selectedModelTypes.length > 0) {
@@ -267,6 +302,9 @@ if (sysData.length > 0) {
     if (selectedSources.length > 0) {
       filters.push({ field: "source", oneOf: selectedSources });
     }
+    if (selectedMetrics.length > 0) {
+      filters.push({ field: "metric", oneOf: selectedMetrics });
+    }
     if (filters.length > 0) {
       newSpec.transform = [{ filter: { and: filters } }];
     }
diff --git a/resources/report/dashboard/template.html.jinja2 b/resources/report/dashboard/template.html.jinja2
index 6710d011..371ad198 100644
--- a/resources/report/dashboard/template.html.jinja2
+++ b/resources/report/dashboard/template.html.jinja2
@@ -178,6 +178,10 @@
                     {% endfor %}
                 </select>
             </div>
+            <div class="control-group">
+                <label>Metric(s)</label>
+                <select id="sys-metric-select" multiple></select>
+            </div>
         </div>
         <div id="sys-vis"></div>
     </div>

From 2d9f3582a7972a089c61704dbf91688606df2a8e Mon Sep 17 00:00:00 2001
From: Jonas Bhend <jonas.bhend@meteoswiss.ch>
Date: Fri, 5 Jun 2026 17:02:27 +0200
Subject: [PATCH 08/16] fix error in inference from merging in main

---
 workflow/rules/inference.smk | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/workflow/rules/inference.smk b/workflow/rules/inference.smk
index 0790dd4a..8ab420bd 100644
--- a/workflow/rules/inference.smk
+++ b/workflow/rules/inference.smk
@@ -369,16 +369,5 @@ rule inference_execute:
             fi
 
         ) > {log} 2>&1
-                srun \
-                    --unbuffered \
-                    --partition={resources.slurm_partition} \
-                    --cpus-per-task={resources.cpus_per_task} \
-                    --mem-per-cpu={resources.mem_mb_per_cpu} \
-                    --time={resources.runtime} \
-                    --gres={resources.gres} \
-                    --ntasks={resources.ntasks} \
-                    anemoi-inference run config.yaml "${{CMD_ARGS[@]}}"
-                '
-        ) >{log} 2>&1
         """
 # fmt: on

From 16c4df7cf34eb1de1cbbf1686a9e8736255ca2c9 Mon Sep 17 00:00:00 2001
From: Jonas Bhend <jonas.bhend@meteoswiss.ch>
Date: Fri, 5 Jun 2026 17:06:30 +0200
Subject: [PATCH 09/16] linting

---
 src/diagnostics/__init__.py             |  6 +-----
 tests/unit/test_parse_inference_logs.py | 12 +++++++++---
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/diagnostics/__init__.py b/src/diagnostics/__init__.py
index 58a19152..aca5cad8 100644
--- a/src/diagnostics/__init__.py
+++ b/src/diagnostics/__init__.py
@@ -145,11 +145,7 @@ def _vals(col: str) -> list[float]:
         idx = col_idx.get(col)
         if idx is None:
             return []
-        return [
-            r[idx]
-            for r in data_rows
-            if idx < len(r) and math.isfinite(r[idx])
-        ]
+        return [r[idx] for r in data_rows if idx < len(r) and math.isfinite(r[idx])]
 
     result: dict = {}
 
diff --git a/tests/unit/test_parse_inference_logs.py b/tests/unit/test_parse_inference_logs.py
index 9c96baa7..cdf58d19 100644
--- a/tests/unit/test_parse_inference_logs.py
+++ b/tests/unit/test_parse_inference_logs.py
@@ -135,7 +135,9 @@ def test_parse_logs_no_metrics_files_is_skipped(tmp_path):
     workdir = tmp_path / "empty_run"
     workdir.mkdir()
     records = parse_logs(
-        run_info=[{"workdir": str(workdir), "run_id": "x", "init_time": "202503010000"}],
+        run_info=[
+            {"workdir": str(workdir), "run_id": "x", "init_time": "202503010000"}
+        ],
         label_map={},
         gpu_map={},
     )
@@ -152,7 +154,9 @@ def test_parse_logs_model_type_from_run_id_prefix(tmp_path):
         (workdir / "slurm_metrics.log").write_text(SACCT_CONTENT)
 
         records = parse_logs(
-            run_info=[{"workdir": str(workdir), "run_id": prefix, "init_time": "202503010000"}],
+            run_info=[
+                {"workdir": str(workdir), "run_id": prefix, "init_time": "202503010000"}
+            ],
             label_map={},
             gpu_map={},
         )
@@ -165,7 +169,9 @@ def test_parse_logs_fallback_label_is_run_id(tmp_path):
     (workdir / "slurm_metrics.log").write_text(SACCT_CONTENT)
 
     records = parse_logs(
-        run_info=[{"workdir": str(workdir), "run_id": "env-abc", "init_time": "202503020000"}],
+        run_info=[
+            {"workdir": str(workdir), "run_id": "env-abc", "init_time": "202503020000"}
+        ],
         label_map={},
         gpu_map={},
     )

From b9ec45e782e263825b67226224bcab7dfc0a161c Mon Sep 17 00:00:00 2001
From: Jonas Bhend <jonas.bhend@meteoswiss.ch>
Date: Sun, 7 Jun 2026 12:24:51 +0200
Subject: [PATCH 10/16] convert to wall time to minutes

---
 src/diagnostics/__init__.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/diagnostics/__init__.py b/src/diagnostics/__init__.py
index aca5cad8..703c161a 100644
--- a/src/diagnostics/__init__.py
+++ b/src/diagnostics/__init__.py
@@ -9,7 +9,7 @@
 
 # Columns exposed as distribution metrics in the dashboard
 SYSMETRICS_COLS = {
-    "wall_time_s": "Wall Time (s)",
+    "wall_time_s": "Wall Time (min)",
     "gpu_hours": "GPU Hours",
     "max_rss_mb": "Peak CPU Memory (MB)",
     "gpu_util_mean": "Mean GPU Util (%)",
@@ -259,8 +259,9 @@ def melt_for_dashboard(records: list[dict]) -> tuple[str, list[str], list[str]]:
         }
         for col, label in SYSMETRICS_COLS.items():
             if r.get(col) is not None:
-                long_records.append({**base, "metric": label, "value": r[col]})
+                value = r[col] / 60 if col == "wall_time_s" else r[col]
+                long_records.append({**base, "metric": label, "value": round(value, 2)})
 
-    sources = sorted({r["source"] for r in records})
-    model_types = sorted({r.get("model_type", "unknown") for r in records})
+    sources = sorted({r["source"] for r in records if r["source"] is not None})
+    model_types = sorted({r.get("model_type") or "unknown" for r in records})
     return json.dumps(long_records), sources, model_types

From 642443027130e847a648e570c6cff40bbde32642 Mon Sep 17 00:00:00 2001
From: Jonas Bhend <jonas.bhend@meteoswiss.ch>
Date: Sun, 7 Jun 2026 12:25:19 +0200
Subject: [PATCH 11/16] fix getSelected error

---
 resources/report/dashboard/script.js | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/resources/report/dashboard/script.js b/resources/report/dashboard/script.js
index beb8f85e..086fb50a 100644
--- a/resources/report/dashboard/script.js
+++ b/resources/report/dashboard/script.js
@@ -119,7 +119,7 @@ if (sysData.length > 0) {
           "field": "value",
           "type": "quantitative",
           "title": null,
-          "scale": { "zero": false }
+          "scale": { "zero": true }
         },
         "color": {
           "field": "source",
@@ -150,9 +150,9 @@ if (sysData.length > 0) {
   };
 
   function updateSysChart() {
-    const selectedModelTypes = getSelectedValues("sys-model-type-select");
-    const selectedSources = getSelectedValues("sys-source-select");
-    const selectedMetrics = getSelectedValues("sys-metric-select");
+    const selectedModelTypes = getSelected("sys-model-type-select");
+    const selectedSources = getSelected("sys-source-select");
+    const selectedMetrics = getSelected("sys-metric-select");
     const newSpec = JSON.parse(JSON.stringify(sysSpec));
     const filters = [];
     if (selectedModelTypes.length > 0) {

From 1709ed7096afdd4ab129074d9a25f85a456388c7 Mon Sep 17 00:00:00 2001
From: Jonas Bhend <jonas.bhend@meteoswiss.ch>
Date: Sun, 7 Jun 2026 12:26:37 +0200
Subject: [PATCH 12/16] remove duplicates

---
 resources/report/dashboard/template.html.jinja2 | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/resources/report/dashboard/template.html.jinja2 b/resources/report/dashboard/template.html.jinja2
index fc39cb02..93d8ec28 100644
--- a/resources/report/dashboard/template.html.jinja2
+++ b/resources/report/dashboard/template.html.jinja2
@@ -253,18 +253,10 @@
     {{ sysmetrics_data | safe | indent(8, false) }}
 </script>
 
-<script id="verif-data" type="application/json">
-    {{ verif_data | safe | indent(8, false)}}
-</script>
-
 <script id="header-text" type="text">
     {{ header_text }}
 </script>
 
-<script>
-    {{ js_src | indent(8, true) }}
-</script>
-
 <script>
 window.onerror = function(msg, src, line, col, err) {
     var d = document.getElementById("js-error-box");

From 7c9af61ac785edd7369f5067cc9761496eb0c402 Mon Sep 17 00:00:00 2001
From: Jonas Bhend <jonas.bhend@meteoswiss.ch>
Date: Sun, 7 Jun 2026 12:27:41 +0200
Subject: [PATCH 13/16] fix indentation error

---
 workflow/rules/inference.smk | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/workflow/rules/inference.smk b/workflow/rules/inference.smk
index 8ab420bd..b9f44a16 100644
--- a/workflow/rules/inference.smk
+++ b/workflow/rules/inference.smk
@@ -327,7 +327,9 @@ rule inference_execute:
                 trap 'kill $DMON_PID 2>/dev/null; wait $DMON_PID 2>/dev/null' EXIT INT TERM
             fi
             anemoi-inference run config.yaml $INFERENCE_CMD_ARGS
-            RUN_METRICS_EOF
+# fmt: off
+RUN_METRICS_EOF
+# fmt: on
             chmod +x "$METRICS_DIR/run_with_metrics.sh"
 
             squashfs-mount {params.image_path}:/user-environment -- bash -c '

From b9e1df39294c7918d577887f7372bee258038f90 Mon Sep 17 00:00:00 2001
From: Jonas Bhend <jonas.bhend@meteoswiss.ch>
Date: Sun, 7 Jun 2026 12:28:44 +0200
Subject: [PATCH 14/16] reintroduce fix for n_samples

---
 workflow/scripts/report_experiment_dashboard.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/scripts/report_experiment_dashboard.py b/workflow/scripts/report_experiment_dashboard.py
index bdd46595..7dcdbfb7 100644
--- a/workflow/scripts/report_experiment_dashboard.py
+++ b/workflow/scripts/report_experiment_dashboard.py
@@ -58,7 +58,7 @@ def main(args):
     LOG.info("Loaded verification netcdf: \n%s", ds)
 
     # extract only  non-spatial variables to pd.DataFrame
-    nonspatial_vars = [d for d in ds.data_vars if "spatial" not in d]
+    nonspatial_vars = [d for d in ds.data_vars if "spatial" not in d and "." in d]
     df = ds[nonspatial_vars].to_array("stack").to_dataframe(name="value").reset_index()
     df[["param", "metric"]] = df["stack"].str.split(".", n=1, expand=True)
     df["metric"] = df.metric.apply(decode_metric)

From 489506656819ebd97b3b8469dad8821bd9974c18 Mon Sep 17 00:00:00 2001
From: Jonas Bhend <jonas.bhend@meteoswiss.ch>
Date: Sun, 7 Jun 2026 12:28:56 +0200
Subject: [PATCH 15/16] use interpolator label for corresponding forecaster run

---
 workflow/rules/report.smk | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk
index d2ebed84..aa059b49 100644
--- a/workflow/rules/report.smk
+++ b/workflow/rules/report.smk
@@ -54,7 +54,19 @@ rule collect_system_metrics:
             for t in REFTIMES
         ],
         label_map={
-            run_id: run_cfg.get("label", run_id)
+            run_id: (
+                run_cfg.get("label")
+                or next(
+                    (
+                        cfg.get("label")
+                        for cfg in RUN_CONFIGS.values()
+                        if cfg.get("forecaster")
+                        and cfg["forecaster"].get("run_id") == run_id
+                    ),
+                    None,
+                )
+                or run_id
+            )
             for run_id, run_cfg in RUN_CONFIGS.items()
         },
         gpu_map={

From 723a0131a22ae5df81595da3e089dd8d0d657e9b Mon Sep 17 00:00:00 2001
From: Jonas Bhend <jonas.bhend@meteoswiss.ch>
Date: Mon, 15 Jun 2026 08:49:58 +0200
Subject: [PATCH 16/16] fix failing test

---
 tests/unit/test_parse_inference_logs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/test_parse_inference_logs.py b/tests/unit/test_parse_inference_logs.py
index cdf58d19..4b4d36f4 100644
--- a/tests/unit/test_parse_inference_logs.py
+++ b/tests/unit/test_parse_inference_logs.py
@@ -206,6 +206,6 @@ def test_melt_for_dashboard_exposes_model_type_and_distribution_metrics():
     assert model_types == ["forecaster", "interpolator"]
     # only wall_time_s and gpu_hours are present in the test records
     metrics_present = {r["metric"] for r in rows}
-    assert metrics_present == {"Wall Time (s)", "GPU Hours"}
+    assert metrics_present == {"Wall Time (min)", "GPU Hours"}
     # model_type must be present in every row
     assert all("model_type" in r for r in rows)