From 4e3fae0f99084414dadceea1f4f0e3cb1564f939 Mon Sep 17 00:00:00 2001 From: Jonas Bhend Date: Wed, 6 May 2026 09:55:45 +0200 Subject: [PATCH 01/16] Initial suggestion from our dear friend --- pyproject.toml | 3 +- resources/report/dashboard/script.js | 66 +++++++ .../report/dashboard/template.html.jinja2 | 24 +++ src/diagnostics/__init__.py | 162 ++++++++++++++++++ tests/unit/test_parse_inference_logs.py | 81 +++++++++ workflow/rules/report.smk | 63 +++++++ workflow/scripts/parse_inference_logs.py | 47 +++++ .../scripts/report_experiment_dashboard.py | 25 +++ 8 files changed, 470 insertions(+), 1 deletion(-) create mode 100644 src/diagnostics/__init__.py create mode 100644 tests/unit/test_parse_inference_logs.py create mode 100644 workflow/scripts/parse_inference_logs.py diff --git a/pyproject.toml b/pyproject.toml index f754384d..66214d57 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,7 +57,8 @@ markers = [ packages = [ "src/evalml", "src/verification", - "src/data_input" + "src/data_input", + "src/diagnostics" ] [tool.uv.sources] diff --git a/resources/report/dashboard/script.js b/resources/report/dashboard/script.js index 9eed0ba3..2561d7bf 100644 --- a/resources/report/dashboard/script.js +++ b/resources/report/dashboard/script.js @@ -185,3 +185,69 @@ function updateChart() { // Initial chart updateChart() + + +// ---- System metrics tab ---- + +const sysDataEl = document.getElementById("sysmetrics-data"); +const sysData = sysDataEl ? JSON.parse(sysDataEl.textContent) : []; + +if (sysData.length > 0) { + choicesInstances["sys-source-select"] = new Choices("#sys-source-select", { + searchEnabled: false, + removeItemButton: true, + shouldSort: false, + itemSelectText: "", + placeholder: false, + }); + document.getElementById("sys-source-select").addEventListener("change", updateSysChart); + + const sysSpec = { + "data": { "values": sysData }, + "facet": { + "column": { "field": "metric", "type": "nominal", "title": null }, + }, + "resolve": { "scale": { "y": "independent" } }, + "spec": { + "width": 220, + "height": 220, + "layer": [ + { + "mark": { "type": "bar", "opacity": 0.7 }, + "encoding": { + "x": { "field": "source", "type": "nominal", "axis": { "labelAngle": -30, "title": null } }, + "y": { "field": "value", "aggregate": "mean", "type": "quantitative", "title": "mean" }, + "color": { "field": "source", "type": "nominal", "legend": { "orient": "top", "title": "Source" } }, + }, + }, + { + "mark": { "type": "point", "filled": true, "size": 40, "opacity": 0.9 }, + "encoding": { + "x": { "field": "source", "type": "nominal" }, + "y": { "field": "value", "type": "quantitative" }, + "color": { "field": "source", "type": "nominal" }, + "tooltip": [ + { "field": "source", "type": "nominal", "title": "Source" }, + { "field": "init_time", "type": "nominal", "title": "Init time" }, + { "field": "metric", "type": "nominal", "title": "Metric" }, + { "field": "value", "type": "quantitative", "title": "Value", "format": ".3f" }, + { "field": "n_gpu", "type": "quantitative", "title": "GPUs" }, + { "field": "job_id", "type": "nominal", "title": "Job ID" }, + ], + }, + }, + ], + }, + }; + + function updateSysChart() { + const selectedSources = getSelectedValues("sys-source-select"); + const newSpec = JSON.parse(JSON.stringify(sysSpec)); + if (selectedSources.length > 0) { + newSpec.transform = [{ filter: { field: "source", oneOf: selectedSources } }]; + } + vegaEmbed("#sys-vis", newSpec, { actions: false }); + } + + updateSysChart(); +} diff --git a/resources/report/dashboard/template.html.jinja2 b/resources/report/dashboard/template.html.jinja2 index 78404974..d0ba035c 100644 --- a/resources/report/dashboard/template.html.jinja2 +++ b/resources/report/dashboard/template.html.jinja2 @@ -95,6 +95,9 @@
+ {% if sysmetrics_sources %} + + {% endif %}
@@ -155,6 +158,23 @@ + + {% if sysmetrics_sources %} +
+
+
+ + +
+
+
+
+ {% endif %} +
@@ -163,6 +183,10 @@
     
+ + diff --git a/src/diagnostics/__init__.py b/src/diagnostics/__init__.py new file mode 100644 index 00000000..f9720b3c --- /dev/null +++ b/src/diagnostics/__init__.py @@ -0,0 +1,162 @@ +"""System-performance diagnostics for inference jobs.""" + +import logging +import re +from datetime import datetime +from pathlib import Path + +LOG = logging.getLogger(__name__) + +_TIMESTAMP = re.compile(r"^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})") +_JOB_ID = re.compile(r"srun: job (\d+) queued") +_CHECKPOINT_SIZE = re.compile(r"Checkpoint size: ([\d.]+) GiB") +_N_STEPS = re.compile(r"Forecasting (\d+) steps") +_STEP_TIME = re.compile(r"Forecast\. Model call \d+:.+?: (\d+) seconds\.") + +# Human-readable names for wide-format columns used in the dashboard +SYSMETRICS_COLS = { + "wall_time_s": "Wall Time (s)", + "gpu_hours": "GPU Hours", + "mean_step_time_s": "Mean Step Time (s)", + "max_step_time_s": "Max Step Time (s)", + "checkpoint_size_gib": "Checkpoint Size (GiB)", + "n_steps": "No. Steps", +} + + +def parse_single_log(log_path: str) -> dict: + """Return raw metric values extracted from one inference log file.""" + job_id = None + first_ts = last_ts = None + checkpoint_gib = None + n_steps = None + step_times: list[int] = [] + + with open(log_path) as fh: + for line in fh: + if job_id is None: + m = _JOB_ID.search(line) + if m: + job_id = m.group(1) + + m = _TIMESTAMP.match(line) + if m: + ts = datetime.strptime(m.group(1), "%Y-%m-%d %H:%M:%S") + if first_ts is None: + first_ts = ts + last_ts = ts + + if checkpoint_gib is None: + m = _CHECKPOINT_SIZE.search(line) + if m: + checkpoint_gib = float(m.group(1)) + + if n_steps is None: + m = _N_STEPS.search(line) + if m: + n_steps = int(m.group(1)) + + m = _STEP_TIME.search(line) + if m: + step_times.append(int(m.group(1))) + + wall_time_s = ( + (last_ts - first_ts).total_seconds() + if first_ts is not None and last_ts is not None + else None + ) + return { + "job_id": job_id, + "wall_time_s": wall_time_s, + "n_steps": n_steps if n_steps is not None else len(step_times), + "mean_step_time_s": ( + round(sum(step_times) / len(step_times), 2) if step_times else None + ), + "max_step_time_s": max(step_times) if step_times else None, + "checkpoint_size_gib": checkpoint_gib, + } + + +def parse_logs( + log_files: list[str], + label_map: dict[str, str], + gpu_map: dict[str, int], + log_dir: str, +) -> list[dict]: + """Parse inference log files and return one record per (run, init_time). + + Parameters + ---------- + log_files : paths to .log files to parse. + label_map : {run_id: human-readable label} — supplied by Snakemake rule params. + gpu_map : {run_id: n_gpu} — GPU count used for each run. + log_dir : root of the inference_execute logs directory; used to derive run_id. + """ + log_dir_path = Path(log_dir) + records: list[dict] = [] + + for log_file in log_files: + log_path = Path(log_file) + if not log_path.exists(): + LOG.warning("Log file not found, skipping: %s", log_file) + continue + + # Derive run_id and init_time from the path. + # Relative path structure: "{run_id}-{init_time}.log" + # init_time is always 12 digits (YYYYMMDDHHM). + try: + stem = str(log_path.relative_to(log_dir_path).with_suffix("")) + init_time_str = stem[-12:] + run_id = stem[:-13] # strip trailing "-YYYYMMDDHHM" + except Exception: + LOG.warning("Cannot derive run_id from path, skipping: %s", log_file) + continue + + label = label_map.get(run_id, run_id) + n_gpu = int(gpu_map.get(run_id, 1)) + + try: + raw = parse_single_log(str(log_path)) + except Exception as exc: + LOG.warning("Failed to parse %s: %s", log_file, exc) + continue + + wall_s = raw.get("wall_time_s") + gpu_hours = round(wall_s / 3600 * n_gpu, 4) if wall_s is not None else None + + try: + init_iso = datetime.strptime(init_time_str, "%Y%m%d%H%M").isoformat() + except ValueError: + init_iso = init_time_str + + records.append( + { + "source": label, + "run_id": run_id, + "init_time": init_iso, + "n_gpu": n_gpu, + "gpu_hours": gpu_hours, + **raw, + } + ) + + LOG.info("Parsed %d log files → %d records", len(log_files), len(records)) + return records + + +def melt_for_dashboard(records: list[dict]) -> tuple[str, list[str]]: + """Convert wide-format system metrics records to long format for Vega-Lite. + + Returns (json_string, sorted_source_list). + """ + import json + + long_records = [] + for r in records: + base = {k: r.get(k) for k in ("source", "init_time", "n_gpu", "job_id")} + for col, label in SYSMETRICS_COLS.items(): + if r.get(col) is not None: + long_records.append({**base, "metric": label, "value": r[col]}) + + sources = sorted({r["source"] for r in records}) + return json.dumps(long_records), sources diff --git a/tests/unit/test_parse_inference_logs.py b/tests/unit/test_parse_inference_logs.py new file mode 100644 index 00000000..ef76014e --- /dev/null +++ b/tests/unit/test_parse_inference_logs.py @@ -0,0 +1,81 @@ +import pytest + +from diagnostics import parse_logs, parse_single_log + +LOG_CONTENT = """\ +srun: job 4242140 queued and waiting for resources +srun: job 4242140 has been allocated resources +2026-05-01 11:10:19 INFO Loading multi-dataset metadata +2026-05-01 11:10:52 INFO Checkpoint size: 1.4 GiB +2026-05-01 11:10:52 INFO Lead time: 5 days, 0:00:00 Forecasting 3 steps through 3 autoregressive steps +2026-05-01 11:11:07 INFO Forecast. Model call 1: horizon 6:00:00, freq. 6:00:00 (2025-03-01 06:00:00): 6 seconds. +2026-05-01 11:11:12 INFO Forecast. Model call 2: horizon 12:00:00, freq. 6:00:00 (2025-03-01 12:00:00): 2 seconds. +2026-05-01 11:11:17 INFO Forecast. Model call 3: horizon 18:00:00, freq. 6:00:00 (2025-03-01 18:00:00): 2 seconds. +2026-05-01 11:11:20 INFO Done. +""" + + +def test_parse_single_log(tmp_path): + log_file = tmp_path / "test.log" + log_file.write_text(LOG_CONTENT) + + result = parse_single_log(str(log_file)) + + assert result["job_id"] == "4242140" + assert result["checkpoint_size_gib"] == 1.4 + assert result["n_steps"] == 3 + assert result["max_step_time_s"] == 6 + assert result["mean_step_time_s"] == pytest.approx(round((6 + 2 + 2) / 3, 2)) + # wall time: 11:11:20 - 11:10:19 = 61 seconds + assert result["wall_time_s"] == pytest.approx(61.0) + + +def test_parse_logs_extracts_run_id_and_init_time(tmp_path): + log_dir = tmp_path / "inference_execute" / "forecaster-c304-1e7e" + log_dir.mkdir(parents=True) + log_file = log_dir / "253b-202503010000.log" + log_file.write_text(LOG_CONTENT) + + label_map = {"forecaster-c304-1e7e/253b": "My Model"} + gpu_map = {"forecaster-c304-1e7e/253b": 2} + + records = parse_logs( + log_files=[str(log_file)], + label_map=label_map, + gpu_map=gpu_map, + log_dir=str(tmp_path / "inference_execute"), + ) + + assert len(records) == 1 + r = records[0] + assert r["source"] == "My Model" + assert r["run_id"] == "forecaster-c304-1e7e/253b" + assert r["init_time"] == "2025-03-01T00:00:00" + assert r["n_gpu"] == 2 + assert r["gpu_hours"] == pytest.approx(61.0 / 3600 * 2, rel=1e-3) + + +def test_parse_logs_missing_file_is_skipped(tmp_path): + records = parse_logs( + log_files=[str(tmp_path / "does_not_exist.log")], + label_map={}, + gpu_map={}, + log_dir=str(tmp_path), + ) + assert records == [] + + +def test_parse_logs_fallback_label_is_run_id(tmp_path): + log_dir = tmp_path / "inference_execute" / "env-abc" + log_dir.mkdir(parents=True) + log_file = log_dir / "1234-202503020000.log" + log_file.write_text(LOG_CONTENT) + + records = parse_logs( + log_files=[str(log_file)], + label_map={}, # no label provided + gpu_map={}, + log_dir=str(tmp_path / "inference_execute"), + ) + + assert records[0]["source"] == "env-abc/1234" diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk index b0acc44b..66843b5e 100644 --- a/workflow/rules/report.smk +++ b/workflow/rules/report.smk @@ -15,6 +15,67 @@ def make_header_text(): return f"Verification against {truth} with initializations from {dates.get('start')} to {dates.get('end')} by {dates.get('frequency')}" +def _candidate_gpu(run_cfg: dict) -> int: + """Return the GPU count for a run, defaulting to 1.""" + ir = run_cfg.get("inference_resources") + if ir is None: + return 1 + if isinstance(ir, dict): + return int(ir.get("gpu", 1) or 1) + return int(getattr(ir, "gpu", 1) or 1) + + +rule collect_system_metrics: + localrule: True + input: + okfiles=[ + OUT_ROOT / f"logs/inference_execute/{run_id}-{t.strftime('%Y%m%d%H%M')}.ok" + for run_id, run_cfg in RUN_CONFIGS.items() + if run_cfg.get("_is_candidate", False) + for t in REFTIMES + ], + output: + OUT_ROOT / "results/{experiment}/system_metrics.json", + params: + log_files=[ + str( + OUT_ROOT + / f"logs/inference_execute/{run_id}-{t.strftime('%Y%m%d%H%M')}.log" + ) + for run_id, run_cfg in RUN_CONFIGS.items() + if run_cfg.get("_is_candidate", False) + for t in REFTIMES + ], + label_map={ + run_id: run_cfg.get("label", run_id) + for run_id, run_cfg in RUN_CONFIGS.items() + if run_cfg.get("_is_candidate", False) + }, + gpu_map={ + run_id: _candidate_gpu(run_cfg) + for run_id, run_cfg in RUN_CONFIGS.items() + if run_cfg.get("_is_candidate", False) + }, + log_dir=str(OUT_ROOT / "logs/inference_execute"), + run: + import json + import sys + from pathlib import Path + + sys.path.insert(0, str(Path(workflow.snakefile).parent / "scripts")) + from parse_inference_logs import parse_logs + + records = parse_logs( + log_files=params.log_files, + label_map=params.label_map, + gpu_map=params.gpu_map, + log_dir=params.log_dir, + ) + out_path = Path(str(output[0])) + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(json.dumps(records, indent=2)) + + rule report_experiment_dashboard: localrule: True input: @@ -23,6 +84,7 @@ rule report_experiment_dashboard: template="resources/report/dashboard/template.html.jinja2", js_script="resources/report/dashboard/script.js", configfile={workflow.configfiles[0]}, + sysmetrics=OUT_ROOT / "results/{experiment}/system_metrics.json", output: report( directory(OUT_ROOT / "results/{experiment}/dashboard"), @@ -41,5 +103,6 @@ rule report_experiment_dashboard: --script {input.js_script} \ --header_text "{params.header_text}" \ --configfile "{input.configfile}" \ + --sysmetrics_file "{input.sysmetrics}" \ --output {output} > {log} 2>&1 """ diff --git a/workflow/scripts/parse_inference_logs.py b/workflow/scripts/parse_inference_logs.py new file mode 100644 index 00000000..389df9a2 --- /dev/null +++ b/workflow/scripts/parse_inference_logs.py @@ -0,0 +1,47 @@ +"""CLI wrapper: parse anemoi-inference logs and write system metrics JSON.""" + +import argparse +import json +import logging +from pathlib import Path + +from diagnostics import parse_logs + +LOG = logging.getLogger(__name__) +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) + + +def main(args: argparse.Namespace) -> None: + records = parse_logs( + log_files=args.logs, + label_map=json.loads(args.label_map), + gpu_map=json.loads(args.gpu_map), + log_dir=args.log_dir, + ) + out = Path(args.output) + out.parent.mkdir(parents=True, exist_ok=True) + with open(out, "w") as fh: + json.dump(records, fh, indent=2) + LOG.info("Saved system metrics to %s", args.output) + + +if __name__ == "__main__": + p = argparse.ArgumentParser(description="Parse inference logs for system metrics.") + p.add_argument("--logs", nargs="+", required=True, help="Inference log file paths.") + p.add_argument( + "--label_map", required=True, help="JSON dict mapping run_id → source label." + ) + p.add_argument( + "--gpu_map", + default="{}", + help="JSON dict mapping run_id → GPU count (default: 1).", + ) + p.add_argument( + "--log_dir", + required=True, + help="Root of inference_execute logs; used to extract run_id from file path.", + ) + p.add_argument("--output", required=True, help="Output JSON file path.") + main(p.parse_args()) diff --git a/workflow/scripts/report_experiment_dashboard.py b/workflow/scripts/report_experiment_dashboard.py index 46e132d7..e1872e5a 100644 --- a/workflow/scripts/report_experiment_dashboard.py +++ b/workflow/scripts/report_experiment_dashboard.py @@ -1,4 +1,5 @@ import argparse +import json import logging import sys as _sys from pathlib import Path @@ -6,6 +7,8 @@ import jinja2 import xarray as xr +from diagnostics import melt_for_dashboard + _sys.path.append(str(Path(__file__).parent)) from verification_plot_metrics import _ensure_unique_lead_time from verification_plot_metrics import _select_best_sources @@ -16,6 +19,17 @@ ) +def _load_sysmetrics(sysmetrics_file: Path) -> tuple[str, list[str]]: + """Load system metrics JSON and melt to long format for Vega-Lite.""" + if not sysmetrics_file or not sysmetrics_file.is_file(): + return "[]", [] + with open(sysmetrics_file) as fh: + records = json.load(fh) + sysmetrics_json, sources = melt_for_dashboard(records) + LOG.info("Loaded system metrics for %d source(s)", len(sources)) + return sysmetrics_json, sources + + def program_summary_log(args): """Log a welcome message with the script and template information.""" LOG.info("=" * 80) @@ -75,6 +89,9 @@ def main(args): json_size = len(df_json.encode("utf-8")) LOG.info("Size of embedded JSON data: %d bytes", json_size) + # load system metrics + sysmetrics_json, sysmetrics_sources = _load_sysmetrics(args.sysmetrics_file) + # read script with open(args.script, "r") as f: js_src = f.read() @@ -97,6 +114,8 @@ def main(args): configfile_content=open(args.configfile, "r").read() if args.configfile.is_file() else "", + sysmetrics_data=sysmetrics_json, + sysmetrics_sources=sysmetrics_sources, ) LOG.info("Size of generated HTML: %d bytes", len(html.encode("utf-8"))) @@ -139,6 +158,12 @@ def main(args): type=Path, help="Path to config file for the evalml run.", ) + parser.add_argument( + "--sysmetrics_file", + type=Path, + default=None, + help="Path to system metrics JSON produced by parse_inference_logs.py.", + ) parser.add_argument( "--output", type=Path, From 6755cdab125ac0594afe9886d0ebd2d73868948e Mon Sep 17 00:00:00 2001 From: Jonas Bhend Date: Wed, 6 May 2026 10:01:04 +0200 Subject: [PATCH 02/16] fix referencing --- workflow/rules/report.smk | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk index 66843b5e..452e5d7f 100644 --- a/workflow/rules/report.smk +++ b/workflow/rules/report.smk @@ -59,11 +59,9 @@ rule collect_system_metrics: log_dir=str(OUT_ROOT / "logs/inference_execute"), run: import json - import sys from pathlib import Path - sys.path.insert(0, str(Path(workflow.snakefile).parent / "scripts")) - from parse_inference_logs import parse_logs + from diagnostics import parse_logs records = parse_logs( log_files=params.log_files, From d8249d575d4ce976953f97017f038f2e4ee730aa Mon Sep 17 00:00:00 2001 From: Jonas Bhend Date: Wed, 6 May 2026 10:32:34 +0200 Subject: [PATCH 03/16] exclude rulegraph and dag from being tracked --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 50b2ff7d..346ade17 100644 --- a/.gitignore +++ b/.gitignore @@ -55,3 +55,5 @@ uv.lock # evalml .evalml_snakemake_cmd.txt +rulegraph.svg +dag.svg From 1b871a7af9b55237146b21a31c67addf8d8d557a Mon Sep 17 00:00:00 2001 From: Jonas Bhend Date: Wed, 6 May 2026 13:53:30 +0200 Subject: [PATCH 04/16] expose individual runs --- resources/report/dashboard/script.js | 78 ++++++++++++------- .../report/dashboard/template.html.jinja2 | 8 ++ src/diagnostics/__init__.py | 19 +++-- tests/unit/test_parse_inference_logs.py | 61 ++++++++++++++- workflow/rules/report.smk | 11 +-- .../scripts/report_experiment_dashboard.py | 19 +++-- 6 files changed, 147 insertions(+), 49 deletions(-) diff --git a/resources/report/dashboard/script.js b/resources/report/dashboard/script.js index 2561d7bf..dddc3fd3 100644 --- a/resources/report/dashboard/script.js +++ b/resources/report/dashboard/script.js @@ -193,6 +193,15 @@ const sysDataEl = document.getElementById("sysmetrics-data"); const sysData = sysDataEl ? JSON.parse(sysDataEl.textContent) : []; if (sysData.length > 0) { + choicesInstances["sys-model-type-select"] = new Choices("#sys-model-type-select", { + searchEnabled: false, + removeItemButton: true, + shouldSort: false, + itemSelectText: "", + placeholder: false, + }); + document.getElementById("sys-model-type-select").addEventListener("change", updateSysChart); + choicesInstances["sys-source-select"] = new Choices("#sys-source-select", { searchEnabled: false, removeItemButton: true, @@ -209,42 +218,57 @@ if (sysData.length > 0) { }, "resolve": { "scale": { "y": "independent" } }, "spec": { - "width": 220, - "height": 220, - "layer": [ - { - "mark": { "type": "bar", "opacity": 0.7 }, - "encoding": { - "x": { "field": "source", "type": "nominal", "axis": { "labelAngle": -30, "title": null } }, - "y": { "field": "value", "aggregate": "mean", "type": "quantitative", "title": "mean" }, - "color": { "field": "source", "type": "nominal", "legend": { "orient": "top", "title": "Source" } }, - }, + "width": 280, + "height": 240, + "mark": { "type": "point", "filled": true, "size": 70, "opacity": 0.85 }, + "encoding": { + "x": { + "field": "source", + "type": "nominal", + "axis": { "labelAngle": -30, "title": null } + }, + "y": { + "field": "value", + "type": "quantitative", + "title": null, + "scale": { "zero": true } + }, + "color": { + "field": "model_type", + "type": "nominal", + "legend": { "orient": "top", "title": "Model type" } }, - { - "mark": { "type": "point", "filled": true, "size": 40, "opacity": 0.9 }, - "encoding": { - "x": { "field": "source", "type": "nominal" }, - "y": { "field": "value", "type": "quantitative" }, - "color": { "field": "source", "type": "nominal" }, - "tooltip": [ - { "field": "source", "type": "nominal", "title": "Source" }, - { "field": "init_time", "type": "nominal", "title": "Init time" }, - { "field": "metric", "type": "nominal", "title": "Metric" }, - { "field": "value", "type": "quantitative", "title": "Value", "format": ".3f" }, - { "field": "n_gpu", "type": "quantitative", "title": "GPUs" }, - { "field": "job_id", "type": "nominal", "title": "Job ID" }, - ], - }, + "shape": { + "field": "model_type", + "type": "nominal", + "legend": { "orient": "top", "title": "Model type" } }, - ], + "tooltip": [ + { "field": "source", "type": "nominal", "title": "Source" }, + { "field": "model_type", "type": "nominal", "title": "Model type" }, + { "field": "init_time", "type": "nominal", "title": "Init time" }, + { "field": "metric", "type": "nominal", "title": "Metric" }, + { "field": "value", "type": "quantitative", "title": "Value", "format": ".3f" }, + { "field": "n_gpu", "type": "quantitative", "title": "GPUs" }, + { "field": "job_id", "type": "nominal", "title": "Job ID" }, + ], + }, }, }; function updateSysChart() { + const selectedModelTypes = getSelectedValues("sys-model-type-select"); const selectedSources = getSelectedValues("sys-source-select"); const newSpec = JSON.parse(JSON.stringify(sysSpec)); + const filters = []; + if (selectedModelTypes.length > 0) { + filters.push({ field: "model_type", oneOf: selectedModelTypes }); + } if (selectedSources.length > 0) { - newSpec.transform = [{ filter: { field: "source", oneOf: selectedSources } }]; + filters.push({ field: "source", oneOf: selectedSources }); + } + if (filters.length > 0) { + newSpec.transform = [{ filter: { and: filters } }]; } vegaEmbed("#sys-vis", newSpec, { actions: false }); } diff --git a/resources/report/dashboard/template.html.jinja2 b/resources/report/dashboard/template.html.jinja2 index d0ba035c..6710d011 100644 --- a/resources/report/dashboard/template.html.jinja2 +++ b/resources/report/dashboard/template.html.jinja2 @@ -162,6 +162,14 @@ {% if sysmetrics_sources %}
+
+ + +
+
+ + +
From 2d9f3582a7972a089c61704dbf91688606df2a8e Mon Sep 17 00:00:00 2001 From: Jonas Bhend Date: Fri, 5 Jun 2026 17:02:27 +0200 Subject: [PATCH 08/16] fix error in inference from merging in main --- workflow/rules/inference.smk | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/workflow/rules/inference.smk b/workflow/rules/inference.smk index 0790dd4a..8ab420bd 100644 --- a/workflow/rules/inference.smk +++ b/workflow/rules/inference.smk @@ -369,16 +369,5 @@ rule inference_execute: fi ) > {log} 2>&1 - srun \ - --unbuffered \ - --partition={resources.slurm_partition} \ - --cpus-per-task={resources.cpus_per_task} \ - --mem-per-cpu={resources.mem_mb_per_cpu} \ - --time={resources.runtime} \ - --gres={resources.gres} \ - --ntasks={resources.ntasks} \ - anemoi-inference run config.yaml "${{CMD_ARGS[@]}}" - ' - ) >{log} 2>&1 """ # fmt: on From 16c4df7cf34eb1de1cbbf1686a9e8736255ca2c9 Mon Sep 17 00:00:00 2001 From: Jonas Bhend Date: Fri, 5 Jun 2026 17:06:30 +0200 Subject: [PATCH 09/16] linting --- src/diagnostics/__init__.py | 6 +----- tests/unit/test_parse_inference_logs.py | 12 +++++++++--- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/diagnostics/__init__.py b/src/diagnostics/__init__.py index 58a19152..aca5cad8 100644 --- a/src/diagnostics/__init__.py +++ b/src/diagnostics/__init__.py @@ -145,11 +145,7 @@ def _vals(col: str) -> list[float]: idx = col_idx.get(col) if idx is None: return [] - return [ - r[idx] - for r in data_rows - if idx < len(r) and math.isfinite(r[idx]) - ] + return [r[idx] for r in data_rows if idx < len(r) and math.isfinite(r[idx])] result: dict = {} diff --git a/tests/unit/test_parse_inference_logs.py b/tests/unit/test_parse_inference_logs.py index 9c96baa7..cdf58d19 100644 --- a/tests/unit/test_parse_inference_logs.py +++ b/tests/unit/test_parse_inference_logs.py @@ -135,7 +135,9 @@ def test_parse_logs_no_metrics_files_is_skipped(tmp_path): workdir = tmp_path / "empty_run" workdir.mkdir() records = parse_logs( - run_info=[{"workdir": str(workdir), "run_id": "x", "init_time": "202503010000"}], + run_info=[ + {"workdir": str(workdir), "run_id": "x", "init_time": "202503010000"} + ], label_map={}, gpu_map={}, ) @@ -152,7 +154,9 @@ def test_parse_logs_model_type_from_run_id_prefix(tmp_path): (workdir / "slurm_metrics.log").write_text(SACCT_CONTENT) records = parse_logs( - run_info=[{"workdir": str(workdir), "run_id": prefix, "init_time": "202503010000"}], + run_info=[ + {"workdir": str(workdir), "run_id": prefix, "init_time": "202503010000"} + ], label_map={}, gpu_map={}, ) @@ -165,7 +169,9 @@ def test_parse_logs_fallback_label_is_run_id(tmp_path): (workdir / "slurm_metrics.log").write_text(SACCT_CONTENT) records = parse_logs( - run_info=[{"workdir": str(workdir), "run_id": "env-abc", "init_time": "202503020000"}], + run_info=[ + {"workdir": str(workdir), "run_id": "env-abc", "init_time": "202503020000"} + ], label_map={}, gpu_map={}, ) From b9ec45e782e263825b67226224bcab7dfc0a161c Mon Sep 17 00:00:00 2001 From: Jonas Bhend Date: Sun, 7 Jun 2026 12:24:51 +0200 Subject: [PATCH 10/16] convert to wall time to minutes --- src/diagnostics/__init__.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/diagnostics/__init__.py b/src/diagnostics/__init__.py index aca5cad8..703c161a 100644 --- a/src/diagnostics/__init__.py +++ b/src/diagnostics/__init__.py @@ -9,7 +9,7 @@ # Columns exposed as distribution metrics in the dashboard SYSMETRICS_COLS = { - "wall_time_s": "Wall Time (s)", + "wall_time_s": "Wall Time (min)", "gpu_hours": "GPU Hours", "max_rss_mb": "Peak CPU Memory (MB)", "gpu_util_mean": "Mean GPU Util (%)", @@ -259,8 +259,9 @@ def melt_for_dashboard(records: list[dict]) -> tuple[str, list[str], list[str]]: } for col, label in SYSMETRICS_COLS.items(): if r.get(col) is not None: - long_records.append({**base, "metric": label, "value": r[col]}) + value = r[col] / 60 if col == "wall_time_s" else r[col] + long_records.append({**base, "metric": label, "value": round(value, 2)}) - sources = sorted({r["source"] for r in records}) - model_types = sorted({r.get("model_type", "unknown") for r in records}) + sources = sorted({r["source"] for r in records if r["source"] is not None}) + model_types = sorted({r.get("model_type") or "unknown" for r in records}) return json.dumps(long_records), sources, model_types From 642443027130e847a648e570c6cff40bbde32642 Mon Sep 17 00:00:00 2001 From: Jonas Bhend Date: Sun, 7 Jun 2026 12:25:19 +0200 Subject: [PATCH 11/16] fix getSelected error --- resources/report/dashboard/script.js | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/resources/report/dashboard/script.js b/resources/report/dashboard/script.js index beb8f85e..086fb50a 100644 --- a/resources/report/dashboard/script.js +++ b/resources/report/dashboard/script.js @@ -119,7 +119,7 @@ if (sysData.length > 0) { "field": "value", "type": "quantitative", "title": null, - "scale": { "zero": false } + "scale": { "zero": true } }, "color": { "field": "source", @@ -150,9 +150,9 @@ if (sysData.length > 0) { }; function updateSysChart() { - const selectedModelTypes = getSelectedValues("sys-model-type-select"); - const selectedSources = getSelectedValues("sys-source-select"); - const selectedMetrics = getSelectedValues("sys-metric-select"); + const selectedModelTypes = getSelected("sys-model-type-select"); + const selectedSources = getSelected("sys-source-select"); + const selectedMetrics = getSelected("sys-metric-select"); const newSpec = JSON.parse(JSON.stringify(sysSpec)); const filters = []; if (selectedModelTypes.length > 0) { From 1709ed7096afdd4ab129074d9a25f85a456388c7 Mon Sep 17 00:00:00 2001 From: Jonas Bhend Date: Sun, 7 Jun 2026 12:26:37 +0200 Subject: [PATCH 12/16] remove duplicates --- resources/report/dashboard/template.html.jinja2 | 8 -------- 1 file changed, 8 deletions(-) diff --git a/resources/report/dashboard/template.html.jinja2 b/resources/report/dashboard/template.html.jinja2 index fc39cb02..93d8ec28 100644 --- a/resources/report/dashboard/template.html.jinja2 +++ b/resources/report/dashboard/template.html.jinja2 @@ -253,18 +253,10 @@ {{ sysmetrics_data | safe | indent(8, false) }} - - - -