From e8090bd0f8bf146f2be36dc8a8b1d10d1baf9092 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 5 Dec 2025 11:29:55 -0500 Subject: [PATCH 01/21] benchmarks hardening! --- .github/workflows/bench.yml | 84 ++++++++++++++++++-- .github/workflows/test.yml | 4 +- toolchain/mfc/bench.py | 101 +++++++++++++++++++---- toolchain/mfc/test/test.py | 154 +++++++++++++++++++++++------------- 4 files changed, 263 insertions(+), 80 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 563566788e..131010106a 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -23,7 +23,7 @@ jobs: filters: ".github/file-filter.yml" self: - name: "${{ matrix.name }} (${{ matrix.device }})" + name: "${{ matrix.name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})" if: ${{ github.repository=='MFlowCode/MFC' && needs.file-changes.outputs.checkall=='true' && ((github.event_name=='pull_request_review' && github.event.review.state=='approved') || (github.event_name=='pull_request' && (github.event.pull_request.user.login=='sbryngelson' || github.event.pull_request.user.login=='wilfonba'))) }} needs: file-changes strategy: @@ -73,7 +73,7 @@ jobs: runs-on: group: ${{ matrix.group }} labels: ${{ matrix.labels }} - timeout-minutes: 1400 + timeout-minutes: 480 env: ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16 ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true @@ -99,9 +99,83 @@ jobs: - name: Bench (Master v. PR) run: | - (cd pr && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }} ${{ matrix.interface }}) & - (cd master && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }} ${{ matrix.interface }}) & - wait %1 && wait %2 + set -e + + # Function to submit and monitor + submit_and_monitor() { + local dir=$1 + local device=$2 + local interface=$3 + local cluster=$4 + + cd "$dir" + + # Submit job + submit_output=$(bash .github/workflows/$cluster/submit-bench.sh \ + .github/workflows/$cluster/bench.sh $device $interface 2>&1) + + job_id=$(echo "$submit_output" | grep -oP 'Submitted batch job \K[0-9]+' || echo "") + job_slug="bench-$device-$interface" + output_file="${job_slug}.out" + + if [ -z "$job_id" ]; then + echo "ERROR: Failed to submit job" + echo "$submit_output" + return 1 + fi + + echo "Submitted batch job $job_id" + echo "Monitoring output file: $output_file" + + # Wait for file to appear (check job status if it takes a while) + echo "Waiting for job to start..." + while [ ! -f "$output_file" ]; do + # Check if job failed to start + if ! squeue -j "$job_id" &>/dev/null && [ ! -f "$output_file" ]; then + echo "ERROR: Job $job_id finished without creating output file" + return 1 + fi + sleep 5 + done + + echo "=== Streaming output for job $job_id ===" + # Stream output while job runs + tail -f "$output_file" & + tail_pid=$! + + # Wait for job to complete (will wait up to GitHub Actions timeout) + while squeue -j "$job_id" &>/dev/null; do + sleep 5 + done + + # Stop tailing + kill $tail_pid 2>/dev/null || true + + echo "" + echo "=== Final output ===" + cat "$output_file" + + # Check exit status + exit_code=$(scontrol show job "$job_id" 2>/dev/null | grep -oP 'ExitCode=\K[0-9]+' || echo "0:0") + if [ "$exit_code" != "0" ] && [ "$exit_code" != "0:0" ]; then + echo "ERROR: Job $job_id failed with exit code $exit_code" + return 1 + fi + } + + # Run both jobs with monitoring + (submit_and_monitor pr ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}) & + pr_pid=$! + + (submit_and_monitor master ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}) & + master_pid=$! + + wait $pr_pid && pr_exit=$? || pr_exit=$? + wait $master_pid && master_exit=$? || master_exit=$? + + if [ $pr_exit -ne 0 ] || [ $master_exit -ne 0 ]; then + exit 1 + fi - name: Generate & Post Comment run: | diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d7b4703a1e..65707799b2 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -93,11 +93,11 @@ jobs: OPT2: ${{ matrix.debug == 'debug' && '-% 20' || '' }} self: - name: Self Hosted + name: "${{ matrix.lbl == 'gt' && 'Georgia Tech | Phoenix' || 'Oak Ridge | Frontier' }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})" if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true' needs: file-changes continue-on-error: false - timeout-minutes: 1400 + timeout-minutes: 480 strategy: matrix: device: ['gpu'] diff --git a/toolchain/mfc/bench.py b/toolchain/mfc/bench.py index eb1b120035..89d5a6fb1b 100644 --- a/toolchain/mfc/bench.py +++ b/toolchain/mfc/bench.py @@ -16,6 +16,7 @@ class BenchCase: path: str args: typing.List[str] +# pylint: disable=too-many-locals, too-many-branches, too-many-statements def bench(targets = None): if targets is None: targets = ARG("targets") @@ -36,6 +37,10 @@ def bench(targets = None): case.args = case.args + ARG("--") case.path = os.path.abspath(case.path) + # Validate case file exists early + if not os.path.exists(case.path): + raise MFCException(f"Benchmark case file not found: {case.path}") + results = { "metadata": { "invocation": sys.argv[1:], @@ -44,6 +49,8 @@ def bench(targets = None): "cases": {}, } + failed_cases = [] + for i, case in enumerate(CASES): summary_filepath = os.path.join(bench_dirpath, f"{case.slug}.yaml") log_filepath = os.path.join(bench_dirpath, f"{case.slug}.out") @@ -54,21 +61,82 @@ def bench(targets = None): cons.print(f"> Log: [bold]{os.path.relpath(log_filepath)}[/bold]") cons.print(f"> Summary: [bold]{os.path.relpath(summary_filepath)}[/bold]") - with open(log_filepath, "w") as log_file: - system( - ["./mfc.sh", "run", case.path, "--case-optimization"] + - ["--targets"] + [t.name for t in targets] + - ["--output-summary", summary_filepath] + - case.args + - ["--", "--gbpp", ARG('mem')], - stdout=log_file, - stderr=subprocess.STDOUT) - - results["cases"][case.slug] = { - "description": dataclasses.asdict(case), - "output_summary": file_load_yaml(summary_filepath), - } + try: + with open(log_filepath, "w") as log_file: + result = system( + ["./mfc.sh", "run", case.path, "--case-optimization"] + + ["--targets"] + [t.name for t in targets] + + ["--output-summary", summary_filepath] + + case.args + + ["--", "--gbpp", ARG('mem')], + stdout=log_file, + stderr=subprocess.STDOUT) + + # Check return code + if result.returncode != 0: + cons.print(f"[bold red]ERROR[/bold red]: Case {case.slug} failed with exit code {result.returncode}") + cons.print(f"[bold red] Check log at: {log_filepath}[/bold red]") + failed_cases.append(case.slug) + cons.unindent() + continue + + # Validate summary file exists + if not os.path.exists(summary_filepath): + cons.print(f"[bold red]ERROR[/bold red]: Summary file not created for {case.slug}") + cons.print(f"[bold red] Expected: {summary_filepath}[/bold red]") + failed_cases.append(case.slug) + cons.unindent() + continue + + # Load summary + summary = file_load_yaml(summary_filepath) + + # Validate all targets have required data + validation_failed = False + for target in targets: + if target.name not in summary: + cons.print(f"[bold red]ERROR[/bold red]: Target {target.name} missing from summary for {case.slug}") + validation_failed = True + break + + if "exec" not in summary[target.name]: + cons.print(f"[bold red]ERROR[/bold red]: 'exec' time missing for {target.name} in {case.slug}") + validation_failed = True + break + + if target.name == "simulation" and "grind" not in summary[target.name]: + cons.print(f"[bold red]ERROR[/bold red]: 'grind' time missing for simulation in {case.slug}") + validation_failed = True + break + + if validation_failed: + failed_cases.append(case.slug) + cons.unindent() + continue + + # Add to results + results["cases"][case.slug] = { + "description": dataclasses.asdict(case), + "output_summary": summary, + } + cons.print(f"[bold green]✓[/bold green] Case {case.slug} completed successfully") + + except Exception as e: + cons.print(f"[bold red]ERROR[/bold red]: Unexpected error running {case.slug}: {e}") + failed_cases.append(case.slug) + + cons.unindent() + + # Report results + if failed_cases: + cons.print() + cons.print(f"[bold red]Failed cases ({len(failed_cases)}):[/bold red]") + for slug in failed_cases: + cons.print(f" - {slug}") + cons.print() + raise MFCException(f"Benchmarking failed: {len(failed_cases)}/{len(CASES)} cases failed") + # Write output file_dump_yaml(ARG("output"), results) cons.print(f"Wrote results to [bold magenta]{os.path.relpath(ARG('output'))}[/bold magenta].") @@ -140,8 +208,9 @@ def diff(): if grind_time_value <0.95: cons.print(f"[bold red]Error[/bold red]: Benchmarking failed since grind time speedup for {target.name} below acceptable threshold (<0.95) - Case: {slug}") err = 1 - except Exception as _: - pass + except Exception as e: + cons.print(f"[bold red]ERROR[/bold red]: Failed to compute speedup for {target.name} in {slug}: {e}") + err = 1 table.add_row(f"[magenta]{slug}[/magenta]", *speedups) diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 0ba6e160f7..c52334d233 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -1,4 +1,4 @@ -import os, typing, shutil, time, itertools +import os, typing, shutil, time, itertools, signal from random import sample, seed import rich, rich.table @@ -23,6 +23,19 @@ total_test_count = 0 errors = [] +# Early abort thresholds +MIN_CASES_BEFORE_ABORT = 20 +FAILURE_RATE_THRESHOLD = 0.3 + +# Per-test timeout (1 hour) +TEST_TIMEOUT_SECONDS = 3600 + +class TestTimeoutError(MFCException): + pass + +def timeout_handler(signum, frame): + raise TestTimeoutError("Test case exceeded 1 hour timeout") + # pylint: disable=too-many-branches, trailing-whitespace def __filter(cases_) -> typing.List[TestCase]: cases = cases_[:] @@ -186,91 +199,106 @@ def _handle_case(case: TestCase, devices: typing.Set[int]): global current_test_number start_time = time.time() + # Set timeout alarm + signal.signal(signal.SIGALRM, timeout_handler) + signal.alarm(TEST_TIMEOUT_SECONDS) + tol = case.compute_tolerance() case.delete_output() case.create_directory() if ARG("dry_run"): cons.print(f" [bold magenta]{case.get_uuid()}[/bold magenta] SKIP {case.trace}") + signal.alarm(0) # Cancel alarm return - cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices) + try: + cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices) - out_filepath = os.path.join(case.get_dirpath(), "out_pre_sim.txt") + out_filepath = os.path.join(case.get_dirpath(), "out_pre_sim.txt") - common.file_write(out_filepath, cmd.stdout) + common.file_write(out_filepath, cmd.stdout) - if cmd.returncode != 0: - cons.print(cmd.stdout) - raise MFCException(f"Test {case}: Failed to execute MFC.") + if cmd.returncode != 0: + cons.print(cmd.stdout) + raise MFCException(f"Test {case}: Failed to execute MFC.") - pack, err = packer.pack(case.get_dirpath()) - if err is not None: - raise MFCException(f"Test {case}: {err}") + pack, err = packer.pack(case.get_dirpath()) + if err is not None: + raise MFCException(f"Test {case}: {err}") - if pack.has_NaNs(): - raise MFCException(f"Test {case}: NaNs detected in the case.") + if pack.has_NaNs(): + raise MFCException(f"Test {case}: NaNs detected in the case.") - golden_filepath = os.path.join(case.get_dirpath(), "golden.txt") - if ARG("generate"): - common.delete_file(golden_filepath) - pack.save(golden_filepath) - else: - if not os.path.isfile(golden_filepath): - raise MFCException(f"Test {case}: The golden file does not exist! To generate golden files, use the '--generate' flag.") + golden_filepath = os.path.join(case.get_dirpath(), "golden.txt") + if ARG("generate"): + common.delete_file(golden_filepath) + pack.save(golden_filepath) + else: + if not os.path.isfile(golden_filepath): + raise MFCException(f"Test {case}: The golden file does not exist! To generate golden files, use the '--generate' flag.") - golden = packer.load(golden_filepath) + golden = packer.load(golden_filepath) - if ARG("add_new_variables"): - for pfilepath, pentry in list(pack.entries.items()): - if golden.find(pfilepath) is None: - golden.set(pentry) + if ARG("add_new_variables"): + for pfilepath, pentry in list(pack.entries.items()): + if golden.find(pfilepath) is None: + golden.set(pentry) - for gfilepath, gentry in list(golden.entries.items()): - if pack.find(gfilepath) is None: - golden.remove(gentry) + for gfilepath, gentry in list(golden.entries.items()): + if pack.find(gfilepath) is None: + golden.remove(gentry) - golden.save(golden_filepath) - else: - err, msg = packtol.compare(pack, packer.load(golden_filepath), packtol.Tolerance(tol, tol)) - if msg is not None: - raise MFCException(f"Test {case}: {msg}") + golden.save(golden_filepath) + else: + err, msg = packtol.compare(pack, packer.load(golden_filepath), packtol.Tolerance(tol, tol)) + if msg is not None: + raise MFCException(f"Test {case}: {msg}") - if ARG("test_all"): - case.delete_output() - cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices) - out_filepath = os.path.join(case.get_dirpath(), "out_post.txt") - common.file_write(out_filepath, cmd.stdout) + if ARG("test_all"): + case.delete_output() + cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices) + out_filepath = os.path.join(case.get_dirpath(), "out_post.txt") + common.file_write(out_filepath, cmd.stdout) - for silo_filepath in os.listdir(os.path.join(case.get_dirpath(), 'silo_hdf5', 'p0')): - silo_filepath = os.path.join(case.get_dirpath(), 'silo_hdf5', 'p0', silo_filepath) - h5dump = f"{HDF5.get_install_dirpath(case.to_input_file())}/bin/h5dump" + for silo_filepath in os.listdir(os.path.join(case.get_dirpath(), 'silo_hdf5', 'p0')): + silo_filepath = os.path.join(case.get_dirpath(), 'silo_hdf5', 'p0', silo_filepath) + h5dump = f"{HDF5.get_install_dirpath(case.to_input_file())}/bin/h5dump" - if not os.path.exists(h5dump or ""): - if not does_command_exist("h5dump"): - raise MFCException("h5dump couldn't be found.") + if not os.path.exists(h5dump or ""): + if not does_command_exist("h5dump"): + raise MFCException("h5dump couldn't be found.") - h5dump = shutil.which("h5dump") + h5dump = shutil.which("h5dump") - output, err = get_program_output([h5dump, silo_filepath]) + output, err = get_program_output([h5dump, silo_filepath]) - if err != 0: - raise MFCException(f"Test {case}: Failed to run h5dump. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.") + if err != 0: + raise MFCException(f"Test {case}: Failed to run h5dump. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.") - if "nan," in output: - raise MFCException(f"Test {case}: Post Process has detected a NaN. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.") + if "nan," in output: + raise MFCException(f"Test {case}: Post Process has detected a NaN. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.") - if "inf," in output: - raise MFCException(f"Test {case}: Post Process has detected an Infinity. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.") + if "inf," in output: + raise MFCException(f"Test {case}: Post Process has detected an Infinity. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.") - case.delete_output() + case.delete_output() - end_time = time.time() - duration = end_time - start_time + end_time = time.time() + duration = end_time - start_time - current_test_number += 1 - progress_str = f"({current_test_number:3d}/{total_test_count:3d})" - cons.print(f" {progress_str} [bold magenta]{case.get_uuid()}[/bold magenta] {duration:6.2f} {case.trace}") + current_test_number += 1 + progress_str = f"({current_test_number:3d}/{total_test_count:3d})" + cons.print(f" {progress_str} [bold magenta]{case.get_uuid()}[/bold magenta] {duration:6.2f} {case.trace}") + + except TestTimeoutError: + raise MFCException( + f"Test {case} exceeded 1 hour timeout. " + f"This may indicate a hung simulation or misconfigured case. " + f"Check the log at: {os.path.join(case.get_dirpath(), 'out_pre_sim.txt')}" + ) + finally: + signal.alarm(0) # Cancel alarm def handle_case(case: TestCase, devices: typing.Set[int]): @@ -301,4 +329,16 @@ def handle_case(case: TestCase, devices: typing.Set[int]): errors.append(f"[bold red]Failed test {case} after {nAttempts} attempt(s).[/bold red]") errors.append(f"{exc}") + # Check if we should abort early due to high failure rate + total_completed = nFAIL + nPASS + if total_completed >= MIN_CASES_BEFORE_ABORT: + failure_rate = nFAIL / total_completed + if failure_rate >= FAILURE_RATE_THRESHOLD: + cons.print(f"\n[bold red]CRITICAL: {failure_rate*100:.1f}% failure rate detected after {total_completed} tests.[/bold red]") + cons.print(f"[bold red]This suggests a systemic issue (bad build, broken environment, etc.)[/bold red]") + cons.print(f"[bold red]Aborting remaining tests to fail fast.[/bold red]\n") + raise MFCException( + f"Excessive test failures: {nFAIL}/{total_completed} failed ({failure_rate*100:.1f}%)" + ) + return From 8c01b99e057605c2ae7a847f2258f397484258a0 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 5 Dec 2025 11:33:35 -0500 Subject: [PATCH 02/21] lint error --- toolchain/mfc/test/test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index c52334d233..a63374e3c3 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -291,12 +291,12 @@ def _handle_case(case: TestCase, devices: typing.Set[int]): progress_str = f"({current_test_number:3d}/{total_test_count:3d})" cons.print(f" {progress_str} [bold magenta]{case.get_uuid()}[/bold magenta] {duration:6.2f} {case.trace}") - except TestTimeoutError: + except TestTimeoutError as exc: raise MFCException( f"Test {case} exceeded 1 hour timeout. " f"This may indicate a hung simulation or misconfigured case. " f"Check the log at: {os.path.join(case.get_dirpath(), 'out_pre_sim.txt')}" - ) + ) from exc finally: signal.alarm(0) # Cancel alarm From 2e1eb87e1ffb9dbed5846fe6f47e49ed03606bb6 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 5 Dec 2025 11:44:59 -0500 Subject: [PATCH 03/21] minor --- toolchain/mfc/test/test.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index a63374e3c3..77e4dead30 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -330,15 +330,17 @@ def handle_case(case: TestCase, devices: typing.Set[int]): errors.append(f"{exc}") # Check if we should abort early due to high failure rate - total_completed = nFAIL + nPASS - if total_completed >= MIN_CASES_BEFORE_ABORT: - failure_rate = nFAIL / total_completed - if failure_rate >= FAILURE_RATE_THRESHOLD: - cons.print(f"\n[bold red]CRITICAL: {failure_rate*100:.1f}% failure rate detected after {total_completed} tests.[/bold red]") - cons.print(f"[bold red]This suggests a systemic issue (bad build, broken environment, etc.)[/bold red]") - cons.print(f"[bold red]Aborting remaining tests to fail fast.[/bold red]\n") - raise MFCException( - f"Excessive test failures: {nFAIL}/{total_completed} failed ({failure_rate*100:.1f}%)" - ) + # Skip this check during dry-run (only builds, doesn't run tests) + if not ARG("dry_run"): + total_completed = nFAIL + nPASS + if total_completed >= MIN_CASES_BEFORE_ABORT: + failure_rate = nFAIL / total_completed + if failure_rate >= FAILURE_RATE_THRESHOLD: + cons.print(f"\n[bold red]CRITICAL: {failure_rate*100:.1f}% failure rate detected after {total_completed} tests.[/bold red]") + cons.print(f"[bold red]This suggests a systemic issue (bad build, broken environment, etc.)[/bold red]") + cons.print(f"[bold red]Aborting remaining tests to fail fast.[/bold red]\n") + raise MFCException( + f"Excessive test failures: {nFAIL}/{total_completed} failed ({failure_rate*100:.1f}%)" + ) return From 0668fedefcba5302448c99bcbf8cccbfe519601b Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 5 Dec 2025 11:52:07 -0500 Subject: [PATCH 04/21] forks look nicer now i think --- .github/workflows/test.yml | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 65707799b2..251b276ace 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -93,23 +93,40 @@ jobs: OPT2: ${{ matrix.debug == 'debug' && '-% 20' || '' }} self: - name: "${{ matrix.lbl == 'gt' && 'Georgia Tech | Phoenix' || 'Oak Ridge | Frontier' }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})" + name: "${{ matrix.cluster_name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})" if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true' needs: file-changes continue-on-error: false timeout-minutes: 480 strategy: matrix: - device: ['gpu'] - interface: ['acc', 'omp'] - lbl: ['gt', 'frontier'] include: - - device: 'cpu' + # Phoenix (GT) + - lbl: 'gt' + cluster_name: 'Georgia Tech | Phoenix' + device: 'gpu' + interface: 'acc' + - lbl: 'gt' + cluster_name: 'Georgia Tech | Phoenix' + device: 'gpu' + interface: 'omp' + - lbl: 'gt' + cluster_name: 'Georgia Tech | Phoenix' + device: 'cpu' interface: 'none' - lbl: 'gt' - - device: 'cpu' + # Frontier (ORNL) + - lbl: 'frontier' + cluster_name: 'Oak Ridge | Frontier' + device: 'gpu' + interface: 'acc' + - lbl: 'frontier' + cluster_name: 'Oak Ridge | Frontier' + device: 'gpu' + interface: 'omp' + - lbl: 'frontier' + cluster_name: 'Oak Ridge | Frontier' + device: 'cpu' interface: 'none' - lbl: 'frontier' runs-on: group: phoenix labels: ${{ matrix.lbl }} From f79f4d16969801501643b22f82f8a75658ecb477 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 5 Dec 2025 12:08:11 -0500 Subject: [PATCH 05/21] more fixing! --- .github/scripts/monitor_slurm_job.sh | 65 +++++++++++++++++++++ .github/workflows/bench.yml | 41 +------------- toolchain/mfc/bench.py | 20 ++++--- toolchain/mfc/test/test.py | 84 ++++++++++++++++++---------- 4 files changed, 136 insertions(+), 74 deletions(-) create mode 100755 .github/scripts/monitor_slurm_job.sh diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh new file mode 100755 index 0000000000..b6d43010df --- /dev/null +++ b/.github/scripts/monitor_slurm_job.sh @@ -0,0 +1,65 @@ +#!/bin/bash +# Monitor a SLURM job and stream its output in real-time +# Usage: monitor_slurm_job.sh + +set -e + +if [ $# -ne 2 ]; then + echo "Usage: $0 " + exit 1 +fi + +job_id="$1" +output_file="$2" + +echo "Submitted batch job $job_id" +echo "Monitoring output file: $output_file" + +# Wait for file to appear (check job status if it takes a while) +echo "Waiting for job to start..." +while [ ! -f "$output_file" ]; do + # Check if job failed to start + if ! squeue -j "$job_id" &>/dev/null && [ ! -f "$output_file" ]; then + echo "ERROR: Job $job_id finished without creating output file" + exit 1 + fi + sleep 5 +done + +echo "=== Streaming output for job $job_id ===" +# Stream output while job runs +tail -f "$output_file" & +tail_pid=$! + +# Wait for job to complete with retry logic for transient squeue failures +squeue_failures=0 +while true; do + if squeue -j "$job_id" &>/dev/null; then + squeue_failures=0 + else + squeue_failures=$((squeue_failures + 1)) + # Allow a few transient failures before concluding job is done + if [ $squeue_failures -ge 3 ]; then + break + fi + fi + sleep 5 +done + +# Stop tailing +kill $tail_pid 2>/dev/null || true + +echo "" +echo "=== Final output ===" +cat "$output_file" + +# Check exit status +exit_code=$(scontrol show job "$job_id" 2>/dev/null | grep -oP 'ExitCode=\K[0-9]+:[0-9]+' || echo "0:0") +if [ "$exit_code" != "0:0" ]; then + echo "ERROR: Job $job_id failed with exit code $exit_code" + exit 1 +fi + +echo "Job $job_id completed successfully" +exit 0 + diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 131010106a..cbe83de6b7 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -101,7 +101,7 @@ jobs: run: | set -e - # Function to submit and monitor + # Function to submit and monitor using extracted script submit_and_monitor() { local dir=$1 local device=$2 @@ -124,43 +124,8 @@ jobs: return 1 fi - echo "Submitted batch job $job_id" - echo "Monitoring output file: $output_file" - - # Wait for file to appear (check job status if it takes a while) - echo "Waiting for job to start..." - while [ ! -f "$output_file" ]; do - # Check if job failed to start - if ! squeue -j "$job_id" &>/dev/null && [ ! -f "$output_file" ]; then - echo "ERROR: Job $job_id finished without creating output file" - return 1 - fi - sleep 5 - done - - echo "=== Streaming output for job $job_id ===" - # Stream output while job runs - tail -f "$output_file" & - tail_pid=$! - - # Wait for job to complete (will wait up to GitHub Actions timeout) - while squeue -j "$job_id" &>/dev/null; do - sleep 5 - done - - # Stop tailing - kill $tail_pid 2>/dev/null || true - - echo "" - echo "=== Final output ===" - cat "$output_file" - - # Check exit status - exit_code=$(scontrol show job "$job_id" 2>/dev/null | grep -oP 'ExitCode=\K[0-9]+' || echo "0:0") - if [ "$exit_code" != "0" ] && [ "$exit_code" != "0:0" ]; then - echo "ERROR: Job $job_id failed with exit code $exit_code" - return 1 - fi + # Use the monitoring script + bash .github/scripts/monitor_slurm_job.sh "$job_id" "$output_file" } # Run both jobs with monitoring diff --git a/toolchain/mfc/bench.py b/toolchain/mfc/bench.py index 89d5a6fb1b..a9441febc4 100644 --- a/toolchain/mfc/bench.py +++ b/toolchain/mfc/bench.py @@ -68,13 +68,14 @@ def bench(targets = None): ["--targets"] + [t.name for t in targets] + ["--output-summary", summary_filepath] + case.args + - ["--", "--gbpp", ARG('mem')], + ["--", "--gbpp", str(ARG('mem'))], stdout=log_file, stderr=subprocess.STDOUT) - # Check return code - if result.returncode != 0: - cons.print(f"[bold red]ERROR[/bold red]: Case {case.slug} failed with exit code {result.returncode}") + # Check return code (handle CompletedProcess or int defensively) + rc = result.returncode if hasattr(result, "returncode") else result + if rc != 0: + cons.print(f"[bold red]ERROR[/bold red]: Case {case.slug} failed with exit code {rc}") cons.print(f"[bold red] Check log at: {log_filepath}[/bold red]") failed_cases.append(case.slug) cons.unindent() @@ -111,7 +112,6 @@ def bench(targets = None): if validation_failed: failed_cases.append(case.slug) - cons.unindent() continue # Add to results @@ -124,8 +124,8 @@ def bench(targets = None): except Exception as e: cons.print(f"[bold red]ERROR[/bold red]: Unexpected error running {case.slug}: {e}") failed_cases.append(case.slug) - - cons.unindent() + finally: + cons.unindent() # Report results if failed_cases: @@ -209,7 +209,11 @@ def diff(): cons.print(f"[bold red]Error[/bold red]: Benchmarking failed since grind time speedup for {target.name} below acceptable threshold (<0.95) - Case: {slug}") err = 1 except Exception as e: - cons.print(f"[bold red]ERROR[/bold red]: Failed to compute speedup for {target.name} in {slug}: {e}") + import traceback + cons.print( + f"[bold red]ERROR[/bold red]: Failed to compute speedup for {target.name} in {slug}: {e}\n" + f"{traceback.format_exc()}" + ) err = 1 table.add_row(f"[magenta]{slug}[/magenta]", *speedups) diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 77e4dead30..cfac342058 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -1,4 +1,4 @@ -import os, typing, shutil, time, itertools, signal +import os, typing, shutil, time, itertools, threading from random import sample, seed import rich, rich.table @@ -30,12 +30,15 @@ # Per-test timeout (1 hour) TEST_TIMEOUT_SECONDS = 3600 +# Global abort flag for thread-safe early termination +# This flag is set when the failure rate exceeds the threshold, signaling +# all worker threads to exit gracefully. This avoids raising exceptions +# from worker threads which could leave the scheduler in an undefined state. +abort_tests = threading.Event() + class TestTimeoutError(MFCException): pass -def timeout_handler(signum, frame): - raise TestTimeoutError("Test case exceeded 1 hour timeout") - # pylint: disable=too-many-branches, trailing-whitespace def __filter(cases_) -> typing.List[TestCase]: cases = cases_[:] @@ -173,6 +176,15 @@ def test(): [ sched.Task(ppn=case.ppn, func=handle_case, args=[case], load=case.get_cell_count()) for case in cases ], ARG("jobs"), ARG("gpus")) + # Check if we aborted due to high failure rate + if abort_tests.is_set(): + total_completed = nFAIL + nPASS + cons.print() + cons.unindent() + raise MFCException( + f"Excessive test failures: {nFAIL}/{total_completed} failed ({nFAIL/total_completed*100:.1f}%)" + ) + nSKIP = len(skipped_cases) cons.print() cons.unindent() @@ -199,9 +211,12 @@ def _handle_case(case: TestCase, devices: typing.Set[int]): global current_test_number start_time = time.time() - # Set timeout alarm - signal.signal(signal.SIGALRM, timeout_handler) - signal.alarm(TEST_TIMEOUT_SECONDS) + # Set timeout using threading.Timer (works in worker threads) + # Note: signal.alarm() only works in the main thread, so we use + # threading.Timer which works correctly in worker threads spawned by sched.sched + timeout_flag = threading.Event() + timeout_timer = threading.Timer(TEST_TIMEOUT_SECONDS, timeout_flag.set) + timeout_timer.start() tol = case.compute_tolerance() case.delete_output() @@ -209,12 +224,19 @@ def _handle_case(case: TestCase, devices: typing.Set[int]): if ARG("dry_run"): cons.print(f" [bold magenta]{case.get_uuid()}[/bold magenta] SKIP {case.trace}") - signal.alarm(0) # Cancel alarm + timeout_timer.cancel() return try: + # Check timeout before starting + if timeout_flag.is_set(): + raise TestTimeoutError("Test case exceeded 1 hour timeout") cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices) + # Check timeout after simulation + if timeout_flag.is_set(): + raise TestTimeoutError("Test case exceeded 1 hour timeout") + out_filepath = os.path.join(case.get_dirpath(), "out_pre_sim.txt") common.file_write(out_filepath, cmd.stdout) @@ -261,26 +283,28 @@ def _handle_case(case: TestCase, devices: typing.Set[int]): out_filepath = os.path.join(case.get_dirpath(), "out_post.txt") common.file_write(out_filepath, cmd.stdout) - for silo_filepath in os.listdir(os.path.join(case.get_dirpath(), 'silo_hdf5', 'p0')): - silo_filepath = os.path.join(case.get_dirpath(), 'silo_hdf5', 'p0', silo_filepath) - h5dump = f"{HDF5.get_install_dirpath(case.to_input_file())}/bin/h5dump" + silo_dir = os.path.join(case.get_dirpath(), 'silo_hdf5', 'p0') + if os.path.isdir(silo_dir): + for silo_filename in os.listdir(silo_dir): + silo_filepath = os.path.join(silo_dir, silo_filename) + h5dump = f"{HDF5.get_install_dirpath(case.to_input_file())}/bin/h5dump" - if not os.path.exists(h5dump or ""): - if not does_command_exist("h5dump"): - raise MFCException("h5dump couldn't be found.") + if not os.path.exists(h5dump or ""): + if not does_command_exist("h5dump"): + raise MFCException("h5dump couldn't be found.") - h5dump = shutil.which("h5dump") + h5dump = shutil.which("h5dump") - output, err = get_program_output([h5dump, silo_filepath]) + output, err = get_program_output([h5dump, silo_filepath]) - if err != 0: - raise MFCException(f"Test {case}: Failed to run h5dump. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.") + if err != 0: + raise MFCException(f"Test {case}: Failed to run h5dump. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.") - if "nan," in output: - raise MFCException(f"Test {case}: Post Process has detected a NaN. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.") + if "nan," in output: + raise MFCException(f"Test {case}: Post Process has detected a NaN. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.") - if "inf," in output: - raise MFCException(f"Test {case}: Post Process has detected an Infinity. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.") + if "inf," in output: + raise MFCException(f"Test {case}: Post Process has detected an Infinity. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.") case.delete_output() @@ -298,7 +322,7 @@ def _handle_case(case: TestCase, devices: typing.Set[int]): f"Check the log at: {os.path.join(case.get_dirpath(), 'out_pre_sim.txt')}" ) from exc finally: - signal.alarm(0) # Cancel alarm + timeout_timer.cancel() # Cancel timeout timer def handle_case(case: TestCase, devices: typing.Set[int]): @@ -306,6 +330,10 @@ def handle_case(case: TestCase, devices: typing.Set[int]): global nFAIL, nPASS, nSKIP global errors + # Check if we should abort before processing this case + if abort_tests.is_set(): + return # Exit gracefully if abort was requested + nAttempts = 0 if ARG('single'): max_attempts = max(ARG('max_attempts'), 3) @@ -337,10 +365,10 @@ def handle_case(case: TestCase, devices: typing.Set[int]): failure_rate = nFAIL / total_completed if failure_rate >= FAILURE_RATE_THRESHOLD: cons.print(f"\n[bold red]CRITICAL: {failure_rate*100:.1f}% failure rate detected after {total_completed} tests.[/bold red]") - cons.print(f"[bold red]This suggests a systemic issue (bad build, broken environment, etc.)[/bold red]") - cons.print(f"[bold red]Aborting remaining tests to fail fast.[/bold red]\n") - raise MFCException( - f"Excessive test failures: {nFAIL}/{total_completed} failed ({failure_rate*100:.1f}%)" - ) + cons.print("[bold red]This suggests a systemic issue (bad build, broken environment, etc.)[/bold red]") + cons.print("[bold red]Aborting remaining tests to fail fast.[/bold red]\n") + # Set abort flag instead of raising exception from worker thread + abort_tests.set() + return # Exit gracefully return From 76a6e63a5d7dcd5a8ca069c560d858e0639ec2f8 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 5 Dec 2025 12:14:01 -0500 Subject: [PATCH 06/21] clean linter --- toolchain/mfc/bench.py | 3 +-- toolchain/mfc/test/test.py | 49 ++++++++++++++++++++++++-------------- 2 files changed, 32 insertions(+), 20 deletions(-) diff --git a/toolchain/mfc/bench.py b/toolchain/mfc/bench.py index a9441febc4..ec54cc9cf2 100644 --- a/toolchain/mfc/bench.py +++ b/toolchain/mfc/bench.py @@ -1,4 +1,4 @@ -import os, sys, uuid, subprocess, dataclasses, typing, math +import os, sys, uuid, subprocess, dataclasses, typing, math, traceback import rich.table @@ -209,7 +209,6 @@ def diff(): cons.print(f"[bold red]Error[/bold red]: Benchmarking failed since grind time speedup for {target.name} below acceptable threshold (<0.95) - Case: {slug}") err = 1 except Exception as e: - import traceback cons.print( f"[bold red]ERROR[/bold red]: Failed to compute speedup for {target.name} in {slug}: {e}\n" f"{traceback.format_exc()}" diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index cfac342058..f1b04ba2fd 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -206,6 +206,36 @@ def test(): # pylint: disable=too-many-locals, too-many-branches, too-many-statements, trailing-whitespace +def _process_silo_file(silo_filepath: str, case: TestCase, out_filepath: str): + """Process a single silo file with h5dump and check for NaNs/Infinities.""" + h5dump = f"{HDF5.get_install_dirpath(case.to_input_file())}/bin/h5dump" + + if not os.path.exists(h5dump or ""): + if not does_command_exist("h5dump"): + raise MFCException("h5dump couldn't be found.") + h5dump = shutil.which("h5dump") + + output, err = get_program_output([h5dump, silo_filepath]) + + if err != 0: + raise MFCException( + f"Test {case}: Failed to run h5dump. You can find the run's output in {out_filepath}, " + f"and the case dictionary in {case.get_filepath()}." + ) + + if "nan," in output: + raise MFCException( + f"Test {case}: Post Process has detected a NaN. You can find the run's output in {out_filepath}, " + f"and the case dictionary in {case.get_filepath()}." + ) + + if "inf," in output: + raise MFCException( + f"Test {case}: Post Process has detected an Infinity. You can find the run's output in {out_filepath}, " + f"and the case dictionary in {case.get_filepath()}." + ) + + def _handle_case(case: TestCase, devices: typing.Set[int]): # pylint: disable=global-statement, global-variable-not-assigned global current_test_number @@ -287,24 +317,7 @@ def _handle_case(case: TestCase, devices: typing.Set[int]): if os.path.isdir(silo_dir): for silo_filename in os.listdir(silo_dir): silo_filepath = os.path.join(silo_dir, silo_filename) - h5dump = f"{HDF5.get_install_dirpath(case.to_input_file())}/bin/h5dump" - - if not os.path.exists(h5dump or ""): - if not does_command_exist("h5dump"): - raise MFCException("h5dump couldn't be found.") - - h5dump = shutil.which("h5dump") - - output, err = get_program_output([h5dump, silo_filepath]) - - if err != 0: - raise MFCException(f"Test {case}: Failed to run h5dump. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.") - - if "nan," in output: - raise MFCException(f"Test {case}: Post Process has detected a NaN. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.") - - if "inf," in output: - raise MFCException(f"Test {case}: Post Process has detected an Infinity. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.") + _process_silo_file(silo_filepath, case, out_filepath) case.delete_output() From 7333098b1563322b050334cd53369f6c4e1e05b9 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 5 Dec 2025 12:17:21 -0500 Subject: [PATCH 07/21] fix some more suggestions --- toolchain/mfc/bench.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/toolchain/mfc/bench.py b/toolchain/mfc/bench.py index ec54cc9cf2..8ee9a256d9 100644 --- a/toolchain/mfc/bench.py +++ b/toolchain/mfc/bench.py @@ -78,7 +78,6 @@ def bench(targets = None): cons.print(f"[bold red]ERROR[/bold red]: Case {case.slug} failed with exit code {rc}") cons.print(f"[bold red] Check log at: {log_filepath}[/bold red]") failed_cases.append(case.slug) - cons.unindent() continue # Validate summary file exists @@ -86,7 +85,6 @@ def bench(targets = None): cons.print(f"[bold red]ERROR[/bold red]: Summary file not created for {case.slug}") cons.print(f"[bold red] Expected: {summary_filepath}[/bold red]") failed_cases.append(case.slug) - cons.unindent() continue # Load summary From b62854c42c6c65c3c2069681ad93016b07fa8e64 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 5 Dec 2025 12:37:32 -0500 Subject: [PATCH 08/21] more hardening :O --- .github/scripts/monitor_slurm_job.sh | 94 +++++++++++++++++++++++++--- .github/workflows/bench.yml | 20 ++++-- toolchain/mfc/bench.py | 3 +- toolchain/mfc/test/test.py | 26 ++++++-- 4 files changed, 122 insertions(+), 21 deletions(-) diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh index b6d43010df..7e66377956 100755 --- a/.github/scripts/monitor_slurm_job.sh +++ b/.github/scripts/monitor_slurm_job.sh @@ -15,15 +15,30 @@ output_file="$2" echo "Submitted batch job $job_id" echo "Monitoring output file: $output_file" -# Wait for file to appear (check job status if it takes a while) +# Wait for file to appear with retry logic for transient squeue failures echo "Waiting for job to start..." +squeue_retries=0 +max_squeue_retries=5 while [ ! -f "$output_file" ]; do - # Check if job failed to start - if ! squeue -j "$job_id" &>/dev/null && [ ! -f "$output_file" ]; then - echo "ERROR: Job $job_id finished without creating output file" - exit 1 + # Check if job is still queued/running + if squeue -j "$job_id" &>/dev/null; then + squeue_retries=0 # Reset on success + sleep 5 + else + squeue_retries=$((squeue_retries + 1)) + if [ $squeue_retries -ge $max_squeue_retries ]; then + # Job not in queue and output file doesn't exist + if [ ! -f "$output_file" ]; then + echo "ERROR: Job $job_id not in queue and output file not created" + exit 1 + fi + break + fi + # Exponential backoff + sleep_time=$((2 ** squeue_retries)) + echo "Warning: squeue check failed, retrying in ${sleep_time}s..." + sleep $sleep_time fi - sleep 5 done echo "=== Streaming output for job $job_id ===" @@ -38,14 +53,49 @@ while true; do squeue_failures=0 else squeue_failures=$((squeue_failures + 1)) - # Allow a few transient failures before concluding job is done + # Check if job actually completed using sacct (if available) if [ $squeue_failures -ge 3 ]; then - break + if command -v sacct >/dev/null 2>&1; then + state=$(sacct -j "$job_id" --format=State --noheader 2>/dev/null | head -n1 | awk '{print $1}') + # Consider job done only if it reached a terminal state + case "$state" in + COMPLETED|FAILED|CANCELLED|TIMEOUT|OUT_OF_MEMORY) + break + ;; + *) + # treat as transient failure, reset failures and continue polling + squeue_failures=0 + ;; + esac + else + # No sacct: avoid false positive by doing an extra check cycle + squeue_failures=2 + fi fi fi sleep 5 done +# Wait for output file to finish growing (stabilize) before stopping tail +if [ -f "$output_file" ]; then + last_size=-1 + same_count=0 + while true; do + size=$(stat -c%s "$output_file" 2>/dev/null || echo -1) + if [ "$size" -eq "$last_size" ] && [ "$size" -ge 0 ]; then + same_count=$((same_count + 1)) + else + same_count=0 + last_size=$size + fi + # two consecutive stable checks (~10s) implies file likely flushed + if [ $same_count -ge 2 ]; then + break + fi + sleep 5 + done +fi + # Stop tailing kill $tail_pid 2>/dev/null || true @@ -53,8 +103,32 @@ echo "" echo "=== Final output ===" cat "$output_file" -# Check exit status -exit_code=$(scontrol show job "$job_id" 2>/dev/null | grep -oP 'ExitCode=\K[0-9]+:[0-9]+' || echo "0:0") +# Check exit status with sacct fallback +exit_code="" + +# Try scontrol first (works for recent jobs) +scontrol_output=$(scontrol show job "$job_id" 2>/dev/null || echo "") +if [ -n "$scontrol_output" ]; then + exit_code=$(echo "$scontrol_output" | grep -oE 'ExitCode=[0-9]+:[0-9]+' | cut -d= -f2 || echo "") +fi + +# If scontrol failed or returned invalid job, try sacct (for completed/aged-out jobs) +if [ -z "$exit_code" ]; then + echo "Warning: scontrol failed to get exit code, trying sacct..." + sacct_output=$(sacct -j "$job_id" --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 || echo "") + if [ -n "$sacct_output" ]; then + exit_code="$sacct_output" + fi +fi + +# If we still can't determine exit code, fail explicitly +if [ -z "$exit_code" ]; then + echo "ERROR: Unable to determine exit status for job $job_id" + echo "Both scontrol and sacct failed to return valid exit code" + exit 1 +fi + +# Check if job succeeded if [ "$exit_code" != "0:0" ]; then echo "ERROR: Job $job_id failed with exit code $exit_code" exit 1 diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index cbe83de6b7..fbb6ba7072 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -114,7 +114,7 @@ jobs: submit_output=$(bash .github/workflows/$cluster/submit-bench.sh \ .github/workflows/$cluster/bench.sh $device $interface 2>&1) - job_id=$(echo "$submit_output" | grep -oP 'Submitted batch job \K[0-9]+' || echo "") + job_id=$(echo "$submit_output" | sed -n 's/.*Submitted batch job \([0-9][0-9]*\).*/\1/p') job_slug="bench-$device-$interface" output_file="${job_slug}.out" @@ -135,10 +135,20 @@ jobs: (submit_and_monitor master ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}) & master_pid=$! - wait $pr_pid && pr_exit=$? || pr_exit=$? - wait $master_pid && master_exit=$? || master_exit=$? - - if [ $pr_exit -ne 0 ] || [ $master_exit -ne 0 ]; then + # Wait and capture exit codes reliably + pr_exit=0 + master_exit=0 + + if ! wait "$pr_pid"; then + pr_exit=$? + fi + if ! wait "$master_pid"; then + master_exit=$? + fi + + # Explicitly check and quote to avoid test errors + if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then + echo "One or both benchmark jobs failed: pr_exit=${pr_exit}, master_exit=${master_exit}" exit 1 fi diff --git a/toolchain/mfc/bench.py b/toolchain/mfc/bench.py index 8ee9a256d9..feb33198ca 100644 --- a/toolchain/mfc/bench.py +++ b/toolchain/mfc/bench.py @@ -121,6 +121,7 @@ def bench(targets = None): except Exception as e: cons.print(f"[bold red]ERROR[/bold red]: Unexpected error running {case.slug}: {e}") + cons.print(f"[dim]{traceback.format_exc()}[/dim]") failed_cases.append(case.slug) finally: cons.unindent() @@ -203,7 +204,7 @@ def diff(): grind_time_value = lhs_summary[target.name]["grind"] / rhs_summary[target.name]["grind"] speedups[i] += f" & Grind: {grind_time_value:.2f}" - if grind_time_value <0.95: + if grind_time_value < 0.95: cons.print(f"[bold red]Error[/bold red]: Benchmarking failed since grind time speedup for {target.name} below acceptable threshold (<0.95) - Case: {slug}") err = 1 except Exception as e: diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index f1b04ba2fd..78d40d9eba 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -107,7 +107,7 @@ def __filter(cases_) -> typing.List[TestCase]: return selected_cases, skipped_cases def test(): - # pylint: disable=global-statement, global-variable-not-assigned + # pylint: disable=global-statement, global-variable-not-assigned, too-many-statements global nFAIL, nPASS, nSKIP, total_test_count global errors @@ -181,8 +181,13 @@ def test(): total_completed = nFAIL + nPASS cons.print() cons.unindent() + if total_completed > 0: + raise MFCException( + f"Excessive test failures: {nFAIL}/{total_completed} " + f"failed ({nFAIL/total_completed*100:.1f}%)" + ) raise MFCException( - f"Excessive test failures: {nFAIL}/{total_completed} failed ({nFAIL/total_completed*100:.1f}%)" + f"Excessive test failures: {nFAIL} failed, but no tests were completed." ) nSKIP = len(skipped_cases) @@ -242,8 +247,9 @@ def _handle_case(case: TestCase, devices: typing.Set[int]): start_time = time.time() # Set timeout using threading.Timer (works in worker threads) - # Note: signal.alarm() only works in the main thread, so we use - # threading.Timer which works correctly in worker threads spawned by sched.sched + # Note: we intentionally do not use signal.alarm() here because signals + # only work in the main thread; sched.sched runs tests in worker threads. + # threading.Timer works correctly in this threaded context. timeout_flag = threading.Event() timeout_timer = threading.Timer(TEST_TIMEOUT_SECONDS, timeout_flag.set) timeout_timer.start() @@ -309,6 +315,9 @@ def _handle_case(case: TestCase, devices: typing.Set[int]): if ARG("test_all"): case.delete_output() + # Check timeout before launching the (potentially long) post-process run + if timeout_flag.is_set(): + raise TestTimeoutError("Test case exceeded 1 hour timeout") cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices) out_filepath = os.path.join(case.get_dirpath(), "out_post.txt") common.file_write(out_filepath, cmd.stdout) @@ -329,10 +338,17 @@ def _handle_case(case: TestCase, devices: typing.Set[int]): cons.print(f" {progress_str} [bold magenta]{case.get_uuid()}[/bold magenta] {duration:6.2f} {case.trace}") except TestTimeoutError as exc: + log_path = os.path.join(case.get_dirpath(), 'out_pre_sim.txt') + if os.path.exists(log_path): + log_msg = f"Check the log at: {log_path}" + else: + log_msg = ( + f"Log file ({log_path}) may not exist if the timeout occurred early." + ) raise MFCException( f"Test {case} exceeded 1 hour timeout. " f"This may indicate a hung simulation or misconfigured case. " - f"Check the log at: {os.path.join(case.get_dirpath(), 'out_pre_sim.txt')}" + f"{log_msg}" ) from exc finally: timeout_timer.cancel() # Cancel timeout timer From db91d6ae244611adca7bd52fafac04f41037d810 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Mon, 8 Dec 2025 15:56:41 -0500 Subject: [PATCH 09/21] cleanup on the monitoring --- .github/scripts/monitor_slurm_job.sh | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh index 7e66377956..e6bd4472d0 100755 --- a/.github/scripts/monitor_slurm_job.sh +++ b/.github/scripts/monitor_slurm_job.sh @@ -2,7 +2,15 @@ # Monitor a SLURM job and stream its output in real-time # Usage: monitor_slurm_job.sh -set -e +set -euo pipefail + +# Cleanup handler to prevent orphaned tail processes +cleanup() { + if [ -n "${tail_pid:-}" ]; then + kill "${tail_pid}" 2>/dev/null || true + fi +} +trap cleanup EXIT if [ $# -ne 2 ]; then echo "Usage: $0 " @@ -96,8 +104,8 @@ if [ -f "$output_file" ]; then done fi -# Stop tailing -kill $tail_pid 2>/dev/null || true +# Stop tailing (trap will also handle this on exit) +kill "${tail_pid}" 2>/dev/null || true echo "" echo "=== Final output ===" From 87fcd74b6222086089a19306640a917f0e9c5493 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Mon, 8 Dec 2025 16:12:31 -0500 Subject: [PATCH 10/21] better monitoring --- .github/scripts/monitor_slurm_job.sh | 14 +- .github/workflows/bench.yml | 28 +++- toolchain/mfc/bench.py | 203 ++++++++++++++------------- 3 files changed, 141 insertions(+), 104 deletions(-) diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh index e6bd4472d0..05eaf5a48b 100755 --- a/.github/scripts/monitor_slurm_job.sh +++ b/.github/scripts/monitor_slurm_job.sh @@ -50,15 +50,24 @@ while [ ! -f "$output_file" ]; do done echo "=== Streaming output for job $job_id ===" -# Stream output while job runs -tail -f "$output_file" & +# Stream output while job runs (explicitly redirect to ensure output visibility) +tail -f "$output_file" 2>&1 & tail_pid=$! +# Give tail a moment to start and show initial output +sleep 2 + # Wait for job to complete with retry logic for transient squeue failures squeue_failures=0 +heartbeat_counter=0 while true; do if squeue -j "$job_id" &>/dev/null; then squeue_failures=0 + # Print heartbeat every 60 seconds (12 iterations * 5 sec) + heartbeat_counter=$((heartbeat_counter + 1)) + if [ $((heartbeat_counter % 12)) -eq 0 ]; then + echo "[$(date +%H:%M:%S)] Job $job_id still running..." + fi else squeue_failures=$((squeue_failures + 1)) # Check if job actually completed using sacct (if available) @@ -68,6 +77,7 @@ while true; do # Consider job done only if it reached a terminal state case "$state" in COMPLETED|FAILED|CANCELLED|TIMEOUT|OUT_OF_MEMORY) + echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state" break ;; *) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index fbb6ba7072..1c485c93a5 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -108,6 +108,7 @@ jobs: local interface=$3 local cluster=$4 + echo "[$dir] Submitting benchmark for $device-$interface on $cluster..." cd "$dir" # Submit job @@ -119,21 +120,33 @@ jobs: output_file="${job_slug}.out" if [ -z "$job_id" ]; then - echo "ERROR: Failed to submit job" + echo "[$dir] ERROR: Failed to submit job" echo "$submit_output" return 1 fi + echo "[$dir] Job ID: $job_id, monitoring output file: $output_file" + # Use the monitoring script bash .github/scripts/monitor_slurm_job.sh "$job_id" "$output_file" + + echo "[$dir] Monitoring complete for job $job_id" } # Run both jobs with monitoring + echo "==========================================" + echo "Starting parallel benchmark jobs..." + echo "==========================================" + (submit_and_monitor pr ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}) & pr_pid=$! + echo "PR job started in background (PID: $pr_pid)" (submit_and_monitor master ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}) & master_pid=$! + echo "Master job started in background (PID: $master_pid)" + + echo "Waiting for both jobs to complete..." # Wait and capture exit codes reliably pr_exit=0 @@ -141,16 +154,27 @@ jobs: if ! wait "$pr_pid"; then pr_exit=$? + echo "PR job exited with code: $pr_exit" + else + echo "PR job completed successfully" fi + if ! wait "$master_pid"; then master_exit=$? + echo "Master job exited with code: $master_exit" + else + echo "Master job completed successfully" fi # Explicitly check and quote to avoid test errors if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then - echo "One or both benchmark jobs failed: pr_exit=${pr_exit}, master_exit=${master_exit}" + echo "ERROR: One or both benchmark jobs failed: pr_exit=${pr_exit}, master_exit=${master_exit}" exit 1 fi + + echo "==========================================" + echo "Both benchmark jobs completed successfully!" + echo "==========================================" - name: Generate & Post Comment run: | diff --git a/toolchain/mfc/bench.py b/toolchain/mfc/bench.py index feb33198ca..78b9b0fc65 100644 --- a/toolchain/mfc/bench.py +++ b/toolchain/mfc/bench.py @@ -29,118 +29,121 @@ def bench(targets = None): cons.print() cons.print(f"[bold]Benchmarking {format_list_to_string(ARG('targets'), 'magenta')} ([magenta]{os.path.relpath(bench_dirpath)}[/magenta]):[/bold]") cons.indent() - cons.print() - CASES = [ BenchCase(**case) for case in file_load_yaml(MFC_BENCH_FILEPATH) ] + try: + cons.print() - for case in CASES: - case.args = case.args + ARG("--") - case.path = os.path.abspath(case.path) + CASES = [ BenchCase(**case) for case in file_load_yaml(MFC_BENCH_FILEPATH) ] - # Validate case file exists early - if not os.path.exists(case.path): - raise MFCException(f"Benchmark case file not found: {case.path}") + for case in CASES: + case.args = case.args + ARG("--") + case.path = os.path.abspath(case.path) - results = { - "metadata": { - "invocation": sys.argv[1:], - "lock": dataclasses.asdict(CFG()) - }, - "cases": {}, - } + # Validate case file exists early + if not os.path.exists(case.path): + raise MFCException(f"Benchmark case file not found: {case.path}") - failed_cases = [] + results = { + "metadata": { + "invocation": sys.argv[1:], + "lock": dataclasses.asdict(CFG()) + }, + "cases": {}, + } - for i, case in enumerate(CASES): - summary_filepath = os.path.join(bench_dirpath, f"{case.slug}.yaml") - log_filepath = os.path.join(bench_dirpath, f"{case.slug}.out") + failed_cases = [] - cons.print(f"{str(i+1).zfill(len(CASES) // 10 + 1)}/{len(CASES)}: {case.slug} @ [bold]{os.path.relpath(case.path)}[/bold]") - cons.indent() - cons.print() - cons.print(f"> Log: [bold]{os.path.relpath(log_filepath)}[/bold]") - cons.print(f"> Summary: [bold]{os.path.relpath(summary_filepath)}[/bold]") - - try: - with open(log_filepath, "w") as log_file: - result = system( - ["./mfc.sh", "run", case.path, "--case-optimization"] + - ["--targets"] + [t.name for t in targets] + - ["--output-summary", summary_filepath] + - case.args + - ["--", "--gbpp", str(ARG('mem'))], - stdout=log_file, - stderr=subprocess.STDOUT) - - # Check return code (handle CompletedProcess or int defensively) - rc = result.returncode if hasattr(result, "returncode") else result - if rc != 0: - cons.print(f"[bold red]ERROR[/bold red]: Case {case.slug} failed with exit code {rc}") - cons.print(f"[bold red] Check log at: {log_filepath}[/bold red]") - failed_cases.append(case.slug) - continue + for i, case in enumerate(CASES): + summary_filepath = os.path.join(bench_dirpath, f"{case.slug}.yaml") + log_filepath = os.path.join(bench_dirpath, f"{case.slug}.out") - # Validate summary file exists - if not os.path.exists(summary_filepath): - cons.print(f"[bold red]ERROR[/bold red]: Summary file not created for {case.slug}") - cons.print(f"[bold red] Expected: {summary_filepath}[/bold red]") - failed_cases.append(case.slug) - continue - - # Load summary - summary = file_load_yaml(summary_filepath) - - # Validate all targets have required data - validation_failed = False - for target in targets: - if target.name not in summary: - cons.print(f"[bold red]ERROR[/bold red]: Target {target.name} missing from summary for {case.slug}") - validation_failed = True - break - - if "exec" not in summary[target.name]: - cons.print(f"[bold red]ERROR[/bold red]: 'exec' time missing for {target.name} in {case.slug}") - validation_failed = True - break - - if target.name == "simulation" and "grind" not in summary[target.name]: - cons.print(f"[bold red]ERROR[/bold red]: 'grind' time missing for simulation in {case.slug}") - validation_failed = True - break - - if validation_failed: + cons.print(f"{str(i+1).zfill(len(CASES) // 10 + 1)}/{len(CASES)}: {case.slug} @ [bold]{os.path.relpath(case.path)}[/bold]") + cons.indent() + cons.print() + cons.print(f"> Log: [bold]{os.path.relpath(log_filepath)}[/bold]") + cons.print(f"> Summary: [bold]{os.path.relpath(summary_filepath)}[/bold]") + + try: + with open(log_filepath, "w") as log_file: + result = system( + ["./mfc.sh", "run", case.path, "--case-optimization"] + + ["--targets"] + [t.name for t in targets] + + ["--output-summary", summary_filepath] + + case.args + + ["--", "--gbpp", str(ARG('mem'))], + stdout=log_file, + stderr=subprocess.STDOUT) + + # Check return code (handle CompletedProcess or int defensively) + rc = result.returncode if hasattr(result, "returncode") else result + if rc != 0: + cons.print(f"[bold red]ERROR[/bold red]: Case {case.slug} failed with exit code {rc}") + cons.print(f"[bold red] Check log at: {log_filepath}[/bold red]") + failed_cases.append(case.slug) + continue + + # Validate summary file exists + if not os.path.exists(summary_filepath): + cons.print(f"[bold red]ERROR[/bold red]: Summary file not created for {case.slug}") + cons.print(f"[bold red] Expected: {summary_filepath}[/bold red]") + failed_cases.append(case.slug) + continue + + # Load summary + summary = file_load_yaml(summary_filepath) + + # Validate all targets have required data + validation_failed = False + for target in targets: + if target.name not in summary: + cons.print(f"[bold red]ERROR[/bold red]: Target {target.name} missing from summary for {case.slug}") + validation_failed = True + break + + if "exec" not in summary[target.name]: + cons.print(f"[bold red]ERROR[/bold red]: 'exec' time missing for {target.name} in {case.slug}") + validation_failed = True + break + + if target.name == "simulation" and "grind" not in summary[target.name]: + cons.print(f"[bold red]ERROR[/bold red]: 'grind' time missing for simulation in {case.slug}") + validation_failed = True + break + + if validation_failed: + failed_cases.append(case.slug) + continue + + # Add to results + results["cases"][case.slug] = { + "description": dataclasses.asdict(case), + "output_summary": summary, + } + cons.print(f"[bold green]✓[/bold green] Case {case.slug} completed successfully") + + except Exception as e: + cons.print(f"[bold red]ERROR[/bold red]: Unexpected error running {case.slug}: {e}") + cons.print(f"[dim]{traceback.format_exc()}[/dim]") failed_cases.append(case.slug) - continue - - # Add to results - results["cases"][case.slug] = { - "description": dataclasses.asdict(case), - "output_summary": summary, - } - cons.print(f"[bold green]✓[/bold green] Case {case.slug} completed successfully") - - except Exception as e: - cons.print(f"[bold red]ERROR[/bold red]: Unexpected error running {case.slug}: {e}") - cons.print(f"[dim]{traceback.format_exc()}[/dim]") - failed_cases.append(case.slug) - finally: - cons.unindent() - - # Report results - if failed_cases: - cons.print() - cons.print(f"[bold red]Failed cases ({len(failed_cases)}):[/bold red]") - for slug in failed_cases: - cons.print(f" - {slug}") - cons.print() - raise MFCException(f"Benchmarking failed: {len(failed_cases)}/{len(CASES)} cases failed") + finally: + cons.unindent() + + # Report results + if failed_cases: + cons.print() + cons.print(f"[bold red]Failed cases ({len(failed_cases)}):[/bold red]") + for slug in failed_cases: + cons.print(f" - {slug}") + cons.print() + raise MFCException(f"Benchmarking failed: {len(failed_cases)}/{len(CASES)} cases failed") - # Write output - file_dump_yaml(ARG("output"), results) + # Write output + file_dump_yaml(ARG("output"), results) - cons.print(f"Wrote results to [bold magenta]{os.path.relpath(ARG('output'))}[/bold magenta].") + cons.print(f"Wrote results to [bold magenta]{os.path.relpath(ARG('output'))}[/bold magenta].") - cons.unindent() + finally: + cons.unindent() # TODO: This function is too long and not nicely written at all. Someone should From f468fe5403200981068a5070206c71a9cc8c0df3 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Mon, 8 Dec 2025 23:09:11 -0500 Subject: [PATCH 11/21] fixup --- .github/scripts/run_parallel_benchmarks.sh | 78 +++++++++++++++++++++ .github/scripts/submit_and_monitor_bench.sh | 51 ++++++++++++++ .github/workflows/bench.yml | 78 +-------------------- 3 files changed, 130 insertions(+), 77 deletions(-) create mode 100755 .github/scripts/run_parallel_benchmarks.sh create mode 100755 .github/scripts/submit_and_monitor_bench.sh diff --git a/.github/scripts/run_parallel_benchmarks.sh b/.github/scripts/run_parallel_benchmarks.sh new file mode 100755 index 0000000000..eaa333b003 --- /dev/null +++ b/.github/scripts/run_parallel_benchmarks.sh @@ -0,0 +1,78 @@ +#!/bin/bash +# Run PR and master benchmarks in parallel and verify outputs +# Usage: run_parallel_benchmarks.sh + +set -euo pipefail + +if [ $# -ne 3 ]; then + echo "Usage: $0 " + exit 1 +fi + +device="$1" +interface="$2" +cluster="$3" + +echo "==========================================" +echo "Starting parallel benchmark jobs..." +echo "==========================================" + +# Run both jobs with monitoring using dedicated script +(bash .github/scripts/submit_and_monitor_bench.sh pr "$device" "$interface" "$cluster") & +pr_pid=$! +echo "PR job started in background (PID: $pr_pid)" + +(bash .github/scripts/submit_and_monitor_bench.sh master "$device" "$interface" "$cluster") & +master_pid=$! +echo "Master job started in background (PID: $master_pid)" + +echo "Waiting for both jobs to complete..." + +# Wait and capture exit codes reliably +pr_exit=0 +master_exit=0 + +if ! wait "$pr_pid"; then + pr_exit=$? + echo "PR job exited with code: $pr_exit" +else + echo "PR job completed successfully" +fi + +if ! wait "$master_pid"; then + master_exit=$? + echo "Master job exited with code: $master_exit" +else + echo "Master job completed successfully" +fi + +# Check if either job failed +if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then + echo "ERROR: One or both benchmark jobs failed: pr_exit=${pr_exit}, master_exit=${master_exit}" + exit 1 +fi + +echo "==========================================" +echo "Both benchmark jobs completed successfully!" +echo "==========================================" + +# Final verification that output files exist before proceeding +pr_yaml="pr/bench-${device}-${interface}.yaml" +master_yaml="master/bench-${device}-${interface}.yaml" + +if [ ! -f "$pr_yaml" ]; then + echo "ERROR: PR benchmark output not found: $pr_yaml" + ls -la pr/ || true + exit 1 +fi + +if [ ! -f "$master_yaml" ]; then + echo "ERROR: Master benchmark output not found: $master_yaml" + ls -la master/ || true + exit 1 +fi + +echo "Verified both YAML files exist:" +echo " - $pr_yaml" +echo " - $master_yaml" + diff --git a/.github/scripts/submit_and_monitor_bench.sh b/.github/scripts/submit_and_monitor_bench.sh new file mode 100755 index 0000000000..6ce68909f2 --- /dev/null +++ b/.github/scripts/submit_and_monitor_bench.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# Submit and monitor a benchmark job on a SLURM cluster +# Usage: submit_and_monitor_bench.sh + +set -euo pipefail + +if [ $# -ne 4 ]; then + echo "Usage: $0 " + exit 1 +fi + +dir="$1" +device="$2" +interface="$3" +cluster="$4" + +echo "[$dir] Submitting benchmark for $device-$interface on $cluster..." +cd "$dir" + +# Submit job +submit_output=$(bash .github/workflows/$cluster/submit-bench.sh \ + .github/workflows/$cluster/bench.sh "$device" "$interface" 2>&1) + +job_id=$(echo "$submit_output" | sed -n 's/.*Submitted batch job \([0-9][0-9]*\).*/\1/p') +job_slug="bench-$device-$interface" +output_file="${job_slug}.out" + +if [ -z "$job_id" ]; then + echo "[$dir] ERROR: Failed to submit job" + echo "$submit_output" + exit 1 +fi + +echo "[$dir] Job ID: $job_id, monitoring output file: $output_file" + +# Use the monitoring script +bash .github/scripts/monitor_slurm_job.sh "$job_id" "$output_file" + +echo "[$dir] Monitoring complete for job $job_id" + +# Verify the YAML output file was created +yaml_file="${job_slug}.yaml" +if [ ! -f "$yaml_file" ]; then + echo "[$dir] ERROR: Expected output file not found: $yaml_file" + echo "[$dir] Directory contents:" + ls -la *.yaml 2>/dev/null || echo " No YAML files found" + exit 1 +fi + +echo "[$dir] Verified output file exists: $yaml_file ($(stat -f%z "$yaml_file" 2>/dev/null || stat -c%s "$yaml_file" 2>/dev/null) bytes)" + diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 1c485c93a5..cc3099de00 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -98,83 +98,7 @@ jobs: wait %1 && wait %2 - name: Bench (Master v. PR) - run: | - set -e - - # Function to submit and monitor using extracted script - submit_and_monitor() { - local dir=$1 - local device=$2 - local interface=$3 - local cluster=$4 - - echo "[$dir] Submitting benchmark for $device-$interface on $cluster..." - cd "$dir" - - # Submit job - submit_output=$(bash .github/workflows/$cluster/submit-bench.sh \ - .github/workflows/$cluster/bench.sh $device $interface 2>&1) - - job_id=$(echo "$submit_output" | sed -n 's/.*Submitted batch job \([0-9][0-9]*\).*/\1/p') - job_slug="bench-$device-$interface" - output_file="${job_slug}.out" - - if [ -z "$job_id" ]; then - echo "[$dir] ERROR: Failed to submit job" - echo "$submit_output" - return 1 - fi - - echo "[$dir] Job ID: $job_id, monitoring output file: $output_file" - - # Use the monitoring script - bash .github/scripts/monitor_slurm_job.sh "$job_id" "$output_file" - - echo "[$dir] Monitoring complete for job $job_id" - } - - # Run both jobs with monitoring - echo "==========================================" - echo "Starting parallel benchmark jobs..." - echo "==========================================" - - (submit_and_monitor pr ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}) & - pr_pid=$! - echo "PR job started in background (PID: $pr_pid)" - - (submit_and_monitor master ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}) & - master_pid=$! - echo "Master job started in background (PID: $master_pid)" - - echo "Waiting for both jobs to complete..." - - # Wait and capture exit codes reliably - pr_exit=0 - master_exit=0 - - if ! wait "$pr_pid"; then - pr_exit=$? - echo "PR job exited with code: $pr_exit" - else - echo "PR job completed successfully" - fi - - if ! wait "$master_pid"; then - master_exit=$? - echo "Master job exited with code: $master_exit" - else - echo "Master job completed successfully" - fi - - # Explicitly check and quote to avoid test errors - if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then - echo "ERROR: One or both benchmark jobs failed: pr_exit=${pr_exit}, master_exit=${master_exit}" - exit 1 - fi - - echo "==========================================" - echo "Both benchmark jobs completed successfully!" - echo "==========================================" + run: bash .github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} - name: Generate & Post Comment run: | From d9e06f6f8b45c7994362342f7342e122d26dfc55 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Tue, 9 Dec 2025 00:21:26 -0500 Subject: [PATCH 12/21] fix --- .github/scripts/run_parallel_benchmarks.sh | 9 ++++++--- .github/scripts/submit_and_monitor_bench.sh | 7 +++++-- .github/workflows/bench.yml | 2 +- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/.github/scripts/run_parallel_benchmarks.sh b/.github/scripts/run_parallel_benchmarks.sh index eaa333b003..eadafe29ef 100755 --- a/.github/scripts/run_parallel_benchmarks.sh +++ b/.github/scripts/run_parallel_benchmarks.sh @@ -13,16 +13,19 @@ device="$1" interface="$2" cluster="$3" +# Get the directory where this script lives (pr/.github/scripts/) +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + echo "==========================================" echo "Starting parallel benchmark jobs..." echo "==========================================" -# Run both jobs with monitoring using dedicated script -(bash .github/scripts/submit_and_monitor_bench.sh pr "$device" "$interface" "$cluster") & +# Run both jobs with monitoring using dedicated script from PR +(bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" pr "$device" "$interface" "$cluster") & pr_pid=$! echo "PR job started in background (PID: $pr_pid)" -(bash .github/scripts/submit_and_monitor_bench.sh master "$device" "$interface" "$cluster") & +(bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" master "$device" "$interface" "$cluster") & master_pid=$! echo "Master job started in background (PID: $master_pid)" diff --git a/.github/scripts/submit_and_monitor_bench.sh b/.github/scripts/submit_and_monitor_bench.sh index 6ce68909f2..32061cf2f9 100755 --- a/.github/scripts/submit_and_monitor_bench.sh +++ b/.github/scripts/submit_and_monitor_bench.sh @@ -14,6 +14,9 @@ device="$2" interface="$3" cluster="$4" +# Get the directory where this script lives +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + echo "[$dir] Submitting benchmark for $device-$interface on $cluster..." cd "$dir" @@ -33,8 +36,8 @@ fi echo "[$dir] Job ID: $job_id, monitoring output file: $output_file" -# Use the monitoring script -bash .github/scripts/monitor_slurm_job.sh "$job_id" "$output_file" +# Use the monitoring script from PR (where this script lives) +bash "${SCRIPT_DIR}/monitor_slurm_job.sh" "$job_id" "$output_file" echo "[$dir] Monitoring complete for job $job_id" diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index cc3099de00..2ccdfca87a 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -98,7 +98,7 @@ jobs: wait %1 && wait %2 - name: Bench (Master v. PR) - run: bash .github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} + run: bash pr/.github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} - name: Generate & Post Comment run: | From e17eb2751ac6bc7433d6af6da9cd8c234060d4f4 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Tue, 9 Dec 2025 00:51:40 -0500 Subject: [PATCH 13/21] fixup --- .github/scripts/monitor_slurm_job.sh | 3 ++- .github/scripts/run_parallel_benchmarks.sh | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh index 05eaf5a48b..5b65d5c0a4 100755 --- a/.github/scripts/monitor_slurm_job.sh +++ b/.github/scripts/monitor_slurm_job.sh @@ -51,7 +51,8 @@ done echo "=== Streaming output for job $job_id ===" # Stream output while job runs (explicitly redirect to ensure output visibility) -tail -f "$output_file" 2>&1 & +# Use stdbuf for unbuffered output to ensure immediate display in CI logs +stdbuf -oL -eL tail -f "$output_file" 2>&1 & tail_pid=$! # Give tail a moment to start and show initial output diff --git a/.github/scripts/run_parallel_benchmarks.sh b/.github/scripts/run_parallel_benchmarks.sh index eadafe29ef..442f740eac 100755 --- a/.github/scripts/run_parallel_benchmarks.sh +++ b/.github/scripts/run_parallel_benchmarks.sh @@ -21,11 +21,12 @@ echo "Starting parallel benchmark jobs..." echo "==========================================" # Run both jobs with monitoring using dedicated script from PR -(bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" pr "$device" "$interface" "$cluster") & +# Use stdbuf for line-buffered output and prefix each line for clarity +(stdbuf -oL -eL bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" pr "$device" "$interface" "$cluster" 2>&1 | while IFS= read -r line; do echo "[PR] $line"; done) & pr_pid=$! echo "PR job started in background (PID: $pr_pid)" -(bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" master "$device" "$interface" "$cluster") & +(stdbuf -oL -eL bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" master "$device" "$interface" "$cluster" 2>&1 | while IFS= read -r line; do echo "[MASTER] $line"; done) & master_pid=$! echo "Master job started in background (PID: $master_pid)" From db77fb9f70aa093f2f08263169ba5ec677f3abf6 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Tue, 9 Dec 2025 01:08:59 -0500 Subject: [PATCH 14/21] fix --- .github/scripts/submit_and_monitor_bench.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/scripts/submit_and_monitor_bench.sh b/.github/scripts/submit_and_monitor_bench.sh index 32061cf2f9..8e4b124107 100755 --- a/.github/scripts/submit_and_monitor_bench.sh +++ b/.github/scripts/submit_and_monitor_bench.sh @@ -47,6 +47,11 @@ if [ ! -f "$yaml_file" ]; then echo "[$dir] ERROR: Expected output file not found: $yaml_file" echo "[$dir] Directory contents:" ls -la *.yaml 2>/dev/null || echo " No YAML files found" + echo "" + echo "[$dir] Last 100 lines of job output ($output_file):" + echo "----------------------------------------" + tail -n 100 "$output_file" 2>/dev/null || echo " Could not read output file" + echo "----------------------------------------" exit 1 fi From 09cda88a3a1525dab505e7c32730ba3c36321024 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Tue, 9 Dec 2025 08:41:43 -0500 Subject: [PATCH 15/21] fix up some printing --- .github/scripts/monitor_slurm_job.sh | 70 +++++++++++++++++------ .github/workflows/phoenix/submit-bench.sh | 2 +- 2 files changed, 53 insertions(+), 19 deletions(-) diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh index 5b65d5c0a4..27472e01ef 100755 --- a/.github/scripts/monitor_slurm_job.sh +++ b/.github/scripts/monitor_slurm_job.sh @@ -50,26 +50,33 @@ while [ ! -f "$output_file" ]; do done echo "=== Streaming output for job $job_id ===" -# Stream output while job runs (explicitly redirect to ensure output visibility) -# Use stdbuf for unbuffered output to ensure immediate display in CI logs -stdbuf -oL -eL tail -f "$output_file" 2>&1 & -tail_pid=$! -# Give tail a moment to start and show initial output -sleep 2 +# Start tail and redirect its output to file descriptor 3 for multiplexing +# This allows us to stream tail output while also printing heartbeat messages +exec 3< <(stdbuf -oL -eL tail -f "$output_file" 2>&1) +tail_pid=$! -# Wait for job to complete with retry logic for transient squeue failures +# Monitor job status and stream output simultaneously squeue_failures=0 -heartbeat_counter=0 +last_heartbeat=$(date +%s) + while true; do - if squeue -j "$job_id" &>/dev/null; then - squeue_failures=0 - # Print heartbeat every 60 seconds (12 iterations * 5 sec) - heartbeat_counter=$((heartbeat_counter + 1)) - if [ $((heartbeat_counter % 12)) -eq 0 ]; then - echo "[$(date +%H:%M:%S)] Job $job_id still running..." + # Try to read from tail output (non-blocking via timeout) + # Read multiple lines if available to avoid falling behind + lines_read=0 + while IFS= read -r -t 0.1 line <&3 2>/dev/null; do + echo "$line" + lines_read=$((lines_read + 1)) + last_heartbeat=$(date +%s) + # Limit burst reads to avoid starving the status check + if [ $lines_read -ge 100 ]; then + break fi - else + done + + # Check job status + current_time=$(date +%s) + if ! squeue -j "$job_id" &>/dev/null; then squeue_failures=$((squeue_failures + 1)) # Check if job actually completed using sacct (if available) if [ $squeue_failures -ge 3 ]; then @@ -87,14 +94,41 @@ while true; do ;; esac else - # No sacct: avoid false positive by doing an extra check cycle - squeue_failures=2 + # No sacct: assume job completed after 3 failures + echo "[$(date +%H:%M:%S)] Job $job_id no longer in queue" + break fi fi + else + squeue_failures=0 + # Print heartbeat if no output for 60 seconds + if [ $((current_time - last_heartbeat)) -ge 60 ]; then + echo "[$(date +%H:%M:%S)] Job $job_id still running (no new output for 60s)..." + last_heartbeat=$current_time + fi + fi + + # Sleep briefly between status checks + sleep 1 +done + +# Drain any remaining output from tail after job completes +echo "Draining remaining output..." +drain_count=0 +while IFS= read -r -t 0.5 line <&3 2>/dev/null; do + echo "$line" + drain_count=$((drain_count + 1)) + # Safety limit to avoid infinite loop + if [ $drain_count -ge 10000 ]; then + echo "Warning: Truncating remaining output after 10000 lines" + break fi - sleep 5 done +# Close the file descriptor and kill tail +exec 3<&- +kill "${tail_pid}" 2>/dev/null || true + # Wait for output file to finish growing (stabilize) before stopping tail if [ -f "$output_file" ]; then last_size=-1 diff --git a/.github/workflows/phoenix/submit-bench.sh b/.github/workflows/phoenix/submit-bench.sh index 21c1a557ab..3310df6886 100644 --- a/.github/workflows/phoenix/submit-bench.sh +++ b/.github/workflows/phoenix/submit-bench.sh @@ -42,7 +42,7 @@ sbatch < Date: Tue, 9 Dec 2025 09:41:47 -0500 Subject: [PATCH 16/21] no -W wait --- .github/workflows/frontier/submit-bench.sh | 1 - .github/workflows/phoenix/submit-bench.sh | 1 - 2 files changed, 2 deletions(-) diff --git a/.github/workflows/frontier/submit-bench.sh b/.github/workflows/frontier/submit-bench.sh index 9dbc5e5817..4374231eca 100644 --- a/.github/workflows/frontier/submit-bench.sh +++ b/.github/workflows/frontier/submit-bench.sh @@ -35,7 +35,6 @@ $sbatch_device_opts #SBATCH -t 02:59:00 # Duration of the job (Ex: 15 mins) #SBATCH -o$job_slug.out # Combined output and error messages file #SBATCH -p extended # Extended partition for shorter queues -#SBATCH -W # Do not exit until the submitted job terminates. set -e set -x diff --git a/.github/workflows/phoenix/submit-bench.sh b/.github/workflows/phoenix/submit-bench.sh index 3310df6886..7ae85e66fe 100644 --- a/.github/workflows/phoenix/submit-bench.sh +++ b/.github/workflows/phoenix/submit-bench.sh @@ -45,7 +45,6 @@ $sbatch_device_opts #SBATCH -t 04:00:00 # Duration of the job (Ex: 15 mins) #SBATCH -q embers # QOS Name #SBATCH -o$job_slug.out # Combined output and error messages file -#SBATCH -W # Do not exit until the submitted job terminates. set -e set -x From 8e164b6369cb01bbed809da2a3640b0423996071 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Tue, 9 Dec 2025 21:34:14 -0500 Subject: [PATCH 17/21] fix --- .github/scripts/run_parallel_benchmarks.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/scripts/run_parallel_benchmarks.sh b/.github/scripts/run_parallel_benchmarks.sh index 442f740eac..a0080931ba 100755 --- a/.github/scripts/run_parallel_benchmarks.sh +++ b/.github/scripts/run_parallel_benchmarks.sh @@ -73,6 +73,9 @@ fi if [ ! -f "$master_yaml" ]; then echo "ERROR: Master benchmark output not found: $master_yaml" ls -la master/ || true + echo "" + echo "Last 100 lines of master log:" + tail -n 100 "master/bench-${device}-${interface}.out" 2>/dev/null || echo " Could not read master log" exit 1 fi From cfad09966c474215b769b03c3d8effc9508143ab Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Tue, 9 Dec 2025 21:44:21 -0500 Subject: [PATCH 18/21] tweak --- .github/scripts/run_parallel_benchmarks.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/scripts/run_parallel_benchmarks.sh b/.github/scripts/run_parallel_benchmarks.sh index a0080931ba..050568a73e 100755 --- a/.github/scripts/run_parallel_benchmarks.sh +++ b/.github/scripts/run_parallel_benchmarks.sh @@ -22,11 +22,11 @@ echo "==========================================" # Run both jobs with monitoring using dedicated script from PR # Use stdbuf for line-buffered output and prefix each line for clarity -(stdbuf -oL -eL bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" pr "$device" "$interface" "$cluster" 2>&1 | while IFS= read -r line; do echo "[PR] $line"; done) & +(set -o pipefail; stdbuf -oL -eL bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" pr "$device" "$interface" "$cluster" 2>&1 | while IFS= read -r line; do echo "[PR] $line"; done) & pr_pid=$! echo "PR job started in background (PID: $pr_pid)" -(stdbuf -oL -eL bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" master "$device" "$interface" "$cluster" 2>&1 | while IFS= read -r line; do echo "[MASTER] $line"; done) & +(set -o pipefail; stdbuf -oL -eL bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" master "$device" "$interface" "$cluster" 2>&1 | while IFS= read -r line; do echo "[MASTER] $line"; done) & master_pid=$! echo "Master job started in background (PID: $master_pid)" From 4a45ef68fdd6db7609383265c76a4bf03e21b3a1 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Wed, 10 Dec 2025 09:45:56 -0500 Subject: [PATCH 19/21] more debug output --- .github/scripts/run_parallel_benchmarks.sh | 3 +++ .gitignore | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/scripts/run_parallel_benchmarks.sh b/.github/scripts/run_parallel_benchmarks.sh index 050568a73e..e3557b999b 100755 --- a/.github/scripts/run_parallel_benchmarks.sh +++ b/.github/scripts/run_parallel_benchmarks.sh @@ -67,6 +67,9 @@ master_yaml="master/bench-${device}-${interface}.yaml" if [ ! -f "$pr_yaml" ]; then echo "ERROR: PR benchmark output not found: $pr_yaml" ls -la pr/ || true + echo "" + echo "Last 100 lines of PR log:" + tail -n 100 "pr/bench-${device}-${interface}.out" 2>/dev/null || echo " Could not read PR log" exit 1 fi diff --git a/.gitignore b/.gitignore index 2a6eea8203..c8b2c016b4 100644 --- a/.gitignore +++ b/.gitignore @@ -91,5 +91,5 @@ benchmarks/*.png *.mkv *.avi -packaging/spack/spack-test -.spack \ No newline at end of file +**isolation_rules/ +**.supercode/ \ No newline at end of file From b5cb016b9015547083cd838499de32511815c149 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Wed, 10 Dec 2025 09:58:59 -0500 Subject: [PATCH 20/21] minor fixup --- .github/scripts/run_parallel_benchmarks.sh | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/scripts/run_parallel_benchmarks.sh b/.github/scripts/run_parallel_benchmarks.sh index e3557b999b..06b6eaa8d3 100755 --- a/.github/scripts/run_parallel_benchmarks.sh +++ b/.github/scripts/run_parallel_benchmarks.sh @@ -36,15 +36,17 @@ echo "Waiting for both jobs to complete..." pr_exit=0 master_exit=0 -if ! wait "$pr_pid"; then - pr_exit=$? +wait "$pr_pid" +pr_exit=$? +if [ "$pr_exit" -ne 0 ]; then echo "PR job exited with code: $pr_exit" else echo "PR job completed successfully" fi -if ! wait "$master_pid"; then - master_exit=$? +wait "$master_pid" +master_exit=$? +if [ "$master_exit" -ne 0 ]; then echo "Master job exited with code: $master_exit" else echo "Master job completed successfully" From 66176e74fa543502df4f95e905b46509fdf86f6e Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Wed, 10 Dec 2025 15:54:29 -0500 Subject: [PATCH 21/21] fix python --- toolchain/bootstrap/python.sh | 56 ++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 20 deletions(-) diff --git a/toolchain/bootstrap/python.sh b/toolchain/bootstrap/python.sh index cc2be64be1..dc6f0c1de9 100644 --- a/toolchain/bootstrap/python.sh +++ b/toolchain/bootstrap/python.sh @@ -29,7 +29,8 @@ if [ -f "$(pwd)/build/venv/bin/activate" ]; then fi fi -if ! command -v pip3 > /dev/null 2>&1 && [ ! -f "$(pwd)/build/venv/bin/activate" ]; then +# Only bootstrap pip if we don't already have a venv +if [ ! -f "$(pwd)/build/venv/bin/activate" ]; then # Check whether python3 is in the $PATH / is accessible. if ! command -v python3 > /dev/null 2>&1; then error "Couldn't find$MAGENTA Python$COLOR_RESET. Please ensure it is discoverable." @@ -39,26 +40,41 @@ if ! command -v pip3 > /dev/null 2>&1 && [ ! -f "$(pwd)/build/venv/bin/activate" assert_python_compatible - get_pip_url="https://bootstrap.pypa.io/pip/get-pip.py" - - warn "$MAGENTA""Python$COLOR_RESET's$MAGENTA PIP$COLOR_RESET is not installed." - log "Downloading$MAGENTA Python$COLOR_RESET's$MAGENTA PIP$COLOR_RESET from $get_pip_url..." - - if ! wget -O "$(pwd)/build/get-pip.py" "$get_pip_url"; then - error "Couldn't download get-pip.py." - - exit 1 - fi - - # Suppress PIP version warning (out of date) - export PIP_DISABLE_PIP_VERSION_CHECK=1 - if ! python3 "$(pwd)/build/get-pip.py" --user; then - error "Couldn't install$MAGENTA pip$COLOR_RESET with get-pip.py" - - exit 1 + # Check if pip is already available as a Python module + # This works on both laptops and HPC systems with module-loaded Python + if ! python3 -c "import pip" > /dev/null 2>&1; then + warn "$MAGENTA""Python$COLOR_RESET's$MAGENTA PIP$COLOR_RESET is not installed." + + # Try ensurepip first (standard library, safe) + log "Attempting to install pip via ensurepip..." + if python3 -m ensurepip --upgrade 2>/dev/null; then + ok "Installed pip via ensurepip." + else + # Fall back to get-pip.py only if ensurepip fails + get_pip_url="https://bootstrap.pypa.io/pip/get-pip.py" + log "Downloading$MAGENTA Python$COLOR_RESET's$MAGENTA PIP$COLOR_RESET from $get_pip_url..." + + if ! wget -O "$(pwd)/build/get-pip.py" "$get_pip_url"; then + error "Couldn't download get-pip.py." + exit 1 + fi + + # Suppress PIP version warning (out of date) + export PIP_DISABLE_PIP_VERSION_CHECK=1 + if ! python3 "$(pwd)/build/get-pip.py" --user; then + error "Couldn't install$MAGENTA pip$COLOR_RESET with get-pip.py" + exit 1 + fi + + ok "Installed pip via get-pip.py." + + # Ensure user-site bin directory is on PATH for this session + user_base_bin="$(python3 -m site --user-base)/bin" + if [ -d "$user_base_bin" ]; then + export PATH="$user_base_bin:$PATH" + fi + fi fi - - ok "Installed pip." fi