From e8090bd0f8bf146f2be36dc8a8b1d10d1baf9092 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 5 Dec 2025 11:29:55 -0500
Subject: [PATCH 01/21] benchmarks hardening!

---
 .github/workflows/bench.yml |  84 ++++++++++++++++++--
 .github/workflows/test.yml  |   4 +-
 toolchain/mfc/bench.py      | 101 +++++++++++++++++++----
 toolchain/mfc/test/test.py  | 154 +++++++++++++++++++++++-------------
 4 files changed, 263 insertions(+), 80 deletions(-)

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 563566788e..131010106a 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -23,7 +23,7 @@ jobs:
           filters: ".github/file-filter.yml"
 
   self:
-    name: "${{ matrix.name }} (${{ matrix.device }})"
+    name: "${{ matrix.name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
     if: ${{ github.repository=='MFlowCode/MFC' && needs.file-changes.outputs.checkall=='true' && ((github.event_name=='pull_request_review' && github.event.review.state=='approved') || (github.event_name=='pull_request' && (github.event.pull_request.user.login=='sbryngelson' || github.event.pull_request.user.login=='wilfonba'))) }}
     needs: file-changes
     strategy:
@@ -73,7 +73,7 @@ jobs:
     runs-on:
       group: ${{ matrix.group }}
       labels: ${{ matrix.labels }}
-    timeout-minutes: 1400
+    timeout-minutes: 480
     env:
       ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
       ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
@@ -99,9 +99,83 @@ jobs:
 
       - name: Bench (Master v. PR)
         run: |
-          (cd pr     && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }} ${{ matrix.interface }}) &
-          (cd master && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }} ${{ matrix.interface }}) &
-          wait %1 && wait %2
+          set -e
+          
+          # Function to submit and monitor
+          submit_and_monitor() {
+            local dir=$1
+            local device=$2
+            local interface=$3
+            local cluster=$4
+            
+            cd "$dir"
+            
+            # Submit job
+            submit_output=$(bash .github/workflows/$cluster/submit-bench.sh \
+              .github/workflows/$cluster/bench.sh $device $interface 2>&1)
+            
+            job_id=$(echo "$submit_output" | grep -oP 'Submitted batch job \K[0-9]+' || echo "")
+            job_slug="bench-$device-$interface"
+            output_file="${job_slug}.out"
+            
+            if [ -z "$job_id" ]; then
+              echo "ERROR: Failed to submit job"
+              echo "$submit_output"
+              return 1
+            fi
+            
+            echo "Submitted batch job $job_id"
+            echo "Monitoring output file: $output_file"
+            
+            # Wait for file to appear (check job status if it takes a while)
+            echo "Waiting for job to start..."
+            while [ ! -f "$output_file" ]; do
+              # Check if job failed to start
+              if ! squeue -j "$job_id" &>/dev/null && [ ! -f "$output_file" ]; then
+                echo "ERROR: Job $job_id finished without creating output file"
+                return 1
+              fi
+              sleep 5
+            done
+            
+            echo "=== Streaming output for job $job_id ==="
+            # Stream output while job runs
+            tail -f "$output_file" &
+            tail_pid=$!
+            
+            # Wait for job to complete (will wait up to GitHub Actions timeout)
+            while squeue -j "$job_id" &>/dev/null; do
+              sleep 5
+            done
+            
+            # Stop tailing
+            kill $tail_pid 2>/dev/null || true
+            
+            echo ""
+            echo "=== Final output ==="
+            cat "$output_file"
+            
+            # Check exit status
+            exit_code=$(scontrol show job "$job_id" 2>/dev/null | grep -oP 'ExitCode=\K[0-9]+' || echo "0:0")
+            if [ "$exit_code" != "0" ] && [ "$exit_code" != "0:0" ]; then
+              echo "ERROR: Job $job_id failed with exit code $exit_code"
+              return 1
+            fi
+          }
+          
+          # Run both jobs with monitoring
+          (submit_and_monitor pr ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}) &
+          pr_pid=$!
+          
+          (submit_and_monitor master ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}) &
+          master_pid=$!
+          
+          wait $pr_pid && pr_exit=$? || pr_exit=$?
+          wait $master_pid && master_exit=$? || master_exit=$?
+          
+          if [ $pr_exit -ne 0 ] || [ $master_exit -ne 0 ]; then
+            exit 1
+          fi
 
       - name: Generate & Post Comment
         run: |
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index d7b4703a1e..65707799b2 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -93,11 +93,11 @@ jobs:
           OPT2: ${{ matrix.debug == 'debug' && '-% 20' || '' }}
 
   self:
-    name: Self Hosted
+    name: "${{ matrix.lbl == 'gt' && 'Georgia Tech | Phoenix' || 'Oak Ridge | Frontier' }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
     if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true'
     needs: file-changes
     continue-on-error: false
-    timeout-minutes: 1400
+    timeout-minutes: 480
     strategy:
       matrix:
         device: ['gpu']
diff --git a/toolchain/mfc/bench.py b/toolchain/mfc/bench.py
index eb1b120035..89d5a6fb1b 100644
--- a/toolchain/mfc/bench.py
+++ b/toolchain/mfc/bench.py
@@ -16,6 +16,7 @@ class BenchCase:
     path: str
     args: typing.List[str]
 
+# pylint: disable=too-many-locals, too-many-branches, too-many-statements
 def bench(targets = None):
     if targets is None:
         targets = ARG("targets")
@@ -36,6 +37,10 @@ def bench(targets = None):
         case.args = case.args + ARG("--")
         case.path = os.path.abspath(case.path)
 
+        # Validate case file exists early
+        if not os.path.exists(case.path):
+            raise MFCException(f"Benchmark case file not found: {case.path}")
+
     results = {
         "metadata": {
             "invocation": sys.argv[1:],
@@ -44,6 +49,8 @@ def bench(targets = None):
         "cases": {},
     }
 
+    failed_cases = []
+
     for i, case in enumerate(CASES):
         summary_filepath = os.path.join(bench_dirpath, f"{case.slug}.yaml")
         log_filepath     = os.path.join(bench_dirpath, f"{case.slug}.out")
@@ -54,21 +61,82 @@ def bench(targets = None):
         cons.print(f"> Log:     [bold]{os.path.relpath(log_filepath)}[/bold]")
         cons.print(f"> Summary: [bold]{os.path.relpath(summary_filepath)}[/bold]")
 
-        with open(log_filepath, "w") as log_file:
-            system(
-                ["./mfc.sh", "run", case.path, "--case-optimization"] +
-                ["--targets"] + [t.name for t in targets] +
-                ["--output-summary", summary_filepath] +
-                case.args +
-                ["--", "--gbpp", ARG('mem')],
-                stdout=log_file,
-                stderr=subprocess.STDOUT)
-
-        results["cases"][case.slug] = {
-            "description":    dataclasses.asdict(case),
-            "output_summary": file_load_yaml(summary_filepath),
-        }
+        try:
+            with open(log_filepath, "w") as log_file:
+                result = system(
+                    ["./mfc.sh", "run", case.path, "--case-optimization"] +
+                    ["--targets"] + [t.name for t in targets] +
+                    ["--output-summary", summary_filepath] +
+                    case.args +
+                    ["--", "--gbpp", ARG('mem')],
+                    stdout=log_file,
+                    stderr=subprocess.STDOUT)
+
+            # Check return code
+            if result.returncode != 0:
+                cons.print(f"[bold red]ERROR[/bold red]: Case {case.slug} failed with exit code {result.returncode}")
+                cons.print(f"[bold red]      Check log at: {log_filepath}[/bold red]")
+                failed_cases.append(case.slug)
+                cons.unindent()
+                continue
+
+            # Validate summary file exists
+            if not os.path.exists(summary_filepath):
+                cons.print(f"[bold red]ERROR[/bold red]: Summary file not created for {case.slug}")
+                cons.print(f"[bold red]      Expected: {summary_filepath}[/bold red]")
+                failed_cases.append(case.slug)
+                cons.unindent()
+                continue
+
+            # Load summary
+            summary = file_load_yaml(summary_filepath)
+
+            # Validate all targets have required data
+            validation_failed = False
+            for target in targets:
+                if target.name not in summary:
+                    cons.print(f"[bold red]ERROR[/bold red]: Target {target.name} missing from summary for {case.slug}")
+                    validation_failed = True
+                    break
+
+                if "exec" not in summary[target.name]:
+                    cons.print(f"[bold red]ERROR[/bold red]: 'exec' time missing for {target.name} in {case.slug}")
+                    validation_failed = True
+                    break
+
+                if target.name == "simulation" and "grind" not in summary[target.name]:
+                    cons.print(f"[bold red]ERROR[/bold red]: 'grind' time missing for simulation in {case.slug}")
+                    validation_failed = True
+                    break
+
+            if validation_failed:
+                failed_cases.append(case.slug)
+                cons.unindent()
+                continue
+
+            # Add to results
+            results["cases"][case.slug] = {
+                "description":    dataclasses.asdict(case),
+                "output_summary": summary,
+            }
+            cons.print(f"[bold green]✓[/bold green] Case {case.slug} completed successfully")
+
+        except Exception as e:
+            cons.print(f"[bold red]ERROR[/bold red]: Unexpected error running {case.slug}: {e}")
+            failed_cases.append(case.slug)
+
+        cons.unindent()
+
+    # Report results
+    if failed_cases:
+        cons.print()
+        cons.print(f"[bold red]Failed cases ({len(failed_cases)}):[/bold red]")
+        for slug in failed_cases:
+            cons.print(f"  - {slug}")
+        cons.print()
+        raise MFCException(f"Benchmarking failed: {len(failed_cases)}/{len(CASES)} cases failed")
 
+    # Write output
     file_dump_yaml(ARG("output"), results)
 
     cons.print(f"Wrote results to [bold magenta]{os.path.relpath(ARG('output'))}[/bold magenta].")
@@ -140,8 +208,9 @@ def diff():
                     if grind_time_value <0.95:
                         cons.print(f"[bold red]Error[/bold red]: Benchmarking failed since grind time speedup for {target.name} below acceptable threshold (<0.95) - Case: {slug}")
                         err = 1
-            except Exception as _:
-                pass
+            except Exception as e:
+                cons.print(f"[bold red]ERROR[/bold red]: Failed to compute speedup for {target.name} in {slug}: {e}")
+                err = 1
 
         table.add_row(f"[magenta]{slug}[/magenta]", *speedups)
 
diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py
index 0ba6e160f7..c52334d233 100644
--- a/toolchain/mfc/test/test.py
+++ b/toolchain/mfc/test/test.py
@@ -1,4 +1,4 @@
-import os, typing, shutil, time, itertools
+import os, typing, shutil, time, itertools, signal
 from random import sample, seed
 
 import rich, rich.table
@@ -23,6 +23,19 @@
 total_test_count = 0
 errors = []
 
+# Early abort thresholds
+MIN_CASES_BEFORE_ABORT = 20
+FAILURE_RATE_THRESHOLD = 0.3
+
+# Per-test timeout (1 hour)
+TEST_TIMEOUT_SECONDS = 3600
+
+class TestTimeoutError(MFCException):
+    pass
+
+def timeout_handler(signum, frame):
+    raise TestTimeoutError("Test case exceeded 1 hour timeout")
+
 # pylint: disable=too-many-branches, trailing-whitespace
 def __filter(cases_) -> typing.List[TestCase]:
     cases = cases_[:]
@@ -186,91 +199,106 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
     global current_test_number
     start_time = time.time()
 
+    # Set timeout alarm
+    signal.signal(signal.SIGALRM, timeout_handler)
+    signal.alarm(TEST_TIMEOUT_SECONDS)
+
     tol = case.compute_tolerance()
     case.delete_output()
     case.create_directory()
 
     if ARG("dry_run"):
         cons.print(f"  [bold magenta]{case.get_uuid()}[/bold magenta]     SKIP     {case.trace}")
+        signal.alarm(0)  # Cancel alarm
         return
 
-    cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices)
+    try:
+        cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices)
 
-    out_filepath = os.path.join(case.get_dirpath(), "out_pre_sim.txt")
+        out_filepath = os.path.join(case.get_dirpath(), "out_pre_sim.txt")
 
-    common.file_write(out_filepath, cmd.stdout)
+        common.file_write(out_filepath, cmd.stdout)
 
-    if cmd.returncode != 0:
-        cons.print(cmd.stdout)
-        raise MFCException(f"Test {case}: Failed to execute MFC.")
+        if cmd.returncode != 0:
+            cons.print(cmd.stdout)
+            raise MFCException(f"Test {case}: Failed to execute MFC.")
 
-    pack, err = packer.pack(case.get_dirpath())
-    if err is not None:
-        raise MFCException(f"Test {case}: {err}")
+        pack, err = packer.pack(case.get_dirpath())
+        if err is not None:
+            raise MFCException(f"Test {case}: {err}")
 
-    if pack.has_NaNs():
-        raise MFCException(f"Test {case}: NaNs detected in the case.")
+        if pack.has_NaNs():
+            raise MFCException(f"Test {case}: NaNs detected in the case.")
 
-    golden_filepath = os.path.join(case.get_dirpath(), "golden.txt")
-    if ARG("generate"):
-        common.delete_file(golden_filepath)
-        pack.save(golden_filepath)
-    else:
-        if not os.path.isfile(golden_filepath):
-            raise MFCException(f"Test {case}: The golden file does not exist! To generate golden files, use the '--generate' flag.")
+        golden_filepath = os.path.join(case.get_dirpath(), "golden.txt")
+        if ARG("generate"):
+            common.delete_file(golden_filepath)
+            pack.save(golden_filepath)
+        else:
+            if not os.path.isfile(golden_filepath):
+                raise MFCException(f"Test {case}: The golden file does not exist! To generate golden files, use the '--generate' flag.")
 
-        golden = packer.load(golden_filepath)
+            golden = packer.load(golden_filepath)
 
-        if ARG("add_new_variables"):
-            for pfilepath, pentry in list(pack.entries.items()):
-                if golden.find(pfilepath) is None:
-                    golden.set(pentry)
+            if ARG("add_new_variables"):
+                for pfilepath, pentry in list(pack.entries.items()):
+                    if golden.find(pfilepath) is None:
+                        golden.set(pentry)
 
-            for gfilepath, gentry in list(golden.entries.items()):
-                if pack.find(gfilepath) is None:
-                    golden.remove(gentry)
+                for gfilepath, gentry in list(golden.entries.items()):
+                    if pack.find(gfilepath) is None:
+                        golden.remove(gentry)
 
-            golden.save(golden_filepath)
-        else:
-            err, msg = packtol.compare(pack, packer.load(golden_filepath), packtol.Tolerance(tol, tol))
-            if msg is not None:
-                raise MFCException(f"Test {case}: {msg}")
+                golden.save(golden_filepath)
+            else:
+                err, msg = packtol.compare(pack, packer.load(golden_filepath), packtol.Tolerance(tol, tol))
+                if msg is not None:
+                    raise MFCException(f"Test {case}: {msg}")
 
-    if ARG("test_all"):
-        case.delete_output()
-        cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices)
-        out_filepath = os.path.join(case.get_dirpath(), "out_post.txt")
-        common.file_write(out_filepath, cmd.stdout)
+        if ARG("test_all"):
+            case.delete_output()
+            cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices)
+            out_filepath = os.path.join(case.get_dirpath(), "out_post.txt")
+            common.file_write(out_filepath, cmd.stdout)
 
-        for silo_filepath in os.listdir(os.path.join(case.get_dirpath(), 'silo_hdf5', 'p0')):
-            silo_filepath = os.path.join(case.get_dirpath(), 'silo_hdf5', 'p0', silo_filepath)
-            h5dump        = f"{HDF5.get_install_dirpath(case.to_input_file())}/bin/h5dump"
+            for silo_filepath in os.listdir(os.path.join(case.get_dirpath(), 'silo_hdf5', 'p0')):
+                silo_filepath = os.path.join(case.get_dirpath(), 'silo_hdf5', 'p0', silo_filepath)
+                h5dump        = f"{HDF5.get_install_dirpath(case.to_input_file())}/bin/h5dump"
 
-            if not os.path.exists(h5dump or ""):
-                if not does_command_exist("h5dump"):
-                    raise MFCException("h5dump couldn't be found.")
+                if not os.path.exists(h5dump or ""):
+                    if not does_command_exist("h5dump"):
+                        raise MFCException("h5dump couldn't be found.")
 
-                h5dump = shutil.which("h5dump")
+                    h5dump = shutil.which("h5dump")
 
-            output, err = get_program_output([h5dump, silo_filepath])
+                output, err = get_program_output([h5dump, silo_filepath])
 
-            if err != 0:
-                raise MFCException(f"Test {case}: Failed to run h5dump. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.")
+                if err != 0:
+                    raise MFCException(f"Test {case}: Failed to run h5dump. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.")
 
-            if "nan," in output:
-                raise MFCException(f"Test {case}: Post Process has detected a NaN. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.")
+                if "nan," in output:
+                    raise MFCException(f"Test {case}: Post Process has detected a NaN. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.")
 
-            if "inf," in output:
-                raise MFCException(f"Test {case}: Post Process has detected an Infinity. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.")
+                if "inf," in output:
+                    raise MFCException(f"Test {case}: Post Process has detected an Infinity. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.")
 
-    case.delete_output()
+        case.delete_output()
 
-    end_time = time.time()
-    duration = end_time - start_time
+        end_time = time.time()
+        duration = end_time - start_time
 
-    current_test_number += 1
-    progress_str = f"({current_test_number:3d}/{total_test_count:3d})"
-    cons.print(f"  {progress_str}    [bold magenta]{case.get_uuid()}[/bold magenta]    {duration:6.2f}    {case.trace}")
+        current_test_number += 1
+        progress_str = f"({current_test_number:3d}/{total_test_count:3d})"
+        cons.print(f"  {progress_str}    [bold magenta]{case.get_uuid()}[/bold magenta]    {duration:6.2f}    {case.trace}")
+
+    except TestTimeoutError:
+        raise MFCException(
+            f"Test {case} exceeded 1 hour timeout. "
+            f"This may indicate a hung simulation or misconfigured case. "
+            f"Check the log at: {os.path.join(case.get_dirpath(), 'out_pre_sim.txt')}"
+        )
+    finally:
+        signal.alarm(0)  # Cancel alarm
 
 
 def handle_case(case: TestCase, devices: typing.Set[int]):
@@ -301,4 +329,16 @@ def handle_case(case: TestCase, devices: typing.Set[int]):
             errors.append(f"[bold red]Failed test {case} after {nAttempts} attempt(s).[/bold red]")
             errors.append(f"{exc}")
 
+        # Check if we should abort early due to high failure rate
+        total_completed = nFAIL + nPASS
+        if total_completed >= MIN_CASES_BEFORE_ABORT:
+            failure_rate = nFAIL / total_completed
+            if failure_rate >= FAILURE_RATE_THRESHOLD:
+                cons.print(f"\n[bold red]CRITICAL: {failure_rate*100:.1f}% failure rate detected after {total_completed} tests.[/bold red]")
+                cons.print(f"[bold red]This suggests a systemic issue (bad build, broken environment, etc.)[/bold red]")
+                cons.print(f"[bold red]Aborting remaining tests to fail fast.[/bold red]\n")
+                raise MFCException(
+                    f"Excessive test failures: {nFAIL}/{total_completed} failed ({failure_rate*100:.1f}%)"
+                )
+
         return

From 8c01b99e057605c2ae7a847f2258f397484258a0 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 5 Dec 2025 11:33:35 -0500
Subject: [PATCH 02/21] lint error

---
 toolchain/mfc/test/test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py
index c52334d233..a63374e3c3 100644
--- a/toolchain/mfc/test/test.py
+++ b/toolchain/mfc/test/test.py
@@ -291,12 +291,12 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
         progress_str = f"({current_test_number:3d}/{total_test_count:3d})"
         cons.print(f"  {progress_str}    [bold magenta]{case.get_uuid()}[/bold magenta]    {duration:6.2f}    {case.trace}")
 
-    except TestTimeoutError:
+    except TestTimeoutError as exc:
         raise MFCException(
             f"Test {case} exceeded 1 hour timeout. "
             f"This may indicate a hung simulation or misconfigured case. "
             f"Check the log at: {os.path.join(case.get_dirpath(), 'out_pre_sim.txt')}"
-        )
+        ) from exc
     finally:
         signal.alarm(0)  # Cancel alarm
 

From 2e1eb87e1ffb9dbed5846fe6f47e49ed03606bb6 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 5 Dec 2025 11:44:59 -0500
Subject: [PATCH 03/21] minor

---
 toolchain/mfc/test/test.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py
index a63374e3c3..77e4dead30 100644
--- a/toolchain/mfc/test/test.py
+++ b/toolchain/mfc/test/test.py
@@ -330,15 +330,17 @@ def handle_case(case: TestCase, devices: typing.Set[int]):
             errors.append(f"{exc}")
 
         # Check if we should abort early due to high failure rate
-        total_completed = nFAIL + nPASS
-        if total_completed >= MIN_CASES_BEFORE_ABORT:
-            failure_rate = nFAIL / total_completed
-            if failure_rate >= FAILURE_RATE_THRESHOLD:
-                cons.print(f"\n[bold red]CRITICAL: {failure_rate*100:.1f}% failure rate detected after {total_completed} tests.[/bold red]")
-                cons.print(f"[bold red]This suggests a systemic issue (bad build, broken environment, etc.)[/bold red]")
-                cons.print(f"[bold red]Aborting remaining tests to fail fast.[/bold red]\n")
-                raise MFCException(
-                    f"Excessive test failures: {nFAIL}/{total_completed} failed ({failure_rate*100:.1f}%)"
-                )
+        # Skip this check during dry-run (only builds, doesn't run tests)
+        if not ARG("dry_run"):
+            total_completed = nFAIL + nPASS
+            if total_completed >= MIN_CASES_BEFORE_ABORT:
+                failure_rate = nFAIL / total_completed
+                if failure_rate >= FAILURE_RATE_THRESHOLD:
+                    cons.print(f"\n[bold red]CRITICAL: {failure_rate*100:.1f}% failure rate detected after {total_completed} tests.[/bold red]")
+                    cons.print(f"[bold red]This suggests a systemic issue (bad build, broken environment, etc.)[/bold red]")
+                    cons.print(f"[bold red]Aborting remaining tests to fail fast.[/bold red]\n")
+                    raise MFCException(
+                        f"Excessive test failures: {nFAIL}/{total_completed} failed ({failure_rate*100:.1f}%)"
+                    )
 
         return

From 0668fedefcba5302448c99bcbf8cccbfe519601b Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 5 Dec 2025 11:52:07 -0500
Subject: [PATCH 04/21] forks look nicer now i think

---
 .github/workflows/test.yml | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 65707799b2..251b276ace 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -93,23 +93,40 @@ jobs:
           OPT2: ${{ matrix.debug == 'debug' && '-% 20' || '' }}
 
   self:
-    name: "${{ matrix.lbl == 'gt' && 'Georgia Tech | Phoenix' || 'Oak Ridge | Frontier' }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
+    name: "${{ matrix.cluster_name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
     if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true'
     needs: file-changes
     continue-on-error: false
     timeout-minutes: 480
     strategy:
       matrix:
-        device: ['gpu']
-        interface: ['acc', 'omp']
-        lbl: ['gt', 'frontier']
         include:
-          - device: 'cpu'
+          # Phoenix (GT)
+          - lbl: 'gt'
+            cluster_name: 'Georgia Tech | Phoenix'
+            device: 'gpu'
+            interface: 'acc'
+          - lbl: 'gt'
+            cluster_name: 'Georgia Tech | Phoenix'
+            device: 'gpu'
+            interface: 'omp'
+          - lbl: 'gt'
+            cluster_name: 'Georgia Tech | Phoenix'
+            device: 'cpu'
             interface: 'none'
-            lbl: 'gt'
-          - device: 'cpu'
+          # Frontier (ORNL)
+          - lbl: 'frontier'
+            cluster_name: 'Oak Ridge | Frontier'
+            device: 'gpu'
+            interface: 'acc'
+          - lbl: 'frontier'
+            cluster_name: 'Oak Ridge | Frontier'
+            device: 'gpu'
+            interface: 'omp'
+          - lbl: 'frontier'
+            cluster_name: 'Oak Ridge | Frontier'
+            device: 'cpu'
             interface: 'none'
-            lbl: 'frontier'
     runs-on:
       group:  phoenix
       labels: ${{ matrix.lbl }}

From f79f4d16969801501643b22f82f8a75658ecb477 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 5 Dec 2025 12:08:11 -0500
Subject: [PATCH 05/21] more fixing!

---
 .github/scripts/monitor_slurm_job.sh | 65 +++++++++++++++++++++
 .github/workflows/bench.yml          | 41 +-------------
 toolchain/mfc/bench.py               | 20 ++++---
 toolchain/mfc/test/test.py           | 84 ++++++++++++++++++----------
 4 files changed, 136 insertions(+), 74 deletions(-)
 create mode 100755 .github/scripts/monitor_slurm_job.sh

diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh
new file mode 100755
index 0000000000..b6d43010df
--- /dev/null
+++ b/.github/scripts/monitor_slurm_job.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+# Monitor a SLURM job and stream its output in real-time
+# Usage: monitor_slurm_job.sh <job_id> <output_file>
+
+set -e
+
+if [ $# -ne 2 ]; then
+    echo "Usage: $0 <job_id> <output_file>"
+    exit 1
+fi
+
+job_id="$1"
+output_file="$2"
+
+echo "Submitted batch job $job_id"
+echo "Monitoring output file: $output_file"
+
+# Wait for file to appear (check job status if it takes a while)
+echo "Waiting for job to start..."
+while [ ! -f "$output_file" ]; do
+  # Check if job failed to start
+  if ! squeue -j "$job_id" &>/dev/null && [ ! -f "$output_file" ]; then
+    echo "ERROR: Job $job_id finished without creating output file"
+    exit 1
+  fi
+  sleep 5
+done
+
+echo "=== Streaming output for job $job_id ==="
+# Stream output while job runs
+tail -f "$output_file" &
+tail_pid=$!
+
+# Wait for job to complete with retry logic for transient squeue failures
+squeue_failures=0
+while true; do
+  if squeue -j "$job_id" &>/dev/null; then
+    squeue_failures=0
+  else
+    squeue_failures=$((squeue_failures + 1))
+    # Allow a few transient failures before concluding job is done
+    if [ $squeue_failures -ge 3 ]; then
+      break
+    fi
+  fi
+  sleep 5
+done
+
+# Stop tailing
+kill $tail_pid 2>/dev/null || true
+
+echo ""
+echo "=== Final output ==="
+cat "$output_file"
+
+# Check exit status
+exit_code=$(scontrol show job "$job_id" 2>/dev/null | grep -oP 'ExitCode=\K[0-9]+:[0-9]+' || echo "0:0")
+if [ "$exit_code" != "0:0" ]; then
+  echo "ERROR: Job $job_id failed with exit code $exit_code"
+  exit 1
+fi
+
+echo "Job $job_id completed successfully"
+exit 0
+
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 131010106a..cbe83de6b7 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -101,7 +101,7 @@ jobs:
         run: |
           set -e
           
-          # Function to submit and monitor
+          # Function to submit and monitor using extracted script
           submit_and_monitor() {
             local dir=$1
             local device=$2
@@ -124,43 +124,8 @@ jobs:
               return 1
             fi
             
-            echo "Submitted batch job $job_id"
-            echo "Monitoring output file: $output_file"
-            
-            # Wait for file to appear (check job status if it takes a while)
-            echo "Waiting for job to start..."
-            while [ ! -f "$output_file" ]; do
-              # Check if job failed to start
-              if ! squeue -j "$job_id" &>/dev/null && [ ! -f "$output_file" ]; then
-                echo "ERROR: Job $job_id finished without creating output file"
-                return 1
-              fi
-              sleep 5
-            done
-            
-            echo "=== Streaming output for job $job_id ==="
-            # Stream output while job runs
-            tail -f "$output_file" &
-            tail_pid=$!
-            
-            # Wait for job to complete (will wait up to GitHub Actions timeout)
-            while squeue -j "$job_id" &>/dev/null; do
-              sleep 5
-            done
-            
-            # Stop tailing
-            kill $tail_pid 2>/dev/null || true
-            
-            echo ""
-            echo "=== Final output ==="
-            cat "$output_file"
-            
-            # Check exit status
-            exit_code=$(scontrol show job "$job_id" 2>/dev/null | grep -oP 'ExitCode=\K[0-9]+' || echo "0:0")
-            if [ "$exit_code" != "0" ] && [ "$exit_code" != "0:0" ]; then
-              echo "ERROR: Job $job_id failed with exit code $exit_code"
-              return 1
-            fi
+            # Use the monitoring script
+            bash .github/scripts/monitor_slurm_job.sh "$job_id" "$output_file"
           }
           
           # Run both jobs with monitoring
diff --git a/toolchain/mfc/bench.py b/toolchain/mfc/bench.py
index 89d5a6fb1b..a9441febc4 100644
--- a/toolchain/mfc/bench.py
+++ b/toolchain/mfc/bench.py
@@ -68,13 +68,14 @@ def bench(targets = None):
                     ["--targets"] + [t.name for t in targets] +
                     ["--output-summary", summary_filepath] +
                     case.args +
-                    ["--", "--gbpp", ARG('mem')],
+                    ["--", "--gbpp", str(ARG('mem'))],
                     stdout=log_file,
                     stderr=subprocess.STDOUT)
 
-            # Check return code
-            if result.returncode != 0:
-                cons.print(f"[bold red]ERROR[/bold red]: Case {case.slug} failed with exit code {result.returncode}")
+            # Check return code (handle CompletedProcess or int defensively)
+            rc = result.returncode if hasattr(result, "returncode") else result
+            if rc != 0:
+                cons.print(f"[bold red]ERROR[/bold red]: Case {case.slug} failed with exit code {rc}")
                 cons.print(f"[bold red]      Check log at: {log_filepath}[/bold red]")
                 failed_cases.append(case.slug)
                 cons.unindent()
@@ -111,7 +112,6 @@ def bench(targets = None):
 
             if validation_failed:
                 failed_cases.append(case.slug)
-                cons.unindent()
                 continue
 
             # Add to results
@@ -124,8 +124,8 @@ def bench(targets = None):
         except Exception as e:
             cons.print(f"[bold red]ERROR[/bold red]: Unexpected error running {case.slug}: {e}")
             failed_cases.append(case.slug)
-
-        cons.unindent()
+        finally:
+            cons.unindent()
 
     # Report results
     if failed_cases:
@@ -209,7 +209,11 @@ def diff():
                         cons.print(f"[bold red]Error[/bold red]: Benchmarking failed since grind time speedup for {target.name} below acceptable threshold (<0.95) - Case: {slug}")
                         err = 1
             except Exception as e:
-                cons.print(f"[bold red]ERROR[/bold red]: Failed to compute speedup for {target.name} in {slug}: {e}")
+                import traceback
+                cons.print(
+                    f"[bold red]ERROR[/bold red]: Failed to compute speedup for {target.name} in {slug}: {e}\n"
+                    f"{traceback.format_exc()}"
+                )
                 err = 1
 
         table.add_row(f"[magenta]{slug}[/magenta]", *speedups)
diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py
index 77e4dead30..cfac342058 100644
--- a/toolchain/mfc/test/test.py
+++ b/toolchain/mfc/test/test.py
@@ -1,4 +1,4 @@
-import os, typing, shutil, time, itertools, signal
+import os, typing, shutil, time, itertools, threading
 from random import sample, seed
 
 import rich, rich.table
@@ -30,12 +30,15 @@
 # Per-test timeout (1 hour)
 TEST_TIMEOUT_SECONDS = 3600
 
+# Global abort flag for thread-safe early termination
+# This flag is set when the failure rate exceeds the threshold, signaling
+# all worker threads to exit gracefully. This avoids raising exceptions
+# from worker threads which could leave the scheduler in an undefined state.
+abort_tests = threading.Event()
+
 class TestTimeoutError(MFCException):
     pass
 
-def timeout_handler(signum, frame):
-    raise TestTimeoutError("Test case exceeded 1 hour timeout")
-
 # pylint: disable=too-many-branches, trailing-whitespace
 def __filter(cases_) -> typing.List[TestCase]:
     cases = cases_[:]
@@ -173,6 +176,15 @@ def test():
         [ sched.Task(ppn=case.ppn, func=handle_case, args=[case], load=case.get_cell_count()) for case in cases ],
         ARG("jobs"), ARG("gpus"))
 
+    # Check if we aborted due to high failure rate
+    if abort_tests.is_set():
+        total_completed = nFAIL + nPASS
+        cons.print()
+        cons.unindent()
+        raise MFCException(
+            f"Excessive test failures: {nFAIL}/{total_completed} failed ({nFAIL/total_completed*100:.1f}%)"
+        )
+
     nSKIP = len(skipped_cases)
     cons.print()
     cons.unindent()
@@ -199,9 +211,12 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
     global current_test_number
     start_time = time.time()
 
-    # Set timeout alarm
-    signal.signal(signal.SIGALRM, timeout_handler)
-    signal.alarm(TEST_TIMEOUT_SECONDS)
+    # Set timeout using threading.Timer (works in worker threads)
+    # Note: signal.alarm() only works in the main thread, so we use
+    # threading.Timer which works correctly in worker threads spawned by sched.sched
+    timeout_flag = threading.Event()
+    timeout_timer = threading.Timer(TEST_TIMEOUT_SECONDS, timeout_flag.set)
+    timeout_timer.start()
 
     tol = case.compute_tolerance()
     case.delete_output()
@@ -209,12 +224,19 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
 
     if ARG("dry_run"):
         cons.print(f"  [bold magenta]{case.get_uuid()}[/bold magenta]     SKIP     {case.trace}")
-        signal.alarm(0)  # Cancel alarm
+        timeout_timer.cancel()
         return
 
     try:
+        # Check timeout before starting
+        if timeout_flag.is_set():
+            raise TestTimeoutError("Test case exceeded 1 hour timeout")
         cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices)
 
+        # Check timeout after simulation
+        if timeout_flag.is_set():
+            raise TestTimeoutError("Test case exceeded 1 hour timeout")
+
         out_filepath = os.path.join(case.get_dirpath(), "out_pre_sim.txt")
 
         common.file_write(out_filepath, cmd.stdout)
@@ -261,26 +283,28 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
             out_filepath = os.path.join(case.get_dirpath(), "out_post.txt")
             common.file_write(out_filepath, cmd.stdout)
 
-            for silo_filepath in os.listdir(os.path.join(case.get_dirpath(), 'silo_hdf5', 'p0')):
-                silo_filepath = os.path.join(case.get_dirpath(), 'silo_hdf5', 'p0', silo_filepath)
-                h5dump        = f"{HDF5.get_install_dirpath(case.to_input_file())}/bin/h5dump"
+            silo_dir = os.path.join(case.get_dirpath(), 'silo_hdf5', 'p0')
+            if os.path.isdir(silo_dir):
+                for silo_filename in os.listdir(silo_dir):
+                    silo_filepath = os.path.join(silo_dir, silo_filename)
+                    h5dump        = f"{HDF5.get_install_dirpath(case.to_input_file())}/bin/h5dump"
 
-                if not os.path.exists(h5dump or ""):
-                    if not does_command_exist("h5dump"):
-                        raise MFCException("h5dump couldn't be found.")
+                    if not os.path.exists(h5dump or ""):
+                        if not does_command_exist("h5dump"):
+                            raise MFCException("h5dump couldn't be found.")
 
-                    h5dump = shutil.which("h5dump")
+                        h5dump = shutil.which("h5dump")
 
-                output, err = get_program_output([h5dump, silo_filepath])
+                    output, err = get_program_output([h5dump, silo_filepath])
 
-                if err != 0:
-                    raise MFCException(f"Test {case}: Failed to run h5dump. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.")
+                    if err != 0:
+                        raise MFCException(f"Test {case}: Failed to run h5dump. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.")
 
-                if "nan," in output:
-                    raise MFCException(f"Test {case}: Post Process has detected a NaN. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.")
+                    if "nan," in output:
+                        raise MFCException(f"Test {case}: Post Process has detected a NaN. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.")
 
-                if "inf," in output:
-                    raise MFCException(f"Test {case}: Post Process has detected an Infinity. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.")
+                    if "inf," in output:
+                        raise MFCException(f"Test {case}: Post Process has detected an Infinity. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.")
 
         case.delete_output()
 
@@ -298,7 +322,7 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
             f"Check the log at: {os.path.join(case.get_dirpath(), 'out_pre_sim.txt')}"
         ) from exc
     finally:
-        signal.alarm(0)  # Cancel alarm
+        timeout_timer.cancel()  # Cancel timeout timer
 
 
 def handle_case(case: TestCase, devices: typing.Set[int]):
@@ -306,6 +330,10 @@ def handle_case(case: TestCase, devices: typing.Set[int]):
     global nFAIL, nPASS, nSKIP
     global errors
 
+    # Check if we should abort before processing this case
+    if abort_tests.is_set():
+        return  # Exit gracefully if abort was requested
+
     nAttempts = 0
     if ARG('single'):
         max_attempts = max(ARG('max_attempts'), 3)
@@ -337,10 +365,10 @@ def handle_case(case: TestCase, devices: typing.Set[int]):
                 failure_rate = nFAIL / total_completed
                 if failure_rate >= FAILURE_RATE_THRESHOLD:
                     cons.print(f"\n[bold red]CRITICAL: {failure_rate*100:.1f}% failure rate detected after {total_completed} tests.[/bold red]")
-                    cons.print(f"[bold red]This suggests a systemic issue (bad build, broken environment, etc.)[/bold red]")
-                    cons.print(f"[bold red]Aborting remaining tests to fail fast.[/bold red]\n")
-                    raise MFCException(
-                        f"Excessive test failures: {nFAIL}/{total_completed} failed ({failure_rate*100:.1f}%)"
-                    )
+                    cons.print("[bold red]This suggests a systemic issue (bad build, broken environment, etc.)[/bold red]")
+                    cons.print("[bold red]Aborting remaining tests to fail fast.[/bold red]\n")
+                    # Set abort flag instead of raising exception from worker thread
+                    abort_tests.set()
+                    return  # Exit gracefully
 
         return

From 76a6e63a5d7dcd5a8ca069c560d858e0639ec2f8 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 5 Dec 2025 12:14:01 -0500
Subject: [PATCH 06/21] clean linter

---
 toolchain/mfc/bench.py     |  3 +--
 toolchain/mfc/test/test.py | 49 ++++++++++++++++++++++++--------------
 2 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/toolchain/mfc/bench.py b/toolchain/mfc/bench.py
index a9441febc4..ec54cc9cf2 100644
--- a/toolchain/mfc/bench.py
+++ b/toolchain/mfc/bench.py
@@ -1,4 +1,4 @@
-import os, sys, uuid, subprocess, dataclasses, typing, math
+import os, sys, uuid, subprocess, dataclasses, typing, math, traceback
 
 import rich.table
 
@@ -209,7 +209,6 @@ def diff():
                         cons.print(f"[bold red]Error[/bold red]: Benchmarking failed since grind time speedup for {target.name} below acceptable threshold (<0.95) - Case: {slug}")
                         err = 1
             except Exception as e:
-                import traceback
                 cons.print(
                     f"[bold red]ERROR[/bold red]: Failed to compute speedup for {target.name} in {slug}: {e}\n"
                     f"{traceback.format_exc()}"
diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py
index cfac342058..f1b04ba2fd 100644
--- a/toolchain/mfc/test/test.py
+++ b/toolchain/mfc/test/test.py
@@ -206,6 +206,36 @@ def test():
 
 
 # pylint: disable=too-many-locals, too-many-branches, too-many-statements, trailing-whitespace
+def _process_silo_file(silo_filepath: str, case: TestCase, out_filepath: str):
+    """Process a single silo file with h5dump and check for NaNs/Infinities."""
+    h5dump = f"{HDF5.get_install_dirpath(case.to_input_file())}/bin/h5dump"
+
+    if not os.path.exists(h5dump or ""):
+        if not does_command_exist("h5dump"):
+            raise MFCException("h5dump couldn't be found.")
+        h5dump = shutil.which("h5dump")
+
+    output, err = get_program_output([h5dump, silo_filepath])
+
+    if err != 0:
+        raise MFCException(
+            f"Test {case}: Failed to run h5dump. You can find the run's output in {out_filepath}, "
+            f"and the case dictionary in {case.get_filepath()}."
+        )
+
+    if "nan," in output:
+        raise MFCException(
+            f"Test {case}: Post Process has detected a NaN. You can find the run's output in {out_filepath}, "
+            f"and the case dictionary in {case.get_filepath()}."
+        )
+
+    if "inf," in output:
+        raise MFCException(
+            f"Test {case}: Post Process has detected an Infinity. You can find the run's output in {out_filepath}, "
+            f"and the case dictionary in {case.get_filepath()}."
+        )
+
+
 def _handle_case(case: TestCase, devices: typing.Set[int]):
     # pylint: disable=global-statement, global-variable-not-assigned
     global current_test_number
@@ -287,24 +317,7 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
             if os.path.isdir(silo_dir):
                 for silo_filename in os.listdir(silo_dir):
                     silo_filepath = os.path.join(silo_dir, silo_filename)
-                    h5dump        = f"{HDF5.get_install_dirpath(case.to_input_file())}/bin/h5dump"
-
-                    if not os.path.exists(h5dump or ""):
-                        if not does_command_exist("h5dump"):
-                            raise MFCException("h5dump couldn't be found.")
-
-                        h5dump = shutil.which("h5dump")
-
-                    output, err = get_program_output([h5dump, silo_filepath])
-
-                    if err != 0:
-                        raise MFCException(f"Test {case}: Failed to run h5dump. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.")
-
-                    if "nan," in output:
-                        raise MFCException(f"Test {case}: Post Process has detected a NaN. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.")
-
-                    if "inf," in output:
-                        raise MFCException(f"Test {case}: Post Process has detected an Infinity. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.")
+                    _process_silo_file(silo_filepath, case, out_filepath)
 
         case.delete_output()
 

From 7333098b1563322b050334cd53369f6c4e1e05b9 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 5 Dec 2025 12:17:21 -0500
Subject: [PATCH 07/21] fix some more suggestions

---
 toolchain/mfc/bench.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/toolchain/mfc/bench.py b/toolchain/mfc/bench.py
index ec54cc9cf2..8ee9a256d9 100644
--- a/toolchain/mfc/bench.py
+++ b/toolchain/mfc/bench.py
@@ -78,7 +78,6 @@ def bench(targets = None):
                 cons.print(f"[bold red]ERROR[/bold red]: Case {case.slug} failed with exit code {rc}")
                 cons.print(f"[bold red]      Check log at: {log_filepath}[/bold red]")
                 failed_cases.append(case.slug)
-                cons.unindent()
                 continue
 
             # Validate summary file exists
@@ -86,7 +85,6 @@ def bench(targets = None):
                 cons.print(f"[bold red]ERROR[/bold red]: Summary file not created for {case.slug}")
                 cons.print(f"[bold red]      Expected: {summary_filepath}[/bold red]")
                 failed_cases.append(case.slug)
-                cons.unindent()
                 continue
 
             # Load summary

From b62854c42c6c65c3c2069681ad93016b07fa8e64 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 5 Dec 2025 12:37:32 -0500
Subject: [PATCH 08/21] more hardening :O

---
 .github/scripts/monitor_slurm_job.sh | 94 +++++++++++++++++++++++++---
 .github/workflows/bench.yml          | 20 ++++--
 toolchain/mfc/bench.py               |  3 +-
 toolchain/mfc/test/test.py           | 26 ++++++--
 4 files changed, 122 insertions(+), 21 deletions(-)

diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh
index b6d43010df..7e66377956 100755
--- a/.github/scripts/monitor_slurm_job.sh
+++ b/.github/scripts/monitor_slurm_job.sh
@@ -15,15 +15,30 @@ output_file="$2"
 echo "Submitted batch job $job_id"
 echo "Monitoring output file: $output_file"
 
-# Wait for file to appear (check job status if it takes a while)
+# Wait for file to appear with retry logic for transient squeue failures
 echo "Waiting for job to start..."
+squeue_retries=0
+max_squeue_retries=5
 while [ ! -f "$output_file" ]; do
-  # Check if job failed to start
-  if ! squeue -j "$job_id" &>/dev/null && [ ! -f "$output_file" ]; then
-    echo "ERROR: Job $job_id finished without creating output file"
-    exit 1
+  # Check if job is still queued/running
+  if squeue -j "$job_id" &>/dev/null; then
+    squeue_retries=0  # Reset on success
+    sleep 5
+  else
+    squeue_retries=$((squeue_retries + 1))
+    if [ $squeue_retries -ge $max_squeue_retries ]; then
+      # Job not in queue and output file doesn't exist
+      if [ ! -f "$output_file" ]; then
+        echo "ERROR: Job $job_id not in queue and output file not created"
+        exit 1
+      fi
+      break
+    fi
+    # Exponential backoff
+    sleep_time=$((2 ** squeue_retries))
+    echo "Warning: squeue check failed, retrying in ${sleep_time}s..."
+    sleep $sleep_time
   fi
-  sleep 5
 done
 
 echo "=== Streaming output for job $job_id ==="
@@ -38,14 +53,49 @@ while true; do
     squeue_failures=0
   else
     squeue_failures=$((squeue_failures + 1))
-    # Allow a few transient failures before concluding job is done
+    # Check if job actually completed using sacct (if available)
     if [ $squeue_failures -ge 3 ]; then
-      break
+      if command -v sacct >/dev/null 2>&1; then
+        state=$(sacct -j "$job_id" --format=State --noheader 2>/dev/null | head -n1 | awk '{print $1}')
+        # Consider job done only if it reached a terminal state
+        case "$state" in
+          COMPLETED|FAILED|CANCELLED|TIMEOUT|OUT_OF_MEMORY)
+            break
+            ;;
+          *)
+            # treat as transient failure, reset failures and continue polling
+            squeue_failures=0
+            ;;
+        esac
+      else
+        # No sacct: avoid false positive by doing an extra check cycle
+        squeue_failures=2
+      fi
     fi
   fi
   sleep 5
 done
 
+# Wait for output file to finish growing (stabilize) before stopping tail
+if [ -f "$output_file" ]; then
+  last_size=-1
+  same_count=0
+  while true; do
+    size=$(stat -c%s "$output_file" 2>/dev/null || echo -1)
+    if [ "$size" -eq "$last_size" ] && [ "$size" -ge 0 ]; then
+      same_count=$((same_count + 1))
+    else
+      same_count=0
+      last_size=$size
+    fi
+    # two consecutive stable checks (~10s) implies file likely flushed
+    if [ $same_count -ge 2 ]; then
+      break
+    fi
+    sleep 5
+  done
+fi
+
 # Stop tailing
 kill $tail_pid 2>/dev/null || true
 
@@ -53,8 +103,32 @@ echo ""
 echo "=== Final output ==="
 cat "$output_file"
 
-# Check exit status
-exit_code=$(scontrol show job "$job_id" 2>/dev/null | grep -oP 'ExitCode=\K[0-9]+:[0-9]+' || echo "0:0")
+# Check exit status with sacct fallback
+exit_code=""
+
+# Try scontrol first (works for recent jobs)
+scontrol_output=$(scontrol show job "$job_id" 2>/dev/null || echo "")
+if [ -n "$scontrol_output" ]; then
+  exit_code=$(echo "$scontrol_output" | grep -oE 'ExitCode=[0-9]+:[0-9]+' | cut -d= -f2 || echo "")
+fi
+
+# If scontrol failed or returned invalid job, try sacct (for completed/aged-out jobs)
+if [ -z "$exit_code" ]; then
+  echo "Warning: scontrol failed to get exit code, trying sacct..."
+  sacct_output=$(sacct -j "$job_id" --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 || echo "")
+  if [ -n "$sacct_output" ]; then
+    exit_code="$sacct_output"
+  fi
+fi
+
+# If we still can't determine exit code, fail explicitly
+if [ -z "$exit_code" ]; then
+  echo "ERROR: Unable to determine exit status for job $job_id"
+  echo "Both scontrol and sacct failed to return valid exit code"
+  exit 1
+fi
+
+# Check if job succeeded
 if [ "$exit_code" != "0:0" ]; then
   echo "ERROR: Job $job_id failed with exit code $exit_code"
   exit 1
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index cbe83de6b7..fbb6ba7072 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -114,7 +114,7 @@ jobs:
             submit_output=$(bash .github/workflows/$cluster/submit-bench.sh \
               .github/workflows/$cluster/bench.sh $device $interface 2>&1)
             
-            job_id=$(echo "$submit_output" | grep -oP 'Submitted batch job \K[0-9]+' || echo "")
+            job_id=$(echo "$submit_output" | sed -n 's/.*Submitted batch job \([0-9][0-9]*\).*/\1/p')
             job_slug="bench-$device-$interface"
             output_file="${job_slug}.out"
             
@@ -135,10 +135,20 @@ jobs:
           (submit_and_monitor master ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}) &
           master_pid=$!
           
-          wait $pr_pid && pr_exit=$? || pr_exit=$?
-          wait $master_pid && master_exit=$? || master_exit=$?
-          
-          if [ $pr_exit -ne 0 ] || [ $master_exit -ne 0 ]; then
+          # Wait and capture exit codes reliably
+          pr_exit=0
+          master_exit=0
+
+          if ! wait "$pr_pid"; then
+            pr_exit=$?
+          fi
+          if ! wait "$master_pid"; then
+            master_exit=$?
+          fi
+
+          # Explicitly check and quote to avoid test errors
+          if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then
+            echo "One or both benchmark jobs failed: pr_exit=${pr_exit}, master_exit=${master_exit}"
             exit 1
           fi
 
diff --git a/toolchain/mfc/bench.py b/toolchain/mfc/bench.py
index 8ee9a256d9..feb33198ca 100644
--- a/toolchain/mfc/bench.py
+++ b/toolchain/mfc/bench.py
@@ -121,6 +121,7 @@ def bench(targets = None):
 
         except Exception as e:
             cons.print(f"[bold red]ERROR[/bold red]: Unexpected error running {case.slug}: {e}")
+            cons.print(f"[dim]{traceback.format_exc()}[/dim]")
             failed_cases.append(case.slug)
         finally:
             cons.unindent()
@@ -203,7 +204,7 @@ def diff():
 
                     grind_time_value = lhs_summary[target.name]["grind"] / rhs_summary[target.name]["grind"]
                     speedups[i] += f" & Grind: {grind_time_value:.2f}"
-                    if grind_time_value <0.95:
+                    if grind_time_value < 0.95:
                         cons.print(f"[bold red]Error[/bold red]: Benchmarking failed since grind time speedup for {target.name} below acceptable threshold (<0.95) - Case: {slug}")
                         err = 1
             except Exception as e:
diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py
index f1b04ba2fd..78d40d9eba 100644
--- a/toolchain/mfc/test/test.py
+++ b/toolchain/mfc/test/test.py
@@ -107,7 +107,7 @@ def __filter(cases_) -> typing.List[TestCase]:
     return selected_cases, skipped_cases
 
 def test():
-    # pylint: disable=global-statement, global-variable-not-assigned
+    # pylint: disable=global-statement, global-variable-not-assigned, too-many-statements
     global nFAIL, nPASS, nSKIP, total_test_count
     global errors
 
@@ -181,8 +181,13 @@ def test():
         total_completed = nFAIL + nPASS
         cons.print()
         cons.unindent()
+        if total_completed > 0:
+            raise MFCException(
+                f"Excessive test failures: {nFAIL}/{total_completed} "
+                f"failed ({nFAIL/total_completed*100:.1f}%)"
+            )
         raise MFCException(
-            f"Excessive test failures: {nFAIL}/{total_completed} failed ({nFAIL/total_completed*100:.1f}%)"
+            f"Excessive test failures: {nFAIL} failed, but no tests were completed."
         )
 
     nSKIP = len(skipped_cases)
@@ -242,8 +247,9 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
     start_time = time.time()
 
     # Set timeout using threading.Timer (works in worker threads)
-    # Note: signal.alarm() only works in the main thread, so we use
-    # threading.Timer which works correctly in worker threads spawned by sched.sched
+    # Note: we intentionally do not use signal.alarm() here because signals
+    # only work in the main thread; sched.sched runs tests in worker threads.
+    # threading.Timer works correctly in this threaded context.
     timeout_flag = threading.Event()
     timeout_timer = threading.Timer(TEST_TIMEOUT_SECONDS, timeout_flag.set)
     timeout_timer.start()
@@ -309,6 +315,9 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
 
         if ARG("test_all"):
             case.delete_output()
+            # Check timeout before launching the (potentially long) post-process run
+            if timeout_flag.is_set():
+                raise TestTimeoutError("Test case exceeded 1 hour timeout")
             cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices)
             out_filepath = os.path.join(case.get_dirpath(), "out_post.txt")
             common.file_write(out_filepath, cmd.stdout)
@@ -329,10 +338,17 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
         cons.print(f"  {progress_str}    [bold magenta]{case.get_uuid()}[/bold magenta]    {duration:6.2f}    {case.trace}")
 
     except TestTimeoutError as exc:
+        log_path = os.path.join(case.get_dirpath(), 'out_pre_sim.txt')
+        if os.path.exists(log_path):
+            log_msg = f"Check the log at: {log_path}"
+        else:
+            log_msg = (
+                f"Log file ({log_path}) may not exist if the timeout occurred early."
+            )
         raise MFCException(
             f"Test {case} exceeded 1 hour timeout. "
             f"This may indicate a hung simulation or misconfigured case. "
-            f"Check the log at: {os.path.join(case.get_dirpath(), 'out_pre_sim.txt')}"
+            f"{log_msg}"
         ) from exc
     finally:
         timeout_timer.cancel()  # Cancel timeout timer

From db91d6ae244611adca7bd52fafac04f41037d810 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Mon, 8 Dec 2025 15:56:41 -0500
Subject: [PATCH 09/21] cleanup on the monitoring

---
 .github/scripts/monitor_slurm_job.sh | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh
index 7e66377956..e6bd4472d0 100755
--- a/.github/scripts/monitor_slurm_job.sh
+++ b/.github/scripts/monitor_slurm_job.sh
@@ -2,7 +2,15 @@
 # Monitor a SLURM job and stream its output in real-time
 # Usage: monitor_slurm_job.sh <job_id> <output_file>
 
-set -e
+set -euo pipefail
+
+# Cleanup handler to prevent orphaned tail processes
+cleanup() {
+  if [ -n "${tail_pid:-}" ]; then
+    kill "${tail_pid}" 2>/dev/null || true
+  fi
+}
+trap cleanup EXIT
 
 if [ $# -ne 2 ]; then
     echo "Usage: $0 <job_id> <output_file>"
@@ -96,8 +104,8 @@ if [ -f "$output_file" ]; then
   done
 fi
 
-# Stop tailing
-kill $tail_pid 2>/dev/null || true
+# Stop tailing (trap will also handle this on exit)
+kill "${tail_pid}" 2>/dev/null || true
 
 echo ""
 echo "=== Final output ==="

From 87fcd74b6222086089a19306640a917f0e9c5493 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Mon, 8 Dec 2025 16:12:31 -0500
Subject: [PATCH 10/21] better monitoring

---
 .github/scripts/monitor_slurm_job.sh |  14 +-
 .github/workflows/bench.yml          |  28 +++-
 toolchain/mfc/bench.py               | 203 ++++++++++++++-------------
 3 files changed, 141 insertions(+), 104 deletions(-)

diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh
index e6bd4472d0..05eaf5a48b 100755
--- a/.github/scripts/monitor_slurm_job.sh
+++ b/.github/scripts/monitor_slurm_job.sh
@@ -50,15 +50,24 @@ while [ ! -f "$output_file" ]; do
 done
 
 echo "=== Streaming output for job $job_id ==="
-# Stream output while job runs
-tail -f "$output_file" &
+# Stream output while job runs (explicitly redirect to ensure output visibility)
+tail -f "$output_file" 2>&1 &
 tail_pid=$!
 
+# Give tail a moment to start and show initial output
+sleep 2
+
 # Wait for job to complete with retry logic for transient squeue failures
 squeue_failures=0
+heartbeat_counter=0
 while true; do
   if squeue -j "$job_id" &>/dev/null; then
     squeue_failures=0
+    # Print heartbeat every 60 seconds (12 iterations * 5 sec)
+    heartbeat_counter=$((heartbeat_counter + 1))
+    if [ $((heartbeat_counter % 12)) -eq 0 ]; then
+      echo "[$(date +%H:%M:%S)] Job $job_id still running..."
+    fi
   else
     squeue_failures=$((squeue_failures + 1))
     # Check if job actually completed using sacct (if available)
@@ -68,6 +77,7 @@ while true; do
         # Consider job done only if it reached a terminal state
         case "$state" in
           COMPLETED|FAILED|CANCELLED|TIMEOUT|OUT_OF_MEMORY)
+            echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state"
             break
             ;;
           *)
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index fbb6ba7072..1c485c93a5 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -108,6 +108,7 @@ jobs:
             local interface=$3
             local cluster=$4
             
+            echo "[$dir] Submitting benchmark for $device-$interface on $cluster..."
             cd "$dir"
             
             # Submit job
@@ -119,21 +120,33 @@ jobs:
             output_file="${job_slug}.out"
             
             if [ -z "$job_id" ]; then
-              echo "ERROR: Failed to submit job"
+              echo "[$dir] ERROR: Failed to submit job"
               echo "$submit_output"
               return 1
             fi
             
+            echo "[$dir] Job ID: $job_id, monitoring output file: $output_file"
+            
             # Use the monitoring script
             bash .github/scripts/monitor_slurm_job.sh "$job_id" "$output_file"
+            
+            echo "[$dir] Monitoring complete for job $job_id"
           }
           
           # Run both jobs with monitoring
+          echo "=========================================="
+          echo "Starting parallel benchmark jobs..."
+          echo "=========================================="
+          
           (submit_and_monitor pr ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}) &
           pr_pid=$!
+          echo "PR job started in background (PID: $pr_pid)"
           
           (submit_and_monitor master ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}) &
           master_pid=$!
+          echo "Master job started in background (PID: $master_pid)"
+          
+          echo "Waiting for both jobs to complete..."
           
           # Wait and capture exit codes reliably
           pr_exit=0
@@ -141,16 +154,27 @@ jobs:
 
           if ! wait "$pr_pid"; then
             pr_exit=$?
+            echo "PR job exited with code: $pr_exit"
+          else
+            echo "PR job completed successfully"
           fi
+          
           if ! wait "$master_pid"; then
             master_exit=$?
+            echo "Master job exited with code: $master_exit"
+          else
+            echo "Master job completed successfully"
           fi
 
           # Explicitly check and quote to avoid test errors
           if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then
-            echo "One or both benchmark jobs failed: pr_exit=${pr_exit}, master_exit=${master_exit}"
+            echo "ERROR: One or both benchmark jobs failed: pr_exit=${pr_exit}, master_exit=${master_exit}"
             exit 1
           fi
+          
+          echo "=========================================="
+          echo "Both benchmark jobs completed successfully!"
+          echo "=========================================="
 
       - name: Generate & Post Comment
         run: |
diff --git a/toolchain/mfc/bench.py b/toolchain/mfc/bench.py
index feb33198ca..78b9b0fc65 100644
--- a/toolchain/mfc/bench.py
+++ b/toolchain/mfc/bench.py
@@ -29,118 +29,121 @@ def bench(targets = None):
     cons.print()
     cons.print(f"[bold]Benchmarking {format_list_to_string(ARG('targets'), 'magenta')} ([magenta]{os.path.relpath(bench_dirpath)}[/magenta]):[/bold]")
     cons.indent()
-    cons.print()
 
-    CASES = [ BenchCase(**case) for case in file_load_yaml(MFC_BENCH_FILEPATH) ]
+    try:
+        cons.print()
 
-    for case in CASES:
-        case.args = case.args + ARG("--")
-        case.path = os.path.abspath(case.path)
+        CASES = [ BenchCase(**case) for case in file_load_yaml(MFC_BENCH_FILEPATH) ]
 
-        # Validate case file exists early
-        if not os.path.exists(case.path):
-            raise MFCException(f"Benchmark case file not found: {case.path}")
+        for case in CASES:
+            case.args = case.args + ARG("--")
+            case.path = os.path.abspath(case.path)
 
-    results = {
-        "metadata": {
-            "invocation": sys.argv[1:],
-            "lock":       dataclasses.asdict(CFG())
-        },
-        "cases": {},
-    }
+            # Validate case file exists early
+            if not os.path.exists(case.path):
+                raise MFCException(f"Benchmark case file not found: {case.path}")
 
-    failed_cases = []
+        results = {
+            "metadata": {
+                "invocation": sys.argv[1:],
+                "lock":       dataclasses.asdict(CFG())
+            },
+            "cases": {},
+        }
 
-    for i, case in enumerate(CASES):
-        summary_filepath = os.path.join(bench_dirpath, f"{case.slug}.yaml")
-        log_filepath     = os.path.join(bench_dirpath, f"{case.slug}.out")
+        failed_cases = []
 
-        cons.print(f"{str(i+1).zfill(len(CASES) // 10 + 1)}/{len(CASES)}: {case.slug} @ [bold]{os.path.relpath(case.path)}[/bold]")
-        cons.indent()
-        cons.print()
-        cons.print(f"> Log:     [bold]{os.path.relpath(log_filepath)}[/bold]")
-        cons.print(f"> Summary: [bold]{os.path.relpath(summary_filepath)}[/bold]")
-
-        try:
-            with open(log_filepath, "w") as log_file:
-                result = system(
-                    ["./mfc.sh", "run", case.path, "--case-optimization"] +
-                    ["--targets"] + [t.name for t in targets] +
-                    ["--output-summary", summary_filepath] +
-                    case.args +
-                    ["--", "--gbpp", str(ARG('mem'))],
-                    stdout=log_file,
-                    stderr=subprocess.STDOUT)
-
-            # Check return code (handle CompletedProcess or int defensively)
-            rc = result.returncode if hasattr(result, "returncode") else result
-            if rc != 0:
-                cons.print(f"[bold red]ERROR[/bold red]: Case {case.slug} failed with exit code {rc}")
-                cons.print(f"[bold red]      Check log at: {log_filepath}[/bold red]")
-                failed_cases.append(case.slug)
-                continue
+        for i, case in enumerate(CASES):
+            summary_filepath = os.path.join(bench_dirpath, f"{case.slug}.yaml")
+            log_filepath     = os.path.join(bench_dirpath, f"{case.slug}.out")
 
-            # Validate summary file exists
-            if not os.path.exists(summary_filepath):
-                cons.print(f"[bold red]ERROR[/bold red]: Summary file not created for {case.slug}")
-                cons.print(f"[bold red]      Expected: {summary_filepath}[/bold red]")
-                failed_cases.append(case.slug)
-                continue
-
-            # Load summary
-            summary = file_load_yaml(summary_filepath)
-
-            # Validate all targets have required data
-            validation_failed = False
-            for target in targets:
-                if target.name not in summary:
-                    cons.print(f"[bold red]ERROR[/bold red]: Target {target.name} missing from summary for {case.slug}")
-                    validation_failed = True
-                    break
-
-                if "exec" not in summary[target.name]:
-                    cons.print(f"[bold red]ERROR[/bold red]: 'exec' time missing for {target.name} in {case.slug}")
-                    validation_failed = True
-                    break
-
-                if target.name == "simulation" and "grind" not in summary[target.name]:
-                    cons.print(f"[bold red]ERROR[/bold red]: 'grind' time missing for simulation in {case.slug}")
-                    validation_failed = True
-                    break
-
-            if validation_failed:
+            cons.print(f"{str(i+1).zfill(len(CASES) // 10 + 1)}/{len(CASES)}: {case.slug} @ [bold]{os.path.relpath(case.path)}[/bold]")
+            cons.indent()
+            cons.print()
+            cons.print(f"> Log:     [bold]{os.path.relpath(log_filepath)}[/bold]")
+            cons.print(f"> Summary: [bold]{os.path.relpath(summary_filepath)}[/bold]")
+
+            try:
+                with open(log_filepath, "w") as log_file:
+                    result = system(
+                        ["./mfc.sh", "run", case.path, "--case-optimization"] +
+                        ["--targets"] + [t.name for t in targets] +
+                        ["--output-summary", summary_filepath] +
+                        case.args +
+                        ["--", "--gbpp", str(ARG('mem'))],
+                        stdout=log_file,
+                        stderr=subprocess.STDOUT)
+
+                # Check return code (handle CompletedProcess or int defensively)
+                rc = result.returncode if hasattr(result, "returncode") else result
+                if rc != 0:
+                    cons.print(f"[bold red]ERROR[/bold red]: Case {case.slug} failed with exit code {rc}")
+                    cons.print(f"[bold red]      Check log at: {log_filepath}[/bold red]")
+                    failed_cases.append(case.slug)
+                    continue
+
+                # Validate summary file exists
+                if not os.path.exists(summary_filepath):
+                    cons.print(f"[bold red]ERROR[/bold red]: Summary file not created for {case.slug}")
+                    cons.print(f"[bold red]      Expected: {summary_filepath}[/bold red]")
+                    failed_cases.append(case.slug)
+                    continue
+
+                # Load summary
+                summary = file_load_yaml(summary_filepath)
+
+                # Validate all targets have required data
+                validation_failed = False
+                for target in targets:
+                    if target.name not in summary:
+                        cons.print(f"[bold red]ERROR[/bold red]: Target {target.name} missing from summary for {case.slug}")
+                        validation_failed = True
+                        break
+
+                    if "exec" not in summary[target.name]:
+                        cons.print(f"[bold red]ERROR[/bold red]: 'exec' time missing for {target.name} in {case.slug}")
+                        validation_failed = True
+                        break
+
+                    if target.name == "simulation" and "grind" not in summary[target.name]:
+                        cons.print(f"[bold red]ERROR[/bold red]: 'grind' time missing for simulation in {case.slug}")
+                        validation_failed = True
+                        break
+
+                if validation_failed:
+                    failed_cases.append(case.slug)
+                    continue
+
+                # Add to results
+                results["cases"][case.slug] = {
+                    "description":    dataclasses.asdict(case),
+                    "output_summary": summary,
+                }
+                cons.print(f"[bold green]✓[/bold green] Case {case.slug} completed successfully")
+
+            except Exception as e:
+                cons.print(f"[bold red]ERROR[/bold red]: Unexpected error running {case.slug}: {e}")
+                cons.print(f"[dim]{traceback.format_exc()}[/dim]")
                 failed_cases.append(case.slug)
-                continue
-
-            # Add to results
-            results["cases"][case.slug] = {
-                "description":    dataclasses.asdict(case),
-                "output_summary": summary,
-            }
-            cons.print(f"[bold green]✓[/bold green] Case {case.slug} completed successfully")
-
-        except Exception as e:
-            cons.print(f"[bold red]ERROR[/bold red]: Unexpected error running {case.slug}: {e}")
-            cons.print(f"[dim]{traceback.format_exc()}[/dim]")
-            failed_cases.append(case.slug)
-        finally:
-            cons.unindent()
-
-    # Report results
-    if failed_cases:
-        cons.print()
-        cons.print(f"[bold red]Failed cases ({len(failed_cases)}):[/bold red]")
-        for slug in failed_cases:
-            cons.print(f"  - {slug}")
-        cons.print()
-        raise MFCException(f"Benchmarking failed: {len(failed_cases)}/{len(CASES)} cases failed")
+            finally:
+                cons.unindent()
+
+        # Report results
+        if failed_cases:
+            cons.print()
+            cons.print(f"[bold red]Failed cases ({len(failed_cases)}):[/bold red]")
+            for slug in failed_cases:
+                cons.print(f"  - {slug}")
+            cons.print()
+            raise MFCException(f"Benchmarking failed: {len(failed_cases)}/{len(CASES)} cases failed")
 
-    # Write output
-    file_dump_yaml(ARG("output"), results)
+        # Write output
+        file_dump_yaml(ARG("output"), results)
 
-    cons.print(f"Wrote results to [bold magenta]{os.path.relpath(ARG('output'))}[/bold magenta].")
+        cons.print(f"Wrote results to [bold magenta]{os.path.relpath(ARG('output'))}[/bold magenta].")
 
-    cons.unindent()
+    finally:
+        cons.unindent()
 
 
 # TODO: This function is too long and not nicely written at all. Someone should

From f468fe5403200981068a5070206c71a9cc8c0df3 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Mon, 8 Dec 2025 23:09:11 -0500
Subject: [PATCH 11/21] fixup

---
 .github/scripts/run_parallel_benchmarks.sh  | 78 +++++++++++++++++++++
 .github/scripts/submit_and_monitor_bench.sh | 51 ++++++++++++++
 .github/workflows/bench.yml                 | 78 +--------------------
 3 files changed, 130 insertions(+), 77 deletions(-)
 create mode 100755 .github/scripts/run_parallel_benchmarks.sh
 create mode 100755 .github/scripts/submit_and_monitor_bench.sh

diff --git a/.github/scripts/run_parallel_benchmarks.sh b/.github/scripts/run_parallel_benchmarks.sh
new file mode 100755
index 0000000000..eaa333b003
--- /dev/null
+++ b/.github/scripts/run_parallel_benchmarks.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+# Run PR and master benchmarks in parallel and verify outputs
+# Usage: run_parallel_benchmarks.sh <device> <interface> <cluster>
+
+set -euo pipefail
+
+if [ $# -ne 3 ]; then
+    echo "Usage: $0 <device> <interface> <cluster>"
+    exit 1
+fi
+
+device="$1"
+interface="$2"
+cluster="$3"
+
+echo "=========================================="
+echo "Starting parallel benchmark jobs..."
+echo "=========================================="
+
+# Run both jobs with monitoring using dedicated script
+(bash .github/scripts/submit_and_monitor_bench.sh pr "$device" "$interface" "$cluster") &
+pr_pid=$!
+echo "PR job started in background (PID: $pr_pid)"
+
+(bash .github/scripts/submit_and_monitor_bench.sh master "$device" "$interface" "$cluster") &
+master_pid=$!
+echo "Master job started in background (PID: $master_pid)"
+
+echo "Waiting for both jobs to complete..."
+
+# Wait and capture exit codes reliably
+pr_exit=0
+master_exit=0
+
+if ! wait "$pr_pid"; then
+  pr_exit=$?
+  echo "PR job exited with code: $pr_exit"
+else
+  echo "PR job completed successfully"
+fi
+
+if ! wait "$master_pid"; then
+  master_exit=$?
+  echo "Master job exited with code: $master_exit"
+else
+  echo "Master job completed successfully"
+fi
+
+# Check if either job failed
+if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then
+  echo "ERROR: One or both benchmark jobs failed: pr_exit=${pr_exit}, master_exit=${master_exit}"
+  exit 1
+fi
+
+echo "=========================================="
+echo "Both benchmark jobs completed successfully!"
+echo "=========================================="
+
+# Final verification that output files exist before proceeding
+pr_yaml="pr/bench-${device}-${interface}.yaml"
+master_yaml="master/bench-${device}-${interface}.yaml"
+
+if [ ! -f "$pr_yaml" ]; then
+  echo "ERROR: PR benchmark output not found: $pr_yaml"
+  ls -la pr/ || true
+  exit 1
+fi
+
+if [ ! -f "$master_yaml" ]; then
+  echo "ERROR: Master benchmark output not found: $master_yaml"
+  ls -la master/ || true
+  exit 1
+fi
+
+echo "Verified both YAML files exist:"
+echo "  - $pr_yaml"
+echo "  - $master_yaml"
+
diff --git a/.github/scripts/submit_and_monitor_bench.sh b/.github/scripts/submit_and_monitor_bench.sh
new file mode 100755
index 0000000000..6ce68909f2
--- /dev/null
+++ b/.github/scripts/submit_and_monitor_bench.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# Submit and monitor a benchmark job on a SLURM cluster
+# Usage: submit_and_monitor_bench.sh <dir> <device> <interface> <cluster>
+
+set -euo pipefail
+
+if [ $# -ne 4 ]; then
+    echo "Usage: $0 <dir> <device> <interface> <cluster>"
+    exit 1
+fi
+
+dir="$1"
+device="$2"
+interface="$3"
+cluster="$4"
+
+echo "[$dir] Submitting benchmark for $device-$interface on $cluster..."
+cd "$dir"
+
+# Submit job
+submit_output=$(bash .github/workflows/$cluster/submit-bench.sh \
+  .github/workflows/$cluster/bench.sh "$device" "$interface" 2>&1)
+
+job_id=$(echo "$submit_output" | sed -n 's/.*Submitted batch job \([0-9][0-9]*\).*/\1/p')
+job_slug="bench-$device-$interface"
+output_file="${job_slug}.out"
+
+if [ -z "$job_id" ]; then
+  echo "[$dir] ERROR: Failed to submit job"
+  echo "$submit_output"
+  exit 1
+fi
+
+echo "[$dir] Job ID: $job_id, monitoring output file: $output_file"
+
+# Use the monitoring script
+bash .github/scripts/monitor_slurm_job.sh "$job_id" "$output_file"
+
+echo "[$dir] Monitoring complete for job $job_id"
+
+# Verify the YAML output file was created
+yaml_file="${job_slug}.yaml"
+if [ ! -f "$yaml_file" ]; then
+  echo "[$dir] ERROR: Expected output file not found: $yaml_file"
+  echo "[$dir] Directory contents:"
+  ls -la *.yaml 2>/dev/null || echo "  No YAML files found"
+  exit 1
+fi
+
+echo "[$dir] Verified output file exists: $yaml_file ($(stat -f%z "$yaml_file" 2>/dev/null || stat -c%s "$yaml_file" 2>/dev/null) bytes)"
+
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 1c485c93a5..cc3099de00 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -98,83 +98,7 @@ jobs:
           wait %1 && wait %2
 
       - name: Bench (Master v. PR)
-        run: |
-          set -e
-          
-          # Function to submit and monitor using extracted script
-          submit_and_monitor() {
-            local dir=$1
-            local device=$2
-            local interface=$3
-            local cluster=$4
-            
-            echo "[$dir] Submitting benchmark for $device-$interface on $cluster..."
-            cd "$dir"
-            
-            # Submit job
-            submit_output=$(bash .github/workflows/$cluster/submit-bench.sh \
-              .github/workflows/$cluster/bench.sh $device $interface 2>&1)
-            
-            job_id=$(echo "$submit_output" | sed -n 's/.*Submitted batch job \([0-9][0-9]*\).*/\1/p')
-            job_slug="bench-$device-$interface"
-            output_file="${job_slug}.out"
-            
-            if [ -z "$job_id" ]; then
-              echo "[$dir] ERROR: Failed to submit job"
-              echo "$submit_output"
-              return 1
-            fi
-            
-            echo "[$dir] Job ID: $job_id, monitoring output file: $output_file"
-            
-            # Use the monitoring script
-            bash .github/scripts/monitor_slurm_job.sh "$job_id" "$output_file"
-            
-            echo "[$dir] Monitoring complete for job $job_id"
-          }
-          
-          # Run both jobs with monitoring
-          echo "=========================================="
-          echo "Starting parallel benchmark jobs..."
-          echo "=========================================="
-          
-          (submit_and_monitor pr ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}) &
-          pr_pid=$!
-          echo "PR job started in background (PID: $pr_pid)"
-          
-          (submit_and_monitor master ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}) &
-          master_pid=$!
-          echo "Master job started in background (PID: $master_pid)"
-          
-          echo "Waiting for both jobs to complete..."
-          
-          # Wait and capture exit codes reliably
-          pr_exit=0
-          master_exit=0
-
-          if ! wait "$pr_pid"; then
-            pr_exit=$?
-            echo "PR job exited with code: $pr_exit"
-          else
-            echo "PR job completed successfully"
-          fi
-          
-          if ! wait "$master_pid"; then
-            master_exit=$?
-            echo "Master job exited with code: $master_exit"
-          else
-            echo "Master job completed successfully"
-          fi
-
-          # Explicitly check and quote to avoid test errors
-          if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then
-            echo "ERROR: One or both benchmark jobs failed: pr_exit=${pr_exit}, master_exit=${master_exit}"
-            exit 1
-          fi
-          
-          echo "=========================================="
-          echo "Both benchmark jobs completed successfully!"
-          echo "=========================================="
+        run: bash .github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
 
       - name: Generate & Post Comment
         run: |

From d9e06f6f8b45c7994362342f7342e122d26dfc55 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Tue, 9 Dec 2025 00:21:26 -0500
Subject: [PATCH 12/21] fix

---
 .github/scripts/run_parallel_benchmarks.sh  | 9 ++++++---
 .github/scripts/submit_and_monitor_bench.sh | 7 +++++--
 .github/workflows/bench.yml                 | 2 +-
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/.github/scripts/run_parallel_benchmarks.sh b/.github/scripts/run_parallel_benchmarks.sh
index eaa333b003..eadafe29ef 100755
--- a/.github/scripts/run_parallel_benchmarks.sh
+++ b/.github/scripts/run_parallel_benchmarks.sh
@@ -13,16 +13,19 @@ device="$1"
 interface="$2"
 cluster="$3"
 
+# Get the directory where this script lives (pr/.github/scripts/)
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
 echo "=========================================="
 echo "Starting parallel benchmark jobs..."
 echo "=========================================="
 
-# Run both jobs with monitoring using dedicated script
-(bash .github/scripts/submit_and_monitor_bench.sh pr "$device" "$interface" "$cluster") &
+# Run both jobs with monitoring using dedicated script from PR
+(bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" pr "$device" "$interface" "$cluster") &
 pr_pid=$!
 echo "PR job started in background (PID: $pr_pid)"
 
-(bash .github/scripts/submit_and_monitor_bench.sh master "$device" "$interface" "$cluster") &
+(bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" master "$device" "$interface" "$cluster") &
 master_pid=$!
 echo "Master job started in background (PID: $master_pid)"
 
diff --git a/.github/scripts/submit_and_monitor_bench.sh b/.github/scripts/submit_and_monitor_bench.sh
index 6ce68909f2..32061cf2f9 100755
--- a/.github/scripts/submit_and_monitor_bench.sh
+++ b/.github/scripts/submit_and_monitor_bench.sh
@@ -14,6 +14,9 @@ device="$2"
 interface="$3"
 cluster="$4"
 
+# Get the directory where this script lives
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
 echo "[$dir] Submitting benchmark for $device-$interface on $cluster..."
 cd "$dir"
 
@@ -33,8 +36,8 @@ fi
 
 echo "[$dir] Job ID: $job_id, monitoring output file: $output_file"
 
-# Use the monitoring script
-bash .github/scripts/monitor_slurm_job.sh "$job_id" "$output_file"
+# Use the monitoring script from PR (where this script lives)
+bash "${SCRIPT_DIR}/monitor_slurm_job.sh" "$job_id" "$output_file"
 
 echo "[$dir] Monitoring complete for job $job_id"
 
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index cc3099de00..2ccdfca87a 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -98,7 +98,7 @@ jobs:
           wait %1 && wait %2
 
       - name: Bench (Master v. PR)
-        run: bash .github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
+        run: bash pr/.github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
 
       - name: Generate & Post Comment
         run: |

From e17eb2751ac6bc7433d6af6da9cd8c234060d4f4 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Tue, 9 Dec 2025 00:51:40 -0500
Subject: [PATCH 13/21] fixup

---
 .github/scripts/monitor_slurm_job.sh       | 3 ++-
 .github/scripts/run_parallel_benchmarks.sh | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh
index 05eaf5a48b..5b65d5c0a4 100755
--- a/.github/scripts/monitor_slurm_job.sh
+++ b/.github/scripts/monitor_slurm_job.sh
@@ -51,7 +51,8 @@ done
 
 echo "=== Streaming output for job $job_id ==="
 # Stream output while job runs (explicitly redirect to ensure output visibility)
-tail -f "$output_file" 2>&1 &
+# Use stdbuf for unbuffered output to ensure immediate display in CI logs
+stdbuf -oL -eL tail -f "$output_file" 2>&1 &
 tail_pid=$!
 
 # Give tail a moment to start and show initial output
diff --git a/.github/scripts/run_parallel_benchmarks.sh b/.github/scripts/run_parallel_benchmarks.sh
index eadafe29ef..442f740eac 100755
--- a/.github/scripts/run_parallel_benchmarks.sh
+++ b/.github/scripts/run_parallel_benchmarks.sh
@@ -21,11 +21,12 @@ echo "Starting parallel benchmark jobs..."
 echo "=========================================="
 
 # Run both jobs with monitoring using dedicated script from PR
-(bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" pr "$device" "$interface" "$cluster") &
+# Use stdbuf for line-buffered output and prefix each line for clarity
+(stdbuf -oL -eL bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" pr "$device" "$interface" "$cluster" 2>&1 | while IFS= read -r line; do echo "[PR] $line"; done) &
 pr_pid=$!
 echo "PR job started in background (PID: $pr_pid)"
 
-(bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" master "$device" "$interface" "$cluster") &
+(stdbuf -oL -eL bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" master "$device" "$interface" "$cluster" 2>&1 | while IFS= read -r line; do echo "[MASTER] $line"; done) &
 master_pid=$!
 echo "Master job started in background (PID: $master_pid)"
 

From db77fb9f70aa093f2f08263169ba5ec677f3abf6 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Tue, 9 Dec 2025 01:08:59 -0500
Subject: [PATCH 14/21] fix

---
 .github/scripts/submit_and_monitor_bench.sh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/scripts/submit_and_monitor_bench.sh b/.github/scripts/submit_and_monitor_bench.sh
index 32061cf2f9..8e4b124107 100755
--- a/.github/scripts/submit_and_monitor_bench.sh
+++ b/.github/scripts/submit_and_monitor_bench.sh
@@ -47,6 +47,11 @@ if [ ! -f "$yaml_file" ]; then
   echo "[$dir] ERROR: Expected output file not found: $yaml_file"
   echo "[$dir] Directory contents:"
   ls -la *.yaml 2>/dev/null || echo "  No YAML files found"
+  echo ""
+  echo "[$dir] Last 100 lines of job output ($output_file):"
+  echo "----------------------------------------"
+  tail -n 100 "$output_file" 2>/dev/null || echo "  Could not read output file"
+  echo "----------------------------------------"
   exit 1
 fi
 

From 09cda88a3a1525dab505e7c32730ba3c36321024 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Tue, 9 Dec 2025 08:41:43 -0500
Subject: [PATCH 15/21] fix up some printing

---
 .github/scripts/monitor_slurm_job.sh      | 70 +++++++++++++++++------
 .github/workflows/phoenix/submit-bench.sh |  2 +-
 2 files changed, 53 insertions(+), 19 deletions(-)

diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh
index 5b65d5c0a4..27472e01ef 100755
--- a/.github/scripts/monitor_slurm_job.sh
+++ b/.github/scripts/monitor_slurm_job.sh
@@ -50,26 +50,33 @@ while [ ! -f "$output_file" ]; do
 done
 
 echo "=== Streaming output for job $job_id ==="
-# Stream output while job runs (explicitly redirect to ensure output visibility)
-# Use stdbuf for unbuffered output to ensure immediate display in CI logs
-stdbuf -oL -eL tail -f "$output_file" 2>&1 &
-tail_pid=$!
 
-# Give tail a moment to start and show initial output
-sleep 2
+# Start tail and redirect its output to file descriptor 3 for multiplexing
+# This allows us to stream tail output while also printing heartbeat messages
+exec 3< <(stdbuf -oL -eL tail -f "$output_file" 2>&1)
+tail_pid=$!
 
-# Wait for job to complete with retry logic for transient squeue failures
+# Monitor job status and stream output simultaneously
 squeue_failures=0
-heartbeat_counter=0
+last_heartbeat=$(date +%s)
+
 while true; do
-  if squeue -j "$job_id" &>/dev/null; then
-    squeue_failures=0
-    # Print heartbeat every 60 seconds (12 iterations * 5 sec)
-    heartbeat_counter=$((heartbeat_counter + 1))
-    if [ $((heartbeat_counter % 12)) -eq 0 ]; then
-      echo "[$(date +%H:%M:%S)] Job $job_id still running..."
+  # Try to read from tail output (non-blocking via timeout)
+  # Read multiple lines if available to avoid falling behind
+  lines_read=0
+  while IFS= read -r -t 0.1 line <&3 2>/dev/null; do
+    echo "$line"
+    lines_read=$((lines_read + 1))
+    last_heartbeat=$(date +%s)
+    # Limit burst reads to avoid starving the status check
+    if [ $lines_read -ge 100 ]; then
+      break
     fi
-  else
+  done
+  
+  # Check job status
+  current_time=$(date +%s)
+  if ! squeue -j "$job_id" &>/dev/null; then
     squeue_failures=$((squeue_failures + 1))
     # Check if job actually completed using sacct (if available)
     if [ $squeue_failures -ge 3 ]; then
@@ -87,14 +94,41 @@ while true; do
             ;;
         esac
       else
-        # No sacct: avoid false positive by doing an extra check cycle
-        squeue_failures=2
+        # No sacct: assume job completed after 3 failures
+        echo "[$(date +%H:%M:%S)] Job $job_id no longer in queue"
+        break
       fi
     fi
+  else
+    squeue_failures=0
+    # Print heartbeat if no output for 60 seconds
+    if [ $((current_time - last_heartbeat)) -ge 60 ]; then
+      echo "[$(date +%H:%M:%S)] Job $job_id still running (no new output for 60s)..."
+      last_heartbeat=$current_time
+    fi
+  fi
+  
+  # Sleep briefly between status checks
+  sleep 1
+done
+
+# Drain any remaining output from tail after job completes
+echo "Draining remaining output..."
+drain_count=0
+while IFS= read -r -t 0.5 line <&3 2>/dev/null; do
+  echo "$line"
+  drain_count=$((drain_count + 1))
+  # Safety limit to avoid infinite loop
+  if [ $drain_count -ge 10000 ]; then
+    echo "Warning: Truncating remaining output after 10000 lines"
+    break
   fi
-  sleep 5
 done
 
+# Close the file descriptor and kill tail
+exec 3<&-
+kill "${tail_pid}" 2>/dev/null || true
+
 # Wait for output file to finish growing (stabilize) before stopping tail
 if [ -f "$output_file" ]; then
   last_size=-1
diff --git a/.github/workflows/phoenix/submit-bench.sh b/.github/workflows/phoenix/submit-bench.sh
index 21c1a557ab..3310df6886 100644
--- a/.github/workflows/phoenix/submit-bench.sh
+++ b/.github/workflows/phoenix/submit-bench.sh
@@ -42,7 +42,7 @@ sbatch <<EOT
 #SBATCH --account=gts-sbryngelson3 # charge account
 #SBATCH -N1                        # Number of nodes required
 $sbatch_device_opts
-#SBATCH -t 03:00:00                # Duration of the job (Ex: 15 mins)
+#SBATCH -t 04:00:00                # Duration of the job (Ex: 15 mins)
 #SBATCH -q embers                  # QOS Name
 #SBATCH -o$job_slug.out            # Combined output and error messages file
 #SBATCH -W                         # Do not exit until the submitted job terminates.

From 3a77897bb4a0c0014f6be2b7b3587abca1de987d Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Tue, 9 Dec 2025 09:41:47 -0500
Subject: [PATCH 16/21] no -W wait

---
 .github/workflows/frontier/submit-bench.sh | 1 -
 .github/workflows/phoenix/submit-bench.sh  | 1 -
 2 files changed, 2 deletions(-)

diff --git a/.github/workflows/frontier/submit-bench.sh b/.github/workflows/frontier/submit-bench.sh
index 9dbc5e5817..4374231eca 100644
--- a/.github/workflows/frontier/submit-bench.sh
+++ b/.github/workflows/frontier/submit-bench.sh
@@ -35,7 +35,6 @@ $sbatch_device_opts
 #SBATCH -t 02:59:00                # Duration of the job (Ex: 15 mins)
 #SBATCH -o$job_slug.out            # Combined output and error messages file
 #SBATCH -p extended                # Extended partition for shorter queues
-#SBATCH -W                         # Do not exit until the submitted job terminates.
 
 set -e
 set -x
diff --git a/.github/workflows/phoenix/submit-bench.sh b/.github/workflows/phoenix/submit-bench.sh
index 3310df6886..7ae85e66fe 100644
--- a/.github/workflows/phoenix/submit-bench.sh
+++ b/.github/workflows/phoenix/submit-bench.sh
@@ -45,7 +45,6 @@ $sbatch_device_opts
 #SBATCH -t 04:00:00                # Duration of the job (Ex: 15 mins)
 #SBATCH -q embers                  # QOS Name
 #SBATCH -o$job_slug.out            # Combined output and error messages file
-#SBATCH -W                         # Do not exit until the submitted job terminates.
 
 set -e
 set -x

From 8e164b6369cb01bbed809da2a3640b0423996071 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Tue, 9 Dec 2025 21:34:14 -0500
Subject: [PATCH 17/21] fix

---
 .github/scripts/run_parallel_benchmarks.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/scripts/run_parallel_benchmarks.sh b/.github/scripts/run_parallel_benchmarks.sh
index 442f740eac..a0080931ba 100755
--- a/.github/scripts/run_parallel_benchmarks.sh
+++ b/.github/scripts/run_parallel_benchmarks.sh
@@ -73,6 +73,9 @@ fi
 if [ ! -f "$master_yaml" ]; then
   echo "ERROR: Master benchmark output not found: $master_yaml"
   ls -la master/ || true
+  echo ""
+  echo "Last 100 lines of master log:"
+  tail -n 100 "master/bench-${device}-${interface}.out" 2>/dev/null || echo "  Could not read master log"
   exit 1
 fi
 

From cfad09966c474215b769b03c3d8effc9508143ab Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Tue, 9 Dec 2025 21:44:21 -0500
Subject: [PATCH 18/21] tweak

---
 .github/scripts/run_parallel_benchmarks.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/run_parallel_benchmarks.sh b/.github/scripts/run_parallel_benchmarks.sh
index a0080931ba..050568a73e 100755
--- a/.github/scripts/run_parallel_benchmarks.sh
+++ b/.github/scripts/run_parallel_benchmarks.sh
@@ -22,11 +22,11 @@ echo "=========================================="
 
 # Run both jobs with monitoring using dedicated script from PR
 # Use stdbuf for line-buffered output and prefix each line for clarity
-(stdbuf -oL -eL bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" pr "$device" "$interface" "$cluster" 2>&1 | while IFS= read -r line; do echo "[PR] $line"; done) &
+(set -o pipefail; stdbuf -oL -eL bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" pr "$device" "$interface" "$cluster" 2>&1 | while IFS= read -r line; do echo "[PR] $line"; done) &
 pr_pid=$!
 echo "PR job started in background (PID: $pr_pid)"
 
-(stdbuf -oL -eL bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" master "$device" "$interface" "$cluster" 2>&1 | while IFS= read -r line; do echo "[MASTER] $line"; done) &
+(set -o pipefail; stdbuf -oL -eL bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" master "$device" "$interface" "$cluster" 2>&1 | while IFS= read -r line; do echo "[MASTER] $line"; done) &
 master_pid=$!
 echo "Master job started in background (PID: $master_pid)"
 

From 4a45ef68fdd6db7609383265c76a4bf03e21b3a1 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Wed, 10 Dec 2025 09:45:56 -0500
Subject: [PATCH 19/21] more debug output

---
 .github/scripts/run_parallel_benchmarks.sh | 3 +++
 .gitignore                                 | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/run_parallel_benchmarks.sh b/.github/scripts/run_parallel_benchmarks.sh
index 050568a73e..e3557b999b 100755
--- a/.github/scripts/run_parallel_benchmarks.sh
+++ b/.github/scripts/run_parallel_benchmarks.sh
@@ -67,6 +67,9 @@ master_yaml="master/bench-${device}-${interface}.yaml"
 if [ ! -f "$pr_yaml" ]; then
   echo "ERROR: PR benchmark output not found: $pr_yaml"
   ls -la pr/ || true
+  echo ""
+  echo "Last 100 lines of PR log:"
+  tail -n 100 "pr/bench-${device}-${interface}.out" 2>/dev/null || echo "  Could not read PR log"
   exit 1
 fi
 
diff --git a/.gitignore b/.gitignore
index 2a6eea8203..c8b2c016b4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -91,5 +91,5 @@ benchmarks/*.png
 *.mkv
 *.avi
 
-packaging/spack/spack-test
-.spack 
\ No newline at end of file
+**isolation_rules/
+**.supercode/
\ No newline at end of file

From b5cb016b9015547083cd838499de32511815c149 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Wed, 10 Dec 2025 09:58:59 -0500
Subject: [PATCH 20/21] minor fixup

---
 .github/scripts/run_parallel_benchmarks.sh | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/.github/scripts/run_parallel_benchmarks.sh b/.github/scripts/run_parallel_benchmarks.sh
index e3557b999b..06b6eaa8d3 100755
--- a/.github/scripts/run_parallel_benchmarks.sh
+++ b/.github/scripts/run_parallel_benchmarks.sh
@@ -36,15 +36,17 @@ echo "Waiting for both jobs to complete..."
 pr_exit=0
 master_exit=0
 
-if ! wait "$pr_pid"; then
-  pr_exit=$?
+wait "$pr_pid"
+pr_exit=$?
+if [ "$pr_exit" -ne 0 ]; then
   echo "PR job exited with code: $pr_exit"
 else
   echo "PR job completed successfully"
 fi
 
-if ! wait "$master_pid"; then
-  master_exit=$?
+wait "$master_pid"
+master_exit=$?
+if [ "$master_exit" -ne 0 ]; then
   echo "Master job exited with code: $master_exit"
 else
   echo "Master job completed successfully"

From 66176e74fa543502df4f95e905b46509fdf86f6e Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Wed, 10 Dec 2025 15:54:29 -0500
Subject: [PATCH 21/21] fix python

---
 toolchain/bootstrap/python.sh | 56 ++++++++++++++++++++++-------------
 1 file changed, 36 insertions(+), 20 deletions(-)

diff --git a/toolchain/bootstrap/python.sh b/toolchain/bootstrap/python.sh
index cc2be64be1..dc6f0c1de9 100644
--- a/toolchain/bootstrap/python.sh
+++ b/toolchain/bootstrap/python.sh
@@ -29,7 +29,8 @@ if [ -f "$(pwd)/build/venv/bin/activate" ]; then
     fi
 fi
 
-if ! command -v pip3 > /dev/null 2>&1 && [ ! -f "$(pwd)/build/venv/bin/activate" ]; then
+# Only bootstrap pip if we don't already have a venv
+if [ ! -f "$(pwd)/build/venv/bin/activate" ]; then
     # Check whether python3 is in the $PATH / is accessible.
     if ! command -v python3 > /dev/null 2>&1; then
         error "Couldn't find$MAGENTA Python$COLOR_RESET. Please ensure it is discoverable."
@@ -39,26 +40,41 @@ if ! command -v pip3 > /dev/null 2>&1 && [ ! -f "$(pwd)/build/venv/bin/activate"
 
     assert_python_compatible
 
-    get_pip_url="https://bootstrap.pypa.io/pip/get-pip.py"
-
-    warn "$MAGENTA""Python$COLOR_RESET's$MAGENTA PIP$COLOR_RESET is not installed."
-    log  "Downloading$MAGENTA Python$COLOR_RESET's$MAGENTA PIP$COLOR_RESET from $get_pip_url..."
-
-    if ! wget -O "$(pwd)/build/get-pip.py" "$get_pip_url"; then
-        error "Couldn't download get-pip.py."
-
-        exit 1
-    fi
-
-    # Suppress PIP version warning (out of date)
-    export PIP_DISABLE_PIP_VERSION_CHECK=1
-    if ! python3 "$(pwd)/build/get-pip.py" --user; then
-        error "Couldn't install$MAGENTA pip$COLOR_RESET with get-pip.py"
-
-        exit 1
+    # Check if pip is already available as a Python module
+    # This works on both laptops and HPC systems with module-loaded Python
+    if ! python3 -c "import pip" > /dev/null 2>&1; then
+        warn "$MAGENTA""Python$COLOR_RESET's$MAGENTA PIP$COLOR_RESET is not installed."
+        
+        # Try ensurepip first (standard library, safe)
+        log "Attempting to install pip via ensurepip..."
+        if python3 -m ensurepip --upgrade 2>/dev/null; then
+            ok "Installed pip via ensurepip."
+        else
+            # Fall back to get-pip.py only if ensurepip fails
+            get_pip_url="https://bootstrap.pypa.io/pip/get-pip.py"
+            log "Downloading$MAGENTA Python$COLOR_RESET's$MAGENTA PIP$COLOR_RESET from $get_pip_url..."
+
+            if ! wget -O "$(pwd)/build/get-pip.py" "$get_pip_url"; then
+                error "Couldn't download get-pip.py."
+                exit 1
+            fi
+
+            # Suppress PIP version warning (out of date)
+            export PIP_DISABLE_PIP_VERSION_CHECK=1
+            if ! python3 "$(pwd)/build/get-pip.py" --user; then
+                error "Couldn't install$MAGENTA pip$COLOR_RESET with get-pip.py"
+                exit 1
+            fi
+
+            ok "Installed pip via get-pip.py."
+            
+            # Ensure user-site bin directory is on PATH for this session
+            user_base_bin="$(python3 -m site --user-base)/bin"
+            if [ -d "$user_base_bin" ]; then
+                export PATH="$user_base_bin:$PATH"
+            fi
+        fi
     fi
-
-    ok "Installed pip."
 fi