MFlowCode · sbryngelson · Dec 11, 2025 · Dec 5, 2025 · Dec 5, 2025 · Dec 5, 2025
@@ -0,0 +1,192 @@
+#!/bin/bash
+# Monitor a SLURM job and stream its output in real-time
+# Usage: monitor_slurm_job.sh <job_id> <output_file>
+
+set -euo pipefail
+
+# Cleanup handler to prevent orphaned tail processes
+cleanup() {
+  if [ -n "${tail_pid:-}" ]; then
+    kill "${tail_pid}" 2>/dev/null || true
+  fi
+}
+trap cleanup EXIT
+
+if [ $# -ne 2 ]; then
+    echo "Usage: $0 <job_id> <output_file>"
+    exit 1
+fi
+
+job_id="$1"
+output_file="$2"
+
+echo "Submitted batch job $job_id"
+echo "Monitoring output file: $output_file"
+
+# Wait for file to appear with retry logic for transient squeue failures
+echo "Waiting for job to start..."
+squeue_retries=0
+max_squeue_retries=5
+while [ ! -f "$output_file" ]; do
+  # Check if job is still queued/running
+  if squeue -j "$job_id" &>/dev/null; then
+    squeue_retries=0  # Reset on success
+    sleep 5
+  else
+    squeue_retries=$((squeue_retries + 1))
+    if [ $squeue_retries -ge $max_squeue_retries ]; then
+      # Job not in queue and output file doesn't exist
+      if [ ! -f "$output_file" ]; then
+        echo "ERROR: Job $job_id not in queue and output file not created"
+        exit 1
+      fi
+      break
+    fi
+    # Exponential backoff
+    sleep_time=$((2 ** squeue_retries))
+    echo "Warning: squeue check failed, retrying in ${sleep_time}s..."
+    sleep $sleep_time
+  fi
+done
+
+echo "=== Streaming output for job $job_id ==="
+
+# Start tail and redirect its output to file descriptor 3 for multiplexing
+# This allows us to stream tail output while also printing heartbeat messages
+exec 3< <(stdbuf -oL -eL tail -f "$output_file" 2>&1)
+tail_pid=$!
+
+# Monitor job status and stream output simultaneously
+squeue_failures=0
+last_heartbeat=$(date +%s)
+
+while true; do
+  # Try to read from tail output (non-blocking via timeout)
+  # Read multiple lines if available to avoid falling behind
+  lines_read=0
+  while IFS= read -r -t 0.1 line <&3 2>/dev/null; do
+    echo "$line"
+    lines_read=$((lines_read + 1))
+    last_heartbeat=$(date +%s)
+    # Limit burst reads to avoid starving the status check
+    if [ $lines_read -ge 100 ]; then
+      break
+    fi
+  done
+
+  # Check job status
+  current_time=$(date +%s)
+  if ! squeue -j "$job_id" &>/dev/null; then
+    squeue_failures=$((squeue_failures + 1))
+    # Check if job actually completed using sacct (if available)
+    if [ $squeue_failures -ge 3 ]; then
+      if command -v sacct >/dev/null 2>&1; then
+        state=$(sacct -j "$job_id" --format=State --noheader 2>/dev/null | head -n1 | awk '{print $1}')
+        # Consider job done only if it reached a terminal state
+        case "$state" in
+          COMPLETED|FAILED|CANCELLED|TIMEOUT|OUT_OF_MEMORY)
+            echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state"
+            break
+            ;;
+          *)
+            # treat as transient failure, reset failures and continue polling
+            squeue_failures=0
+            ;;
+        esac
+      else
+        # No sacct: assume job completed after 3 failures
+        echo "[$(date +%H:%M:%S)] Job $job_id no longer in queue"
+        break
+      fi
+    fi
+  else
+    squeue_failures=0
+    # Print heartbeat if no output for 60 seconds
+    if [ $((current_time - last_heartbeat)) -ge 60 ]; then
+      echo "[$(date +%H:%M:%S)] Job $job_id still running (no new output for 60s)..."
+      last_heartbeat=$current_time
+    fi
+  fi
+
+  # Sleep briefly between status checks
+  sleep 1
+done
+
+# Drain any remaining output from tail after job completes
+echo "Draining remaining output..."
+drain_count=0
+while IFS= read -r -t 0.5 line <&3 2>/dev/null; do
+  echo "$line"
+  drain_count=$((drain_count + 1))
+  # Safety limit to avoid infinite loop
+  if [ $drain_count -ge 10000 ]; then
+    echo "Warning: Truncating remaining output after 10000 lines"
+    break
+  fi
+done
+
+# Close the file descriptor and kill tail
+exec 3<&-
+kill "${tail_pid}" 2>/dev/null || true
+
+# Wait for output file to finish growing (stabilize) before stopping tail
+if [ -f "$output_file" ]; then
+  last_size=-1
+  same_count=0
+  while true; do
+    size=$(stat -c%s "$output_file" 2>/dev/null || echo -1)
+    if [ "$size" -eq "$last_size" ] && [ "$size" -ge 0 ]; then
+      same_count=$((same_count + 1))
+    else
+      same_count=0
+      last_size=$size
+    fi
+    # two consecutive stable checks (~10s) implies file likely flushed
+    if [ $same_count -ge 2 ]; then
+      break
+    fi
+    sleep 5
+  done
+fi
+
+# Stop tailing (trap will also handle this on exit)
+kill "${tail_pid}" 2>/dev/null || true
+
+echo ""
+echo "=== Final output ==="
+cat "$output_file"
+
+# Check exit status with sacct fallback
+exit_code=""
+
+# Try scontrol first (works for recent jobs)
+scontrol_output=$(scontrol show job "$job_id" 2>/dev/null || echo "")
+if [ -n "$scontrol_output" ]; then
+  exit_code=$(echo "$scontrol_output" | grep -oE 'ExitCode=[0-9]+:[0-9]+' | cut -d= -f2 || echo "")
+fi
+
+# If scontrol failed or returned invalid job, try sacct (for completed/aged-out jobs)
+if [ -z "$exit_code" ]; then
+  echo "Warning: scontrol failed to get exit code, trying sacct..."
+  sacct_output=$(sacct -j "$job_id" --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 || echo "")
+  if [ -n "$sacct_output" ]; then
+    exit_code="$sacct_output"
+  fi
+fi
+
+# If we still can't determine exit code, fail explicitly
+if [ -z "$exit_code" ]; then
+  echo "ERROR: Unable to determine exit status for job $job_id"
+  echo "Both scontrol and sacct failed to return valid exit code"
+  exit 1
+fi
+
+# Check if job succeeded
+if [ "$exit_code" != "0:0" ]; then
+  echo "ERROR: Job $job_id failed with exit code $exit_code"
+  exit 1
+fi
+
+echo "Job $job_id completed successfully"
+exit 0
+
@@ -0,0 +1,90 @@
+#!/bin/bash
+# Run PR and master benchmarks in parallel and verify outputs
+# Usage: run_parallel_benchmarks.sh <device> <interface> <cluster>
+
+set -euo pipefail
+
+if [ $# -ne 3 ]; then
+    echo "Usage: $0 <device> <interface> <cluster>"
+    exit 1
+fi
+
+device="$1"
+interface="$2"
+cluster="$3"
+
+# Get the directory where this script lives (pr/.github/scripts/)
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+echo "=========================================="
+echo "Starting parallel benchmark jobs..."
+echo "=========================================="
+
+# Run both jobs with monitoring using dedicated script from PR
+# Use stdbuf for line-buffered output and prefix each line for clarity
+(set -o pipefail; stdbuf -oL -eL bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" pr "$device" "$interface" "$cluster" 2>&1 | while IFS= read -r line; do echo "[PR] $line"; done) &
+pr_pid=$!
+echo "PR job started in background (PID: $pr_pid)"
+
+(set -o pipefail; stdbuf -oL -eL bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" master "$device" "$interface" "$cluster" 2>&1 | while IFS= read -r line; do echo "[MASTER] $line"; done) &
+master_pid=$!
+echo "Master job started in background (PID: $master_pid)"
+
+echo "Waiting for both jobs to complete..."
+
+# Wait and capture exit codes reliably
+pr_exit=0
+master_exit=0
+
+wait "$pr_pid"
+pr_exit=$?
+if [ "$pr_exit" -ne 0 ]; then
+  echo "PR job exited with code: $pr_exit"
+else
+  echo "PR job completed successfully"
+fi
+
+wait "$master_pid"
+master_exit=$?
+if [ "$master_exit" -ne 0 ]; then
+  echo "Master job exited with code: $master_exit"
+else
+  echo "Master job completed successfully"
+fi
+
+# Check if either job failed
+if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then
+  echo "ERROR: One or both benchmark jobs failed: pr_exit=${pr_exit}, master_exit=${master_exit}"
+  exit 1
+fi
+
+echo "=========================================="
+echo "Both benchmark jobs completed successfully!"
+echo "=========================================="
+
+# Final verification that output files exist before proceeding
+pr_yaml="pr/bench-${device}-${interface}.yaml"
+master_yaml="master/bench-${device}-${interface}.yaml"
+
+if [ ! -f "$pr_yaml" ]; then
+  echo "ERROR: PR benchmark output not found: $pr_yaml"
+  ls -la pr/ || true
+  echo ""
+  echo "Last 100 lines of PR log:"
+  tail -n 100 "pr/bench-${device}-${interface}.out" 2>/dev/null || echo "  Could not read PR log"
+  exit 1
+fi
+
+if [ ! -f "$master_yaml" ]; then
+  echo "ERROR: Master benchmark output not found: $master_yaml"
+  ls -la master/ || true
+  echo ""
+  echo "Last 100 lines of master log:"
+  tail -n 100 "master/bench-${device}-${interface}.out" 2>/dev/null || echo "  Could not read master log"
+  exit 1
+fi
+
+echo "Verified both YAML files exist:"
+echo "  - $pr_yaml"
+echo "  - $master_yaml"
+
@@ -0,0 +1,59 @@
+#!/bin/bash
+# Submit and monitor a benchmark job on a SLURM cluster
+# Usage: submit_and_monitor_bench.sh <dir> <device> <interface> <cluster>
+
+set -euo pipefail
+
+if [ $# -ne 4 ]; then
+    echo "Usage: $0 <dir> <device> <interface> <cluster>"
+    exit 1
+fi
+
+dir="$1"
+device="$2"
+interface="$3"
+cluster="$4"
+
+# Get the directory where this script lives
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+echo "[$dir] Submitting benchmark for $device-$interface on $cluster..."
+cd "$dir"
+
+# Submit job
+submit_output=$(bash .github/workflows/$cluster/submit-bench.sh \
+  .github/workflows/$cluster/bench.sh "$device" "$interface" 2>&1)
+
+job_id=$(echo "$submit_output" | sed -n 's/.*Submitted batch job \([0-9][0-9]*\).*/\1/p')
+job_slug="bench-$device-$interface"
+output_file="${job_slug}.out"
+
+if [ -z "$job_id" ]; then
+  echo "[$dir] ERROR: Failed to submit job"
+  echo "$submit_output"
+  exit 1
+fi
+
+echo "[$dir] Job ID: $job_id, monitoring output file: $output_file"
+
+# Use the monitoring script from PR (where this script lives)
+bash "${SCRIPT_DIR}/monitor_slurm_job.sh" "$job_id" "$output_file"
+
+echo "[$dir] Monitoring complete for job $job_id"
+
+# Verify the YAML output file was created
+yaml_file="${job_slug}.yaml"
+if [ ! -f "$yaml_file" ]; then
+  echo "[$dir] ERROR: Expected output file not found: $yaml_file"
+  echo "[$dir] Directory contents:"
+  ls -la *.yaml 2>/dev/null || echo "  No YAML files found"
+  echo ""
+  echo "[$dir] Last 100 lines of job output ($output_file):"
+  echo "----------------------------------------"
+  tail -n 100 "$output_file" 2>/dev/null || echo "  Could not read output file"
+  echo "----------------------------------------"
+  exit 1
+fi
+
+echo "[$dir] Verified output file exists: $yaml_file ($(stat -f%z "$yaml_file" 2>/dev/null || stat -c%s "$yaml_file" 2>/dev/null) bytes)"
+
@@ -23,7 +23,7 @@ jobs:
           filters: ".github/file-filter.yml"
 
   self:
-    name: "${{ matrix.name }} (${{ matrix.device }})"
+    name: "${{ matrix.name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
     if: ${{ github.repository=='MFlowCode/MFC' && needs.file-changes.outputs.checkall=='true' && ((github.event_name=='pull_request_review' && github.event.review.state=='approved') || (github.event_name=='pull_request' && (github.event.pull_request.user.login=='sbryngelson' || github.event.pull_request.user.login=='wilfonba'))) }}
     needs: file-changes
     strategy:
@@ -73,7 +73,7 @@ jobs:
     runs-on:
       group: ${{ matrix.group }}
       labels: ${{ matrix.labels }}
-    timeout-minutes: 1400
+    timeout-minutes: 480
     env:
       ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
       ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
@@ -98,10 +98,7 @@ jobs:
           wait %1 && wait %2
 
       - name: Bench (Master v. PR)
-        run: |
-          (cd pr     && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }} ${{ matrix.interface }}) &
-          (cd master && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }} ${{ matrix.interface }}) &
-          wait %1 && wait %2
+        run: bash pr/.github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
 
       - name: Generate & Post Comment
         run: |

@@ -35,7 +35,6 @@ $sbatch_device_opts
 #SBATCH -t 02:59:00                # Duration of the job (Ex: 15 mins)
 #SBATCH -o$job_slug.out            # Combined output and error messages file
 #SBATCH -p extended                # Extended partition for shorter queues
-#SBATCH -W                         # Do not exit until the submitted job terminates.
 
 set -e
 set -x

@@ -42,10 +42,9 @@ sbatch <<EOT
 #SBATCH --account=gts-sbryngelson3 # charge account
 #SBATCH -N1                        # Number of nodes required
 $sbatch_device_opts
-#SBATCH -t 03:00:00                # Duration of the job (Ex: 15 mins)
+#SBATCH -t 04:00:00                # Duration of the job (Ex: 15 mins)
 #SBATCH -q embers                  # QOS Name
 #SBATCH -o$job_slug.out            # Combined output and error messages file
-#SBATCH -W                         # Do not exit until the submitted job terminates.
 
 set -e
 set -x