Skip to content

Commit b220ebf

Browse files
authored
benchmarks hardening! (#1078)
1 parent c91edb9 commit b220ebf

File tree

11 files changed

+686
-144
lines changed

11 files changed

+686
-144
lines changed
Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
#!/bin/bash
2+
# Monitor a SLURM job and stream its output in real-time
3+
# Usage: monitor_slurm_job.sh <job_id> <output_file>
4+
5+
set -euo pipefail
6+
7+
# Cleanup handler to prevent orphaned tail processes
8+
cleanup() {
9+
if [ -n "${tail_pid:-}" ]; then
10+
kill "${tail_pid}" 2>/dev/null || true
11+
fi
12+
}
13+
trap cleanup EXIT
14+
15+
if [ $# -ne 2 ]; then
16+
echo "Usage: $0 <job_id> <output_file>"
17+
exit 1
18+
fi
19+
20+
job_id="$1"
21+
output_file="$2"
22+
23+
echo "Submitted batch job $job_id"
24+
echo "Monitoring output file: $output_file"
25+
26+
# Wait for file to appear with retry logic for transient squeue failures
27+
echo "Waiting for job to start..."
28+
squeue_retries=0
29+
max_squeue_retries=5
30+
while [ ! -f "$output_file" ]; do
31+
# Check if job is still queued/running
32+
if squeue -j "$job_id" &>/dev/null; then
33+
squeue_retries=0 # Reset on success
34+
sleep 5
35+
else
36+
squeue_retries=$((squeue_retries + 1))
37+
if [ $squeue_retries -ge $max_squeue_retries ]; then
38+
# Job not in queue and output file doesn't exist
39+
if [ ! -f "$output_file" ]; then
40+
echo "ERROR: Job $job_id not in queue and output file not created"
41+
exit 1
42+
fi
43+
break
44+
fi
45+
# Exponential backoff
46+
sleep_time=$((2 ** squeue_retries))
47+
echo "Warning: squeue check failed, retrying in ${sleep_time}s..."
48+
sleep $sleep_time
49+
fi
50+
done
51+
52+
echo "=== Streaming output for job $job_id ==="
53+
54+
# Start tail and redirect its output to file descriptor 3 for multiplexing
55+
# This allows us to stream tail output while also printing heartbeat messages
56+
exec 3< <(stdbuf -oL -eL tail -f "$output_file" 2>&1)
57+
tail_pid=$!
58+
59+
# Monitor job status and stream output simultaneously
60+
squeue_failures=0
61+
last_heartbeat=$(date +%s)
62+
63+
while true; do
64+
# Try to read from tail output (non-blocking via timeout)
65+
# Read multiple lines if available to avoid falling behind
66+
lines_read=0
67+
while IFS= read -r -t 0.1 line <&3 2>/dev/null; do
68+
echo "$line"
69+
lines_read=$((lines_read + 1))
70+
last_heartbeat=$(date +%s)
71+
# Limit burst reads to avoid starving the status check
72+
if [ $lines_read -ge 100 ]; then
73+
break
74+
fi
75+
done
76+
77+
# Check job status
78+
current_time=$(date +%s)
79+
if ! squeue -j "$job_id" &>/dev/null; then
80+
squeue_failures=$((squeue_failures + 1))
81+
# Check if job actually completed using sacct (if available)
82+
if [ $squeue_failures -ge 3 ]; then
83+
if command -v sacct >/dev/null 2>&1; then
84+
state=$(sacct -j "$job_id" --format=State --noheader 2>/dev/null | head -n1 | awk '{print $1}')
85+
# Consider job done only if it reached a terminal state
86+
case "$state" in
87+
COMPLETED|FAILED|CANCELLED|TIMEOUT|OUT_OF_MEMORY)
88+
echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state"
89+
break
90+
;;
91+
*)
92+
# treat as transient failure, reset failures and continue polling
93+
squeue_failures=0
94+
;;
95+
esac
96+
else
97+
# No sacct: assume job completed after 3 failures
98+
echo "[$(date +%H:%M:%S)] Job $job_id no longer in queue"
99+
break
100+
fi
101+
fi
102+
else
103+
squeue_failures=0
104+
# Print heartbeat if no output for 60 seconds
105+
if [ $((current_time - last_heartbeat)) -ge 60 ]; then
106+
echo "[$(date +%H:%M:%S)] Job $job_id still running (no new output for 60s)..."
107+
last_heartbeat=$current_time
108+
fi
109+
fi
110+
111+
# Sleep briefly between status checks
112+
sleep 1
113+
done
114+
115+
# Drain any remaining output from tail after job completes
116+
echo "Draining remaining output..."
117+
drain_count=0
118+
while IFS= read -r -t 0.5 line <&3 2>/dev/null; do
119+
echo "$line"
120+
drain_count=$((drain_count + 1))
121+
# Safety limit to avoid infinite loop
122+
if [ $drain_count -ge 10000 ]; then
123+
echo "Warning: Truncating remaining output after 10000 lines"
124+
break
125+
fi
126+
done
127+
128+
# Close the file descriptor and kill tail
129+
exec 3<&-
130+
kill "${tail_pid}" 2>/dev/null || true
131+
132+
# Wait for output file to finish growing (stabilize) before stopping tail
133+
if [ -f "$output_file" ]; then
134+
last_size=-1
135+
same_count=0
136+
while true; do
137+
size=$(stat -c%s "$output_file" 2>/dev/null || echo -1)
138+
if [ "$size" -eq "$last_size" ] && [ "$size" -ge 0 ]; then
139+
same_count=$((same_count + 1))
140+
else
141+
same_count=0
142+
last_size=$size
143+
fi
144+
# two consecutive stable checks (~10s) implies file likely flushed
145+
if [ $same_count -ge 2 ]; then
146+
break
147+
fi
148+
sleep 5
149+
done
150+
fi
151+
152+
# Stop tailing (trap will also handle this on exit)
153+
kill "${tail_pid}" 2>/dev/null || true
154+
155+
echo ""
156+
echo "=== Final output ==="
157+
cat "$output_file"
158+
159+
# Check exit status with sacct fallback
160+
exit_code=""
161+
162+
# Try scontrol first (works for recent jobs)
163+
scontrol_output=$(scontrol show job "$job_id" 2>/dev/null || echo "")
164+
if [ -n "$scontrol_output" ]; then
165+
exit_code=$(echo "$scontrol_output" | grep -oE 'ExitCode=[0-9]+:[0-9]+' | cut -d= -f2 || echo "")
166+
fi
167+
168+
# If scontrol failed or returned invalid job, try sacct (for completed/aged-out jobs)
169+
if [ -z "$exit_code" ]; then
170+
echo "Warning: scontrol failed to get exit code, trying sacct..."
171+
sacct_output=$(sacct -j "$job_id" --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 || echo "")
172+
if [ -n "$sacct_output" ]; then
173+
exit_code="$sacct_output"
174+
fi
175+
fi
176+
177+
# If we still can't determine exit code, fail explicitly
178+
if [ -z "$exit_code" ]; then
179+
echo "ERROR: Unable to determine exit status for job $job_id"
180+
echo "Both scontrol and sacct failed to return valid exit code"
181+
exit 1
182+
fi
183+
184+
# Check if job succeeded
185+
if [ "$exit_code" != "0:0" ]; then
186+
echo "ERROR: Job $job_id failed with exit code $exit_code"
187+
exit 1
188+
fi
189+
190+
echo "Job $job_id completed successfully"
191+
exit 0
192+
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
#!/bin/bash
2+
# Run PR and master benchmarks in parallel and verify outputs
3+
# Usage: run_parallel_benchmarks.sh <device> <interface> <cluster>
4+
5+
set -euo pipefail
6+
7+
if [ $# -ne 3 ]; then
8+
echo "Usage: $0 <device> <interface> <cluster>"
9+
exit 1
10+
fi
11+
12+
device="$1"
13+
interface="$2"
14+
cluster="$3"
15+
16+
# Get the directory where this script lives (pr/.github/scripts/)
17+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
18+
19+
echo "=========================================="
20+
echo "Starting parallel benchmark jobs..."
21+
echo "=========================================="
22+
23+
# Run both jobs with monitoring using dedicated script from PR
24+
# Use stdbuf for line-buffered output and prefix each line for clarity
25+
(set -o pipefail; stdbuf -oL -eL bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" pr "$device" "$interface" "$cluster" 2>&1 | while IFS= read -r line; do echo "[PR] $line"; done) &
26+
pr_pid=$!
27+
echo "PR job started in background (PID: $pr_pid)"
28+
29+
(set -o pipefail; stdbuf -oL -eL bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" master "$device" "$interface" "$cluster" 2>&1 | while IFS= read -r line; do echo "[MASTER] $line"; done) &
30+
master_pid=$!
31+
echo "Master job started in background (PID: $master_pid)"
32+
33+
echo "Waiting for both jobs to complete..."
34+
35+
# Wait and capture exit codes reliably
36+
pr_exit=0
37+
master_exit=0
38+
39+
wait "$pr_pid"
40+
pr_exit=$?
41+
if [ "$pr_exit" -ne 0 ]; then
42+
echo "PR job exited with code: $pr_exit"
43+
else
44+
echo "PR job completed successfully"
45+
fi
46+
47+
wait "$master_pid"
48+
master_exit=$?
49+
if [ "$master_exit" -ne 0 ]; then
50+
echo "Master job exited with code: $master_exit"
51+
else
52+
echo "Master job completed successfully"
53+
fi
54+
55+
# Check if either job failed
56+
if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then
57+
echo "ERROR: One or both benchmark jobs failed: pr_exit=${pr_exit}, master_exit=${master_exit}"
58+
exit 1
59+
fi
60+
61+
echo "=========================================="
62+
echo "Both benchmark jobs completed successfully!"
63+
echo "=========================================="
64+
65+
# Final verification that output files exist before proceeding
66+
pr_yaml="pr/bench-${device}-${interface}.yaml"
67+
master_yaml="master/bench-${device}-${interface}.yaml"
68+
69+
if [ ! -f "$pr_yaml" ]; then
70+
echo "ERROR: PR benchmark output not found: $pr_yaml"
71+
ls -la pr/ || true
72+
echo ""
73+
echo "Last 100 lines of PR log:"
74+
tail -n 100 "pr/bench-${device}-${interface}.out" 2>/dev/null || echo " Could not read PR log"
75+
exit 1
76+
fi
77+
78+
if [ ! -f "$master_yaml" ]; then
79+
echo "ERROR: Master benchmark output not found: $master_yaml"
80+
ls -la master/ || true
81+
echo ""
82+
echo "Last 100 lines of master log:"
83+
tail -n 100 "master/bench-${device}-${interface}.out" 2>/dev/null || echo " Could not read master log"
84+
exit 1
85+
fi
86+
87+
echo "Verified both YAML files exist:"
88+
echo " - $pr_yaml"
89+
echo " - $master_yaml"
90+
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#!/bin/bash
2+
# Submit and monitor a benchmark job on a SLURM cluster
3+
# Usage: submit_and_monitor_bench.sh <dir> <device> <interface> <cluster>
4+
5+
set -euo pipefail
6+
7+
if [ $# -ne 4 ]; then
8+
echo "Usage: $0 <dir> <device> <interface> <cluster>"
9+
exit 1
10+
fi
11+
12+
dir="$1"
13+
device="$2"
14+
interface="$3"
15+
cluster="$4"
16+
17+
# Get the directory where this script lives
18+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
19+
20+
echo "[$dir] Submitting benchmark for $device-$interface on $cluster..."
21+
cd "$dir"
22+
23+
# Submit job
24+
submit_output=$(bash .github/workflows/$cluster/submit-bench.sh \
25+
.github/workflows/$cluster/bench.sh "$device" "$interface" 2>&1)
26+
27+
job_id=$(echo "$submit_output" | sed -n 's/.*Submitted batch job \([0-9][0-9]*\).*/\1/p')
28+
job_slug="bench-$device-$interface"
29+
output_file="${job_slug}.out"
30+
31+
if [ -z "$job_id" ]; then
32+
echo "[$dir] ERROR: Failed to submit job"
33+
echo "$submit_output"
34+
exit 1
35+
fi
36+
37+
echo "[$dir] Job ID: $job_id, monitoring output file: $output_file"
38+
39+
# Use the monitoring script from PR (where this script lives)
40+
bash "${SCRIPT_DIR}/monitor_slurm_job.sh" "$job_id" "$output_file"
41+
42+
echo "[$dir] Monitoring complete for job $job_id"
43+
44+
# Verify the YAML output file was created
45+
yaml_file="${job_slug}.yaml"
46+
if [ ! -f "$yaml_file" ]; then
47+
echo "[$dir] ERROR: Expected output file not found: $yaml_file"
48+
echo "[$dir] Directory contents:"
49+
ls -la *.yaml 2>/dev/null || echo " No YAML files found"
50+
echo ""
51+
echo "[$dir] Last 100 lines of job output ($output_file):"
52+
echo "----------------------------------------"
53+
tail -n 100 "$output_file" 2>/dev/null || echo " Could not read output file"
54+
echo "----------------------------------------"
55+
exit 1
56+
fi
57+
58+
echo "[$dir] Verified output file exists: $yaml_file ($(stat -f%z "$yaml_file" 2>/dev/null || stat -c%s "$yaml_file" 2>/dev/null) bytes)"
59+

.github/workflows/bench.yml

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ jobs:
2323
filters: ".github/file-filter.yml"
2424

2525
self:
26-
name: "${{ matrix.name }} (${{ matrix.device }})"
26+
name: "${{ matrix.name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
2727
if: ${{ github.repository=='MFlowCode/MFC' && needs.file-changes.outputs.checkall=='true' && ((github.event_name=='pull_request_review' && github.event.review.state=='approved') || (github.event_name=='pull_request' && (github.event.pull_request.user.login=='sbryngelson' || github.event.pull_request.user.login=='wilfonba'))) }}
2828
needs: file-changes
2929
strategy:
@@ -73,7 +73,7 @@ jobs:
7373
runs-on:
7474
group: ${{ matrix.group }}
7575
labels: ${{ matrix.labels }}
76-
timeout-minutes: 1400
76+
timeout-minutes: 480
7777
env:
7878
ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
7979
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
@@ -98,10 +98,7 @@ jobs:
9898
wait %1 && wait %2
9999
100100
- name: Bench (Master v. PR)
101-
run: |
102-
(cd pr && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }} ${{ matrix.interface }}) &
103-
(cd master && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }} ${{ matrix.interface }}) &
104-
wait %1 && wait %2
101+
run: bash pr/.github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
105102

106103
- name: Generate & Post Comment
107104
run: |

.github/workflows/frontier/submit-bench.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ $sbatch_device_opts
3535
#SBATCH -t 02:59:00 # Duration of the job (Ex: 15 mins)
3636
#SBATCH -o$job_slug.out # Combined output and error messages file
3737
#SBATCH -p extended # Extended partition for shorter queues
38-
#SBATCH -W # Do not exit until the submitted job terminates.
3938
4039
set -e
4140
set -x

.github/workflows/phoenix/submit-bench.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,9 @@ sbatch <<EOT
4242
#SBATCH --account=gts-sbryngelson3 # charge account
4343
#SBATCH -N1 # Number of nodes required
4444
$sbatch_device_opts
45-
#SBATCH -t 03:00:00 # Duration of the job (Ex: 15 mins)
45+
#SBATCH -t 04:00:00 # Duration of the job (Ex: 15 mins)
4646
#SBATCH -q embers # QOS Name
4747
#SBATCH -o$job_slug.out # Combined output and error messages file
48-
#SBATCH -W # Do not exit until the submitted job terminates.
4948
5049
set -e
5150
set -x

0 commit comments

Comments
 (0)