|
| 1 | +#!/bin/bash |
| 2 | +# Monitor a SLURM job and stream its output in real-time |
| 3 | +# Usage: monitor_slurm_job.sh <job_id> <output_file> |
| 4 | + |
| 5 | +set -euo pipefail |
| 6 | + |
| 7 | +# Cleanup handler to prevent orphaned tail processes |
| 8 | +cleanup() { |
| 9 | + if [ -n "${tail_pid:-}" ]; then |
| 10 | + kill "${tail_pid}" 2>/dev/null || true |
| 11 | + fi |
| 12 | +} |
| 13 | +trap cleanup EXIT |
| 14 | + |
| 15 | +if [ $# -ne 2 ]; then |
| 16 | + echo "Usage: $0 <job_id> <output_file>" |
| 17 | + exit 1 |
| 18 | +fi |
| 19 | + |
| 20 | +job_id="$1" |
| 21 | +output_file="$2" |
| 22 | + |
| 23 | +echo "Submitted batch job $job_id" |
| 24 | +echo "Monitoring output file: $output_file" |
| 25 | + |
| 26 | +# Wait for file to appear with retry logic for transient squeue failures |
| 27 | +echo "Waiting for job to start..." |
| 28 | +squeue_retries=0 |
| 29 | +max_squeue_retries=5 |
| 30 | +while [ ! -f "$output_file" ]; do |
| 31 | + # Check if job is still queued/running |
| 32 | + if squeue -j "$job_id" &>/dev/null; then |
| 33 | + squeue_retries=0 # Reset on success |
| 34 | + sleep 5 |
| 35 | + else |
| 36 | + squeue_retries=$((squeue_retries + 1)) |
| 37 | + if [ $squeue_retries -ge $max_squeue_retries ]; then |
| 38 | + # Job not in queue and output file doesn't exist |
| 39 | + if [ ! -f "$output_file" ]; then |
| 40 | + echo "ERROR: Job $job_id not in queue and output file not created" |
| 41 | + exit 1 |
| 42 | + fi |
| 43 | + break |
| 44 | + fi |
| 45 | + # Exponential backoff |
| 46 | + sleep_time=$((2 ** squeue_retries)) |
| 47 | + echo "Warning: squeue check failed, retrying in ${sleep_time}s..." |
| 48 | + sleep $sleep_time |
| 49 | + fi |
| 50 | +done |
| 51 | + |
| 52 | +echo "=== Streaming output for job $job_id ===" |
| 53 | + |
| 54 | +# Start tail and redirect its output to file descriptor 3 for multiplexing |
| 55 | +# This allows us to stream tail output while also printing heartbeat messages |
| 56 | +exec 3< <(stdbuf -oL -eL tail -f "$output_file" 2>&1) |
| 57 | +tail_pid=$! |
| 58 | + |
| 59 | +# Monitor job status and stream output simultaneously |
| 60 | +squeue_failures=0 |
| 61 | +last_heartbeat=$(date +%s) |
| 62 | + |
| 63 | +while true; do |
| 64 | + # Try to read from tail output (non-blocking via timeout) |
| 65 | + # Read multiple lines if available to avoid falling behind |
| 66 | + lines_read=0 |
| 67 | + while IFS= read -r -t 0.1 line <&3 2>/dev/null; do |
| 68 | + echo "$line" |
| 69 | + lines_read=$((lines_read + 1)) |
| 70 | + last_heartbeat=$(date +%s) |
| 71 | + # Limit burst reads to avoid starving the status check |
| 72 | + if [ $lines_read -ge 100 ]; then |
| 73 | + break |
| 74 | + fi |
| 75 | + done |
| 76 | + |
| 77 | + # Check job status |
| 78 | + current_time=$(date +%s) |
| 79 | + if ! squeue -j "$job_id" &>/dev/null; then |
| 80 | + squeue_failures=$((squeue_failures + 1)) |
| 81 | + # Check if job actually completed using sacct (if available) |
| 82 | + if [ $squeue_failures -ge 3 ]; then |
| 83 | + if command -v sacct >/dev/null 2>&1; then |
| 84 | + state=$(sacct -j "$job_id" --format=State --noheader 2>/dev/null | head -n1 | awk '{print $1}') |
| 85 | + # Consider job done only if it reached a terminal state |
| 86 | + case "$state" in |
| 87 | + COMPLETED|FAILED|CANCELLED|TIMEOUT|OUT_OF_MEMORY) |
| 88 | + echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state" |
| 89 | + break |
| 90 | + ;; |
| 91 | + *) |
| 92 | + # treat as transient failure, reset failures and continue polling |
| 93 | + squeue_failures=0 |
| 94 | + ;; |
| 95 | + esac |
| 96 | + else |
| 97 | + # No sacct: assume job completed after 3 failures |
| 98 | + echo "[$(date +%H:%M:%S)] Job $job_id no longer in queue" |
| 99 | + break |
| 100 | + fi |
| 101 | + fi |
| 102 | + else |
| 103 | + squeue_failures=0 |
| 104 | + # Print heartbeat if no output for 60 seconds |
| 105 | + if [ $((current_time - last_heartbeat)) -ge 60 ]; then |
| 106 | + echo "[$(date +%H:%M:%S)] Job $job_id still running (no new output for 60s)..." |
| 107 | + last_heartbeat=$current_time |
| 108 | + fi |
| 109 | + fi |
| 110 | + |
| 111 | + # Sleep briefly between status checks |
| 112 | + sleep 1 |
| 113 | +done |
| 114 | + |
| 115 | +# Drain any remaining output from tail after job completes |
| 116 | +echo "Draining remaining output..." |
| 117 | +drain_count=0 |
| 118 | +while IFS= read -r -t 0.5 line <&3 2>/dev/null; do |
| 119 | + echo "$line" |
| 120 | + drain_count=$((drain_count + 1)) |
| 121 | + # Safety limit to avoid infinite loop |
| 122 | + if [ $drain_count -ge 10000 ]; then |
| 123 | + echo "Warning: Truncating remaining output after 10000 lines" |
| 124 | + break |
| 125 | + fi |
| 126 | +done |
| 127 | + |
| 128 | +# Close the file descriptor and kill tail |
| 129 | +exec 3<&- |
| 130 | +kill "${tail_pid}" 2>/dev/null || true |
| 131 | + |
| 132 | +# Wait for output file to finish growing (stabilize) before stopping tail |
| 133 | +if [ -f "$output_file" ]; then |
| 134 | + last_size=-1 |
| 135 | + same_count=0 |
| 136 | + while true; do |
| 137 | + size=$(stat -c%s "$output_file" 2>/dev/null || echo -1) |
| 138 | + if [ "$size" -eq "$last_size" ] && [ "$size" -ge 0 ]; then |
| 139 | + same_count=$((same_count + 1)) |
| 140 | + else |
| 141 | + same_count=0 |
| 142 | + last_size=$size |
| 143 | + fi |
| 144 | + # two consecutive stable checks (~10s) implies file likely flushed |
| 145 | + if [ $same_count -ge 2 ]; then |
| 146 | + break |
| 147 | + fi |
| 148 | + sleep 5 |
| 149 | + done |
| 150 | +fi |
| 151 | + |
| 152 | +# Stop tailing (trap will also handle this on exit) |
| 153 | +kill "${tail_pid}" 2>/dev/null || true |
| 154 | + |
| 155 | +echo "" |
| 156 | +echo "=== Final output ===" |
| 157 | +cat "$output_file" |
| 158 | + |
| 159 | +# Check exit status with sacct fallback |
| 160 | +exit_code="" |
| 161 | + |
| 162 | +# Try scontrol first (works for recent jobs) |
| 163 | +scontrol_output=$(scontrol show job "$job_id" 2>/dev/null || echo "") |
| 164 | +if [ -n "$scontrol_output" ]; then |
| 165 | + exit_code=$(echo "$scontrol_output" | grep -oE 'ExitCode=[0-9]+:[0-9]+' | cut -d= -f2 || echo "") |
| 166 | +fi |
| 167 | + |
| 168 | +# If scontrol failed or returned invalid job, try sacct (for completed/aged-out jobs) |
| 169 | +if [ -z "$exit_code" ]; then |
| 170 | + echo "Warning: scontrol failed to get exit code, trying sacct..." |
| 171 | + sacct_output=$(sacct -j "$job_id" --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 || echo "") |
| 172 | + if [ -n "$sacct_output" ]; then |
| 173 | + exit_code="$sacct_output" |
| 174 | + fi |
| 175 | +fi |
| 176 | + |
| 177 | +# If we still can't determine exit code, fail explicitly |
| 178 | +if [ -z "$exit_code" ]; then |
| 179 | + echo "ERROR: Unable to determine exit status for job $job_id" |
| 180 | + echo "Both scontrol and sacct failed to return valid exit code" |
| 181 | + exit 1 |
| 182 | +fi |
| 183 | + |
| 184 | +# Check if job succeeded |
| 185 | +if [ "$exit_code" != "0:0" ]; then |
| 186 | + echo "ERROR: Job $job_id failed with exit code $exit_code" |
| 187 | + exit 1 |
| 188 | +fi |
| 189 | + |
| 190 | +echo "Job $job_id completed successfully" |
| 191 | +exit 0 |
| 192 | + |
0 commit comments