Skip to content

Commit 4c173f4

Browse files
authored
Merge pull request #131 from njzjz/slurm
add a sitatuation to retry slurm cmd
2 parents e1830f2 + 3e6120c commit 4c173f4

File tree

1 file changed

+2
-2
lines changed

1 file changed

+2
-2
lines changed

dpdispatcher/slurm.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def do_submit(self, job, retry=0, max_retry=3):
4242
ret, stdin, stdout, stderr = self.context.block_call('cd %s && %s %s' % (self.context.remote_root, 'sbatch', script_file_name))
4343
if ret != 0:
4444
err_str = stderr.read().decode('utf-8')
45-
if "Socket timed out on send/recv operation" in err_str:
45+
if "Socket timed out on send/recv operation" in err_str or "Unable to contact slurm controller" in err_str:
4646
# server network error, retry 3 times
4747
if retry < max_retry:
4848
dlog.warning("Get error code %d in submitting through ssh with job: %s . message: %s" %
@@ -81,7 +81,7 @@ def check_status(self, job, retry=0, max_retry=3):
8181
return JobStatus.finished
8282
else :
8383
return JobStatus.terminated
84-
elif "Socket timed out on send/recv operation" in err_str:
84+
elif "Socket timed out on send/recv operation" in err_str or "Unable to contact slurm controller" in err_str:
8585
# retry 3 times
8686
if retry < max_retry:
8787
dlog.warning("Get error code %d in checking status through ssh with job: %s . message: %s" %

0 commit comments

Comments
 (0)