Skip to content

Commit 4cab59a

Browse files
fix: fix incorrect error messages (#483)
<!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **New Features** - Enhanced error messages for command execution failures, providing clearer context by including the specific command that failed. - **Bug Fixes** - Improved error reporting in multiple command handling methods, aiding in quicker identification of issues during command execution. <!-- end of auto-generated comment: release notes by coderabbit.ai --> --------- Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent c5ffb08 commit 4cab59a

File tree

6 files changed

+41
-41
lines changed

6 files changed

+41
-41
lines changed

dpdispatcher/machines/JH_UniScheduler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def check_status(self, job):
105105
elif ret != 0:
106106
# just retry when any unknown error raised.
107107
raise RetrySignal(
108-
"Get error code %d in checking status through ssh with job: %s . message: %s"
108+
"Get error code %d in checking status with job: %s . message: %s"
109109
% (ret, job.job_hash, err_str)
110110
)
111111
status_out = stdout.read().decode("utf-8").split("\n")

dpdispatcher/machines/distributed_shell.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -181,8 +181,8 @@ def do_submit(self, job):
181181
if ret != 0:
182182
err_str = stderr.decode("utf-8")
183183
raise RuntimeError(
184-
"Command squeue fails to execute, error message:%s\nreturn code %d\n"
185-
% (err_str, ret)
184+
"Command %s fails to execute, error message:%s\nreturn code %d\n"
185+
% (cmd, err_str, ret)
186186
)
187187
job_id = int(stdout.decode("utf-8").strip())
188188

dpdispatcher/machines/lsf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ def check_status(self, job):
129129
elif ret != 0:
130130
# just retry when any unknown error raised.
131131
raise RetrySignal(
132-
"Get error code %d in checking status through ssh with job: %s . message: %s"
132+
"Get error code %d in checking status with job: %s . message: %s"
133133
% (ret, job.job_hash, err_str)
134134
)
135135
status_out = stdout.read().decode("utf-8").split("\n")

dpdispatcher/machines/pbs.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,8 @@ def check_status(self, job):
7676
job_id = job.job_id
7777
if job_id == "":
7878
return JobStatus.unsubmitted
79-
ret, stdin, stdout, stderr = self.context.block_call("qstat -x " + job_id)
79+
command = "qstat -x " + job_id
80+
ret, stdin, stdout, stderr = self.context.block_call(command)
8081
err_str = stderr.read().decode("utf-8")
8182
if ret != 0:
8283
if "qstat: Unknown Job Id" in err_str or "Job has finished" in err_str:
@@ -86,8 +87,8 @@ def check_status(self, job):
8687
return JobStatus.terminated
8788
else:
8889
raise RuntimeError(
89-
"status command qstat fails to execute. erro info: %s return code %d"
90-
% (err_str, ret)
90+
"status command %s fails to execute. erro info: %s return code %d"
91+
% (command, err_str, ret)
9192
)
9293
status_line = stdout.read().decode("utf-8").split("\n")[-2]
9394
status_word = status_line.split()[-2]
@@ -126,7 +127,8 @@ def check_status(self, job):
126127
job_id = job.job_id
127128
if job_id == "":
128129
return JobStatus.unsubmitted
129-
ret, stdin, stdout, stderr = self.context.block_call("qstat -l " + job_id)
130+
command = "qstat -l " + job_id
131+
ret, stdin, stdout, stderr = self.context.block_call(command)
130132
err_str = stderr.read().decode("utf-8")
131133
if ret != 0:
132134
if "qstat: Unknown Job Id" in err_str or "Job has finished" in err_str:
@@ -136,8 +138,8 @@ def check_status(self, job):
136138
return JobStatus.terminated
137139
else:
138140
raise RuntimeError(
139-
"status command qstat fails to execute. erro info: %s return code %d"
140-
% (err_str, ret)
141+
"status command %s fails to execute. erro info: %s return code %d"
142+
% (command, err_str, ret)
141143
)
142144
status_line = stdout.read().decode("utf-8").split("\n")[-2]
143145
status_word = status_line.split()[-2]
@@ -263,11 +265,12 @@ def check_status(self, job):
263265
status_line = None
264266
if job_id == "":
265267
return JobStatus.unsubmitted
266-
ret, stdin, stdout, stderr = self.context.block_call("qstat")
268+
command = "qstat"
269+
ret, stdin, stdout, stderr = self.context.block_call(command)
267270
err_str = stderr.read().decode("utf-8")
268271
if ret != 0:
269272
raise RuntimeError(
270-
f"status command qstat fails to execute. erro info: {err_str} return code {ret}"
273+
f"status command {command} fails to execute. erro info: {err_str} return code {ret}"
271274
)
272275
status_text_list = stdout.read().decode("utf-8").split("\n")
273276
for txt in status_text_list:

dpdispatcher/machines/shell.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -38,14 +38,13 @@ def do_submit(self, job):
3838
script_run_str = self.gen_script_command(job)
3939
script_run_file_name = f"{job.script_file_name}.run"
4040
self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
41-
ret, stdin, stdout, stderr = self.context.block_call(
42-
f"cd {shlex.quote(self.context.remote_root)} && {{ nohup bash {script_file_name} 1>>{output_name} 2>>{output_name} & }} && echo $!"
43-
)
41+
cmd = f"cd {shlex.quote(self.context.remote_root)} && {{ nohup bash {script_file_name} 1>>{output_name} 2>>{output_name} & }} && echo $!"
42+
ret, stdin, stdout, stderr = self.context.block_call(cmd)
4443
if ret != 0:
4544
err_str = stderr.read().decode("utf-8")
4645
raise RuntimeError(
47-
"status command squeue fails to execute\nerror message:%s\nreturn code %d\n"
48-
% (err_str, ret)
46+
"status command %s fails to execute\nerror message:%s\nreturn code %d\n"
47+
% (cmd, err_str, ret)
4948
)
5049
job_id = int(stdout.read().decode("utf-8").strip())
5150
self.context.write_file(job_id_name, str(job_id))
@@ -73,15 +72,16 @@ def check_status(self, job):
7372
return JobStatus.unsubmitted
7473

7574
# mark defunct process as terminated
76-
ret, stdin, stdout, stderr = self.context.block_call(
75+
cmd = (
7776
r"""command -v ps >/dev/null 2>&1 || { echo >&2 "I require ps but it's not installed. Aborting."; exit 1; };"""
7877
f"if ps -p {job_id} > /dev/null && ! (ps -o command -p {job_id} | grep defunct >/dev/null) ; then echo 1; fi"
7978
)
79+
ret, stdin, stdout, stderr = self.context.block_call(cmd)
8080
if ret != 0:
8181
err_str = stderr.read().decode("utf-8")
8282
raise RuntimeError(
83-
"status command squeue fails to execute\nerror message:%s\nreturn code %d\n"
84-
% (err_str, ret)
83+
"status command %s fails to execute\nerror message:%s\nreturn code %d\n"
84+
% (cmd, err_str, ret)
8585
)
8686

8787
if_job_exists = bool(stdout.read().decode("utf-8").strip())

dpdispatcher/machines/slurm.py

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -83,13 +83,12 @@ def do_submit(self, job):
8383
script_run_file_name = f"{job.script_file_name}.run"
8484
self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
8585
# self.context.write_file(fname=os.path.join(self.context.submission.work_base, script_file_name), write_str=script_str)
86-
ret, stdin, stdout, stderr = self.context.block_call(
87-
"cd {} && {} {}".format(
88-
shlex.quote(self.context.remote_root),
89-
"sbatch",
90-
shlex.quote(script_file_name),
91-
)
86+
command = "cd {} && {} {}".format(
87+
shlex.quote(self.context.remote_root),
88+
"sbatch",
89+
shlex.quote(script_file_name),
9290
)
91+
ret, stdin, stdout, stderr = self.context.block_call(command)
9392
if ret != 0:
9493
err_str = stderr.read().decode("utf-8")
9594
if (
@@ -98,7 +97,7 @@ def do_submit(self, job):
9897
):
9998
# server network error, retry 3 times
10099
raise RetrySignal(
101-
"Get error code %d in submitting through ssh with job: %s . message: %s"
100+
"Get error code %d in submitting with job: %s . message: %s"
102101
% (ret, job.job_hash, err_str)
103102
)
104103
elif (
@@ -110,8 +109,8 @@ def do_submit(self, job):
110109
# job number exceeds, skip the submitting
111110
return ""
112111
raise RuntimeError(
113-
"status command squeue fails to execute\nerror message:%s\nreturn code %d\n"
114-
% (err_str, ret)
112+
"command %s fails to execute\nerror message:%s\nreturn code %d\n"
113+
% (command, err_str, ret)
115114
)
116115
subret = stdout.readlines()
117116
# --parsable
@@ -129,9 +128,8 @@ def check_status(self, job):
129128
job_id = job.job_id
130129
if job_id == "":
131130
return JobStatus.unsubmitted
132-
ret, stdin, stdout, stderr = self.context.block_call(
133-
'squeue -o "%.18i %.2t" -j ' + job_id
134-
)
131+
command = 'squeue -o "%.18i %.2t" -j ' + job_id
132+
ret, stdin, stdout, stderr = self.context.block_call(command)
135133
if ret != 0:
136134
err_str = stderr.read().decode("utf-8")
137135
if "Invalid job id specified" in err_str:
@@ -147,13 +145,13 @@ def check_status(self, job):
147145
):
148146
# retry 3 times
149147
raise RetrySignal(
150-
"Get error code %d in checking status through ssh with job: %s . message: %s"
148+
"Get error code %d in checking status with job: %s . message: %s"
151149
% (ret, job.job_hash, err_str)
152150
)
153151
raise RuntimeError(
154-
"status command squeue fails to execute."
152+
"status command %s fails to execute."
155153
"job_id:%s \n error message:%s\n return code %d\n"
156-
% (job_id, err_str, ret)
154+
% (command, job_id, err_str, ret)
157155
)
158156
status_line = stdout.read().decode("utf-8").split("\n")[-2]
159157
status_word = status_line.split()[-1]
@@ -319,9 +317,8 @@ def check_status(self, job):
319317
job_id = job.job_id
320318
if job_id == "":
321319
return JobStatus.unsubmitted
322-
ret, stdin, stdout, stderr = self.context.block_call(
323-
'squeue -h -o "%.18i %.2t" -j ' + job_id
324-
)
320+
command = 'squeue -h -o "%.18i %.2t" -j ' + job_id
321+
ret, stdin, stdout, stderr = self.context.block_call(command)
325322
if ret != 0:
326323
err_str = stderr.read().decode("utf-8")
327324
if "Invalid job id specified" in err_str:
@@ -336,13 +333,13 @@ def check_status(self, job):
336333
):
337334
# retry 3 times
338335
raise RetrySignal(
339-
"Get error code %d in checking status through ssh with job: %s . message: %s"
336+
"Get error code %d in checking status with job: %s . message: %s"
340337
% (ret, job.job_hash, err_str)
341338
)
342339
raise RuntimeError(
343-
"status command squeue fails to execute."
340+
"status command %s fails to execute."
344341
"job_id:%s \n error message:%s\n return code %d\n"
345-
% (job_id, err_str, ret)
342+
% (command, job_id, err_str, ret)
346343
)
347344
status_lines = stdout.read().decode("utf-8").split("\n")[:-1]
348345
status = []

0 commit comments

Comments
 (0)