@@ -156,14 +156,19 @@ def run_submission(self, *, exit_on_submit=False, clean=True):
156156 if not self .belonging_jobs :
157157 self .generate_jobs ()
158158 self .try_recover_from_json ()
159+ self .update_submission_state ()
159160 if self .check_all_finished ():
160161 dlog .info ('info:check_all_finished: True' )
161162 else :
162163 dlog .info ('info:check_all_finished: False' )
163164 self .upload_jobs ()
164165 self .handle_unexpected_submission_state ()
165166 self .submission_to_json ()
166- time .sleep (1 )
167+ time .sleep (1 )
168+ self .update_submission_state ()
169+ self .check_all_finished ()
170+ self .handle_unexpected_submission_state ()
171+
167172 while not self .check_all_finished ():
168173 if exit_on_submit is True :
169174 dlog .info (f"submission succeeded: { self .submission_hash } " )
@@ -179,6 +184,7 @@ def run_submission(self, *, exit_on_submit=False, clean=True):
179184 dlog .debug (self .serialize ())
180185 raise e
181186 else :
187+ self .update_submission_state ()
182188 self .handle_unexpected_submission_state ()
183189 finally :
184190 pass
@@ -189,7 +195,7 @@ def run_submission(self, *, exit_on_submit=False, clean=True):
189195 self .clean_jobs ()
190196 return self .serialize ()
191197
192- def get_submission_state (self ):
198+ def update_submission_state (self ):
193199 """check whether all the jobs in the submission.
194200
195201 Notes
@@ -201,7 +207,7 @@ def get_submission_state(self):
201207 # finished job will be finished for ever, skip
202208 continue
203209 job .get_job_state ()
204- dlog .debug (f"debug:get_submission_state : job: { job .job_hash } , { job .job_id } , { repr ( job .job_state ) } " )
210+ dlog .debug (f"debug:update_submission_state : job: { job .job_hash } , { job .job_id } , { job .job_state } " )
205211 # self.submission_to_json()
206212
207213 def handle_unexpected_submission_state (self ):
@@ -217,9 +223,10 @@ def handle_unexpected_submission_state(self):
217223 self .submission_to_json ()
218224 raise RuntimeError (
219225 f"Meet errors will handle unexpected submission state.\n "
220- f"Debug information: remote_root=={ self .remote_root } .\n "
226+ f"Debug information: remote_root=={ self .machine . context . remote_root } .\n "
221227 f"Debug information: submission_hash=={ self .submission_hash } .\n "
222228 f"Please check the dirs and scripts in remote_root"
229+ f"The job information mentioned above may help"
223230 ) from e
224231
225232 # not used here, submitting job is in handle_unexpected_submission_state.
@@ -231,14 +238,16 @@ def handle_unexpected_submission_state(self):
231238 # job.submit_job()
232239 # self.get_submission_state()
233240
241+ # def update_submi
242+
234243 def check_all_finished (self ):
235244 """check whether all the jobs in the submission.
236245
237246 Notes
238247 -----
239248 This method will not handle unexpected job state in the submission.
240249 """
241- self .get_submission_state ()
250+ # self.update_submission_state ()
242251 if any ( (job .job_state in [JobStatus .terminated , JobStatus .unknown ] ) for job in self .belonging_jobs ):
243252 self .submission_to_json ()
244253 if any ( (job .job_state in [JobStatus .running ,
@@ -294,7 +303,7 @@ def clean_jobs(self):
294303 self .machine .context .clean ()
295304
296305 def submission_to_json (self ):
297- # self.get_submission_state ()
306+ # self.update_submission_state ()
298307 write_str = json .dumps (self .serialize (), indent = 4 , default = str )
299308 submission_file_name = "{submission_hash}.json" .format (submission_hash = self .submission_hash )
300309 self .machine .context .write_file (submission_file_name , write_str = write_str )
@@ -532,11 +541,10 @@ def handle_unexpected_job_state(self):
532541 raise RuntimeError (f"job:{ self .job_hash } { self .job_id } failed { self .fail_count } times.job_detail:{ self } " )
533542 self .submit_job ()
534543 dlog .info ("job:{job_hash} re-submit after terminated; new job_id is {job_id}" .format (job_hash = self .job_hash , job_id = self .job_id ))
544+ time .sleep (0.2 )
535545 self .get_job_state ()
536- dlog .info ("job:{job_hash} job_id:{job_id} after re-submitting; the state now is {job_state}" .format (
537- job_hash = self .job_hash ,
538- job_id = self .job_id ,
539- job_state = JobStatus (self .job_state )))
546+ dlog .info (f"job:{ self .job_hash } job_id:{ self .job_id } after re-submitting; the state now is { repr (self .job_state )} " )
547+ self .handle_unexpected_job_state ()
540548
541549 if job_state == JobStatus .unsubmitted :
542550 dlog .info (f"job: { self .job_hash } unsubmitted; submit it" )
@@ -758,3 +766,5 @@ def arginfo():
758766 resources_format = Argument ("resources" , dict , resources_args )
759767 return resources_format
760768
769+
770+ # %%
0 commit comments