|
1 | 1 |
|
2 | 2 | # %% |
3 | 3 | import time,random,uuid,json,copy |
4 | | - |
5 | 4 | from dargs.dargs import Argument, Variant |
6 | 5 | from dpdispatcher.JobStatus import JobStatus |
7 | 6 | from dpdispatcher import dlog |
|
10 | 9 | from dpdispatcher.machine import Machine |
11 | 10 | # from dpdispatcher.slurm import SlurmResources |
12 | 11 | #%% |
13 | | -default_strategy = dict(if_cuda_multi_devices=False) |
| 12 | +default_strategy = dict(if_cuda_multi_devices=False, ratio_unfinished=0.0) |
14 | 13 |
|
15 | 14 | class Submission(object): |
16 | 15 | """A submission represents a collection of tasks. |
@@ -181,11 +180,16 @@ def run_submission(self, *, exit_on_submit=False, clean=True): |
181 | 180 | self.check_all_finished() |
182 | 181 | self.handle_unexpected_submission_state() |
183 | 182 |
|
| 183 | + ratio_unfinished = self.resources.strategy['ratio_unfinished'] |
184 | 184 | while not self.check_all_finished(): |
185 | 185 | if exit_on_submit is True: |
186 | 186 | dlog.info(f"submission succeeded: {self.submission_hash}") |
187 | 187 | dlog.info(f"at {self.machine.context.remote_root}") |
188 | 188 | return self.serialize() |
| 189 | + if ratio_unfinished > 0.0 and self.check_ratio_unfinished(ratio_unfinished): |
| 190 | + self.remove_unfinished_jobs() |
| 191 | + break |
| 192 | + |
189 | 193 | try: |
190 | 194 | time.sleep(30) |
191 | 195 | except (Exception, KeyboardInterrupt, SystemExit) as e: |
@@ -252,6 +256,30 @@ def handle_unexpected_submission_state(self): |
252 | 256 |
|
253 | 257 | # def update_submi |
254 | 258 |
|
| 259 | + def check_ratio_unfinished(self, ratio_unfinished): |
| 260 | + status_list = [job.job_state for job in self.belonging_jobs] |
| 261 | + finished_num = status_list.count(JobStatus.finished) |
| 262 | + if finished_num / len(self.belonging_jobs) < (1 - ratio_unfinished): |
| 263 | + return False |
| 264 | + else: |
| 265 | + return True |
| 266 | + |
| 267 | + def remove_unfinished_jobs(self): |
| 268 | + removed_jobs = [job for job in self.belonging_jobs if job.job_state not in [JobStatus.finished]] |
| 269 | + self.belonging_jobs = [job for job in self.belonging_jobs if job.job_state in [JobStatus.finished]] |
| 270 | + for job in removed_jobs: |
| 271 | + # kill unfinished jobs |
| 272 | + try: |
| 273 | + self.machine.context.kill(job.job_id) |
| 274 | + except Exception as e: |
| 275 | + dlog.info("Can not kill job %s" % job.job_id) |
| 276 | + |
| 277 | + # remove unfinished tasks |
| 278 | + import os,shutil |
| 279 | + for task in job.job_task_list: |
| 280 | + shutil.rmtree(os.path.join(self.machine.context.local_root, task.task_work_path), ignore_errors=True) |
| 281 | + self.belonging_tasks = [task for task in self.belonging_tasks if task not in job.job_task_list] |
| 282 | + |
255 | 283 | def check_all_finished(self): |
256 | 284 | """check whether all the jobs in the submission. |
257 | 285 |
|
@@ -559,11 +587,12 @@ def handle_unexpected_job_state(self): |
559 | 587 | if ( self.fail_count ) > 0 and ( self.fail_count % 3 == 0 ) : |
560 | 588 | raise RuntimeError(f"job:{self.job_hash} {self.job_id} failed {self.fail_count} times.job_detail:{self}") |
561 | 589 | self.submit_job() |
562 | | - dlog.info("job:{job_hash} re-submit after terminated; new job_id is {job_id}".format(job_hash=self.job_hash, job_id=self.job_id)) |
563 | | - time.sleep(0.2) |
564 | | - self.get_job_state() |
565 | | - dlog.info(f"job:{self.job_hash} job_id:{self.job_id} after re-submitting; the state now is {repr(self.job_state)}") |
566 | | - self.handle_unexpected_job_state() |
| 590 | + if self.job_state != JobStatus.unsubmitted: |
| 591 | + dlog.info("job:{job_hash} re-submit after terminated; new job_id is {job_id}".format(job_hash=self.job_hash, job_id=self.job_id)) |
| 592 | + time.sleep(0.2) |
| 593 | + self.get_job_state() |
| 594 | + dlog.info(f"job:{self.job_hash} job_id:{self.job_id} after re-submitting; the state now is {repr(self.job_state)}") |
| 595 | + self.handle_unexpected_job_state() |
567 | 596 |
|
568 | 597 | if job_state == JobStatus.unsubmitted: |
569 | 598 | dlog.debug(f"job: {self.job_hash} unsubmitted; submit it") |
@@ -610,8 +639,8 @@ def register_job_id(self, job_id): |
610 | 639 |
|
611 | 640 | def submit_job(self): |
612 | 641 | job_id = self.machine.do_submit(self) |
| 642 | + self.register_job_id(job_id) |
613 | 643 | if job_id: |
614 | | - self.register_job_id(job_id) |
615 | 644 | self.job_state = JobStatus.waiting |
616 | 645 | else: |
617 | 646 | self.job_state = JobStatus.unsubmitted |
@@ -644,6 +673,8 @@ class Resources(object): |
644 | 673 | If there are multiple nvidia GPUS on the node, and we want to assign the tasks to different GPUS. |
645 | 674 | If true, dpdispatcher will manually export environment variable CUDA_VISIBLE_DEVICES to different task. |
646 | 675 | Usually, this option will be used with Task.task_need_resources variable simultaneously. |
| 676 | + ratio_unfinished : float |
| 677 | + The ratio of `jobs` that can be unfinished. |
647 | 678 | para_deg : int |
648 | 679 | Decide how many tasks will be run in parallel. |
649 | 680 | Usually run with `strategy['if_cuda_multi_devices']` |
@@ -695,12 +726,17 @@ def __init__(self, |
695 | 726 | # if self.gpu_per_node > 1: |
696 | 727 | # self.in_para_task_num = 0 |
697 | 728 |
|
| 729 | + if 'if_cuda_multi_devices' not in self.strategy: |
| 730 | + self.strategy['if_cuda_multi_devices'] = default_strategy.get('if_cuda_multi_devices') |
| 731 | + if 'ratio_unfinished' not in self.strategy: |
| 732 | + self.strategy['ratio_unfinished'] = default_strategy.get('ratio_unfinished') |
698 | 733 | if self.strategy['if_cuda_multi_devices'] is True: |
699 | 734 | if gpu_per_node < 1: |
700 | 735 | raise RuntimeError("gpu_per_node can not be smaller than 1 when if_cuda_multi_devices is True") |
701 | 736 | if number_node != 1: |
702 | 737 | raise RuntimeError("number_node must be 1 when if_cuda_multi_devices is True") |
703 | | - |
| 738 | + if self.strategy['ratio_unfinished'] >= 1.0: |
| 739 | + raise RuntimeError("ratio_unfinished must be smaller than 1.0") |
704 | 740 | def __eq__(self, other): |
705 | 741 | return self.serialize() == other.serialize() |
706 | 742 |
|
@@ -731,7 +767,6 @@ def deserialize(cls, resources_dict): |
731 | 767 | gpu_per_node=resources_dict.get('gpu_per_node', 0), |
732 | 768 | queue_name=resources_dict.get('queue_name', ''), |
733 | 769 | group_size=resources_dict['group_size'], |
734 | | - |
735 | 770 | custom_flags=resources_dict.get('custom_flags', []), |
736 | 771 | strategy=resources_dict.get('strategy', default_strategy), |
737 | 772 | para_deg=resources_dict.get('para_deg', 1), |
@@ -776,7 +811,8 @@ def arginfo(): |
776 | 811 | doc_wait_time = 'The waitting time in second after a single `task` submitted' |
777 | 812 |
|
778 | 813 | strategy_args = [ |
779 | | - Argument("if_cuda_multi_devices", bool, optional=True, default=True) |
| 814 | + Argument("if_cuda_multi_devices", bool, optional=True, default=True), |
| 815 | + Argument("ratio_unfinished", float, optional=True, default=0.0) |
780 | 816 | ] |
781 | 817 | doc_strategy = 'strategies we use to generation job submitting scripts.' |
782 | 818 | strategy_format = Argument("strategy", dict, strategy_args, optional=True, doc=doc_strategy) |
|
0 commit comments