Merge branch 'deepmodeling:master' into master

KZHIWEI · web-flow · commit 95264a531254 · 2022-02-17T11:38:59.000+08:00
diff --git a/conda/meta.yaml b/conda/meta.yaml
@@ -25,12 +25,14 @@ requirements:
     - dargs
     - paramiko
     - requests
+    - tqdm
 
   run:
     - python >=3.6
     - dargs
     - paramiko
     - requests
+    - tqdm
 
 test:
   imports:
diff --git a/dpdispatcher/hdfs_context.py b/dpdispatcher/hdfs_context.py
@@ -230,3 +230,6 @@ def read_file(self, fname):
 
     def check_file_exists(self, fname):
         return HDFS.exists(os.path.join(self.remote_root, fname))
+
+    def kill(self, job_id):
+        pass
diff --git a/dpdispatcher/lazy_local_context.py b/dpdispatcher/lazy_local_context.py
@@ -1,5 +1,5 @@
 from dpdispatcher.base_context import BaseContext
-import os
+import os,signal
 import subprocess as sp
 
 class SPRetObj(object) :
@@ -149,8 +149,8 @@ def call(self, cmd) :
         os.chdir(cwd)        
         return proc
 
-    def kill(self, proc):
-        proc.kill()
+    def kill(self, job_id):
+        os.kill(job_id, signal.SIGTERM)
 
     def check_finish(self, proc):
         return (proc.poll() != None)
diff --git a/dpdispatcher/local_context.py b/dpdispatcher/local_context.py
@@ -1,5 +1,5 @@
 from dpdispatcher.base_context import BaseContext
-import os,shutil,hashlib
+import os,shutil,hashlib,signal
 import subprocess as sp
 from glob import glob
 from dpdispatcher import dlog
@@ -397,8 +397,8 @@ def call(self, cmd) :
         os.chdir(cwd)        
         return proc
 
-    def kill(self, proc):
-        proc.kill()
+    def kill(self, job_id):
+        os.kill(job_id, signal.SIGTERM)
 
     def check_finish(self, proc):
         return (proc.poll() != None)
diff --git a/dpdispatcher/ssh_context.py b/dpdispatcher/ssh_context.py
@@ -533,27 +533,15 @@ def _get_files(self,
         of = self.submission.submission_hash + '.tar.gz'
         # remote tar
         # If the number of files are large, we may get "Argument list too long" error.
-        # Thus, we may run tar commands for serveral times and tar only 100 files for
-        # each time.
+        # Thus, "-T" accepts a file containing the list of files
         per_nfile = 100
         ntar = len(files) // per_nfile + 1
         if ntar <= 1:
             self.block_checkcall('tar czfh %s %s' % (of, " ".join(files)))
         else:
-            of_tar = self.submission.submission_hash + '.tar'
-            for ii in range(ntar):
-                ff = files[per_nfile * ii : per_nfile * (ii+1)]
-                if ii == 0:
-                    # tar cf for the first time
-                    self.block_checkcall('tar cfh %s %s' % (of_tar, " ".join(ff)))
-                else:
-                    # append using tar rf
-                    # -r, --append append files to the end of an archive
-                    self.block_checkcall('tar rfh %s %s' % (of_tar, " ".join(ff)))
-            # compress the tar file using gzip, and will get a tar.gz file
-            # overwrite considering dpgen may stop and restart
-            # -f, --force force overwrite of output file and compress links
-            self.block_checkcall('gzip -f %s' % of_tar)
+            file_list_file = os.path.join(self.remote_root, ".tmp.tar." + str(uuid.uuid4()))
+            self.write_file(file_list_file, "\n".join(files))
+            self.block_checkcall('tar czfh %s -T %s' % (of, file_list_file))
         # trans
         from_f = pathlib.PurePath(os.path.join(self.remote_root, of)).as_posix()
         to_f = pathlib.PurePath(os.path.join(self.local_root, of)).as_posix()
diff --git a/dpdispatcher/submission.py b/dpdispatcher/submission.py
@@ -1,7 +1,6 @@
 
 # %%
 import time,random,uuid,json,copy
-
 from dargs.dargs import Argument, Variant
 from dpdispatcher.JobStatus import JobStatus
 from dpdispatcher import dlog
@@ -10,7 +9,7 @@
 from dpdispatcher.machine import Machine
 # from dpdispatcher.slurm import SlurmResources
 #%%
-default_strategy = dict(if_cuda_multi_devices=False)
+default_strategy = dict(if_cuda_multi_devices=False, ratio_unfinished=0.0)
 
 class Submission(object):
     """A submission represents a collection of tasks.
@@ -181,11 +180,16 @@ def run_submission(self, *, exit_on_submit=False, clean=True):
             self.check_all_finished()
             self.handle_unexpected_submission_state()
 
+        ratio_unfinished = self.resources.strategy['ratio_unfinished']
         while not self.check_all_finished():
             if exit_on_submit is True:
                 dlog.info(f"submission succeeded: {self.submission_hash}")
                 dlog.info(f"at {self.machine.context.remote_root}")
                 return self.serialize()
+            if ratio_unfinished > 0.0 and self.check_ratio_unfinished(ratio_unfinished):
+                self.remove_unfinished_jobs()
+                break
+
             try:
                 time.sleep(30)
             except (Exception, KeyboardInterrupt, SystemExit) as e:
@@ -252,6 +256,30 @@ def handle_unexpected_submission_state(self):
 
     # def update_submi
 
+    def check_ratio_unfinished(self, ratio_unfinished):
+        status_list = [job.job_state for job in self.belonging_jobs]
+        finished_num = status_list.count(JobStatus.finished)
+        if finished_num / len(self.belonging_jobs) < (1 - ratio_unfinished):
+            return False
+        else:
+            return True
+
+    def remove_unfinished_jobs(self):
+        removed_jobs = [job for job in self.belonging_jobs if job.job_state not in [JobStatus.finished]]
+        self.belonging_jobs = [job for job in self.belonging_jobs if job.job_state in [JobStatus.finished]]
+        for job in removed_jobs:
+            # kill unfinished jobs
+            try:
+                self.machine.context.kill(job.job_id)
+            except Exception as e:
+                dlog.info("Can not kill job %s" % job.job_id)
+
+            # remove unfinished tasks
+            import os,shutil
+            for task in job.job_task_list:
+                shutil.rmtree(os.path.join(self.machine.context.local_root, task.task_work_path), ignore_errors=True)
+            self.belonging_tasks = [task for task in self.belonging_tasks if task not in job.job_task_list]
+
     def check_all_finished(self):
         """check whether all the jobs in the submission.
 
@@ -559,11 +587,12 @@ def handle_unexpected_job_state(self):
             if ( self.fail_count ) > 0 and ( self.fail_count % 3 == 0 ) :
                 raise RuntimeError(f"job:{self.job_hash} {self.job_id} failed {self.fail_count} times.job_detail:{self}")
             self.submit_job()
-            dlog.info("job:{job_hash} re-submit after terminated; new job_id is {job_id}".format(job_hash=self.job_hash, job_id=self.job_id))
-            time.sleep(0.2)
-            self.get_job_state()
-            dlog.info(f"job:{self.job_hash} job_id:{self.job_id} after re-submitting; the state now is {repr(self.job_state)}")
-            self.handle_unexpected_job_state()
+            if self.job_state != JobStatus.unsubmitted:
+                dlog.info("job:{job_hash} re-submit after terminated; new job_id is {job_id}".format(job_hash=self.job_hash, job_id=self.job_id))
+                time.sleep(0.2)
+                self.get_job_state()
+                dlog.info(f"job:{self.job_hash} job_id:{self.job_id} after re-submitting; the state now is {repr(self.job_state)}")
+                self.handle_unexpected_job_state()
 
         if job_state == JobStatus.unsubmitted:
             dlog.debug(f"job: {self.job_hash} unsubmitted; submit it")
@@ -610,8 +639,8 @@ def register_job_id(self, job_id):
 
     def submit_job(self):
         job_id = self.machine.do_submit(self)
+        self.register_job_id(job_id)
         if job_id:
-            self.register_job_id(job_id)
             self.job_state = JobStatus.waiting
         else:
             self.job_state = JobStatus.unsubmitted
@@ -644,6 +673,8 @@ class Resources(object):
             If there are multiple nvidia GPUS on the node, and we want to assign the tasks to different GPUS.
             If true, dpdispatcher will manually export environment variable CUDA_VISIBLE_DEVICES to different task.
             Usually, this option will be used with Task.task_need_resources variable simultaneously.
+        ratio_unfinished : float
+            The ratio of `jobs` that can be unfinished.
     para_deg : int
         Decide how many tasks will be run in parallel.
         Usually run with `strategy['if_cuda_multi_devices']`
@@ -695,12 +726,17 @@ def __init__(self,
         # if self.gpu_per_node > 1:
         # self.in_para_task_num = 0
 
+        if 'if_cuda_multi_devices' not in self.strategy:
+            self.strategy['if_cuda_multi_devices'] = default_strategy.get('if_cuda_multi_devices')
+        if 'ratio_unfinished' not in self.strategy:
+            self.strategy['ratio_unfinished'] = default_strategy.get('ratio_unfinished')
         if self.strategy['if_cuda_multi_devices'] is True:
             if gpu_per_node < 1:
                 raise RuntimeError("gpu_per_node can not be smaller than 1 when if_cuda_multi_devices is True")
             if number_node != 1:
                 raise RuntimeError("number_node must be 1 when if_cuda_multi_devices is True")
-
+        if self.strategy['ratio_unfinished'] >= 1.0:
+            raise RuntimeError("ratio_unfinished must be smaller than 1.0")
     def __eq__(self, other):
         return self.serialize() == other.serialize()
 
@@ -731,7 +767,6 @@ def deserialize(cls, resources_dict):
                         gpu_per_node=resources_dict.get('gpu_per_node', 0),
                         queue_name=resources_dict.get('queue_name', ''),
                         group_size=resources_dict['group_size'],
-
                         custom_flags=resources_dict.get('custom_flags', []),
                         strategy=resources_dict.get('strategy', default_strategy),
                         para_deg=resources_dict.get('para_deg', 1),
@@ -776,7 +811,8 @@ def arginfo():
         doc_wait_time = 'The waitting time in second after a single `task` submitted'
 
         strategy_args = [
-            Argument("if_cuda_multi_devices", bool, optional=True, default=True)
+            Argument("if_cuda_multi_devices", bool, optional=True, default=True),
+            Argument("ratio_unfinished", float, optional=True, default=0.0)
         ]
         doc_strategy = 'strategies we use to generation job submitting scripts.'
         strategy_format = Argument("strategy", dict, strategy_args, optional=True, doc=doc_strategy)
diff --git a/tests/sample_class.py b/tests/sample_class.py
@@ -40,7 +40,7 @@ def get_sample_resources_dict(cls):
             'queue_name':'T4_4_15', 
             'group_size':2,
             'custom_flags':[],
-            'strategy':{'if_cuda_multi_devices': False}, 
+            'strategy':{'if_cuda_multi_devices': False, 'ratio_unfinished': 0.0},
             'para_deg':1,
             'module_purge':False,
             'module_unload_list':[],