[Cherry-Pick] Fixes #858 and #863 (#865)

KSGulin · bfineran · web-flow · commit fe6826b0396e · 2022-06-13T17:12:21.000-04:00
* Fix transformers batchsize (#858) * Update: remove deepsparse requirement to run yolov5 * Fix: set num_devices to 1 if no gpus * Update: nit * unwrap checkpoint_path on checkpoint recipe load in IC training (#863) * unwrap checkpoint_path on checkpoint recipe load in IC training * import zoo Co-authored-by: Benjamin Fineran <bfineran@users.noreply.github.com>
diff --git a/src/sparseml/pytorch/image_classification/utils/trainer.py b/src/sparseml/pytorch/image_classification/utils/trainer.py
@@ -33,6 +33,7 @@
     default_device,
     is_parallel_model,
 )
+from sparsezoo import Zoo
 
 
 _LOGGER = logging.getLogger(__file__)
@@ -327,6 +328,10 @@ def _run_train_epoch(
         )
 
     def _setup_checkpoint_manager(self):
+        if self.checkpoint_path and self.checkpoint_path.startswith("zoo"):
+            self.checkpoint_path = Zoo.load_model_from_stub(
+                self.checkpoint_path
+            ).download_framework_files(extensions=[".pth"])[0]
         checkpoint_state = torch.load(self.checkpoint_path)
         checkpoint_manager = None
         checkpoint_recipe = checkpoint_state.get("recipe")
diff --git a/src/sparseml/transformers/sparsification/trainer.py b/src/sparseml/transformers/sparsification/trainer.py
@@ -254,9 +254,10 @@ def create_optimizer(self):
             if torch.distributed.is_initialized()
             else self.args._n_gpu
         )
+        n_device = n_gpu if n_gpu > 0 else 1
         total_batch_size = (
             self.args.per_device_train_batch_size
-            * n_gpu
+            * n_device
             * self.args.gradient_accumulation_steps
         )
         self.manager_steps_per_epoch = math.ceil(

Original file line number	Diff line number	Diff line change
`@@ -254,9 +254,10 @@ def create_optimizer(self):`
`254`	`254`	`if torch.distributed.is_initialized()`
`255`	`255`	`else self.args._n_gpu`
`256`	`256`	`)`
	`257`	`+ n_device = n_gpu if n_gpu > 0 else 1`
`257`	`258`	`total_batch_size = (`
`258`	`259`	`self.args.per_device_train_batch_size`
`259`		`- * n_gpu`
	`260`	`+ * n_device`
`260`	`261`	`* self.args.gradient_accumulation_steps`
`261`	`262`	`)`
`262`	`263`	`self.manager_steps_per_epoch = math.ceil(`