[Cherry Pick] M-FAC fixes (#586)

KSGulin · web-flow · commit 28ec07fddb70 · 2022-03-04T11:40:18.000Z
* Fix mfac gradcheck (#573) * Fix: proper handeling of dictionary type num_grads for MFAC (#579) * M-FAC Indexing fix (#583) * Update: docs to use correct mfac options format (#584)
diff --git a/research/mfac/README.md b/research/mfac/README.md
@@ -31,9 +31,9 @@ techniques on a variety of one-shot and gradual pruning tasks.
 SparseML makes it easy to use the M-FAC pruning algorithm as part of sparsification
 recipes to improve pruning recovery by providing an `MFACPruningModifier`.
 The `MFACPruningModifier` contains the same settings as the magnitude
-pruning modifiers and contains extra settings for the M-FAC algorithm under the
-`mfac_options` parameter.  `mfac_options` should be provided as a YAML dictionary and
-details of the main options are provided below.
+pruning modifiers and contains extra settings for the M-FAC algorithm including 
+`num_grads`, `fisher_block_size`, and `available_gpus`. Ideal values will depend 
+on the system available to run on and model to be pruned.
 
 ### Example M-FAC Recipe
 The following is an example `MFACPruningModifier` to be used in place of other
@@ -48,17 +48,11 @@ pruning_modifiers:
     start_epoch: 1.0
     end_epoch: 61.0
     update_frequency: 4.0
-    mfac_options:
-      num_grads: {0.0: 256, 0.5: 512, 0.75: 1024, 0.83: 1400}
-      fisher_block_size: 10000
-      available_gpus: ["cuda:0"]
+    num_grads: {0.0: 256, 0.5: 512, 0.75: 1024, 0.83: 1400}
+    fisher_block_size: 10000
+    available_gpus: ["cuda:0"]
 ```
 
-### mfac_options Parameters
-The following parameters can be specified under the `mfac_options` parameter to control
-how the M-FAC calculations are made. Ideal values will depend on the system
-available to run on and model to be pruned.
-
 #### num_grads
 To approximate the second order information in the M-FAC algorithm, first order
 gradients are used. `num_grads` specifies the number of recent gradient samples to store
diff --git a/research/mfac/recipes/pruning-mnistnet-one_shot-mfac.md b/research/mfac/recipes/pruning-mnistnet-one_shot-mfac.md
@@ -23,13 +23,12 @@ pruning_modifiers:
     start_epoch: 0.0
     end_epoch: 1.0
     update_frequency: 1.0
-    mfac_options:
-      num_grads: 512
-      fisher_block_size: 2000
+    num_grads: 512
+    fisher_block_size: 2000
 ---
 
 # Pruning MNISTNet with M-FAC
 This recipe prunes a model to 35% sparsity using the M-FAC pruning algorithm.
 It is intended for use with MNISTNet but could be used to prune other models
-in one shot, however the `final_sparsity` and `mfac_options` should be adjusted
-accordingly.
+in one shot, however the `final_sparsity`, `num_grads`, and `fisher_block_size` 
+should be adjusted accordingly.
diff --git a/research/mfac/recipes/pruning-mobilenet-imagenette-mfac-short-95.md b/research/mfac/recipes/pruning-mobilenet-imagenette-mfac-short-95.md
@@ -50,10 +50,9 @@ pruning_modifiers:
     end_epoch: *pruning_end_epoch
     update_frequency: *pruning_update_frequency
     mask_type: *pruning_mask_type
-    mfac_options:
-      num_grads: 256
-      fisher_block_size: 2000
-      available_gpus: ["cuda:0"]
+    num_grads: 256
+    fisher_block_size: 2000
+    available_gpus: ["cuda:0"]
 
   - !SetWeightDecayModifier
     weight_decay: 0.0
diff --git a/src/sparseml/pytorch/sparsification/pruning/modifier_pruning_mfac.py b/src/sparseml/pytorch/sparsification/pruning/modifier_pruning_mfac.py
@@ -404,6 +404,7 @@ def __init__(
         self._grad_buffer = None  # type: Tensor
         self._grads = None  # placeholder for all grads across buffers
         self._buffer_idx = 0
+        self._grads_collected = 0
         self._latest_h_inv_diag = None  # type: tuple
 
         # scale num_grads by number of DDP processes
@@ -434,13 +435,13 @@ def score_parameters(self) -> List[Tensor]:
             H^-1, scores will be W^2 / (2 * diag(H^-1))
         """
 
-        if self._grad_buffer is None or torch.any(
-            torch.all(self._grad_buffer == 0.0, dim=1)
+        if self._grads_collected < _get_num_grads_for_sparsity(
+            self._num_grads, self._last_applied_sparsity
         ):
             # raise Exception if grad buffer is not full
             raise RuntimeError(
-                "MFAC pruning step called, but not enough gradient samples have been "
-                f"collected. Expected {self._num_grads} samples"
+                f"MFAC pruning step called, but only {self._grads_collected} were "
+                f"collected from the expected {self._num_grads}."
             )
 
         if self._is_ddp:
@@ -519,6 +520,7 @@ def pre_optim_step_update(self, masks: List[Tensor]):
         # update buffer idx
         self._buffer_idx += 1
         self._buffer_idx %= self._grad_buffer.size(0)
+        self._grads_collected += 1
 
     @torch.no_grad()
     def mask_update(self, masks: List[Tensor], mask_diffs: List[Tensor]):
@@ -635,6 +637,7 @@ def _setup_grad_buffer(self, masks: Tensor):
             device="cpu",
         )
         self._buffer_idx = 0
+        self._grads_collected = 0
 
 
 """
@@ -1260,25 +1263,26 @@ def mul_blocked(self, x: Tensor, call_idx: int, device: str) -> Tensor:
 
         # Get the H^-1 values corresponding to the number of blocks used here.
         # It's clunky compared to torch.cat()[idx], but avoids duplicating
-        # the memory of H^-1
-        start_block = sum(self._num_blocks_per_device_call[:call_idx])
-        end_block = sum(self._num_blocks_per_device_call[: call_idx + 1])
+        # the memory of H^-1. Most of the logic deals with indexing into a list of
+        # tensors as one continuous tensor, to grab slices that may span separate
+        # tensors in the list
+        block_start = sum(self._num_blocks_per_device_call[:call_idx])
+        block_end = sum(self._num_blocks_per_device_call[: call_idx + 1])
         t_hinv = []
-        tensor_start = 0
-        tensor_end = 0
+        cont_end_idx = 0
         for tensor in self._hinvs:
-            tensor_end += len(tensor)
-            if start_block > tensor_end:
+            cont_start_idx = cont_end_idx
+            cont_end_idx += len(tensor)
+            if block_start > cont_end_idx:
                 continue
-            if end_block < tensor_end:
+            if block_end < cont_end_idx:
                 t_hinv.append(
-                    tensor[start_block - tensor_start : end_block - tensor_start]
+                    tensor[block_start - cont_start_idx : block_end - cont_start_idx]
                 )
                 break
             else:
-                t_hinv.append(tensor[start_block - tensor_start :])
-                start_block = tensor_end
-            tensor_start = tensor_end
+                t_hinv.append(tensor[block_start - cont_start_idx :])
+                block_start = cont_end_idx
 
         mul_slice = (
             torch.bmm(torch.cat(t_hinv).to(device), x_slice)
diff --git a/tests/sparseml/pytorch/sparsification/pruning/test_mfac_inverse.py b/tests/sparseml/pytorch/sparsification/pruning/test_mfac_inverse.py
@@ -47,8 +47,7 @@
         pytest.param(
             ["cuda:0"],
             marks=pytest.mark.skipif(
-                "CUDA_VISIBLE_DEVICES" not in os.environ
-                or not os.getenv("CUDA_VISIBLE_DEVICES"),
+                not torch.cuda.is_available(),
                 reason="No CUDA devices available",
             ),
         ),
diff --git a/tests/sparseml/pytorch/sparsification/pruning/test_modifier_pruning_mfac.py b/tests/sparseml/pytorch/sparsification/pruning/test_modifier_pruning_mfac.py
@@ -96,6 +96,17 @@ def _build_gradient_sampler(
             num_grads=8,
             available_devices=["cpu"],
         ),
+        lambda: MFACPruningModifier(
+            params=["seq.fc1.weight", "seq.fc2.weight"],
+            init_sparsity=0.5,
+            final_sparsity=0.95,
+            start_epoch=2.0,
+            end_epoch=5.0,
+            update_frequency=1.0,
+            inter_func="cubic",
+            num_grads=8,
+            global_sparsity=True,
+        ),
     ],
     scope="function",
 )