One shot MFAC pruning support (#305)

bfineran · web-flow · commit b1768948d45a · 2021-08-25T18:15:34.000-04:00
* One shot MFAC prunign support

* GradSampler class

* using tensors_module_forward

* fix extra grad collection issue

* perturbation bug fix

* updates from review, rebasing onto latest
diff --git a/src/sparseml/pytorch/optim/modifier_pruning.py b/src/sparseml/pytorch/optim/modifier_pruning.py
@@ -39,6 +39,7 @@
     ScheduledUpdateModifier,
 )
 from sparseml.pytorch.utils import (
+    GradSampler,
     MFACOptions,
     NamedLayerParam,
     get_layer,
@@ -255,7 +256,7 @@ def initialize(
                 )
             )
 
-        self._check_mask_update(module, epoch, steps_per_epoch=1)
+        self._check_mask_update(module, epoch, steps_per_epoch=1, **kwargs)
 
     def finalize(
         self, module: Optional[Module] = None, reset_loggers: bool = True, **kwargs
@@ -332,7 +333,9 @@ def optimizer_post_step(
         self._module_masks.apply()
 
     @abstractmethod
-    def _check_mask_update(self, module: Module, epoch: float, steps_per_epoch: int):
+    def _check_mask_update(
+        self, module: Module, epoch: float, steps_per_epoch: int, **kwargs
+    ):
         raise NotImplementedError()
 
     def _should_log(
@@ -444,7 +447,9 @@ def __init__(
             log_types=log_types,
         )
 
-    def _check_mask_update(self, module: Module, epoch: float, steps_per_epoch: int):
+    def _check_mask_update(
+        self, module: Module, epoch: float, steps_per_epoch: int, **kwargs
+    ):
         if self.start_pending(epoch, steps_per_epoch):
             self._module_masks.set_param_masks_from_weights()
             self._module_masks.enabled = True
@@ -807,7 +812,9 @@ def validate(self):
                 ).format(self._inter_func, INTERPOLATION_FUNCS, self.__class__.__name__)
             )
 
-    def _check_mask_update(self, module: Module, epoch: float, steps_per_epoch: int):
+    def _check_mask_update(
+        self, module: Module, epoch: float, steps_per_epoch: int, **kwargs
+    ):
         """
         Check for updating the pruning masks at the given epoch.
         Called from both initialize and update.
@@ -822,8 +829,9 @@ def _check_mask_update(self, module: Module, epoch: float, steps_per_epoch: int)
             self._module_masks.enabled = True
             started = True
 
-        self._module_masks.pre_optim_step_update()
-        self._pre_step_completed = True
+        if not self._pre_step_completed:
+            self._module_masks.pre_optim_step_update()
+            self._pre_step_completed = True
 
         if started:
             # set the mask tensors according to the new sparsity
@@ -1359,6 +1367,16 @@ def mfac_options(self) -> Dict[str, Any]:
         """
         return self._mfac_options
 
+    def _check_mask_update(
+        self, module: Module, epoch: float, steps_per_epoch: int, **kwargs
+    ):
+        # create grads for pne-shot pruning
+        if "grad_sampler" in kwargs:
+            self._collect_grad_samples(module, kwargs["grad_sampler"])
+            self._pre_step_completed = True
+
+        super()._check_mask_update(module, epoch, steps_per_epoch, **kwargs)
+
     def _create_pruning_mask(
         self, layers: List[Module], layer_names: List[str], param_names: List[str]
     ) -> ModuleParamPruningMask:
@@ -1371,6 +1389,23 @@ def _create_pruning_mask(
             score_type=MFACOptions(**self._mfac_options),
         )
 
+    def _collect_grad_samples(
+        self,
+        module: Module,
+        grad_sampler: GradSampler,
+    ):
+        if not isinstance(grad_sampler, GradSampler):
+            raise ValueError(
+                "One-shot MFAC pruning requires a GradSampler object given by the "
+                f"grad_sampler kwarg. Given an object of type {type(grad_sampler)}"
+            )
+        num_grads = MFACOptions(**self._mfac_options).get_num_grads_for_sparsity(
+            self._applied_sparsity or 0.0
+        )
+
+        for _ in grad_sampler.iter_module_backwards(module, num_grads):
+            self._module_masks.pre_optim_step_update()
+
 
 @PyTorchModifierYAML()
 class MFACGlobalPruningModifier(MFACPruningModifier):
diff --git a/src/sparseml/pytorch/utils/mfac_helpers.py b/src/sparseml/pytorch/utils/mfac_helpers.py
@@ -21,14 +21,29 @@
 import threading
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
-from typing import Dict, List, Optional, Union
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generator,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    Union,
+)
 
 import torch
 from torch import Tensor
+from torch.nn import Module
 from torch.nn.parallel.parallel_apply import parallel_apply
 
+from sparseml.pytorch.utils import tensors_module_forward
+
 
 __all__ = [
+    "GradSampler",
     "MFACOptions",
     "FisherInverse",
     "FisherInverseFast",
@@ -38,6 +53,86 @@
 ]
 
 
+class GradSampler:
+    """
+    Class for computing gradient samples for a Model given a sample data loader and
+    loss function.
+
+    :param data_loader: iterator of data samples to use as model inputs and their loss
+        targets. Samples can either be single tensors as model input or a list of
+        inputs and should be iterated in tuples with their targets
+    :param loss_fn: function to be called on model outputs to compute the loss at
+        each step
+    """
+
+    def __init__(
+        self,
+        data_loader: Iterator[Tuple[Union[Tensor, List[Tensor]], Any]],
+        loss_fn: Callable[[Tensor], Tensor],
+    ):
+        if not isinstance(data_loader, Iterable):
+            raise ValueError(
+                "data_loader for GradSampler must be Iterable, received object of "
+                f"type {type(data_loader)}"
+            )
+        if not callable(loss_fn):
+            raise ValueError(
+                "loss_fn for GradSampler must be callable, given input "
+                f"with type {type(loss_fn)}"
+            )
+
+        self._data_loader = data_loader
+        self._loss_fn = loss_fn
+
+    def module_forward(self, module: Module, data: Union[Tensor, List[Tensor]]) -> Any:
+        """
+        :param module: module to perform forward pass with
+        :param data: single data sample to pass to module
+        :return: output(s) of the module forward pass
+        """
+        if isinstance(data, Tensor):
+            data = [data]
+
+        return tensors_module_forward(*data, module)
+
+    def module_backward(self, module_outputs: Any, targets: Any):
+        """
+        Computes module loss based on the given module outputs, target data and loss
+        function
+
+        :param module_outputs: outputs of a forward pass from a module
+        :param targets: target outputs for the module to be used for the loss function
+        """
+        loss = self._loss_fn(module_outputs, targets)
+        loss.backward()
+
+    def iter_module_backwards(
+        self, module: Module, num_grads: int
+    ) -> Generator[int, None, None]:
+        """
+
+        :param module: module to compute gradients for
+        :param num_grads: number of gradient samples to compute
+        :return: generator that yields after every gradient is computed with the index
+            of the gradient sample number
+        """
+        computed_grads = 0
+
+        while computed_grads < num_grads:
+            for sample, target in self._data_loader:
+                # run sample forward and backwards pass
+                model_outputs = self.module_forward(module, sample)
+                self.module_backward(model_outputs, target)
+
+                # yield so gradients can be collected
+                computed_grads += 1
+                yield computed_grads
+
+                if computed_grads >= num_grads:
+                    break
+                module.zero_grad()
+
+
 @dataclass
 class MFACOptions:
     """