separate layer-wise and global WoodFisher pruning modifiers (#294)

bfineran · web-flow · commit 1cfc4b8dfb6a · 2021-08-25T10:48:44.000-04:00
* define final sparsity by parameter in torch pruning modifiers

* separate layer-wise and global WoodFisher pruning modifiers

* get_num_grads support for multiple applied sparsities

* fixing tests after rebase
diff --git a/src/sparseml/pytorch/optim/modifier_pruning.py b/src/sparseml/pytorch/optim/modifier_pruning.py
@@ -63,6 +63,7 @@
     "GMPruningModifier",
     "MagnitudePruningModifier",
     "MFACPruningModifier",
+    "MFACGlobalPruningModifier",
     "MovementPruningModifier",
     "GlobalMagnitudePruningModifier",
     "LayerPruningModifier",
@@ -825,6 +826,7 @@ def _check_mask_update(self, module: Module, epoch: float, steps_per_epoch: int)
         self._pre_step_completed = True
 
         if started:
+            # set the mask tensors according to the new sparsity
             if isinstance(self._final_sparsity, List):
                 self._applied_sparsity = [
                     interpolate(
@@ -1294,6 +1296,8 @@ class MFACPruningModifier(GMPruningModifier):
     :param mask_type: String to define type of sparsity (options: ['unstructured',
         'channel', 'filter']), List to define block shape of a parameters in and out
         channels, or a SparsityMaskCreator object. default is 'unstructured'
+    :param global_sparsity: set True to enable global pruning. if False, pruning will
+        be layer-wise. Default is False
     :param mfac_options: Dictionary of key words specifying arguments for the M-FAC
         pruning run. num_grads controls the number of gradient samples that are kept,
         fisher_block_size specifies the block size to break the M-FAC computation into
@@ -1316,6 +1320,7 @@ def __init__(
         phased: bool = False,
         log_types: Union[str, List[str]] = ALL_TOKEN,
         mask_type: Union[str, List[int], PruningMaskCreator] = "unstructured",
+        global_sparsity: bool = False,
         mfac_options: Dict[str, Any] = None,
     ):
         super().__init__(
@@ -1330,18 +1335,11 @@ def __init__(
             phased=phased,
             log_types=log_types,
             mask_type=mask_type,
-            global_sparsity=True,
+            global_sparsity=global_sparsity,
             score_type="MFAC",
         )
         self._mfac_options = mfac_options or {}
 
-    @ModifierProp(serializable=False)
-    def global_sparsity(self) -> bool:
-        """
-        :return: True if global pruning is enabled, False otherwise
-        """
-        return self._global_sparsity
-
     @ModifierProp(serializable=False)
     def score_type(self) -> str:
         """
@@ -1374,6 +1372,112 @@ def _create_pruning_mask(
         )
 
 
+@PyTorchModifierYAML()
+class MFACGlobalPruningModifier(MFACPruningModifier):
+    """
+    Gradually applies kernel sparsity to a given parameter or parameters from
+    init_sparsity until final_sparsity is reached over a given amount of time
+    and applied with an interpolated function for each step taken.
+
+    Uses the Matrix-Free Approxmiate Curvature (M-FAC) algorithm for solving
+    for optimal pruning updates by estimating the inverse Hessian matrix to the
+    loss over time under the Optimal Brain Surgeon (OBS) framework.
+    A link to the paper will be included here in an upcoming update.
+
+    | Sample yaml:
+    |   !MFACPruningModifier
+    |       init_sparsity: 0.05
+    |       final_sparsity: 0.8
+    |       start_epoch: 0.0
+    |       end_epoch: 10.0
+    |       update_frequency: 1.0
+    |       params: ["re:.*weight"]
+    |       leave_enabled: True
+    |       inter_func: cubic
+    |       log_types: __ALL__
+    |       mask_type: unstructured
+    |       mfac_options:
+    |           num_grads: {0.0: 64, 0.5: 128, 0.75: 256, 0.85: 512}
+    |           fisher_block_size: 10000
+    |           available_gpus: ["cuda:0"]
+
+    :param init_sparsity: the initial sparsity for the param to start with at
+        start_epoch
+    :param final_sparsity: the final sparsity for the param to end with at end_epoch.
+        Can also be a Dict of final sparsity values to a list of parameters to apply
+        them to. If given a Dict, then params must be set to [] and the params to
+        be pruned will be read from the final_sparsity Dict
+    :param start_epoch: The epoch to start the modifier at
+    :param end_epoch: The epoch to end the modifier at
+    :param update_frequency: The number of epochs or fraction of epochs to update at
+        between start and end
+    :param params: A list of full parameter names or regex patterns of names to apply
+        pruning to.  Regex patterns must be specified with the prefix 're:'. __ALL__
+        will match to all parameters. __ALL_PRUNABLE__ will match to all ConvNd
+        and Linear layers' weights. If a sparsity to param mapping is defined by
+        final_sparsity, then params should be set to []
+    :param leave_enabled: True to continue masking the weights after end_epoch,
+        False to stop masking. Should be set to False if exporting the result
+        immediately after or doing some other prune
+    :param inter_func: the type of interpolation function to use:
+        [linear, cubic, inverse_cubic]
+    :param phased: True to enable a phased approach where pruning will
+        turn on and off with the update_frequency. Starts with pruning on
+        at start_epoch, off at start_epoch + update_frequency, and so on.
+    :param log_types: The loggers to allow the learning rate to be logged to,
+        default is __ALL__
+    :param mask_type: String to define type of sparsity (options: ['unstructured',
+        'channel', 'filter']), List to define block shape of a parameters in and out
+        channels, or a SparsityMaskCreator object. default is 'unstructured'
+    :param mfac_options: Dictionary of key words specifying arguments for the M-FAC
+        pruning run. num_grads controls the number of gradient samples that are kept,
+        fisher_block_size specifies the block size to break the M-FAC computation into
+        (default is 2000, use None for no blocks), available_gpus specifies a list
+        of device ids that can be used for computation. For a full list of options,
+        see the MFACOptions dataclass documentation. Default configuration uses
+        CPU for computation without blocked computation
+    """
+
+    def __init__(
+        self,
+        init_sparsity: float,
+        final_sparsity: Union[float, Dict[float, List[str]]],
+        start_epoch: float,
+        end_epoch: float,
+        update_frequency: float,
+        params: Union[str, List[str]],
+        leave_enabled: bool = True,
+        inter_func: str = "cubic",
+        phased: bool = False,
+        log_types: Union[str, List[str]] = ALL_TOKEN,
+        mask_type: Union[str, List[int], PruningMaskCreator] = "unstructured",
+        mfac_options: Dict[str, Any] = None,
+    ):
+        super().__init__(
+            init_sparsity=init_sparsity,
+            final_sparsity=final_sparsity,
+            start_epoch=start_epoch,
+            end_epoch=end_epoch,
+            update_frequency=update_frequency,
+            params=params,
+            leave_enabled=leave_enabled,
+            inter_func=inter_func,
+            phased=phased,
+            log_types=log_types,
+            mask_type=mask_type,
+            global_sparsity=True,
+            mfac_options=mfac_options,
+        )
+        self._mfac_options = mfac_options or {}
+
+    @ModifierProp(serializable=False)
+    def global_sparsity(self) -> bool:
+        """
+        :return: True if global pruning is enabled, False otherwise
+        """
+        return self._global_sparsity
+
+
 @PyTorchModifierYAML()
 class LayerPruningModifier(ScheduledUpdateModifier):
     """
diff --git a/src/sparseml/pytorch/utils/mfac_helpers.py b/src/sparseml/pytorch/utils/mfac_helpers.py
@@ -68,9 +68,11 @@ class MFACOptions:
     num_pages: int = 1  # break computation into pages when block size is None
     available_gpus: List[str] = field(default_factory=list)
 
-    def get_num_grads_for_sparsity(self, sparsity: float) -> int:
+    def get_num_grads_for_sparsity(self, sparsity: Union[float, List[float]]) -> int:
         if isinstance(self.num_grads, int):
             return self.num_grads
+        if isinstance(sparsity, List):
+            sparsity = sum(sparsity) / len(sparsity)
 
         sparsity_thresholds = list(sorted(self.num_grads, key=lambda key: float(key)))
         if 0.0 not in sparsity_thresholds:
diff --git a/tests/sparseml/pytorch/optim/test_modifier_pruning.py b/tests/sparseml/pytorch/optim/test_modifier_pruning.py
@@ -26,6 +26,7 @@
     GMPruningModifier,
     LayerPruningModifier,
     MagnitudePruningModifier,
+    MFACGlobalPruningModifier,
     MFACPruningModifier,
     MovementPruningModifier,
     load_mask_creator,
@@ -357,6 +358,7 @@ def test_lifecycle(
                         applied_sparsities, last_sparsities
                     )
                 )
+
             last_sparsities = applied_sparsities
 
         _ = model(test_batch)  # check forward pass
@@ -813,6 +815,7 @@ def test_mfac_pruning_yaml():
     params = "__ALL_PRUNABLE__"
     inter_func = "cubic"
     mask_type = "unstructured"
+    global_sparsity = False
     mfac_options = {"num_grads": 64, "available_gpus": ["cuda:0", "cuda:1"]}
     yaml_str = f"""
     !MFACPruningModifier
@@ -824,9 +827,10 @@ def test_mfac_pruning_yaml():
         params: {params}
         inter_func: {inter_func}
         mask_type: {mask_type}
+        global_sparsity: {global_sparsity}
         mfac_options: {mfac_options}
     """
-    yaml_modifier = MFACPruningModifier.load_obj(yaml_str)  # type: MFACPruningModifier
+    yaml_modifier = MFACPruningModifier.load_obj(yaml_str)
     serialized_modifier = MFACPruningModifier.load_obj(
         str(yaml_modifier)
     )  # type: MFACPruningModifier
@@ -839,6 +843,7 @@ def test_mfac_pruning_yaml():
         params=params,
         inter_func=inter_func,
         mask_type=mask_type,
+        global_sparsity=global_sparsity,
         mfac_options=mfac_options,
     )
 
@@ -879,6 +884,93 @@ def test_mfac_pruning_yaml():
         == str(serialized_modifier.mask_type)
         == str(obj_modifier.mask_type)
     )
+    assert (
+        str(yaml_modifier.global_sparsity)
+        == str(serialized_modifier.global_sparsity)
+        == str(obj_modifier.global_sparsity)
+    )
+    assert (
+        yaml_modifier.mfac_options
+        == serialized_modifier.mfac_options
+        == obj_modifier.mfac_options
+    )
+
+
+def test_global_mfac_pruning_yaml():
+    init_sparsity = 0.05
+    final_sparsity = 0.8
+    start_epoch = 5.0
+    end_epoch = 15.0
+    update_frequency = 1.0
+    params = "__ALL_PRUNABLE__"
+    inter_func = "cubic"
+    mask_type = "unstructured"
+    mfac_options = {"num_grads": 64, "available_gpus": ["cuda:0", "cuda:1"]}
+    yaml_str = f"""
+    !MFACGlobalPruningModifier
+        init_sparsity: {init_sparsity}
+        final_sparsity: {final_sparsity}
+        start_epoch: {start_epoch}
+        end_epoch: {end_epoch}
+        update_frequency: {update_frequency}
+        params: {params}
+        inter_func: {inter_func}
+        mask_type: {mask_type}
+        mfac_options: {mfac_options}
+    """
+    yaml_modifier = MFACGlobalPruningModifier.load_obj(yaml_str)
+    serialized_modifier = MFACGlobalPruningModifier.load_obj(
+        str(yaml_modifier)
+    )  # type: MFACGlobalPruningModifier
+    obj_modifier = MFACGlobalPruningModifier(
+        init_sparsity=init_sparsity,
+        final_sparsity=final_sparsity,
+        start_epoch=start_epoch,
+        end_epoch=end_epoch,
+        update_frequency=update_frequency,
+        params=params,
+        inter_func=inter_func,
+        mask_type=mask_type,
+        mfac_options=mfac_options,
+    )
+
+    assert isinstance(yaml_modifier, MFACGlobalPruningModifier)
+    assert (
+        yaml_modifier.init_sparsity
+        == serialized_modifier.init_sparsity
+        == obj_modifier.init_sparsity
+    )
+    assert (
+        yaml_modifier.final_sparsity
+        == serialized_modifier.final_sparsity
+        == obj_modifier.final_sparsity
+    )
+    assert (
+        yaml_modifier.start_epoch
+        == serialized_modifier.start_epoch
+        == obj_modifier.start_epoch
+    )
+    assert (
+        yaml_modifier.end_epoch
+        == serialized_modifier.end_epoch
+        == obj_modifier.end_epoch
+    )
+    assert (
+        yaml_modifier.update_frequency
+        == serialized_modifier.update_frequency
+        == obj_modifier.update_frequency
+    )
+    assert yaml_modifier.params == serialized_modifier.params == obj_modifier.params
+    assert (
+        yaml_modifier.inter_func
+        == serialized_modifier.inter_func
+        == obj_modifier.inter_func
+    )
+    assert (
+        str(yaml_modifier.mask_type)
+        == str(serialized_modifier.mask_type)
+        == str(obj_modifier.mask_type)
+    )
     assert (
         yaml_modifier.mfac_options
         == serialized_modifier.mfac_options