add pass to thin from checkpoint while loading torch models (#690)

bfineran · web-flow · commit 589f5f9424bf · 2022-04-08T15:13:22.000-04:00
* add pass to thin from checkpoint while loading torch models

* Update src/sparseml/pytorch/utils/helpers.py
diff --git a/src/sparseml/pytorch/utils/helpers.py b/src/sparseml/pytorch/utils/helpers.py
@@ -16,6 +16,7 @@
 Utility / helper functions
 """
 
+import logging
 import random
 import re
 import warnings
@@ -84,9 +85,13 @@
     "get_layer_param",
     "set_deterministic_seeds",
     "torch_distributed_zero_first",
+    "thin_model_from_checkpoint",
 ]
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 ##############################
 #
 # pytorch device helpers
@@ -957,3 +962,99 @@ def torch_distributed_zero_first(local_rank: int):
     yield
     if local_rank == 0:
         torch.distributed.barrier()
+
+
+def thin_model_from_checkpoint(model: Module, state_dict: Dict[str, Any]):
+    """
+    Updates any Linear/Conv/BN layers in the given model to match their
+    respective shapes in the given state dict. Purpose of compatibility
+    when loading weight for a model from a checkpoint of the same architecture
+    but with potentially structured thinning applied. Note that this function
+    has no guarantees on accuracy, will only resize model parameters for
+    loading compatibility. All adjustments done in place
+
+    :param model: model to potentially adjust parameter shapes of
+    :param state_dict: state dict to infer parameter shapes from
+    """
+    first_thinned = True
+    for param_name, checkpoint_tens in state_dict.items():
+        if not param_name.endswith(".weight"):
+            continue  # only deal with weight params of modules
+        layer_name = param_name[:-7]
+        layer = get_layer(layer_name, model)
+
+        if not hasattr(layer, "weight") or (
+            layer.weight.shape == checkpoint_tens.shape
+        ):
+            continue  # skip if there is no update to shape
+
+        # quick check that target layer is some flavor of FC/Conv/BN
+        layer_type = layer.__class__.__name__
+        if not (
+            "Linear" not in layer_type
+            or "Conv" not in layer_type
+            or ("BatchNorm" not in layer_type)
+        ):
+            continue
+
+        orig_shape = layer.weight.shape
+        target_shape = checkpoint_tens.shape
+
+        # update weight param + grad
+        if len(target_shape) > 1:
+            layer.weight.data = layer.weight.data[
+                : target_shape[0], : target_shape[1], ...
+            ]
+            if layer.weight.grad is not None:
+                layer.weight.grad = layer.weight.grad[
+                    : target_shape[0], : target_shape[1], ...
+                ]
+        else:
+            layer.weight.data = layer.weight.data[: target_shape[0]]
+            if layer.weight.grad is not None:
+                layer.weight.grad = layer.weight.grad[: target_shape[0]]
+
+        # update bias param + grad
+        if hasattr(layer, "bias") and layer.bias is not None:
+            # target output channels should be the first dim of target shape
+            layer.bias.data = layer.bias.data[: target_shape[0]]
+            if layer.bias.grad is not None:
+                layer.bias.grad = layer.bias.grad[: target_shape[0]]
+
+        # update layer attributes
+        if "BatchNorm" in layer_type:
+            if hasattr(layer, "num_features"):
+                layer.num_features = layer.weight.size(0)
+            # BN running mean and var are not stored as Parameters
+            if hasattr(layer, "running_mean"):
+                layer.running_mean = torch.zeros_like(layer.running_mean)[
+                    : target_shape[0]
+                ]
+            if hasattr(layer, "running_var"):
+                layer.running_var = torch.zeros_like(layer.running_var)[
+                    : target_shape[0]
+                ]
+
+        if "Linear" in layer_type:
+            if hasattr(layer, "out_features"):
+                layer.out_features = layer.weight.shape[0]
+            if hasattr(layer, "in_features"):
+                layer.in_features = layer.weight.shape[1]
+
+        if "Conv" in layer_type:
+            if hasattr(layer, "out_channels"):
+                layer.out_channels = layer.weight.shape[0]
+            if hasattr(layer, "in_channels"):
+                layer.in_channels = layer.weight.shape[1]
+            if hasattr(layer, "groups") and layer.groups > 1:
+                layer.groups = layer.weight.shape[0] // layer.weight.shape[1]
+
+        if first_thinned:
+            _LOGGER.info(
+                "Thinning module layers for compatibility with given state dict:"
+            )
+            first_thinned = False
+        _LOGGER.info(
+            f"Thinned layer {layer_name} from shape {orig_shape} to "
+            f"{layer.weight.shape}"
+        )
diff --git a/src/sparseml/pytorch/utils/model.py b/src/sparseml/pytorch/utils/model.py
@@ -23,6 +23,7 @@
 from torch.nn import DataParallel, Module
 from torch.optim.optimizer import Optimizer
 
+from sparseml.pytorch.utils.helpers import thin_model_from_checkpoint
 from sparseml.utils.helpers import create_parent_dirs
 from sparsezoo import Zoo
 
@@ -117,6 +118,9 @@ def load_model(
         elif ignore in model_dict and ignore not in current_dict:
             del model_dict[ignore]
 
+    # safety pass for updating layer param shapes when loading a thinned model
+    thin_model_from_checkpoint(model, model_dict)
+
     model.load_state_dict(model_dict, strict)
 
 
diff --git a/tests/sparseml/pytorch/utils/test_helpers.py b/tests/sparseml/pytorch/utils/test_helpers.py
@@ -21,7 +21,7 @@
 import pytest
 import torch
 from torch import Tensor
-from torch.nn import Linear, Module, ReLU, Sequential
+from torch.nn import BatchNorm2d, Conv2d, Linear, Module, ReLU, Sequential
 from torch.optim import SGD
 from torch.utils.data import DataLoader
 
@@ -43,6 +43,7 @@
     tensors_module_forward,
     tensors_to_device,
     tensors_to_precision,
+    thin_model_from_checkpoint,
 )
 from tests.sparseml.pytorch.helpers import LinearNet
 
@@ -837,3 +838,45 @@ def test_tensor_sample_cuda(tensor, size, dim, expected_shape):
 def test_mask_difference(old_mask, new_mask, expected_diff):
     diff = mask_difference(old_mask, new_mask)
     assert torch.sum((diff - expected_diff).abs()) < sys.float_info.epsilon
+
+
+@pytest.mark.skipif(
+    os.getenv("NM_ML_SKIP_PYTORCH_TESTS", False),
+    reason="Skipping pytorch tests",
+)
+@pytest.mark.parametrize(
+    "model,state_dict,test_input",
+    [
+        (
+            Sequential(Conv2d(3, 16, (1, 1)), BatchNorm2d(16), Conv2d(16, 16, (1, 1))),
+            {
+                "0.weight": torch.randn(8, 3, 1, 1),
+                "0.bias": torch.randn(8),
+                "1.weight": torch.randn(8),
+                "1.bias": torch.randn(8),
+                "1.running_mean": torch.randn(8),
+                "1.running_var": torch.randn(8),
+                "2.weight": torch.randn(12, 8, 1, 1),
+                "2.bias": torch.randn(12),
+            },
+            torch.randn(2, 3, 16, 16),
+        ),
+        (
+            Sequential(Linear(8, 12), Linear(12, 16)),
+            {
+                "0.weight": torch.randn(7, 8),
+                "0.bias": torch.randn(7),
+                "1.weight": torch.randn(9, 7),
+                "1.bias": torch.randn(9),
+            },
+            torch.randn(5, 8),
+        ),
+    ],
+)
+def test_thin_model_from_checkpoint(model, state_dict, test_input):
+    with pytest.raises(RuntimeError):
+        model.load_state_dict(state_dict)
+
+    thin_model_from_checkpoint(model, state_dict)
+    model.load_state_dict(state_dict, strict=True)
+    assert isinstance(model(test_input), Tensor)