Efficientnet fixes (#1035)

anmarques · web-flow · commit 3ccb428923c3 · 2022-09-14T15:47:48.000-04:00
* Quantization improvements

* Replace Swish by SiLU. Fix padding of input conv

* Update image size

* Quantization improvements

* Replace Swish by SiLU. Fix padding of input conv

* Update image size

* Style and quality fixes
diff --git a/src/sparseml/pytorch/models/classification/efficientnet.py b/src/sparseml/pytorch/models/classification/efficientnet.py
@@ -19,8 +19,9 @@
 
 import math
 from collections import OrderedDict
-from typing import List, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
+import torch
 from torch import Tensor
 from torch.nn import (
     AdaptiveAvgPool2d,
@@ -31,11 +32,18 @@
     Module,
     Sequential,
     Sigmoid,
+    SiLU,
     Softmax,
 )
 
+
+try:
+    from torch.nn.quantized import FloatFunctional
+except Exception:
+    FloatFunctional = None
+
 from sparseml.pytorch.models.registry import ModelRegistry
-from sparseml.pytorch.nn import SqueezeExcite, Swish
+from sparseml.pytorch.nn import SqueezeExcite
 
 
 __all__ = [
@@ -52,6 +60,36 @@
 ]
 
 
+class _Add(Module):
+    def __init__(self):
+        super().__init__()
+
+        if FloatFunctional:
+            self.functional = FloatFunctional()
+            self.wrap_qat = True
+            self.qat_wrapper_kwargs = {
+                "num_inputs": 2,
+                "num_outputs": 0,
+            }
+
+    def forward(self, a: Tensor, b: Tensor):
+        if FloatFunctional:
+            return self.functional.add(a, b)
+        else:
+            return torch.add(a, b)
+
+
+class QATSiLU(SiLU):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.wrap_qat = True
+        self.qat_wrapper_kwargs = {
+            "num_inputs": 1,
+            "num_outputs": 1,
+        }
+
+
 class _InvertedBottleneckBlock(Module):
     def __init__(
         self,
@@ -68,7 +106,13 @@ def __init__(
         self._out_channels = out_channels
         self._stride = stride
         self._se_mod = se_mod
-        expanded_channels = int(in_channels * expansion_ratio)
+        expanded_channels = _scale_num_channels(in_channels, expansion_ratio)
+        squeezed_channels = (
+            max(1, int(in_channels * se_ratio))
+            if se_ratio and 0 < se_ratio <= 1
+            else None
+        )
+
         self.expand = (
             Sequential(
                 OrderedDict(
@@ -83,7 +127,10 @@ def __init__(
                             ),
                         ),
                         ("bn", BatchNorm2d(num_features=expanded_channels)),
-                        ("act", Swish(num_channels=expanded_channels)),
+                        (
+                            "act",
+                            QATSiLU() if squeezed_channels else SiLU(),
+                        ),
                     ]
                 )
             )
@@ -108,30 +155,32 @@ def __init__(
                         ),
                     ),
                     ("bn", BatchNorm2d(num_features=expanded_channels)),
-                    ("act", Swish(num_channels=expanded_channels)),
+                    (
+                        "act",
+                        QATSiLU() if squeezed_channels else SiLU(),
+                    ),
                 ]
             )
         )
 
-        squeezed_channels = (
-            max(1, int(in_channels * se_ratio))
-            if se_ratio and 0 < se_ratio <= 1
-            else None
-        )
-
         if self._se_mod:
             self.se = (
-                SqueezeExcite(out_channels, squeezed_channels)
+                SqueezeExcite(out_channels, squeezed_channels, "silu")
                 if squeezed_channels
                 else None
             )
         else:
             self.se = (
-                SqueezeExcite(expanded_channels, squeezed_channels)
+                SqueezeExcite(expanded_channels, squeezed_channels, "silu")
                 if squeezed_channels
                 else None
             )
 
+        if self._stride == 1 and self._in_channels == self._out_channels:
+            self.add = _Add()
+        else:
+            self.add = None
+
         self.project = Sequential(
             OrderedDict(
                 [
@@ -165,8 +214,8 @@ def forward(self, inp: Tensor):
         if self.se is not None and self._se_mod:
             out = out * self.se(out)
 
-        if self._stride == 1 and self._in_channels == self._out_channels:
-            out = out + inp
+        if self.add is not None:
+            out = self.add(out, inp)
 
         return out
 
@@ -188,7 +237,7 @@ def __init__(
             bias=False,
         )
         self.bn = BatchNorm2d(num_features=out_channels)
-        self.act = Swish(out_channels)
+        self.act = SiLU()
         self.pool = AdaptiveAvgPool2d(1)
         self.dropout = Dropout(p=dropout)
         self.fc = Linear(out_channels, classes)
@@ -283,11 +332,12 @@ def __init__(
                             out_channels=sec_settings[0].in_channels,
                             kernel_size=3,
                             stride=2,
+                            padding=1,
                             bias=False,
                         ),
                     ),
                     ("bn", BatchNorm2d(num_features=sec_settings[0].in_channels)),
-                    ("act", Swish(sec_settings[0].in_channels)),
+                    ("act", SiLU()),
                 ]
             )
         )
@@ -317,33 +367,40 @@ def create_section(settings: EfficientNetSectionSettings) -> Sequential:
         stride = settings.stride
         blocks = []
 
-        for _ in range(settings.num_blocks):
+        for block in range(settings.num_blocks):
             blocks.append(
                 _InvertedBottleneckBlock(
-                    in_channels=in_channels,
+                    in_channels=in_channels if block == 0 else settings.out_channels,
                     out_channels=settings.out_channels,
                     kernel_size=settings.kernel_size,
                     expansion_ratio=settings.expansion_ratio,
-                    stride=stride,
+                    stride=stride if block == 0 else 1,
                     se_ratio=settings.se_ratio,
                     se_mod=settings.se_mod,
                 )
             )
-            in_channels = settings.out_channels
 
         return Sequential(*blocks)
 
 
-def _scale_num_channels(channels: int, width_mult: float) -> int:
-    divisor = 8
-    scaled = channels * width_mult
-    scaled = max(divisor, int(scaled + divisor / 2) // divisor * divisor)
+def _make_divisible(v: float, divisor: int, min_value: Optional[int] = None) -> int:
+    """
+    This function is taken from the original tf repo.
+    It ensures that all layers have a channel number that is divisible by 8
+    It can be seen here:
+    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+    """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return int(math.ceil(new_v))
 
-    if scaled < 0.9 * channels:
-        # prevent rounding by more than 10%
-        scaled += divisor
 
-    return int(scaled)
+def _scale_num_channels(channels: int, width_mult: float) -> int:
+    return _make_divisible(channels * width_mult, 8)
 
 
 def _scale_num_blocks(blocks: int, depth_mult: float) -> int:
@@ -438,7 +495,7 @@ def _efficient_net_params(model_name):
     params_dict = {
         "efficientnet_b0": (1.0, 1.0, 0.2, 224),
         "efficientnet_b1": (1.0, 1.1, 0.2, 240),
-        "efficientnet_b2": (1.1, 1.2, 0.3, 260),
+        "efficientnet_b2": (1.1, 1.2, 0.3, 288),
         "efficientnet_b3": (1.2, 1.4, 0.3, 300),
         "efficientnet_b4": (1.4, 1.8, 0.4, 380),
         "efficientnet_b5": (1.6, 2.2, 0.4, 456),
diff --git a/src/sparseml/pytorch/nn/se.py b/src/sparseml/pytorch/nn/se.py
@@ -36,7 +36,7 @@ class SqueezeExcite(Module):
     :param expanded_channels: the number of channels to expand to in the SE layer
     :param squeezed_channels: the number of channels to squeeze down to in the SE layer
     :param act_type: the activation type to use in the SE layer; options:
-        [relu, relu6, prelu, lrelu, swish]
+        [relu, relu6, prelu, lrelu, swish, silu]
     """
 
     def __init__(