[cherry-pick] syncing release 0.10 with sparseml updates 1/15/21 (#523)

bfineran · natuan · dependabot[bot] · web-flow · commit bafd6bec4e79 · 2022-01-25T15:09:11.000-05:00
* update transformers install to point to tarball + small fixes (#519) * update transformers install to point to tarball + small fixes * restrict datasets to < 1.18.0 * use subprocess call instead of pip API * Option to skip FC activations QAT in quantization modifier (#518) Co-authored-by: natuan <atnhan@gmail.com> Co-authored-by: natuan <atnhan@gmail.com> * update transformers release version (#520) * update transformers release version * quality * Bump ipython in /research/information_retrieval/doc2query (#515) Bumps [ipython](https://github.com/ipython/ipython) from 7.24.1 to 7.31.1. - [Release notes](https://github.com/ipython/ipython/releases) - [Commits](ipython/ipython@7.24.1...7.31.1) --- updated-dependencies: - dependency-name: ipython dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Mark Kurtz <mark@neuralmagic.com> * Update quality check github actions 'on' triggers (#521) * update transformers install to point to wheel file (#522) Co-authored-by: Mark Kurtz <mark@neuralmagic.com> Co-authored-by: natuan <atnhan@gmail.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Mark Kurtz <mark@neuralmagic.com>
diff --git a/.github/workflows/quality-check.yaml b/.github/workflows/quality-check.yaml
@@ -1,5 +1,13 @@
 name: Quality Checks
-on: [push]
+on: 
+  push:
+    branches:
+      - main
+      - 'release/*'
+  pull_request:
+    branches:
+      - main
+      - 'release/*'
 jobs:
   quality-check:
     runs-on: ubuntu-latest
diff --git a/research/information_retrieval/doc2query/requirements.txt b/research/information_retrieval/doc2query/requirements.txt
@@ -29,7 +29,7 @@ huggingface-hub==0.0.8
 idna==2.10
 imageio==2.9.0
 ipykernel==5.5.5
-ipython==7.24.1
+ipython==7.31.1
 ipython-genutils==0.2.0
 ipywidgets==7.6.3
 jedi==0.18.0
diff --git a/src/sparseml/pytorch/optim/modifier_quantization.py b/src/sparseml/pytorch/optim/modifier_quantization.py
@@ -42,6 +42,7 @@
     fuse_module_conv_bn_relus,
     get_qat_qconfig,
     prepare_embeddings_qat,
+    remove_activation_qat_by_layer_name,
 )
 
 
@@ -93,6 +94,11 @@ class QuantizationModifier(ScheduledModifier):
     :param reduce_range: if True, the quantization range will be reduced by one bit.
         This may prevent overflow issues with model execution on certain hardware
         Default is False
+    :param quantize_linear_activations: if False, FakeQuantize ops will not be run
+        for activations of fully connected layers. this is important for quantizing
+        transformer based models such as BERT where the quantized MatMul outputs
+        are kept at 32 bits of precision and fake quantizing the outputs harm training
+        recovery. Default is True
     """
 
     def __init__(
@@ -106,6 +112,7 @@ def __init__(
         model_fuse_fn_kwargs: Dict[str, Any] = None,
         quantize_embeddings: bool = True,
         reduce_range: bool = False,
+        quantize_linear_activations: bool = True,
     ):
         if torch_quantization is None or torch_intrinsic is None:
             raise RuntimeError(
@@ -129,6 +136,7 @@ def __init__(
         self._freeze_bn_stats_epoch = freeze_bn_stats_epoch
         self._quantize_embeddings = quantize_embeddings
         self._reduce_range = reduce_range
+        self._quantize_linear_activations = quantize_linear_activations
 
         self._modules_to_quantize = None
         self._qat_enabled = False
@@ -251,6 +259,17 @@ def reduce_range(self) -> bool:
         """
         return self._reduce_range
 
+    @ModifierProp()
+    def quantize_linear_activations(self) -> bool:
+        """
+        :return: if False, FakeQuantize ops will not be run
+            for activations of fully connected layers. this is important for quantizing
+            transformer based models such as BERT where the quantized MatMul outputs
+            are kept at 32 bits of precision and fake quantizing the outputs harm
+            training recovery
+        """
+        return self._quantize_linear_activations
+
     def initialize(
         self,
         module: Module,
@@ -393,6 +412,9 @@ def _enable_module_qat(self, module: Module):
 
             add_quant_dequant(quant_module, name, module)
 
+            if not self._quantize_linear_activations:
+                remove_activation_qat_by_layer_name(quant_module, ["Linear"])
+
         # set modules with proper qconfigs to QAT mode
         torch_quantization.prepare_qat(module, inplace=True)
         if self._quantize_embeddings:
diff --git a/src/sparseml/pytorch/utils/quantization/helpers.py b/src/sparseml/pytorch/utils/quantization/helpers.py
@@ -38,6 +38,7 @@
     "configure_module_qat_wrappers",
     "configure_module_default_qconfigs",
     "add_quant_dequant",
+    "remove_activation_qat_by_layer_name",
     "get_qat_qconfig",
     "fuse_module_conv_bn_relus",
     "prepare_embeddings_qat",
@@ -348,6 +349,25 @@ def add_quant_dequant(module, name=None, parent_module=None):
     return module
 
 
+def remove_activation_qat_by_layer_name(module: Module, layer_class_names: List[str]):
+    """
+    Disables fake quantization of activations for all submodules of the given module
+    with class name layer_class_names
+
+    :param module: module to remove activation fake quantization for certain layers
+    :param layer_class_names: list of layer class names that should be affected.
+        e.x. ["Linear"]
+    """
+    for submodule in module.modules():
+        if submodule.__class__.__name__ in layer_class_names and (
+            hasattr(submodule, "qconfig")
+        ):
+            submodule.qconfig = torch_quantization.QConfig(
+                activation=torch.nn.Identity,
+                weight=submodule.qconfig.weight,
+            )
+
+
 def get_qat_qconfig(
     symmetric_activations: bool = False,
     symmetric_weights: bool = True,
diff --git a/src/sparseml/pytorch/utils/quantization/quantize_qat_export.py b/src/sparseml/pytorch/utils/quantization/quantize_qat_export.py
@@ -667,9 +667,9 @@ def _convert_quantizable_matmul_and_add(model: ModelProto):
     |                     |
     |                    Add (with constant bias)
     |                     |
-    |               QuantizeLinear
+    |               QuantizeLinear (Optional)
     |                     |
-    |              DequantizeLinear
+    |              DequantizeLinear (Optional)
     |                     |
     |                  OUTPUT
     | We end up converting to:
@@ -718,19 +718,26 @@ def _convert_quantizable_matmul_and_add(model: ModelProto):
         bias_add_node = graph.get_node_single_child(matmul_node)
         if not bias_add_node or bias_add_node.op_type != "Add":
             continue
+
+        # Optionally find output QDQ block which will be deleted
         output_quantize_node = graph.get_node_single_child(bias_add_node)
         if (
             not output_quantize_node
             or output_quantize_node.op_type not in _QUANTIZE_OP_NAMES
         ):
-            continue
+            output_quantize_node = None
 
-        output_dequantize_node = graph.get_node_single_child(output_quantize_node)
+        output_dequantize_node = (
+            graph.get_node_single_child(output_quantize_node)
+            if output_quantize_node
+            else None
+        )
         if (
             not output_dequantize_node
             or output_dequantize_node.op_type not in _QUANTIZE_OP_NAMES
         ):
-            continue
+            output_quantize_node = None
+            output_dequantize_node = None
 
         input_quantize_params = get_quantization_params(
             model, input_quantize_node, include_target=False
@@ -743,7 +750,7 @@ def _convert_quantizable_matmul_and_add(model: ModelProto):
             continue
         if input_quantize_node.op_type != "DequantizeLinear":
             continue
-        if output_quantize_node.op_type != "QuantizeLinear":
+        if output_quantize_node and output_quantize_node.op_type != "QuantizeLinear":
             continue
         bias_initializer = get_init_by_name(model, bias_add_node.input[1]) or (
             get_init_by_name(model, bias_add_node.input[0])
@@ -822,8 +829,13 @@ def _convert_quantizable_matmul_and_add(model: ModelProto):
             matmul_integer_output,  # MatMul integer outputs (INT32)
             quantized_bias_name,  # Quantized bias (INT32)
         ]
-        quant_add_output = output_quantize_node.output[0]
+
         quant_add_name = "{}_quant".format(bias_add_node.name)
+        quant_add_output = (
+            output_quantize_node.output[0]
+            if output_quantize_node
+            else f"{quant_add_name}_output"
+        )
 
         # create Add node and add it to graph
         qadd_node = onnx.helper.make_node(
@@ -852,10 +864,15 @@ def _convert_quantizable_matmul_and_add(model: ModelProto):
             quantized_bias_scale_name,  # b -> rescale factor
         ]
         mul_node_name = "{}_rescale_mul".format(bias_add_node.name)
+        mul_node_output = (
+            output_dequantize_node.output[0]
+            if output_dequantize_node
+            else bias_add_node.output[0]
+        )
         mul_node = onnx.helper.make_node(
             "Mul",
             mul_node_inputs,
-            [output_dequantize_node.output[0]],
+            [mul_node_output],
             mul_node_name,
         )
         model.graph.node.append(mul_node)
@@ -865,9 +882,15 @@ def _convert_quantizable_matmul_and_add(model: ModelProto):
         delete_quant_node(model, weight_dequantize_node, keep_params=False)
         delete_quant_node(model, weight_quantize_node, keep_params=True)
         remove_node_and_params_from_graph(model, weight_transpose_node)
-        delete_quant_node(model, input_quantize_node, keep_params=True)
-        delete_quant_node(model, output_quantize_node, keep_params=True)
-        delete_quant_node(model, output_dequantize_node, keep_params=True)
+
+        # only delete input node if the matmul is the only child
+        current_graph = ONNXGraph(model)
+        if len(current_graph.get_node_children(input_quantize_node)) == 1:
+            delete_quant_node(model, input_quantize_node, keep_params=True)
+        if output_quantize_node:
+            delete_quant_node(model, output_quantize_node, keep_params=True)
+        if output_dequantize_node:
+            delete_quant_node(model, output_dequantize_node, keep_params=True)
 
         # delete original Gemm node
         remove_node_and_params_from_graph(model, matmul_node, keep_params=None)
diff --git a/src/sparseml/transformers/__init__.py b/src/sparseml/transformers/__init__.py
@@ -19,40 +19,49 @@
 # flake8: noqa
 
 import logging as _logging
+import sys
 
 
 try:
     import transformers as _transformers
 
+    # triggers error if neuralmagic/transformers is not installed
+    _transformers.models.bert.modeling_bert.QATMatMul
     _transformers_import_error = None
 except Exception as _transformers_import_err:
     _transformers_import_error = _transformers_import_err
 
 
 _LOGGER = _logging.getLogger(__name__)
+_NM_TRANSFORMERS_TAR_TEMPLATE = (
+    "https://github.com/neuralmagic/transformers/releases/download/"
+    "{version}/transformers-4.7.0.dev0-py3-none-any.whl"
+)
+_NM_TRANSFORMERS_NIGHTLY = _NM_TRANSFORMERS_TAR_TEMPLATE.format(version="nightly")
 
 
 def _install_transformers_and_deps():
 
-    import pip as _pip
+    import subprocess as _subprocess
+    import sys as _sys
+
     import sparseml as _sparseml
 
-    transformers_branch = (
-        "master"
-        if not _sparseml.is_release
-        else f"release/{_sparseml.version_major_minor}"
+    nm_transformers_release = (
+        "nightly" if not _sparseml.is_release else f"v{_sparseml.version_major_minor}"
     )
-    transformers_requirement = (
-        "transformers @ git+https://github.com/neuralmagic/transformers.git"
-        f"@{transformers_branch}"
+    transformers_requirement = _NM_TRANSFORMERS_TAR_TEMPLATE.format(
+        version=nm_transformers_release
     )
-
     try:
-        _pip.main(
+        _subprocess.check_call(
             [
+                sys.executable,
+                "-m",
+                "pip",
                 "install",
                 transformers_requirement,
-                "datasets",
+                "datasets<1.18.0",
                 "sklearn",
                 "seqeval",
             ]
@@ -65,7 +74,7 @@ def _install_transformers_and_deps():
         raise ValueError(
             "Unable to install and import sparseml-transformers dependencies check "
             "that transformers is installed, if not, install via "
-            "`pip install git+https://github.com/neuralmagic/transformers.git`"
+            f"`pip install {_NM_TRANSFORMERS_NIGHTLY}`"
         )
 
 
@@ -75,19 +84,21 @@ def _check_transformers_install():
 
         if os.getenv("NM_NO_AUTOINSTALL_TRANSFORMERS", False):
             _LOGGER.warning(
-                "Unable to import transformers, skipping auto installation "
+                "Unable to import, skipping auto installation "
                 "due to NM_NO_AUTOINSTALL_TRANSFORMERS"
             )
             # skip any further checks
             return
         else:
-            _LOGGER.info(
-                "No installation of transformers found. Installing sparseml-transformers "
-                "dependencies"
+            _LOGGER.warning(
+                "sparseml-transformers installation not detected. Installing "
+                "sparseml-transformers dependencies if transformers is already "
+                "installed in the environment, it will be overwritten. Set "
+                "environment variable NM_NO_AUTOINSTALL_TRANSFORMERS to disable"
             )
             _install_transformers_and_deps()
 
-    # check NM fork installed with QATMatMul available
+    # re check import after potential install
     try:
         import transformers as _transformers
 
@@ -97,7 +108,7 @@ def _check_transformers_install():
             "transformers.models.bert.modeling_bert.QATMatMul not availalbe. the"
             "neuralmagic fork of transformers may not be installed. it can be "
             "installed via "
-            "`pip install git+https://github.com/neuralmagic/transformers.git`"
+            f"`pip install {_NM_TRANSFORMERS_NIGHTLY}`"
         )
 
 
diff --git a/src/sparseml/transformers/utils/export.py b/src/sparseml/transformers/utils/export.py
@@ -29,21 +29,21 @@
 
 optional arguments:
   -h, --help            show this help message and exit
-  --task TASK           task to create the model for. i.e. mlm, qa, glue, ner
+  --task TASK           Task to create the model for. i.e. mlm, qa, glue, ner
   --model_path MODEL_PATH
                         Path to directory where model files for weights, config,
                         and tokenizer are stored
   --sequence_length SEQUENCE_LENGTH
                         Sequence length to use. Default is 384. Can be overwritten
                         later
   --convert_qat CONVERT_QAT
-                        Set True to convert QAT graph exports to fully quantized.
-                        Default is True
+                        Set flag to not perform QAT to fully quantized conversion
+                        after export
   --finetuning_task FINETUNING_TASK
                         optional finetuning task for text classification and token
                         classification exports
   --onnx_file_name ONNX_FILE_NAME
-                        name for exported ONNX file in the model directory. Default
+                        Name for exported ONNX file in the model directory. Default
                         and reccomended value for pipeline compatibility is
                         'model.onnx'
 
@@ -207,12 +207,9 @@ def _parse_args() -> argparse.Namespace:
         help="Sequence length to use. Default is 384. Can be overwritten later",
     )
     parser.add_argument(
-        "--convert_qat",
-        type=bool,
-        default=True,
-        help=(
-            "Set True to convert QAT graph exports to fully quantized. Default is True"
-        ),
+        "--no_convert_qat",
+        action="store_false",
+        help=("Set flag to not perform QAT to fully quantized conversion after export"),
     )
     parser.add_argument(
         "--finetuning_task",
diff --git a/src/sparseml/transformers/utils/trainer.py b/src/sparseml/transformers/utils/trainer.py
@@ -180,10 +180,12 @@ def create_optimizer(self):
         super().create_optimizer()
         if not self.recipe:
             return
-        steps_per_epoch = math.ceil(
-            len(self.train_dataset)
-            / (self.args.per_device_train_batch_size * self.args._n_gpu)
+        total_batch_size = (
+            self.args.per_device_train_batch_size
+            * self.args._n_gpu
+            * self.args.gradient_accumulation_steps
         )
+        steps_per_epoch = math.ceil(len(self.train_dataset) / total_batch_size)
         if hasattr(self, "scaler"):
             self.scaler = self.manager.modify(
                 self.model,
diff --git a/tests/sparseml/pytorch/optim/test_modifier_quantization.py b/tests/sparseml/pytorch/optim/test_modifier_quantization.py