quantized inputs optimization for qat exports (#110)

bfineran · web-flow · commit 842b1e1ee1c0 · 2021-03-23T15:49:36.000-04:00
* quantized inputs optimization for qat exports

* raise exception if no optim made

* logging and unit test

* negative test
diff --git a/integrations/ultralytics/train.py b/integrations/ultralytics/train.py
@@ -521,13 +521,17 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
         # Start SparseML ONNX Export
         #################################################################################
             from sparseml.pytorch.utils import ModuleExporter
+            from sparseml.pytorch.utils.quantization import skip_onnx_input_quantize
 
+            onnx_path = f"{save_dir}/model.onnx"
             logger.info(
-                f"training complete, exporting ONNX to {save_dir}/model.onnx"
+                f"training complete, exporting ONNX to {onnx_path}"
             )
             model.model[-1].export = True  # do not export grid post-procesing
             exporter = ModuleExporter(model, save_dir)
             exporter.export_onnx(torch.randn((1, 3, *imgsz)), convert_qat=True)
+            if qat:
+                skip_onnx_input_quantize(onnx_path, onnx_path)
         #################################################################################
         # End SparseML ONNX Export
         #################################################################################
diff --git a/src/sparseml/onnx/utils/graph_editor.py b/src/sparseml/onnx/utils/graph_editor.py
@@ -136,12 +136,78 @@ def update_node_input(
             node.input.append(input_id)
         self._input_id_to_nodes[input_id].append(node)
 
+    def delete_node(self, node: NodeProto):
+        """
+        deletes the given node from the graph
+
+        :param node: node to delete
+        """
+        self._model.graph.node.remove(node)
+        self._delete_node_edges(node)
+
+    def delete_nodes(self, nodes: List[NodeProto]):
+        """
+        deletes the given nodes from the graph
+        :param nodes: list of nodes to delete
+        """
+        node_ouptut_ids_to_delete = {node.output[0] for node in nodes}
+        nodes_to_keep = []
+        for node in self._model.graph.node:
+            if node.output[0] in node_ouptut_ids_to_delete:
+                self._delete_node_edges(node)
+            else:
+                nodes_to_keep.append(node)
+        self._model.graph.ClearField("node")
+        self._model.graph.node.extend(nodes_to_keep)
+
+    def delete_initializers(self, initializers: List[Union[str, TensorProto]]):
+        """
+        deletes the given initializers from the model
+
+        :param initializers: list of initializers or initializer names to delete
+        """
+        inits_to_delete = {
+            init if isinstance(init, str) else init.name for init in initializers
+        }
+        inits_to_keep = []
+        for init in self._model.graph.initializer:
+            if init.name in inits_to_delete:
+                # keep edge reference if nodes in the graph still point to the
+                # initializer name
+                if not self._input_id_to_nodes[init.name]:
+                    del self._input_id_to_nodes[init.name]
+                del self._name_to_initializer[init.name]
+            else:
+                inits_to_keep.append(init)
+        self._model.graph.ClearField("initializer")
+        self._model.graph.initializer.extend(inits_to_keep)
+
+    def delete_unused_initializers(self):
+        """
+        deletes tensors in the initializer list that are not listed as inputs to any node
+        in the current graph state
+        """
+        self.delete_initializers(
+            [
+                init
+                for init in self._model.graph.initializer
+                if not self._input_id_to_nodes[init.name]
+            ]
+        )  # delete inits that have no edge
+
     def _store_node_edges(self, node: NodeProto):
         for output_id in node.output:
             self._output_id_to_node[output_id] = node
         for input_id in node.input:
             self._input_id_to_nodes[input_id].append(node)
 
+    def _delete_node_edges(self, node: NodeProto):
+        # remove node edges from cache
+        for output_id in node.output:
+            del self._output_id_to_node[output_id]
+        for input_id in node.input:
+            self._input_id_to_nodes[input_id].remove(node)
+
 
 def update_model_param(
     model: ModelProto,
diff --git a/src/sparseml/pytorch/utils/quantization/quantize_qat_export.py b/src/sparseml/pytorch/utils/quantization/quantize_qat_export.py
@@ -18,15 +18,17 @@
 """
 
 
+import logging
 from collections import defaultdict
 from copy import deepcopy
-from typing import Any, NamedTuple, Union
+from typing import Any, NamedTuple, Optional, Union
 
 import numpy
 import onnx
 from onnx import ModelProto, NodeProto, numpy_helper
 
 from sparseml.onnx.utils import (
+    ONNXGraph,
     get_batch_norm_params,
     get_init_by_name,
     get_node_attributes,
@@ -40,7 +42,15 @@
 )
 
 
-__all__ = ["get_quantization_params", "QuantizationParams", "quantize_torch_qat_export"]
+__all__ = [
+    "get_quantization_params",
+    "QuantizationParams",
+    "quantize_torch_qat_export",
+    "skip_onnx_input_quantize",
+]
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 """
@@ -593,3 +603,81 @@ def quantize_torch_qat_export(
         onnx.save(model, output_file_path)
 
     return model
+
+
+def _skip_input_quantize(model: ModelProto) -> Optional[str]:
+    if (
+        len(model.graph.input) != 1
+        or model.graph.input[0].type.tensor_type.elem_type != 1
+    ):
+        # more than 1 input or input is not FP32
+        return (
+            "Not modifying ONNX graph inputs - either graph has more than one "
+            "input or input type is not FP32"
+        )
+
+    input_node = model.graph.input[0]
+    input_children = [
+        node for node in model.graph.node if input_node.name in node.input
+    ]
+    if not all(node.op_type == "QuantizeLinear" for node in input_children):
+        return (
+            "Not modifying ONNX graph inputs - only QuantizeLinear nodes may follow the "
+            "FP32 input tensor in original graph, prior to converting to uint8"
+        )
+
+    graph = ONNXGraph(model)
+    for quantize_node in input_children:
+        quantize_children = graph.get_node_children(quantize_node)
+        quantize_node_id = quantize_node.output[0]
+        for child_node in quantize_children:
+            input_idx = [
+                idx
+                for idx, inp in enumerate(child_node.input)
+                if inp == quantize_node_id
+            ]
+            if not input_idx:
+                continue
+            input_idx = input_idx[0]
+            graph.update_node_input(child_node, input_node.name, input_idx)
+            _LOGGER.debug(
+                f"set node with output id {child_node.output[0]} as initial node in "
+                "graph"
+            )
+
+    _LOGGER.debug(
+        f"deleting QuantizeLinear node(s) with output id(s): "
+        f"{[n.output for n in input_children]}"
+    )
+    graph.delete_nodes(input_children)  # only contains references to the Quantize nodes
+    graph.delete_unused_initializers()  # cleanup
+    input_node.type.tensor_type.elem_type = 2  # fp32 -> uint8
+    _LOGGER.info("Model initial QuantizeLinear node(s) deleted and inputs set to uint8")
+
+    return None
+
+
+def skip_onnx_input_quantize(
+    model: Union[ModelProto, str],
+    output_file_path: Union[str, None] = None,
+):
+    """
+    If the given model has a single FP32 input that feeds into a QuantizeLinear
+    node, then the input will be changed to uint8 and the QuantizeLinear node will be
+    deleted. This enables quantize graphs to take quantized inputs instead of floats.
+
+    If no optimization is made, a RuntimeError will be raised.
+
+    :param model: The model to convert, or a file path to it
+    :param output_file_path: File path to save the converted model to
+    """
+    if isinstance(model, str):
+        model = onnx.load(model)
+
+    optim_error_message = _skip_input_quantize(model)
+
+    if optim_error_message:
+        raise RuntimeError(optim_error_message)
+
+    if output_file_path:
+        onnx.save(model, output_file_path)
diff --git a/tests/sparseml/pytorch/utils/quantization/test_quantize_qat_export.py b/tests/sparseml/pytorch/utils/quantization/test_quantize_qat_export.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import onnx
+import pytest
+from onnx import TensorProto
+
+from sparseml.pytorch.utils.quantization import skip_onnx_input_quantize
+
+
+def test_skip_onnx_input_quantize():
+    # make sample graph of fp32 input -> QuantizeLinear -> QLinearConv
+    # verify that it is transformed to uint8 input -> QLinearConv
+
+    float_input = onnx.helper.make_tensor_value_info(
+        "input", TensorProto.FLOAT, [1, 3, None, None]
+    )
+    quant_node = onnx.helper.make_node(
+        "QuantizeLinear",
+        ["input", "scale", "zp"],
+        ["quant_output"],
+    )
+    qconv_node = onnx.helper.make_node(
+        "QLinearConv",
+        ["quant_output", "scale", "zp", "w", "w_scale", "w_zp", "y_scale", "y_zp"],
+        ["qconv_output"],
+    )
+
+    qconv_output = onnx.helper.make_tensor_value_info(
+        "qconv_output", TensorProto.UINT8, [1, 1, None, None]
+    )
+
+    graph = onnx.helper.make_graph(
+        [quant_node, qconv_node],
+        "test_graph",
+        [float_input],
+        [qconv_output],
+        [],
+    )
+    model = onnx.helper.make_model(graph)
+
+    # initial model checks
+    assert model.graph.input[0].type.tensor_type.elem_type == TensorProto.FLOAT
+    assert len(model.graph.node) == 2
+    assert model.graph.node[0].op_type == "QuantizeLinear"
+    assert model.graph.node[1].op_type == "QLinearConv"
+
+    assert model.graph.node[0].input[0] == model.graph.input[0].name
+    assert model.graph.node[1].input[0] == model.graph.node[0].output[0]
+
+    # run optimization
+    skip_onnx_input_quantize(model)
+
+    # check model has uint8 inputs and no qlinear input node
+    assert model.graph.input[0].type.tensor_type.elem_type == TensorProto.UINT8
+    assert len(model.graph.node) == 1
+    assert model.graph.node[0].op_type == "QLinearConv"
+
+    assert model.graph.node[0].input[0] == model.graph.input[0].name
+
+
+def test_skip_onnx_input_quantize_expected_exception():
+    # test that a graph with already quantized inputs fails for this optimization
+
+    int_input = onnx.helper.make_tensor_value_info(
+        "input", TensorProto.UINT8, [1, 3, None, None]
+    )
+    qconv_node = onnx.helper.make_node(
+        "QLinearConv",
+        ["input", "scale", "zp", "w", "w_scale", "w_zp", "y_scale", "y_zp"],
+        ["qconv_output"],
+    )
+
+    qconv_output = onnx.helper.make_tensor_value_info(
+        "qconv_output", TensorProto.UINT8, [1, 1, None, None]
+    )
+
+    graph = onnx.helper.make_graph(
+        [qconv_node],
+        "test_graph",
+        [int_input],
+        [qconv_output],
+        [],
+    )
+    model = onnx.helper.make_model(graph)
+    with pytest.raises(RuntimeError) as err:
+        skip_onnx_input_quantize(model)