option to convert PyTorch QAT graphs to fully quantized on ONNX export (#107)

bfineran · web-flow · commit 01c86edbd671 · 2021-03-19T21:05:08.000-04:00
diff --git a/examples/pytorch_sparse_quantized_transfer_learning/pytorch_sparse_quantized_transfer_learning.ipynb b/examples/pytorch_sparse_quantized_transfer_learning/pytorch_sparse_quantized_transfer_learning.ipynb
@@ -319,7 +319,7 @@
     "\n",
     "Once the model is saved as an ONNX ﬁle, it is ready to be used for inference with the DeepSparse Engine.\n",
     "\n",
-    "If exporting the model only to PyTorch for inference, the graph can be converted to fully quantized in PyTorch only using `torch.quantization.convert`, however the resulting model will not be compatible with ONNX conversion."
+    "Normally, exporting a QAT model from PyTorch to ONNX will create a graph with \"fake quantized\" operations that represent the QAT graph.  By setting `convert_qat=True` in our exporter, a function will automatically be called to convert this exported model to a fully quantized graph that will contain desired quantized structure."
    ]
   },
   {
@@ -330,7 +330,6 @@
    "source": [
     "import os\n",
     "from sparseml.pytorch.utils import ModuleExporter\n",
-    "from sparseml.pytorch.optim.quantization import quantize_torch_qat_export\n",
     "\n",
     "save_dir = \"pytorch_sparse_quantized_transfer_learning\"\n",
     "qat_onnx_graph_name = \"resnet50_imagenette_pruned_qat.onnx\"\n",
@@ -339,13 +338,9 @@
     "exporter = ModuleExporter(model, output_dir=save_dir)\n",
     "exporter.export_pytorch(name=\"resnet50_imagenette_pruned_qat.pth\")\n",
     "exporter.export_onnx(\n",
-    "    torch.randn(1, 3, 224, 224), name=qat_onnx_graph_name\n",
+    "    torch.randn(1, 3, 224, 224), name=qat_onnx_graph_name, convert_qat=True\n",
     ")\n",
     "\n",
-    "\n",
-    "# convert QAT graph to fully quantized operators\n",
-    "quantize_torch_qat_export(os.path.join(save_dir, qat_onnx_graph_name), output_file_path=quantized_onnx_path)\n",
-    "\n",
     "print(f\"Sparse-Quantized ONNX model saved to {quantized_onnx_path}\")"
    ]
   },
diff --git a/integrations/pytorch-torchvision/main.py b/integrations/pytorch-torchvision/main.py
@@ -434,7 +434,7 @@ def main(args):
     ########################
     exporter = ModuleExporter(model, save_dir)
     sample_input = torch.randn(image_shape).unsqueeze(0)  # sample batch for ONNX export
-    exporter.export_onnx(sample_input)
+    exporter.export_onnx(sample_input, convert_qat=True)
     exporter.export_pytorch()
     print("Model ONNX export and PyTorch weights saved to {}".format(save_dir))
 
diff --git a/integrations/timm/train.py b/integrations/timm/train.py
@@ -696,7 +696,10 @@ def main():
                 f"training complete, exporting ONNX to {output_dir}/model.onnx"
             )
             exporter = ModuleExporter(model, output_dir)
-            exporter.export_onnx(torch.randn((1, *data_config["input_size"])))
+            exporter.export_onnx(
+                torch.randn((1, *data_config["input_size"])),
+                convert_qat=True
+            )
         #################################################################################
         # End SparseML ONNX Export
         #################################################################################
diff --git a/integrations/ultralytics/train.py b/integrations/ultralytics/train.py
@@ -527,7 +527,7 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
             )
             model.model[-1].export = True  # do not export grid post-procesing
             exporter = ModuleExporter(model, save_dir)
-            exporter.export_onnx(torch.randn((1, 3, *imgsz)))
+            exporter.export_onnx(torch.randn((1, 3, *imgsz)), convert_qat=True)
         #################################################################################
         # End SparseML ONNX Export
         #################################################################################
diff --git a/scripts/pytorch_vision.py b/scripts/pytorch_vision.py
@@ -1042,6 +1042,7 @@ def _save_model_training(
     save_dir: str,
     epoch: int,
     val_res: Union[ModuleRunResults, None],
+    convert_qat: bool = False,
 ):
     LOGGER.info(
         "Saving model for epoch {} and val_loss {} to {} for {}".format(
@@ -1050,7 +1051,11 @@ def _save_model_training(
     )
     exporter = ModuleExporter(model, save_dir)
     exporter.export_pytorch(optim, epoch, "{}.pth".format(save_name))
-    exporter.export_onnx(torch.randn(1, *input_shape), "{}.onnx".format(save_name))
+    exporter.export_onnx(
+        torch.randn(1, *input_shape),
+        "{}.onnx".format(save_name),
+        convert_qat=convert_qat,
+    )
 
     info_path = os.path.join(save_dir, "{}.txt".format(save_name))
 
@@ -1185,8 +1190,10 @@ def train(args, model, train_loader, val_loader, input_shape, save_dir, loggers)
         # export the final model
         LOGGER.info("completed...")
         if args.is_main_process:
+            # only convert qat -> quantized ONNX graph for finalized model
+            # TODO: change this to all checkpoints when conversion times improve
             _save_model_training(
-                model, optim, input_shape, "model", save_dir, epoch, val_res
+                model, optim, input_shape, "model", save_dir, epoch, val_res, True
             )
 
             LOGGER.info("layer sparsities:")
@@ -1222,7 +1229,7 @@ def export(args, model, val_loader, save_dir):
         if not onnx_exported:
             # export onnx file using first sample for graph freezing
             LOGGER.info("exporting onnx in {}".format(save_dir))
-            exporter.export_onnx(data[0], opset=args.onnx_opset)
+            exporter.export_onnx(data[0], opset=args.onnx_opset, convert_qat=True)
             onnx_exported = True
 
         if args.num_samples > 0:
diff --git a/src/sparseml/pytorch/optim/modifier_quantization.py b/src/sparseml/pytorch/optim/modifier_quantization.py
@@ -34,7 +34,7 @@
 
 from sparseml.optim import ModifierProp
 from sparseml.pytorch.optim.modifier import PyTorchModifierYAML, ScheduledModifier
-from sparseml.pytorch.optim.quantization import (
+from sparseml.pytorch.utils.quantization import (
     add_quant_dequant,
     fuse_module_conv_bn_relus,
     get_qat_qconfig,
diff --git a/src/sparseml/pytorch/utils/exporter.py b/src/sparseml/pytorch/utils/exporter.py
@@ -41,6 +41,7 @@
     script_model,
     trace_model,
 )
+from sparseml.pytorch.utils.quantization import quantize_torch_qat_export
 from sparseml.utils import clean_path, create_parent_dirs
 
 
@@ -156,6 +157,7 @@ def export_onnx(
         name: str = "model.onnx",
         opset: int = DEFAULT_ONNX_OPSET,
         disable_bn_fusing: bool = True,
+        convert_qat: bool = False,
     ):
         """
         Export an onnx file for the current module and for a sample batch.
@@ -173,6 +175,10 @@ def export_onnx(
             sensitivity analyses of the exported graph.  Additionally, the DeepSparse
             inference engine, and other engines, perform batch norm fusing at model
             compilation.
+        :param convert_qat: if True and quantization aware training is detected in
+            the module being exported, the resulting QAT ONNX model will be converted
+            to a fully quantized ONNX model using `quantize_torch_qat_export`. Default
+            is False.
         """
         sample_batch = tensors_to_device(sample_batch, "cpu")
         onnx_path = os.path.join(self._output_dir, name)
@@ -241,6 +247,10 @@ def export_onnx(
             _delete_trivial_onnx_adds(onnx_model)
             onnx.save(onnx_model, onnx_path)
 
+        if convert_qat and is_quant_module:
+            # overwrite exported model with fully quantized version
+            quantize_torch_qat_export(model=onnx_path, output_file_path=onnx_path)
+
     def export_torchscript(
         self,
         name: str = "model.pts",
diff --git a/src/sparseml/pytorch/utils/quantization/__init__.py b/src/sparseml/pytorch/utils/quantization/__init__.py
diff --git a/src/sparseml/pytorch/utils/quantization/helpers.py b/src/sparseml/pytorch/utils/quantization/helpers.py
diff --git a/src/sparseml/pytorch/utils/quantization/quantize_qat_export.py b/src/sparseml/pytorch/utils/quantization/quantize_qat_export.py
diff --git a/tests/sparseml/pytorch/utils/quantization/__init__.py b/tests/sparseml/pytorch/utils/quantization/__init__.py
diff --git a/tests/sparseml/pytorch/utils/quantization/test_helpers.py b/tests/sparseml/pytorch/utils/quantization/test_helpers.py
@@ -18,7 +18,7 @@
 import torch
 
 from sparseml.pytorch.models import mobilenet, resnet50
-from sparseml.pytorch.optim.quantization import (
+from sparseml.pytorch.utils.quantization import (
     add_quant_dequant,
     fuse_module_conv_bn_relus,
     get_qat_qconfig,

Original file line number	Diff line number	Diff line change
`@@ -527,7 +527,7 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):`
`527`	`527`	`)`
`528`	`528`	`model.model[-1].export = True # do not export grid post-procesing`
`529`	`529`	`exporter = ModuleExporter(model, save_dir)`
`530`		`- exporter.export_onnx(torch.randn((1, 3, *imgsz)))`
	`530`	`+ exporter.export_onnx(torch.randn((1, 3, *imgsz)), convert_qat=True)`
`531`	`531`	`#################################################################################`
`532`	`532`	`# End SparseML ONNX Export`
`533`	`533`	`#################################################################################`