Skip to content
This repository was archived by the owner on Jun 3, 2025. It is now read-only.

Commit bafd6be

Browse files
bfinerannatuandependabot[bot]markurtz
authored
[cherry-pick] syncing release 0.10 with sparseml updates 1/15/21 (#523)
* update transformers install to point to tarball + small fixes (#519) * update transformers install to point to tarball + small fixes * restrict datasets to < 1.18.0 * use subprocess call instead of pip API * Option to skip FC activations QAT in quantization modifier (#518) Co-authored-by: natuan <atnhan@gmail.com> Co-authored-by: natuan <atnhan@gmail.com> * update transformers release version (#520) * update transformers release version * quality * Bump ipython in /research/information_retrieval/doc2query (#515) Bumps [ipython](https://github.com/ipython/ipython) from 7.24.1 to 7.31.1. - [Release notes](https://github.com/ipython/ipython/releases) - [Commits](ipython/ipython@7.24.1...7.31.1) --- updated-dependencies: - dependency-name: ipython dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Mark Kurtz <mark@neuralmagic.com> * Update quality check github actions 'on' triggers (#521) * update transformers install to point to wheel file (#522) Co-authored-by: Mark Kurtz <mark@neuralmagic.com> Co-authored-by: natuan <atnhan@gmail.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Mark Kurtz <mark@neuralmagic.com>
1 parent 1dcbeed commit bafd6be

File tree

9 files changed

+159
-49
lines changed

9 files changed

+159
-49
lines changed

.github/workflows/quality-check.yaml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
11
name: Quality Checks
2-
on: [push]
2+
on:
3+
push:
4+
branches:
5+
- main
6+
- 'release/*'
7+
pull_request:
8+
branches:
9+
- main
10+
- 'release/*'
311
jobs:
412
quality-check:
513
runs-on: ubuntu-latest

research/information_retrieval/doc2query/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ huggingface-hub==0.0.8
2929
idna==2.10
3030
imageio==2.9.0
3131
ipykernel==5.5.5
32-
ipython==7.24.1
32+
ipython==7.31.1
3333
ipython-genutils==0.2.0
3434
ipywidgets==7.6.3
3535
jedi==0.18.0

src/sparseml/pytorch/optim/modifier_quantization.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
fuse_module_conv_bn_relus,
4343
get_qat_qconfig,
4444
prepare_embeddings_qat,
45+
remove_activation_qat_by_layer_name,
4546
)
4647

4748

@@ -93,6 +94,11 @@ class QuantizationModifier(ScheduledModifier):
9394
:param reduce_range: if True, the quantization range will be reduced by one bit.
9495
This may prevent overflow issues with model execution on certain hardware
9596
Default is False
97+
:param quantize_linear_activations: if False, FakeQuantize ops will not be run
98+
for activations of fully connected layers. this is important for quantizing
99+
transformer based models such as BERT where the quantized MatMul outputs
100+
are kept at 32 bits of precision and fake quantizing the outputs harm training
101+
recovery. Default is True
96102
"""
97103

98104
def __init__(
@@ -106,6 +112,7 @@ def __init__(
106112
model_fuse_fn_kwargs: Dict[str, Any] = None,
107113
quantize_embeddings: bool = True,
108114
reduce_range: bool = False,
115+
quantize_linear_activations: bool = True,
109116
):
110117
if torch_quantization is None or torch_intrinsic is None:
111118
raise RuntimeError(
@@ -129,6 +136,7 @@ def __init__(
129136
self._freeze_bn_stats_epoch = freeze_bn_stats_epoch
130137
self._quantize_embeddings = quantize_embeddings
131138
self._reduce_range = reduce_range
139+
self._quantize_linear_activations = quantize_linear_activations
132140

133141
self._modules_to_quantize = None
134142
self._qat_enabled = False
@@ -251,6 +259,17 @@ def reduce_range(self) -> bool:
251259
"""
252260
return self._reduce_range
253261

262+
@ModifierProp()
263+
def quantize_linear_activations(self) -> bool:
264+
"""
265+
:return: if False, FakeQuantize ops will not be run
266+
for activations of fully connected layers. this is important for quantizing
267+
transformer based models such as BERT where the quantized MatMul outputs
268+
are kept at 32 bits of precision and fake quantizing the outputs harm
269+
training recovery
270+
"""
271+
return self._quantize_linear_activations
272+
254273
def initialize(
255274
self,
256275
module: Module,
@@ -393,6 +412,9 @@ def _enable_module_qat(self, module: Module):
393412

394413
add_quant_dequant(quant_module, name, module)
395414

415+
if not self._quantize_linear_activations:
416+
remove_activation_qat_by_layer_name(quant_module, ["Linear"])
417+
396418
# set modules with proper qconfigs to QAT mode
397419
torch_quantization.prepare_qat(module, inplace=True)
398420
if self._quantize_embeddings:

src/sparseml/pytorch/utils/quantization/helpers.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
"configure_module_qat_wrappers",
3939
"configure_module_default_qconfigs",
4040
"add_quant_dequant",
41+
"remove_activation_qat_by_layer_name",
4142
"get_qat_qconfig",
4243
"fuse_module_conv_bn_relus",
4344
"prepare_embeddings_qat",
@@ -348,6 +349,25 @@ def add_quant_dequant(module, name=None, parent_module=None):
348349
return module
349350

350351

352+
def remove_activation_qat_by_layer_name(module: Module, layer_class_names: List[str]):
353+
"""
354+
Disables fake quantization of activations for all submodules of the given module
355+
with class name layer_class_names
356+
357+
:param module: module to remove activation fake quantization for certain layers
358+
:param layer_class_names: list of layer class names that should be affected.
359+
e.x. ["Linear"]
360+
"""
361+
for submodule in module.modules():
362+
if submodule.__class__.__name__ in layer_class_names and (
363+
hasattr(submodule, "qconfig")
364+
):
365+
submodule.qconfig = torch_quantization.QConfig(
366+
activation=torch.nn.Identity,
367+
weight=submodule.qconfig.weight,
368+
)
369+
370+
351371
def get_qat_qconfig(
352372
symmetric_activations: bool = False,
353373
symmetric_weights: bool = True,

src/sparseml/pytorch/utils/quantization/quantize_qat_export.py

Lines changed: 34 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -667,9 +667,9 @@ def _convert_quantizable_matmul_and_add(model: ModelProto):
667667
| |
668668
| Add (with constant bias)
669669
| |
670-
| QuantizeLinear
670+
| QuantizeLinear (Optional)
671671
| |
672-
| DequantizeLinear
672+
| DequantizeLinear (Optional)
673673
| |
674674
| OUTPUT
675675
| We end up converting to:
@@ -718,19 +718,26 @@ def _convert_quantizable_matmul_and_add(model: ModelProto):
718718
bias_add_node = graph.get_node_single_child(matmul_node)
719719
if not bias_add_node or bias_add_node.op_type != "Add":
720720
continue
721+
722+
# Optionally find output QDQ block which will be deleted
721723
output_quantize_node = graph.get_node_single_child(bias_add_node)
722724
if (
723725
not output_quantize_node
724726
or output_quantize_node.op_type not in _QUANTIZE_OP_NAMES
725727
):
726-
continue
728+
output_quantize_node = None
727729

728-
output_dequantize_node = graph.get_node_single_child(output_quantize_node)
730+
output_dequantize_node = (
731+
graph.get_node_single_child(output_quantize_node)
732+
if output_quantize_node
733+
else None
734+
)
729735
if (
730736
not output_dequantize_node
731737
or output_dequantize_node.op_type not in _QUANTIZE_OP_NAMES
732738
):
733-
continue
739+
output_quantize_node = None
740+
output_dequantize_node = None
734741

735742
input_quantize_params = get_quantization_params(
736743
model, input_quantize_node, include_target=False
@@ -743,7 +750,7 @@ def _convert_quantizable_matmul_and_add(model: ModelProto):
743750
continue
744751
if input_quantize_node.op_type != "DequantizeLinear":
745752
continue
746-
if output_quantize_node.op_type != "QuantizeLinear":
753+
if output_quantize_node and output_quantize_node.op_type != "QuantizeLinear":
747754
continue
748755
bias_initializer = get_init_by_name(model, bias_add_node.input[1]) or (
749756
get_init_by_name(model, bias_add_node.input[0])
@@ -822,8 +829,13 @@ def _convert_quantizable_matmul_and_add(model: ModelProto):
822829
matmul_integer_output, # MatMul integer outputs (INT32)
823830
quantized_bias_name, # Quantized bias (INT32)
824831
]
825-
quant_add_output = output_quantize_node.output[0]
832+
826833
quant_add_name = "{}_quant".format(bias_add_node.name)
834+
quant_add_output = (
835+
output_quantize_node.output[0]
836+
if output_quantize_node
837+
else f"{quant_add_name}_output"
838+
)
827839

828840
# create Add node and add it to graph
829841
qadd_node = onnx.helper.make_node(
@@ -852,10 +864,15 @@ def _convert_quantizable_matmul_and_add(model: ModelProto):
852864
quantized_bias_scale_name, # b -> rescale factor
853865
]
854866
mul_node_name = "{}_rescale_mul".format(bias_add_node.name)
867+
mul_node_output = (
868+
output_dequantize_node.output[0]
869+
if output_dequantize_node
870+
else bias_add_node.output[0]
871+
)
855872
mul_node = onnx.helper.make_node(
856873
"Mul",
857874
mul_node_inputs,
858-
[output_dequantize_node.output[0]],
875+
[mul_node_output],
859876
mul_node_name,
860877
)
861878
model.graph.node.append(mul_node)
@@ -865,9 +882,15 @@ def _convert_quantizable_matmul_and_add(model: ModelProto):
865882
delete_quant_node(model, weight_dequantize_node, keep_params=False)
866883
delete_quant_node(model, weight_quantize_node, keep_params=True)
867884
remove_node_and_params_from_graph(model, weight_transpose_node)
868-
delete_quant_node(model, input_quantize_node, keep_params=True)
869-
delete_quant_node(model, output_quantize_node, keep_params=True)
870-
delete_quant_node(model, output_dequantize_node, keep_params=True)
885+
886+
# only delete input node if the matmul is the only child
887+
current_graph = ONNXGraph(model)
888+
if len(current_graph.get_node_children(input_quantize_node)) == 1:
889+
delete_quant_node(model, input_quantize_node, keep_params=True)
890+
if output_quantize_node:
891+
delete_quant_node(model, output_quantize_node, keep_params=True)
892+
if output_dequantize_node:
893+
delete_quant_node(model, output_dequantize_node, keep_params=True)
871894

872895
# delete original Gemm node
873896
remove_node_and_params_from_graph(model, matmul_node, keep_params=None)

src/sparseml/transformers/__init__.py

Lines changed: 29 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -19,40 +19,49 @@
1919
# flake8: noqa
2020

2121
import logging as _logging
22+
import sys
2223

2324

2425
try:
2526
import transformers as _transformers
2627

28+
# triggers error if neuralmagic/transformers is not installed
29+
_transformers.models.bert.modeling_bert.QATMatMul
2730
_transformers_import_error = None
2831
except Exception as _transformers_import_err:
2932
_transformers_import_error = _transformers_import_err
3033

3134

3235
_LOGGER = _logging.getLogger(__name__)
36+
_NM_TRANSFORMERS_TAR_TEMPLATE = (
37+
"https://github.com/neuralmagic/transformers/releases/download/"
38+
"{version}/transformers-4.7.0.dev0-py3-none-any.whl"
39+
)
40+
_NM_TRANSFORMERS_NIGHTLY = _NM_TRANSFORMERS_TAR_TEMPLATE.format(version="nightly")
3341

3442

3543
def _install_transformers_and_deps():
3644

37-
import pip as _pip
45+
import subprocess as _subprocess
46+
import sys as _sys
47+
3848
import sparseml as _sparseml
3949

40-
transformers_branch = (
41-
"master"
42-
if not _sparseml.is_release
43-
else f"release/{_sparseml.version_major_minor}"
50+
nm_transformers_release = (
51+
"nightly" if not _sparseml.is_release else f"v{_sparseml.version_major_minor}"
4452
)
45-
transformers_requirement = (
46-
"transformers @ git+https://github.com/neuralmagic/transformers.git"
47-
f"@{transformers_branch}"
53+
transformers_requirement = _NM_TRANSFORMERS_TAR_TEMPLATE.format(
54+
version=nm_transformers_release
4855
)
49-
5056
try:
51-
_pip.main(
57+
_subprocess.check_call(
5258
[
59+
sys.executable,
60+
"-m",
61+
"pip",
5362
"install",
5463
transformers_requirement,
55-
"datasets",
64+
"datasets<1.18.0",
5665
"sklearn",
5766
"seqeval",
5867
]
@@ -65,7 +74,7 @@ def _install_transformers_and_deps():
6574
raise ValueError(
6675
"Unable to install and import sparseml-transformers dependencies check "
6776
"that transformers is installed, if not, install via "
68-
"`pip install git+https://github.com/neuralmagic/transformers.git`"
77+
f"`pip install {_NM_TRANSFORMERS_NIGHTLY}`"
6978
)
7079

7180

@@ -75,19 +84,21 @@ def _check_transformers_install():
7584

7685
if os.getenv("NM_NO_AUTOINSTALL_TRANSFORMERS", False):
7786
_LOGGER.warning(
78-
"Unable to import transformers, skipping auto installation "
87+
"Unable to import, skipping auto installation "
7988
"due to NM_NO_AUTOINSTALL_TRANSFORMERS"
8089
)
8190
# skip any further checks
8291
return
8392
else:
84-
_LOGGER.info(
85-
"No installation of transformers found. Installing sparseml-transformers "
86-
"dependencies"
93+
_LOGGER.warning(
94+
"sparseml-transformers installation not detected. Installing "
95+
"sparseml-transformers dependencies if transformers is already "
96+
"installed in the environment, it will be overwritten. Set "
97+
"environment variable NM_NO_AUTOINSTALL_TRANSFORMERS to disable"
8798
)
8899
_install_transformers_and_deps()
89100

90-
# check NM fork installed with QATMatMul available
101+
# re check import after potential install
91102
try:
92103
import transformers as _transformers
93104

@@ -97,7 +108,7 @@ def _check_transformers_install():
97108
"transformers.models.bert.modeling_bert.QATMatMul not availalbe. the"
98109
"neuralmagic fork of transformers may not be installed. it can be "
99110
"installed via "
100-
"`pip install git+https://github.com/neuralmagic/transformers.git`"
111+
f"`pip install {_NM_TRANSFORMERS_NIGHTLY}`"
101112
)
102113

103114

src/sparseml/transformers/utils/export.py

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -29,21 +29,21 @@
2929
3030
optional arguments:
3131
-h, --help show this help message and exit
32-
--task TASK task to create the model for. i.e. mlm, qa, glue, ner
32+
--task TASK Task to create the model for. i.e. mlm, qa, glue, ner
3333
--model_path MODEL_PATH
3434
Path to directory where model files for weights, config,
3535
and tokenizer are stored
3636
--sequence_length SEQUENCE_LENGTH
3737
Sequence length to use. Default is 384. Can be overwritten
3838
later
3939
--convert_qat CONVERT_QAT
40-
Set True to convert QAT graph exports to fully quantized.
41-
Default is True
40+
Set flag to not perform QAT to fully quantized conversion
41+
after export
4242
--finetuning_task FINETUNING_TASK
4343
optional finetuning task for text classification and token
4444
classification exports
4545
--onnx_file_name ONNX_FILE_NAME
46-
name for exported ONNX file in the model directory. Default
46+
Name for exported ONNX file in the model directory. Default
4747
and reccomended value for pipeline compatibility is
4848
'model.onnx'
4949
@@ -207,12 +207,9 @@ def _parse_args() -> argparse.Namespace:
207207
help="Sequence length to use. Default is 384. Can be overwritten later",
208208
)
209209
parser.add_argument(
210-
"--convert_qat",
211-
type=bool,
212-
default=True,
213-
help=(
214-
"Set True to convert QAT graph exports to fully quantized. Default is True"
215-
),
210+
"--no_convert_qat",
211+
action="store_false",
212+
help=("Set flag to not perform QAT to fully quantized conversion after export"),
216213
)
217214
parser.add_argument(
218215
"--finetuning_task",

src/sparseml/transformers/utils/trainer.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -180,10 +180,12 @@ def create_optimizer(self):
180180
super().create_optimizer()
181181
if not self.recipe:
182182
return
183-
steps_per_epoch = math.ceil(
184-
len(self.train_dataset)
185-
/ (self.args.per_device_train_batch_size * self.args._n_gpu)
183+
total_batch_size = (
184+
self.args.per_device_train_batch_size
185+
* self.args._n_gpu
186+
* self.args.gradient_accumulation_steps
186187
)
188+
steps_per_epoch = math.ceil(len(self.train_dataset) / total_batch_size)
187189
if hasattr(self, "scaler"):
188190
self.scaler = self.manager.modify(
189191
self.model,

0 commit comments

Comments
 (0)