You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
430 lines
18 KiB
430 lines
18 KiB
# -------------------------------------------------------------------------
|
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
# Licensed under the MIT License. See License.txt in the project root for
|
|
# license information.
|
|
# --------------------------------------------------------------------------
|
|
import logging
|
|
from enum import Enum
|
|
|
|
import onnx
|
|
import onnx.numpy_helper
|
|
from onnx import TensorProto
|
|
from onnx import onnx_pb as onnx_proto
|
|
|
|
from .onnx_quantizer import ONNXQuantizer
|
|
from .quant_utils import (
|
|
DEQUANT_OP_NAME,
|
|
QUANT_OP_NAME,
|
|
QuantizedValue,
|
|
QuantizedValueType,
|
|
__producer__,
|
|
__version__,
|
|
add_dequant_output_suffix,
|
|
add_dequant_suffix,
|
|
add_quant_input_suffix,
|
|
add_quant_output_suffix,
|
|
add_quant_suffix,
|
|
find_by_name,
|
|
)
|
|
from .registry import CreateQDQQuantizer
|
|
|
|
|
|
class QDQQuantTensorType(Enum):
|
|
ACTIVATION = 0
|
|
WEIGHT = 1
|
|
BIAS = 2
|
|
|
|
|
|
class QDQTensorQuantInfo:
|
|
def __init__(self, tensor_type=QDQQuantTensorType.ACTIVATION, quant_para_provider=None, axis=None):
|
|
self.tensor_type = tensor_type
|
|
self.quant_para_provider = quant_para_provider
|
|
self.axis = axis
|
|
self.is_shared = quant_para_provider is not None
|
|
|
|
|
|
class QDQQuantizer(ONNXQuantizer):
|
|
def __init__(
|
|
self,
|
|
model,
|
|
per_channel,
|
|
reduce_range,
|
|
mode,
|
|
static,
|
|
weight_qType,
|
|
activation_qType,
|
|
tensors_range,
|
|
nodes_to_quantize,
|
|
nodes_to_exclude,
|
|
op_types_to_quantize,
|
|
extra_options=None,
|
|
):
|
|
ONNXQuantizer.__init__(
|
|
self,
|
|
model,
|
|
per_channel,
|
|
reduce_range,
|
|
mode,
|
|
static,
|
|
weight_qType,
|
|
activation_qType,
|
|
tensors_range,
|
|
nodes_to_quantize,
|
|
nodes_to_exclude,
|
|
op_types_to_quantize,
|
|
extra_options,
|
|
)
|
|
self.tensors_to_quantize = {}
|
|
self.bias_to_quantize = []
|
|
|
|
self.nodes_to_remove = []
|
|
|
|
# Specific op types to exclude qdq quantization for their outputs.
|
|
# In TRT, it's not recommended to quantize outputs for weighted ops such as Conv, Matmul, Gemm
|
|
# because those ops may be followed by nodes that require high resolution inputs.
|
|
# Adding QDQ for those ops' output may end up with worse accuracy.
|
|
# So, we don't recommend to add QDQ to node's output under such condition.
|
|
self.op_types_to_exclude_output_quantization = (
|
|
[]
|
|
if "OpTypesToExcludeOutputQuantization" not in extra_options
|
|
else extra_options["OpTypesToExcludeOutputQuantization"]
|
|
)
|
|
|
|
# We do quantization on Dequantizelinear's input to remove Quantizelinear for weight as an optimization.
|
|
# In some cases, for example QDQ BERT model for TensorRT, QDQ should always appear as a pair.
|
|
# Therefore, we need to disable this optimization and add qdq pair to weight.
|
|
self.add_qdq_pair_to_weight = (
|
|
False if "AddQDQPairToWeight" not in extra_options else extra_options["AddQDQPairToWeight"]
|
|
)
|
|
|
|
# The default behavior is that multiple nodes can share a QDQ pair as their inputs.
|
|
# In TRT, QDQ pair can’t be shared between nodes, so it will create dedicated QDQ pairs for each node.
|
|
self.dedicated_qdq_pair = (
|
|
False if "DedicatedQDQPair" not in extra_options else extra_options["DedicatedQDQPair"]
|
|
)
|
|
if self.dedicated_qdq_pair:
|
|
self.tensor_to_its_receiving_nodes = {}
|
|
|
|
# Let user set channel axis for specific op type and it's effective only when per channel quantization is supported and per_channel is True.
|
|
self.qdq_op_type_per_channel_support_to_axis = (
|
|
{}
|
|
if "QDQOpTypePerChannelSupportToAxis" not in extra_options
|
|
else extra_options["QDQOpTypePerChannelSupportToAxis"]
|
|
)
|
|
|
|
def _is_tensor_quantizable(self, tensor_name):
|
|
"""
|
|
Check if tensor can be quantized
|
|
"""
|
|
weight = find_by_name(tensor_name, self.model.initializer())
|
|
if weight is not None:
|
|
if weight.data_type == onnx_proto.TensorProto.FLOAT:
|
|
return True
|
|
elif tensor_name in self.value_infos.keys():
|
|
vi = self.value_infos[tensor_name]
|
|
if vi.type.HasField("tensor_type") and vi.type.tensor_type.elem_type == TensorProto.FLOAT:
|
|
return True
|
|
else:
|
|
logging.warning(
|
|
"failed to infer the type of tensor: {}. Skip to quantize it. Please check if it is expected.".format(
|
|
tensor_name
|
|
)
|
|
)
|
|
|
|
return False
|
|
|
|
def __quantize_tensor(self, tensor_name, quant_sharing_param=None, tensor_type=QDQQuantTensorType.ACTIVATION):
|
|
"""
|
|
Quantize tensors. If quant_param_tensor is not None, tensor with name tensor_name will be quantized with same
|
|
quantization parameters as tensor quant_param_tensor
|
|
|
|
Args:
|
|
tensor_name: name of the tensor to quantize
|
|
quant_sharing_param: name of the tensor that provides quantization parameter
|
|
tensor_type: QDQQuantTensorType default ACTIVATION
|
|
"""
|
|
if self._is_tensor_quantizable(tensor_name):
|
|
if quant_sharing_param:
|
|
self.tensors_to_quantize[tensor_name] = QDQTensorQuantInfo(
|
|
tensor_type=tensor_type, quant_para_provider=quant_sharing_param
|
|
)
|
|
elif tensor_name not in self.tensors_to_quantize:
|
|
self.tensors_to_quantize[tensor_name] = QDQTensorQuantInfo(tensor_type=tensor_type)
|
|
|
|
def quantize_activation_tensor(self, tensor_name, quant_sharing_param=None):
|
|
"""
|
|
Quantize Activation Tensor
|
|
Args:
|
|
tensor_name: name of the tensor to quantize
|
|
quant_sharing_param: name of the tensor that provides quantization parameter
|
|
|
|
"""
|
|
return self.__quantize_tensor(tensor_name, quant_sharing_param, QDQQuantTensorType.ACTIVATION)
|
|
|
|
def quantize_weight_tensor(self, tensor_name, quant_sharing_param=None):
|
|
"""
|
|
Quantize Weight Tensor
|
|
Args:
|
|
tensor_name: name of the tensor to quantize
|
|
quant_sharing_param: name of the tensor that provides quantization parameter
|
|
|
|
"""
|
|
return self.__quantize_tensor(tensor_name, quant_sharing_param, QDQQuantTensorType.WEIGHT)
|
|
|
|
def quantize_weight_tensor_per_channel(self, tensor_name, axis):
|
|
weight = find_by_name(tensor_name, self.model.initializer())
|
|
if weight:
|
|
if weight.data_type == onnx_proto.TensorProto.FLOAT:
|
|
self.tensors_to_quantize[tensor_name] = QDQTensorQuantInfo(
|
|
tensor_type=QDQQuantTensorType.WEIGHT, axis=axis
|
|
)
|
|
else:
|
|
logging.warning(
|
|
"only support per-channel quantization on weight. Tensor: {} is not quantized.".format(tensor_name)
|
|
)
|
|
|
|
def quantize_bias_tensor(self, bias_name, input_name, weight_name, beta=1.0):
|
|
weight = find_by_name(bias_name, self.model.initializer())
|
|
if weight is not None:
|
|
if weight.data_type == onnx_proto.TensorProto.FLOAT:
|
|
self.bias_to_quantize.append((bias_name, input_name, weight_name, beta))
|
|
else:
|
|
logging.warning("Expected {} to be a weight".format(bias_name))
|
|
|
|
def remove_node(self, node):
|
|
self.nodes_to_remove.append(node)
|
|
|
|
def remove_nodes(self):
|
|
self.model.remove_nodes(self.nodes_to_remove)
|
|
|
|
def quantize_model(self):
|
|
for node in self.model.nodes():
|
|
if self.should_quantize_node(node):
|
|
op_quantizer = CreateQDQQuantizer(self, node)
|
|
op_quantizer.quantize()
|
|
|
|
if self.dedicated_qdq_pair:
|
|
for tensor_name in node.input:
|
|
if tensor_name not in self.tensor_to_its_receiving_nodes:
|
|
self.tensor_to_its_receiving_nodes[tensor_name] = []
|
|
self.tensor_to_its_receiving_nodes[tensor_name].append(node)
|
|
|
|
self._quantize_normal_tensors()
|
|
self._quantize_sharing_param_tensors()
|
|
self._quantize_bias_tensors()
|
|
self.remove_nodes()
|
|
if not self.add_qdq_pair_to_weight:
|
|
self.model.clean_initializers()
|
|
|
|
self.model.model.producer_name = __producer__
|
|
self.model.model.producer_version = __version__
|
|
|
|
return self.model.model
|
|
|
|
def try_replacing_upstream_output(self, upstream_output_name, output_name):
|
|
if (
|
|
output_name in self.quantization_params.keys()
|
|
and len(self.model.input_name_to_nodes()[upstream_output_name]) == 1
|
|
and not self.model.is_graph_output(upstream_output_name)
|
|
and not self.model.is_graph_input(upstream_output_name)
|
|
):
|
|
self.model.replace_output_of_all_nodes(upstream_output_name, output_name)
|
|
if upstream_output_name in self.tensors_to_quantize:
|
|
del self.tensors_to_quantize[upstream_output_name]
|
|
return True
|
|
return False
|
|
|
|
def _create_qdq_nodes(
|
|
self, q_input, q_output, quant_node_name, dq_input, dq_output, dequant_node_name, scale_name, zp_name, axis=None
|
|
):
|
|
qlinear_node = onnx.helper.make_node(
|
|
QUANT_OP_NAME,
|
|
[q_input, scale_name, zp_name],
|
|
[q_output],
|
|
quant_node_name,
|
|
axis=axis,
|
|
)
|
|
dequant_node = onnx.helper.make_node(
|
|
DEQUANT_OP_NAME,
|
|
[dq_input, scale_name, zp_name],
|
|
[dq_output],
|
|
dequant_node_name,
|
|
axis=axis,
|
|
)
|
|
self.model.add_nodes([qlinear_node, dequant_node])
|
|
|
|
def _add_qdq_pair_for_initializer(self, weight_proto, tensor_type, axis=None):
|
|
weight_name = weight_proto.name
|
|
if axis is not None:
|
|
if self.opset_version < 13:
|
|
raise ValueError("Per-Channel support with QDQ format requires onnx opset version 13 or above.")
|
|
q_weight_name, zp_name, scale_name = self.quantize_weight_per_channel(
|
|
weight_name, onnx_proto.TensorProto.INT8, axis, keep_float_weight=self.add_qdq_pair_to_weight
|
|
)
|
|
else:
|
|
q_weight_name, zp_name, scale_name = self.quantize_initializer(
|
|
weight_proto,
|
|
self.weight_qType if tensor_type is QDQQuantTensorType.WEIGHT else self.activation_qType,
|
|
keep_float_weight=self.add_qdq_pair_to_weight,
|
|
)
|
|
|
|
weight_dequant_output = add_dequant_output_suffix(weight_name)
|
|
self.model.replace_input_of_all_nodes(weight_name, weight_dequant_output)
|
|
if self.add_qdq_pair_to_weight:
|
|
weight_quant_output = add_quant_output_suffix(weight_name)
|
|
|
|
self._create_qdq_nodes(
|
|
weight_name,
|
|
weight_quant_output,
|
|
add_quant_suffix(weight_name),
|
|
weight_quant_output,
|
|
weight_dequant_output,
|
|
add_dequant_suffix(weight_name),
|
|
scale_name,
|
|
zp_name,
|
|
axis,
|
|
)
|
|
else:
|
|
dequant_node = onnx.helper.make_node(
|
|
DEQUANT_OP_NAME,
|
|
[q_weight_name, scale_name, zp_name],
|
|
[weight_dequant_output],
|
|
add_dequant_suffix(weight_name),
|
|
axis=axis,
|
|
)
|
|
self.model.add_node(dequant_node)
|
|
|
|
def _add_qdq_pair_for_activation(self, tensor_name, scale_name, zp_name):
|
|
if (
|
|
self.dedicated_qdq_pair
|
|
and tensor_name in self.tensor_to_its_receiving_nodes
|
|
and len(self.tensor_to_its_receiving_nodes[tensor_name]) > 1
|
|
):
|
|
num_dedicated_qdq_pair = len(self.tensor_to_its_receiving_nodes[tensor_name])
|
|
for i in range(num_dedicated_qdq_pair):
|
|
postfix = f"_{i + 1}"
|
|
tensor_name_quant_output_postfix = add_quant_output_suffix(tensor_name) + postfix
|
|
tensor_name_dequant_output_postfix = add_dequant_output_suffix(tensor_name) + postfix
|
|
quant_node_name_postfix = add_quant_suffix(tensor_name) + postfix
|
|
dequant_node_name_postfix = add_dequant_suffix(tensor_name) + postfix
|
|
self._create_qdq_nodes(
|
|
tensor_name,
|
|
tensor_name_quant_output_postfix,
|
|
quant_node_name_postfix,
|
|
tensor_name_quant_output_postfix,
|
|
tensor_name_dequant_output_postfix,
|
|
dequant_node_name_postfix,
|
|
scale_name,
|
|
zp_name,
|
|
)
|
|
|
|
node = self.tensor_to_its_receiving_nodes[tensor_name][i]
|
|
self.model.replace_node_input(node, tensor_name, tensor_name_dequant_output_postfix)
|
|
if i == 0:
|
|
quantized_value = QuantizedValue(
|
|
tensor_name,
|
|
tensor_name_dequant_output_postfix,
|
|
scale_name,
|
|
zp_name,
|
|
QuantizedValueType.Input,
|
|
)
|
|
self.quantized_value_map[tensor_name] = quantized_value
|
|
else:
|
|
q_input = tensor_name
|
|
dq_output = add_dequant_output_suffix(tensor_name)
|
|
if self.model.is_graph_output(tensor_name):
|
|
q_input = add_quant_input_suffix(tensor_name)
|
|
dq_output = tensor_name
|
|
self.model.replace_output_of_all_nodes(tensor_name, q_input)
|
|
else:
|
|
self.model.replace_input_of_all_nodes(tensor_name, dq_output)
|
|
|
|
self._create_qdq_nodes(
|
|
q_input,
|
|
add_quant_output_suffix(tensor_name),
|
|
add_quant_suffix(tensor_name),
|
|
add_quant_output_suffix(tensor_name),
|
|
dq_output,
|
|
add_dequant_suffix(tensor_name),
|
|
scale_name,
|
|
zp_name,
|
|
)
|
|
|
|
quantized_value = QuantizedValue(
|
|
tensor_name,
|
|
dq_output,
|
|
scale_name,
|
|
zp_name,
|
|
QuantizedValueType.Input,
|
|
)
|
|
self.quantized_value_map[tensor_name] = quantized_value
|
|
|
|
def _quantize_normal_tensors(self):
|
|
for tensor_name, tensor_info in self.tensors_to_quantize.copy().items():
|
|
if tensor_name in self.quantized_value_map.keys():
|
|
continue
|
|
|
|
if not tensor_info.is_shared:
|
|
# Quantize the input
|
|
initializer = find_by_name(tensor_name, self.model.initializer())
|
|
if initializer:
|
|
self._add_qdq_pair_for_initializer(initializer, tensor_info.tensor_type, tensor_info.axis)
|
|
else:
|
|
used_scale, used_zp = self.find_quant_scale_zp(tensor_name)
|
|
data_found, scale_name, zp_name, _, _ = self._get_quantization_params(
|
|
tensor_name, used_scale, used_zp
|
|
)
|
|
|
|
if not data_found:
|
|
raise ValueError(
|
|
f"Quantization parameters are not specified for param {tensor_name}. "
|
|
"In static mode quantization params for inputs and outputs of nodes to be quantized are required."
|
|
)
|
|
|
|
self._add_qdq_pair_for_activation(tensor_name, scale_name, zp_name)
|
|
|
|
del self.tensors_to_quantize[tensor_name]
|
|
|
|
def _quantize_sharing_param_tensors(self):
|
|
while self.tensors_to_quantize:
|
|
for tensor_name, tensor_info in self.tensors_to_quantize.copy().items():
|
|
tensor_provider_name = tensor_info.quant_para_provider
|
|
if tensor_provider_name in self.quantized_value_map:
|
|
del self.tensors_to_quantize[tensor_name]
|
|
|
|
quantized_value = self.quantized_value_map[tensor_provider_name]
|
|
# Quantize the input
|
|
initializer = find_by_name(tensor_name, self.model.initializer())
|
|
if initializer is not None:
|
|
raise ValueError("Quantization parameter shared mode is not supported for weight yet")
|
|
self._add_qdq_pair_for_activation(tensor_name, quantized_value.scale_name, quantized_value.zp_name)
|
|
|
|
def _quantize_bias_tensors(self):
|
|
for bias_name, input_name, weight_name, beta in self.bias_to_quantize:
|
|
if bias_name in self.quantized_value_map.keys():
|
|
continue
|
|
# Quantize the input
|
|
self.quantize_bias_static(bias_name, input_name, weight_name, beta)
|
|
self.model.remove_initializer(find_by_name(bias_name, self.model.initializer()))
|
|
quant_value = self.quantized_value_map[bias_name]
|
|
inputs = [quant_value.q_name, quant_value.scale_name, quant_value.zp_name]
|
|
node_name = add_dequant_suffix(bias_name)
|
|
if quant_value.axis is not None:
|
|
dequant_node = onnx.helper.make_node(
|
|
"DequantizeLinear",
|
|
inputs,
|
|
[bias_name],
|
|
node_name,
|
|
axis=quant_value.axis,
|
|
)
|
|
else:
|
|
dequant_node = onnx.helper.make_node(
|
|
"DequantizeLinear",
|
|
inputs,
|
|
[bias_name],
|
|
node_name,
|
|
)
|
|
self.model.add_node(dequant_node)
|
|
|
|
def is_tensor_quantized(self, tensor_name):
|
|
return tensor_name in self.tensors_to_quantize or tensor_name in self.bias_to_quantize
|