MTtranslateService/Lib/site-packages/onnxruntime/quantization/qdq_quantizer.py


								# -------------------------------------------------------------------------

								# Copyright (c) Microsoft Corporation. All rights reserved.

								# Licensed under the MIT License. See License.txt in the project root for

								# license information.

								# --------------------------------------------------------------------------

								import logging

								from enum import Enum


								import onnx

								import onnx.numpy_helper

								from onnx import TensorProto

								from onnx import onnx_pb as onnx_proto


								from .onnx_quantizer import ONNXQuantizer

								from .quant_utils import (

								    DEQUANT_OP_NAME,

								    QUANT_OP_NAME,

								    QuantizedValue,

								    QuantizedValueType,

								    __producer__,

								    __version__,

								    add_dequant_output_suffix,

								    add_dequant_suffix,

								    add_quant_input_suffix,

								    add_quant_output_suffix,

								    add_quant_suffix,

								    find_by_name,

								)

								from .registry import CreateQDQQuantizer


								class QDQQuantTensorType(Enum):

								    ACTIVATION = 0

								    WEIGHT = 1

								    BIAS = 2


								class QDQTensorQuantInfo:

								    def __init__(self, tensor_type=QDQQuantTensorType.ACTIVATION, quant_para_provider=None, axis=None):

								        self.tensor_type = tensor_type

								        self.quant_para_provider = quant_para_provider

								        self.axis = axis

								        self.is_shared = quant_para_provider is not None


								class QDQQuantizer(ONNXQuantizer):

								    def __init__(

								        self,

								        model,

								        per_channel,

								        reduce_range,

								        mode,

								        static,

								        weight_qType,

								        activation_qType,

								        tensors_range,

								        nodes_to_quantize,

								        nodes_to_exclude,

								        op_types_to_quantize,

								        extra_options=None,

								    ):

								        ONNXQuantizer.__init__(

								            self,

								            model,

								            per_channel,

								            reduce_range,

								            mode,

								            static,

								            weight_qType,

								            activation_qType,

								            tensors_range,

								            nodes_to_quantize,

								            nodes_to_exclude,

								            op_types_to_quantize,

								            extra_options,

								        )

								        self.tensors_to_quantize = {}

								        self.bias_to_quantize = []


								        self.nodes_to_remove = []


								        # Specific op types to exclude qdq quantization for their outputs.

								        # In TRT, it's not recommended to quantize outputs for weighted ops such as Conv, Matmul, Gemm

								        # because those ops may be followed by nodes that require high resolution inputs.

								        # Adding QDQ for those ops' output may end up with worse accuracy.

								        # So, we don't recommend to add QDQ to node's output under such condition.

								        self.op_types_to_exclude_output_quantization = (

								            []

								            if "OpTypesToExcludeOutputQuantization" not in extra_options

								            else extra_options["OpTypesToExcludeOutputQuantization"]

								        )


								        # We do quantization on Dequantizelinear's input to remove Quantizelinear for weight as an optimization.

								        # In some cases, for example QDQ BERT model for TensorRT, QDQ should always appear as a pair.

								        # Therefore, we need to disable this optimization and add qdq pair to weight.

								        self.add_qdq_pair_to_weight = (

								            False if "AddQDQPairToWeight" not in extra_options else extra_options["AddQDQPairToWeight"]

								        )


								        # The default behavior is that multiple nodes can share a QDQ pair as their inputs.

								        # In TRT, QDQ pair can’t be shared between nodes, so it will create dedicated QDQ pairs for each node.

								        self.dedicated_qdq_pair = (

								            False if "DedicatedQDQPair" not in extra_options else extra_options["DedicatedQDQPair"]

								        )

								        if self.dedicated_qdq_pair:

								            self.tensor_to_its_receiving_nodes = {}


								        # Let user set channel axis for specific op type and it's effective only when per channel quantization is supported and per_channel is True.

								        self.qdq_op_type_per_channel_support_to_axis = (

								            {}

								            if "QDQOpTypePerChannelSupportToAxis" not in extra_options

								            else extra_options["QDQOpTypePerChannelSupportToAxis"]

								        )


								    def _is_tensor_quantizable(self, tensor_name):

								        """

								        Check if tensor can be quantized

								        """

								        weight = find_by_name(tensor_name, self.model.initializer())

								        if weight is not None:

								            if weight.data_type == onnx_proto.TensorProto.FLOAT:

								                return True

								        elif tensor_name in self.value_infos.keys():

								            vi = self.value_infos[tensor_name]

								            if vi.type.HasField("tensor_type") and vi.type.tensor_type.elem_type == TensorProto.FLOAT:

								                return True

								        else:

								            logging.warning(

								                "failed to infer the type of tensor: {}. Skip to quantize it. Please check if it is expected.".format(

								                    tensor_name

								                )

								            )


								        return False


								    def __quantize_tensor(self, tensor_name, quant_sharing_param=None, tensor_type=QDQQuantTensorType.ACTIVATION):

								        """

								        Quantize tensors. If quant_param_tensor is not None, tensor with name tensor_name will be quantized with same

								        quantization parameters as tensor quant_param_tensor


								        Args:

								            tensor_name: name of the tensor to quantize

								            quant_sharing_param: name of the tensor that provides quantization parameter

								            tensor_type: QDQQuantTensorType default ACTIVATION

								        """

								        if self._is_tensor_quantizable(tensor_name):

								            if quant_sharing_param:

								                self.tensors_to_quantize[tensor_name] = QDQTensorQuantInfo(

								                    tensor_type=tensor_type, quant_para_provider=quant_sharing_param

								                )

								            elif tensor_name not in self.tensors_to_quantize:

								                self.tensors_to_quantize[tensor_name] = QDQTensorQuantInfo(tensor_type=tensor_type)


								    def quantize_activation_tensor(self, tensor_name, quant_sharing_param=None):

								        """

								        Quantize Activation Tensor

								        Args:

								            tensor_name: name of the tensor to quantize

								            quant_sharing_param: name of the tensor that provides quantization parameter


								        """

								        return self.__quantize_tensor(tensor_name, quant_sharing_param, QDQQuantTensorType.ACTIVATION)


								    def quantize_weight_tensor(self, tensor_name, quant_sharing_param=None):

								        """

								        Quantize Weight Tensor

								        Args:

								            tensor_name: name of the tensor to quantize

								            quant_sharing_param: name of the tensor that provides quantization parameter


								        """

								        return self.__quantize_tensor(tensor_name, quant_sharing_param, QDQQuantTensorType.WEIGHT)


								    def quantize_weight_tensor_per_channel(self, tensor_name, axis):

								        weight = find_by_name(tensor_name, self.model.initializer())

								        if weight:

								            if weight.data_type == onnx_proto.TensorProto.FLOAT:

								                self.tensors_to_quantize[tensor_name] = QDQTensorQuantInfo(

								                    tensor_type=QDQQuantTensorType.WEIGHT, axis=axis

								                )

								        else:

								            logging.warning(

								                "only support per-channel quantization on weight. Tensor: {} is not quantized.".format(tensor_name)

								            )


								    def quantize_bias_tensor(self, bias_name, input_name, weight_name, beta=1.0):

								        weight = find_by_name(bias_name, self.model.initializer())

								        if weight is not None:

								            if weight.data_type == onnx_proto.TensorProto.FLOAT:

								                self.bias_to_quantize.append((bias_name, input_name, weight_name, beta))

								        else:

								            logging.warning("Expected {} to be a weight".format(bias_name))


								    def remove_node(self, node):

								        self.nodes_to_remove.append(node)


								    def remove_nodes(self):

								        self.model.remove_nodes(self.nodes_to_remove)


								    def quantize_model(self):

								        for node in self.model.nodes():

								            if self.should_quantize_node(node):

								                op_quantizer = CreateQDQQuantizer(self, node)

								                op_quantizer.quantize()


								                if self.dedicated_qdq_pair:

								                    for tensor_name in node.input:

								                        if tensor_name not in self.tensor_to_its_receiving_nodes:

								                            self.tensor_to_its_receiving_nodes[tensor_name] = []

								                        self.tensor_to_its_receiving_nodes[tensor_name].append(node)


								        self._quantize_normal_tensors()

								        self._quantize_sharing_param_tensors()

								        self._quantize_bias_tensors()

								        self.remove_nodes()

								        if not self.add_qdq_pair_to_weight:

								            self.model.clean_initializers()


								        self.model.model.producer_name = __producer__

								        self.model.model.producer_version = __version__


								        return self.model.model


								    def try_replacing_upstream_output(self, upstream_output_name, output_name):

								        if (

								            output_name in self.quantization_params.keys()

								            and len(self.model.input_name_to_nodes()[upstream_output_name]) == 1

								            and not self.model.is_graph_output(upstream_output_name)

								            and not self.model.is_graph_input(upstream_output_name)

								        ):

								            self.model.replace_output_of_all_nodes(upstream_output_name, output_name)

								            if upstream_output_name in self.tensors_to_quantize:

								                del self.tensors_to_quantize[upstream_output_name]

								            return True

								        return False


								    def _create_qdq_nodes(

								        self, q_input, q_output, quant_node_name, dq_input, dq_output, dequant_node_name, scale_name, zp_name, axis=None

								    ):

								        qlinear_node = onnx.helper.make_node(

								            QUANT_OP_NAME,

								            [q_input, scale_name, zp_name],

								            [q_output],

								            quant_node_name,

								            axis=axis,

								        )

								        dequant_node = onnx.helper.make_node(

								            DEQUANT_OP_NAME,

								            [dq_input, scale_name, zp_name],

								            [dq_output],

								            dequant_node_name,

								            axis=axis,

								        )

								        self.model.add_nodes([qlinear_node, dequant_node])


								    def _add_qdq_pair_for_initializer(self, weight_proto, tensor_type, axis=None):

								        weight_name = weight_proto.name

								        if axis is not None:

								            if self.opset_version < 13:

								                raise ValueError("Per-Channel support with QDQ format requires onnx opset version 13 or above.")

								            q_weight_name, zp_name, scale_name = self.quantize_weight_per_channel(

								                weight_name, onnx_proto.TensorProto.INT8, axis, keep_float_weight=self.add_qdq_pair_to_weight

								            )

								        else:

								            q_weight_name, zp_name, scale_name = self.quantize_initializer(

								                weight_proto,

								                self.weight_qType if tensor_type is QDQQuantTensorType.WEIGHT else self.activation_qType,

								                keep_float_weight=self.add_qdq_pair_to_weight,

								            )


								        weight_dequant_output = add_dequant_output_suffix(weight_name)

								        self.model.replace_input_of_all_nodes(weight_name, weight_dequant_output)

								        if self.add_qdq_pair_to_weight:

								            weight_quant_output = add_quant_output_suffix(weight_name)


								            self._create_qdq_nodes(

								                weight_name,

								                weight_quant_output,

								                add_quant_suffix(weight_name),

								                weight_quant_output,

								                weight_dequant_output,

								                add_dequant_suffix(weight_name),

								                scale_name,

								                zp_name,

								                axis,

								            )

								        else:

								            dequant_node = onnx.helper.make_node(

								                DEQUANT_OP_NAME,

								                [q_weight_name, scale_name, zp_name],

								                [weight_dequant_output],

								                add_dequant_suffix(weight_name),

								                axis=axis,

								            )

								            self.model.add_node(dequant_node)


								    def _add_qdq_pair_for_activation(self, tensor_name, scale_name, zp_name):

								        if (

								            self.dedicated_qdq_pair

								            and tensor_name in self.tensor_to_its_receiving_nodes

								            and len(self.tensor_to_its_receiving_nodes[tensor_name]) > 1

								        ):

								            num_dedicated_qdq_pair = len(self.tensor_to_its_receiving_nodes[tensor_name])

								            for i in range(num_dedicated_qdq_pair):

								                postfix = f"_{i + 1}"

								                tensor_name_quant_output_postfix = add_quant_output_suffix(tensor_name) + postfix

								                tensor_name_dequant_output_postfix = add_dequant_output_suffix(tensor_name) + postfix

								                quant_node_name_postfix = add_quant_suffix(tensor_name) + postfix

								                dequant_node_name_postfix = add_dequant_suffix(tensor_name) + postfix

								                self._create_qdq_nodes(

								                    tensor_name,

								                    tensor_name_quant_output_postfix,

								                    quant_node_name_postfix,

								                    tensor_name_quant_output_postfix,

								                    tensor_name_dequant_output_postfix,

								                    dequant_node_name_postfix,

								                    scale_name,

								                    zp_name,

								                )


								                node = self.tensor_to_its_receiving_nodes[tensor_name][i]

								                self.model.replace_node_input(node, tensor_name, tensor_name_dequant_output_postfix)

								                if i == 0:

								                    quantized_value = QuantizedValue(

								                        tensor_name,

								                        tensor_name_dequant_output_postfix,

								                        scale_name,

								                        zp_name,

								                        QuantizedValueType.Input,

								                    )

								                    self.quantized_value_map[tensor_name] = quantized_value

								        else:

								            q_input = tensor_name

								            dq_output = add_dequant_output_suffix(tensor_name)

								            if self.model.is_graph_output(tensor_name):

								                q_input = add_quant_input_suffix(tensor_name)

								                dq_output = tensor_name

								                self.model.replace_output_of_all_nodes(tensor_name, q_input)

								            else:

								                self.model.replace_input_of_all_nodes(tensor_name, dq_output)


								            self._create_qdq_nodes(

								                q_input,

								                add_quant_output_suffix(tensor_name),

								                add_quant_suffix(tensor_name),

								                add_quant_output_suffix(tensor_name),

								                dq_output,

								                add_dequant_suffix(tensor_name),

								                scale_name,

								                zp_name,

								            )


								            quantized_value = QuantizedValue(

								                tensor_name,

								                dq_output,

								                scale_name,

								                zp_name,

								                QuantizedValueType.Input,

								            )

								            self.quantized_value_map[tensor_name] = quantized_value


								    def _quantize_normal_tensors(self):

								        for tensor_name, tensor_info in self.tensors_to_quantize.copy().items():

								            if tensor_name in self.quantized_value_map.keys():

								                continue


								            if not tensor_info.is_shared:

								                # Quantize the input

								                initializer = find_by_name(tensor_name, self.model.initializer())

								                if initializer:

								                    self._add_qdq_pair_for_initializer(initializer, tensor_info.tensor_type, tensor_info.axis)

								                else:

								                    used_scale, used_zp = self.find_quant_scale_zp(tensor_name)

								                    data_found, scale_name, zp_name, _, _ = self._get_quantization_params(

								                        tensor_name, used_scale, used_zp

								                    )


								                    if not data_found:

								                        raise ValueError(

								                            f"Quantization parameters are not specified for param {tensor_name}. "

								                            "In static mode quantization params for inputs and outputs of nodes to be quantized are required."

								                        )


								                    self._add_qdq_pair_for_activation(tensor_name, scale_name, zp_name)


								                del self.tensors_to_quantize[tensor_name]


								    def _quantize_sharing_param_tensors(self):

								        while self.tensors_to_quantize:

								            for tensor_name, tensor_info in self.tensors_to_quantize.copy().items():

								                tensor_provider_name = tensor_info.quant_para_provider

								                if tensor_provider_name in self.quantized_value_map:

								                    del self.tensors_to_quantize[tensor_name]


								                    quantized_value = self.quantized_value_map[tensor_provider_name]

								                    # Quantize the input

								                    initializer = find_by_name(tensor_name, self.model.initializer())

								                    if initializer is not None:

								                        raise ValueError("Quantization parameter shared mode is not supported for weight yet")

								                    self._add_qdq_pair_for_activation(tensor_name, quantized_value.scale_name, quantized_value.zp_name)


								    def _quantize_bias_tensors(self):

								        for bias_name, input_name, weight_name, beta in self.bias_to_quantize:

								            if bias_name in self.quantized_value_map.keys():

								                continue

								            # Quantize the input

								            self.quantize_bias_static(bias_name, input_name, weight_name, beta)

								            self.model.remove_initializer(find_by_name(bias_name, self.model.initializer()))

								            quant_value = self.quantized_value_map[bias_name]

								            inputs = [quant_value.q_name, quant_value.scale_name, quant_value.zp_name]

								            node_name = add_dequant_suffix(bias_name)

								            if quant_value.axis is not None:

								                dequant_node = onnx.helper.make_node(

								                    "DequantizeLinear",

								                    inputs,

								                    [bias_name],

								                    node_name,

								                    axis=quant_value.axis,

								                )

								            else:

								                dequant_node = onnx.helper.make_node(

								                    "DequantizeLinear",

								                    inputs,

								                    [bias_name],

								                    node_name,

								                )

								            self.model.add_node(dequant_node)


								    def is_tensor_quantized(self, tensor_name):

								        return tensor_name in self.tensors_to_quantize or tensor_name in self.bias_to_quantize