MTtranslateService/Lib/site-packages/onnxruntime/transformers/fusion_qordered_matmul.py

# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation.  All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------

from logging import getLogger
from typing import Dict

from fusion_base import Fusion
from fusion_utils import FusionUtils
from onnx import helper
from onnx_model import OnnxModel

logger = getLogger(__name__)


class FusionQOrderedMatMul(Fusion):
    def __init__(self, model: OnnxModel):
        super().__init__(model, "QOrderedMatMul", "MatMul")

    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
        matmul_children = self.model.get_children(node, input_name_to_nodes)

        # Should only have 1 child - Bias Add
        if len(matmul_children) != 1 or matmul_children[0].op_type != "Add":
            return

        bias_add_node = matmul_children[0]

        # Atleast one of the inputs to Bias Add node must be a constant
        bias_add_node_index = 0
        if (
            self.model.get_constant_value(bias_add_node.input[0]) is None
            and self.model.get_constant_value(bias_add_node.input[1]) is None
        ):
            return

        if self.model.get_constant_value(bias_add_node.input[0]) is None:
            bias_add_node_index = 1

        bias_add_children = self.model.get_children(bias_add_node, input_name_to_nodes)

        if len(bias_add_children) != 1:
            return

        bias_add_child = bias_add_children[0]

        # Bias Add can have another Add downstream (Residual Add layer)
        residual_add_node = None

        downstream_quantize_node = None

        if bias_add_child.op_type == "Add":
            residual_add_node = bias_add_child

            residual_add_children = self.model.get_children(residual_add_node, input_name_to_nodes)

            if len(residual_add_children) != 1 or residual_add_children[0].op_type != "QuantizeLinear":
                return

            downstream_quantize_node = residual_add_children[0]

        elif bias_add_child.op_type == "QuantizeLinear":
            downstream_quantize_node = bias_add_child

        else:
            return

        # Make sure the downstream QuantizeLinear has the proper zero points and scales
        if not FusionUtils.check_qdq_node_for_fusion(downstream_quantize_node, self.model):
            return

        # The first input to MatMul should flow through a DequantizeLinear node
        first_path_id, first_input_parent_nodes, _ = self.model.match_parent_paths(
            node,
            [(["DequantizeLinear"], [0])],
            output_name_to_node,
        )

        # If Attention is not fused, this is the pattern to look for
        # leading upto the MatMul
        reshape_node_0 = None
        transpose_node_0 = None
        if first_path_id < 0:
            first_path_id, first_input_parent_nodes, _ = self.model.match_parent_paths(
                node,
                [(["Reshape", "Transpose", "DequantizeLinear", "QuantizeLinear"], [0, 0, 0, 0])],
                output_name_to_node,
            )

            if first_path_id < 0:
                return

            reshape_node_0 = first_input_parent_nodes[0]
            transpose_node_0 = first_input_parent_nodes[1]
            dequantize_node_0 = first_input_parent_nodes[2]
        else:
            dequantize_node_0 = first_input_parent_nodes[0]

        # Make sure the upstream DequantizeLinear-0 has the proper zero points and scales
        if not FusionUtils.check_qdq_node_for_fusion(dequantize_node_0, self.model):
            return

        # The second input to MatMul should flow through a DequantizeLinear node
        dequantize_node_1 = None
        is_weight_transpose_required = True

        weight_path_id, weight_nodes, _ = self.model.match_parent_paths(
            node,
            [(["DequantizeLinear", "QuantizeLinear", "Transpose", "DequantizeLinear"], [1, 0, 0, 0])],
            output_name_to_node,
        )

        if weight_path_id < 0:
            weight_path_id, weight_nodes, _ = self.model.match_parent_paths(
                node,
                [(["DequantizeLinear"], [1])],
                output_name_to_node,
            )

            if weight_path_id < 0:
                return

            dequantize_node_1 = weight_nodes[0]
        else:
            is_weight_transpose_required = False
            dequantize_node_1 = weight_nodes[3]

        # Check if weight 'B' is a constant
        if self.model.get_constant_value(dequantize_node_1.input[0]) is None:
            return

        # Make sure the upstream DequantizeLinear-1 has the proper zero points and scales
        # Per-channel scales are supported for weights alone
        if not FusionUtils.check_qdq_node_for_fusion(dequantize_node_1, self.model, False):
            return

        # Make sure the upstream flow into the Residual Add node flows through a DQ node
        residual_add_dequantize_node = None

        if residual_add_node is not None:
            residual_path_id, residual_input_parent_nodes, _ = self.model.match_parent_paths(
                residual_add_node,
                [
                    (["DequantizeLinear"], [1]),
                ],
                output_name_to_node,
            )

            if residual_path_id < 0:
                return

            residual_add_dequantize_node = residual_input_parent_nodes[0]

        # Make sure the upstream DequantizeLinear to the Residual Add has the proper zero points and scales
        if residual_add_dequantize_node is not None and not FusionUtils.check_qdq_node_for_fusion(
            residual_add_dequantize_node, self.model
        ):
            return

        # Subgraph nodes to be fused
        subgraph_nodes = [node, bias_add_node]  # MatMul + Bias Add

        if residual_add_node is not None:
            subgraph_nodes.extend([residual_add_node])  # Residual Add

        subgraph_nodes.extend(weight_nodes)
        subgraph_nodes.extend([downstream_quantize_node])  # Downstream Q node

        if not self.model.is_safe_to_fuse_nodes(
            subgraph_nodes, downstream_quantize_node.output, input_name_to_nodes, output_name_to_node
        ):
            logger.debug(f"It is not safe to fuse QOrderedMatMul node. Skip")
            return

        # Deal with the case where-in the Attention subgraph is not fused
        if transpose_node_0 is not None:
            self.model.replace_node_input(transpose_node_0, transpose_node_0.input[0], dequantize_node_0.input[0])

        # Make inputs
        fused_node_inputs = [
            reshape_node_0.output[0] if reshape_node_0 is not None else dequantize_node_0.input[0],
            dequantize_node_0.input[1],
            dequantize_node_1.input[0],
            dequantize_node_1.input[1],
            downstream_quantize_node.input[1],
            bias_add_node.input[bias_add_node_index],
        ]

        if residual_add_node is not None:
            fused_node_inputs.append(residual_add_dequantize_node.input[0])
            fused_node_inputs.append(residual_add_dequantize_node.input[1])

        # The MatMul weight 'B' and 'bias' need some post-processing
        # Transpose weight 'B' from order ROW to order COL
        # This offline transpose is needed only while using the CUDA EP
        # TODO: Make this fusion logic EP-agnostic ?
        if is_weight_transpose_required:
            weight_tensor = self.model.get_initializer(dequantize_node_1.input[0])
            FusionUtils.transpose_2d_int8_tensor(weight_tensor)

        fused_node = helper.make_node(
            "QOrderedMatMul",
            inputs=fused_node_inputs,
            outputs=[downstream_quantize_node.output[0]],
            name=self.model.create_node_name("QOrderedMatMul", name_prefix="QOrderedMatMul"),
        )

        fused_node.attribute.extend([helper.make_attribute("order_A", 1)])
        fused_node.attribute.extend([helper.make_attribute("order_B", 0)])
        fused_node.attribute.extend([helper.make_attribute("order_Y", 1)])

        fused_node.domain = "com.microsoft"

        self.nodes_to_remove.extend(subgraph_nodes)
        self.nodes_to_add.append(fused_node)
        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name