MTtranslateService/Lib/site-packages/onnxruntime/transformers/fusion_qordered_attention.py


								# -------------------------------------------------------------------------

								# Copyright (c) Microsoft Corporation.  All rights reserved.

								# Licensed under the MIT License.

								# --------------------------------------------------------------------------


								from logging import getLogger

								from typing import Tuple


								import numpy as np

								from fusion_attention import AttentionMask

								from fusion_base import Fusion

								from fusion_utils import FusionUtils, NumpyHelper

								from onnx import NodeProto, helper

								from onnx_model import OnnxModel


								logger = getLogger(__name__)


								class FusionQOrderedAttention(Fusion):

								    def __init__(

								        self,

								        model: OnnxModel,

								        hidden_size: int,

								        num_heads: int,

								        attention_mask: AttentionMask,

								    ):

								        self.hidden_size = hidden_size

								        self.num_heads = num_heads

								        self.attention_mask = attention_mask


								        super().__init__(model, "QOrderedAttention", "QOrderedLayerNormalization")


								    def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]:

								        """Detect num_heads and hidden_size from a reshape node.

								        Args:

								            reshape_q (NodeProto): reshape node for Q

								        Returns:

								            Tuple[int, int]: num_heads and hidden_size

								        """


								        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]

								        q_shape = self.model.get_initializer(reshape_q.input[1])

								        if q_shape is None:

								            logger.debug(f"{reshape_q.input[1]} is not initializer.")


								            # Check if the second input to Reshape flows through a Constant node

								            # TODO: Investigate why FusionAttention doesn't have such logic

								            constant_node = self.model.match_parent_path(reshape_q, ["Constant"], [1])


								            if constant_node is None:

								                return self.num_heads, self.hidden_size  # Fall back to user specified value

								            else:

								                constant_node = constant_node[0]


								                if len(constant_node.attribute) != 1:

								                    return self.num_heads, self.hidden_size  # Fall back to user specified value


								                # This is assuming it is a Tensor attribute (this is a safe assumption)

								                q_shape = constant_node.attribute[0].t


								        q_shape_value = NumpyHelper.to_array(q_shape)

								        if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0):

								            logger.debug(f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size].")

								            return self.num_heads, self.hidden_size  # Fall back to user specified value


								        num_heads = q_shape_value[2]

								        head_size = q_shape_value[3]

								        hidden_size = num_heads * head_size


								        if self.num_heads > 0 and num_heads != self.num_heads:

								            if self.num_heads_warning:

								                logger.warning(f"--num_heads is {self.num_heads}. Detected value is {num_heads}. Using detected value.")

								                self.num_heads_warning = False  # Do not show the warning more than once


								        if self.hidden_size > 0 and hidden_size != self.hidden_size:

								            if self.hidden_size_warning:

								                logger.warning(

								                    f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value."

								                )

								                self.hidden_size_warning = False  # Do not show the warning more than once


								        return num_heads, hidden_size


								    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):

								        add_before_layernorm = self.model.match_parent_path(

								            normalize_node,

								            ["QuantizeLinear", "Add"],

								            [0, 0],

								        )


								        if add_before_layernorm is not None:

								            start_node = add_before_layernorm[-1]

								        else:

								            return


								        # Input QDQ nodes

								        dequantize_input = self.model.match_parent_path(

								            start_node,

								            ["DequantizeLinear"],

								            [None],

								        )


								        if dequantize_input is None:

								            logger.debug("fuse_qordered_attention: failed to match input qdq nodes path")

								            return


								        dequantize_input = dequantize_input[-1]


								        # QKV nodes

								        qkv_nodes = self.model.match_parent_path(

								            start_node,

								            ["Add", "MatMul", "Reshape", "Transpose", "DequantizeLinear", "QuantizeLinear", "MatMul"],

								            [None, None, 0, 0, 0, 0, 0],

								        )


								        if qkv_nodes is None:

								            logger.debug("fuse_qordered_attention: failed to match qkv path")

								            return


								        (_, projection_matmul, reshape_qkv, transpose_qkv, dequantize_qkv, quantize_qkv, matmul_qkv) = qkv_nodes


								        # Make sure the Q/DQ has the proper zero points and constant per-tensor scales

								        if not FusionUtils.check_qdq_node_for_fusion(quantize_qkv, self.model):

								            return


								        if not FusionUtils.check_qdq_node_for_fusion(dequantize_qkv, self.model):

								            return


								        # Identify the root input to the Attention node

								        other_inputs = []

								        for i, input in enumerate(start_node.input):

								            if input not in output_name_to_node:

								                continue


								            if input == qkv_nodes[0].output[0]:

								                continue


								            other_inputs.append(input)


								        if len(other_inputs) != 1:

								            return


								        root_input = other_inputs[0]


								        # V nodes

								        v_nodes = self.model.match_parent_path(

								            matmul_qkv,

								            ["Transpose", "Reshape", "DequantizeLinear", "QuantizeLinear", "Add", "MatMul"],

								            [1, 0, 0, 0, 0, None],

								        )


								        if v_nodes is None:

								            logger.debug("fuse_qordered_attention: failed to match v path")

								            return


								        (_, _, dequantize_v, quantize_v, add_v, matmul_v) = v_nodes


								        # Make sure the Q/DQ has the proper zero points and constant per-tensor scales

								        if not FusionUtils.check_qdq_node_for_fusion(quantize_v, self.model):

								            return


								        if not FusionUtils.check_qdq_node_for_fusion(dequantize_v, self.model):

								            return


								        # V MatMul weight

								        dequantize_v_matmul_weight = self.model.match_parent_path(matmul_v, ["DequantizeLinear"], [1])


								        if dequantize_v_matmul_weight is None:

								            logger.debug("fuse_qordered_attention: failed to match v path")

								            return


								        dequantize_v_matmul_weight = dequantize_v_matmul_weight[0]


								        if self.model.get_constant_value(dequantize_v_matmul_weight.input[0]) is None:

								            return


								        # Make sure the upstream DequantizeLinear-1 has the proper zero points and scales

								        # Per-channel scales are supported for weights alone

								        if not FusionUtils.check_qdq_node_for_fusion(dequantize_v_matmul_weight, self.model, False):

								            return


								        # QK nodes

								        qk_nodes = self.model.match_parent_path(

								            matmul_qkv,

								            [

								                "DequantizeLinear",

								                "QuantizeLinear",

								                "Softmax",

								                "Add",

								                "Div",

								                "DequantizeLinear",

								                "QuantizeLinear",

								                "MatMul",

								            ],

								            [0, 0, 0, 0, None, 0, 0, 0],

								        )


								        if qk_nodes is None:

								            logger.debug("fuse_qordered_attention: failed to match qk path")

								            return


								        (

								            dequantize_qk_softmax,

								            quantize_qk_softmax,

								            softmax_qk,

								            add_qk,

								            div_qk,

								            dequantize_qk,

								            quantize_qk,

								            matmul_qk,

								        ) = qk_nodes


								        # Make sure the Q/DQ has the proper zero points and constant per-tensor scales

								        if not FusionUtils.check_qdq_node_for_fusion(quantize_qk_softmax, self.model):

								            return


								        if not FusionUtils.check_qdq_node_for_fusion(dequantize_qk_softmax, self.model):

								            return


								        if not FusionUtils.check_qdq_node_for_fusion(quantize_qk, self.model):

								            return


								        if not FusionUtils.check_qdq_node_for_fusion(dequantize_qk, self.model):

								            return


								        # Q nodes

								        q_nodes = self.model.match_parent_path(

								            matmul_qk,

								            ["Transpose", "Reshape", "DequantizeLinear", "QuantizeLinear", "Add", "MatMul"],

								            [0, 0, 0, 0, 0, None],

								        )


								        if q_nodes is None:

								            logger.debug("fuse_qordered_attention: failed to match q path")

								            return


								        (_, reshape_q, dequantize_q, quantize_q, add_q, matmul_q) = q_nodes


								        # Make sure the Q/DQ has the proper zero points and constant per-tensor scales

								        if not FusionUtils.check_qdq_node_for_fusion(quantize_q, self.model):

								            return


								        if not FusionUtils.check_qdq_node_for_fusion(dequantize_q, self.model):

								            return


								        # Q MatMul weight

								        dequantize_q_matmul_weight = self.model.match_parent_path(matmul_q, ["DequantizeLinear"], [1])


								        if dequantize_q_matmul_weight is None:

								            logger.debug("fuse_qordered_attention: failed to match q path")

								            return


								        dequantize_q_matmul_weight = dequantize_q_matmul_weight[0]


								        if self.model.get_constant_value(dequantize_q_matmul_weight.input[0]) is None:

								            return


								        # Make sure the upstream DequantizeLinear-1 has the proper zero points and scales

								        # Per-channel scales are supported for weights alone

								        if not FusionUtils.check_qdq_node_for_fusion(dequantize_q_matmul_weight, self.model, False):

								            return


								        # K nodes

								        k_nodes = self.model.match_parent_path(

								            matmul_qk,

								            ["Transpose", "Reshape", "DequantizeLinear", "QuantizeLinear", "Add", "MatMul"],

								            [1, 0, 0, 0, 0, None],

								        )


								        if k_nodes is None:

								            logger.debug("fuse_qordered_attention: failed to match k path")

								            return


								        (_, _, dequantize_k, quantize_k, add_k, matmul_k) = k_nodes


								        # Make sure the Q/DQ has the proper zero points and constant per-tensor scales

								        if not FusionUtils.check_qdq_node_for_fusion(quantize_k, self.model):

								            return


								        if not FusionUtils.check_qdq_node_for_fusion(dequantize_k, self.model):

								            return


								        # K MatMul weight

								        dequantize_k_matmul_weight = self.model.match_parent_path(matmul_k, ["DequantizeLinear"], [1])


								        if dequantize_k_matmul_weight is None:

								            logger.debug("fuse_qordered_attention: failed to match k path")

								            return


								        dequantize_k_matmul_weight = dequantize_k_matmul_weight[0]


								        if self.model.get_constant_value(dequantize_k_matmul_weight.input[0]) is None:

								            return


								        # Make sure the upstream DequantizeLinear-1 has the proper zero points and scales

								        # Per-channel scales are supported for weights alone

								        if not FusionUtils.check_qdq_node_for_fusion(dequantize_k_matmul_weight, self.model, False):

								            return


								        # Mask nodes

								        mask_nodes = self.model.match_parent_path(

								            add_qk, ["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"], [None, 0, 1, 0, 0]

								        )


								        if mask_nodes is None:

								            logger.debug("fuse_qordered_attention: failed to match mask_nodes path")

								            return


								        # Ascertain `qkv_hidden_sizes` attribute value

								        q_weight = self.model.get_initializer(dequantize_q_matmul_weight.input[0])

								        k_weight = self.model.get_initializer(dequantize_k_matmul_weight.input[0])

								        v_weight = self.model.get_initializer(dequantize_v_matmul_weight.input[0])


								        qw = NumpyHelper.to_array(q_weight)

								        kw = NumpyHelper.to_array(k_weight)

								        vw = NumpyHelper.to_array(v_weight)


								        qw_out_size = np.prod(qw.shape[1:])

								        kw_out_size = np.prod(kw.shape[1:])

								        vw_out_size = np.prod(vw.shape[1:])


								        # Form QOrderedAttention node

								        if matmul_v.input[0] == root_input and matmul_q.input[0] == root_input and matmul_k.input[0] == root_input:

								            mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0])


								            # Ascertain `num_heads` and `hidden_size`

								            num_heads, hidden_size = self.get_num_heads_and_hidden_size(reshape_q)


								            # Formulate the inputs

								            # Actual quantized input

								            attention_inputs = [dequantize_input.input[0]]

								            attention_inputs.append(dequantize_input.input[1])


								            attention_inputs.append(dequantize_q.input[1])

								            attention_inputs.append(dequantize_k.input[1])

								            attention_inputs.append(dequantize_v.input[1])


								            attention_inputs.append(dequantize_q_matmul_weight.input[0])

								            attention_inputs.append(dequantize_k_matmul_weight.input[0])

								            attention_inputs.append(dequantize_v_matmul_weight.input[0])


								            attention_inputs.append(dequantize_q_matmul_weight.input[1])

								            attention_inputs.append(dequantize_k_matmul_weight.input[1])

								            attention_inputs.append(dequantize_v_matmul_weight.input[1])


								            if self.model.get_initializer(add_q.input[0]):

								                attention_inputs.append(add_q.input[0])

								            else:  # second input is the constant bias

								                attention_inputs.append(add_q.input[1])


								            if self.model.get_initializer(add_k.input[0]):

								                attention_inputs.append(add_k.input[0])

								            else:  # second input is the constant bias

								                attention_inputs.append(add_k.input[1])


								            if self.model.get_initializer(add_v.input[0]):

								                attention_inputs.append(add_v.input[0])

								            else:  # second input is the constant bias

								                attention_inputs.append(add_v.input[1])


								            attention_inputs.append(quantize_qk.input[1])

								            attention_inputs.append(quantize_qk_softmax.input[1])

								            attention_inputs.append(dequantize_qkv.input[1])


								            # Mask input

								            if mask_index is not None:

								                attention_inputs.append(mask_index)

								            else:

								                attention_inputs.append("")


								            # The MatMul weight 'B' and 'bias' need some post-processing

								            # Transpose weight 'B' from order ROW to order COL

								            # This offline transpose is needed only while using the CUDA EP

								            # TODO: Make this fusion logic EP-agnostic ?

								            q_weight_tensor = self.model.get_initializer(dequantize_q_matmul_weight.input[0])

								            FusionUtils.transpose_2d_int8_tensor(q_weight_tensor)


								            k_weight_tensor = self.model.get_initializer(dequantize_k_matmul_weight.input[0])

								            FusionUtils.transpose_2d_int8_tensor(k_weight_tensor)


								            v_weight_tensor = self.model.get_initializer(dequantize_v_matmul_weight.input[0])

								            FusionUtils.transpose_2d_int8_tensor(v_weight_tensor)


								            # Name and create Attention node

								            attention_node_name = self.model.create_node_name("QOrderedAttention")


								            attention_node = helper.make_node(

								                "QOrderedAttention",

								                inputs=attention_inputs,

								                outputs=[reshape_qkv.output[0]],

								                name=attention_node_name,

								            )


								            self.model.replace_node_input(dequantize_qkv, dequantize_qkv.input[0], attention_node.output[0])

								            self.model.replace_node_input(projection_matmul, projection_matmul.input[0], dequantize_qkv.output[0])


								            attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])

								            attention_node.attribute.extend([helper.make_attribute("order_input", 1)])

								            attention_node.attribute.extend([helper.make_attribute("order_weight", 0)])

								            attention_node.attribute.extend([helper.make_attribute("order_output", 1)])

								            attention_node.attribute.extend(

								                [helper.make_attribute("qkv_hidden_sizes", [qw_out_size, kw_out_size, vw_out_size])]

								            )


								            attention_node.domain = "com.microsoft"


								            self.nodes_to_add.append(attention_node)

								            self.node_name_to_graph_name[attention_node.name] = self.this_graph_name


								            self.nodes_to_remove.extend([reshape_qkv, transpose_qkv, quantize_qkv, matmul_qkv])

								            self.nodes_to_remove.extend(qk_nodes)

								            self.nodes_to_remove.extend(q_nodes)

								            self.nodes_to_remove.extend(k_nodes)

								            self.nodes_to_remove.extend(v_nodes)

								            self.nodes_to_remove.extend(

								                [dequantize_q_matmul_weight, dequantize_k_matmul_weight, dequantize_v_matmul_weight]

								            )


								            # Use prune graph to remove mask nodes since they are shared by all attention nodes.

								            # self.nodes_to_remove.extend(mask_nodes)

								            self.prune_graph = True