MTtranslateService/Lib/site-packages/onnxruntime/transformers/onnx_model_bert_keras.py

# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation.  All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------

import argparse
import logging
import sys
from collections import deque

import numpy as np
import onnx
from onnx import ModelProto, TensorProto, numpy_helper
from onnx_model_bert_tf import BertOnnxModelTF

logger = logging.getLogger(__name__)


class BertOnnxModelKeras(BertOnnxModelTF):
    def __init__(self, model, num_heads, hidden_size):
        super().__init__(model, num_heads, hidden_size)

    def match_mask_path(self, add_or_sub_before_softmax):
        mask_nodes = self.match_parent_path(
            add_or_sub_before_softmax,
            ["Mul", "Sub", "Reshape", "Cast"],
            [1, None, 1, 0],
        )
        if mask_nodes is not None:
            return mask_nodes

        mask_nodes = self.match_parent_path(
            add_or_sub_before_softmax,
            ["Mul", "Sub", "Cast", "Slice", "Unsqueeze"],
            [1, 1, 1, 0, 0],
        )
        if mask_nodes is not None:
            return mask_nodes

        mask_nodes = self.match_parent_path(
            add_or_sub_before_softmax,
            ["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"],
            [1, None, 1, 0, 0],
        )
        return mask_nodes

    def check_attention_input(self, matmul_q, matmul_k, matmul_v, parent, output_name_to_node):
        reshape_nodes = []

        for x in [matmul_q, matmul_k, matmul_v]:
            root_input = x.input[0]
            root_node = output_name_to_node[root_input]
            if root_node == parent:
                continue
            if root_node.op_type == "Reshape" and root_node.input[0] == parent.output[0]:
                reshape_nodes.append(root_node)
                continue
            logger.debug(f"Check attention input failed:{root_input}, {parent.output[0]}")
            return False, []

        return True, reshape_nodes

    def fuse_attention(self):
        input_name_to_nodes = self.input_name_to_nodes()
        output_name_to_node = self.output_name_to_node()

        nodes_to_remove = []
        attention_count = 0

        skip_layer_norm_nodes = self.get_nodes_by_op_type("SkipLayerNormalization")
        for normalize_node in skip_layer_norm_nodes:
            # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
            parent = self.get_parent(normalize_node, 0)
            if parent is None or parent.op_type not in [
                "SkipLayerNormalization",
                "EmbedLayerNormalization",
            ]:
                if parent.op_type == "Add":
                    parent = self.get_parent(normalize_node, 1)
                    if parent is None or parent.op_type not in [
                        "SkipLayerNormalization",
                        "EmbedLayerNormalization",
                    ]:
                        logger.debug(
                            "First input for skiplayernorm: {}".format(parent.op_type if parent is not None else None)
                        )
                        continue
                else:
                    logger.debug(
                        "First input for skiplayernorm: {}".format(parent.op_type if parent is not None else None)
                    )
                    continue
            else:
                # TODO: shall we add back the checking of children op types.
                pass

            qkv_nodes = self.match_parent_path(
                normalize_node,
                ["Add", "Reshape", "MatMul", "Reshape", "Transpose", "MatMul"],
                [None, 0, 0, 0, 0, 0],
            )
            if qkv_nodes is None:
                logger.debug("Failed to match qkv nodes")
                continue
            (
                add,
                extra_reshape_0,
                matmul,
                reshape_qkv,
                transpose_qkv,
                matmul_qkv,
            ) = qkv_nodes
            logger.debug("Matched qkv nodes")

            v_nodes = self.match_parent_path(
                matmul_qkv,
                ["Transpose", "Reshape", "Add", "Reshape", "MatMul"],
                [1, 0, 0, 0, 0],
            )
            if v_nodes is None:
                logger.debug("Failed to match v path")
                continue
            (transpose_v, reshape_v, add_v, extra_reshape_1, matmul_v) = v_nodes

            qk_nodes = self.match_parent_path(matmul_qkv, ["Softmax", "Sub", "MatMul"], [0, 0, 0])
            if qk_nodes is not None:
                (softmax_qk, sub_qk, matmul_qk) = qk_nodes
                q_nodes = self.match_parent_path(
                    matmul_qk,
                    ["Mul", "Transpose", "Reshape", "Add", "Reshape", "MatMul"],
                    [0, None, 0, 0, 0, 0],
                )
                if q_nodes is not None:
                    (
                        mul_q,
                        transpose_q,
                        reshape_q,
                        add_q,
                        extra_reshape_2,
                        matmul_q,
                    ) = q_nodes

            else:
                qk_nodes = self.match_parent_path(matmul_qkv, ["Softmax", "Add", "Mul", "MatMul"], [0, 0, 0, None])
                if qk_nodes is None:
                    qk_nodes = self.match_parent_path(matmul_qkv, ["Softmax", "Add", "Div", "MatMul"], [0, 0, 0, None])
                    if qk_nodes is None:
                        logger.debug("Failed to match qk path")
                        continue
                (softmax_qk, add_qk, mul_qk, matmul_qk) = qk_nodes

                q_nodes = self.match_parent_path(
                    matmul_qk,
                    ["Transpose", "Reshape", "Add", "Reshape", "MatMul"],
                    [0, 0, 0, 0, 0],
                )
                if q_nodes is not None:
                    (transpose_q, reshape_q, add_q, extra_reshape_2, matmul_q) = q_nodes

            if q_nodes is None:
                logger.debug("Failed to match q path")
                continue

            k_nodes = self.match_parent_path(
                matmul_qk,
                ["Transpose", "Reshape", "Add", "Reshape", "MatMul"],
                [1, 0, 0, 0, 0],
            )
            if k_nodes is None:
                logger.debug("Failed to match k path")
                continue
            (transpose_k, reshape_k, add_k, extra_reshape_3, matmul_k) = k_nodes

            mask_nodes = self.match_mask_path(qk_nodes[1])
            if mask_nodes is None:
                logger.debug("Failed to match mask path")
                continue
            if not self.has_constant_input(mask_nodes[1], 1):
                logger.debug("Sub node expected to have an input with constant value 1.0.")
                continue

            is_same_root, reshape_nodes = self.check_attention_input(
                matmul_q, matmul_k, matmul_v, parent, output_name_to_node
            )
            if is_same_root:
                mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0])
                logger.debug("Create an Attention node.")
                attention_node = self.attention_fusion.create_attention_node(
                    mask_index,
                    matmul_q,
                    matmul_k,
                    matmul_v,
                    add_q,
                    add_k,
                    add_v,
                    self.num_heads,
                    self.hidden_size,
                    parent.output[0],
                    reshape_qkv.output[0],
                    None,
                )
                if attention_node is None:
                    continue

                self.add_node(attention_node)
                attention_count += 1

                nodes_to_remove.extend([reshape_qkv, transpose_qkv, matmul_qkv])
                nodes_to_remove.extend(qk_nodes)
                nodes_to_remove.extend(q_nodes)
                nodes_to_remove.extend(k_nodes)
                nodes_to_remove.extend(v_nodes)
                nodes_to_remove.extend(mask_nodes)
                nodes_to_remove.extend(reshape_nodes)
                nodes_to_remove.append(extra_reshape_0)
                self.replace_node_input(add, extra_reshape_0.output[0], matmul.output[0])
            else:
                logger.debug("Root node not matched.")
                continue
        self.remove_nodes(nodes_to_remove)
        self.update_graph()
        logger.info(f"Fused Attention count:{attention_count}")

    def preprocess(self):
        self.process_embedding()
        self.fuse_mask()
        self.skip_reshape()

    def skip_reshape(self):
        input_name_to_nodes = self.input_name_to_nodes()
        output_name_to_node = self.output_name_to_node()

        nodes_to_remove = []
        attention_count = 0

        count = 0
        reshape_nodes = self.get_nodes_by_op_type("Reshape")
        for reshape_node in reshape_nodes:
            parent = self.get_parent(reshape_node, 0)
            if parent is not None and parent.op_type == "Reshape":
                reshape_node.input[0] = parent.input[0]
                count += 1

        if count > 0:
            logger.info(f"Skip consequent Reshape count: {count}")

    def fuse_embedding(self, node, output_name_to_node):
        assert node.op_type == "LayerNormalization"
        logger.debug(f"start fusing embedding from node with output={node.output[0]}...")
        word_embed_path = self.match_parent_path(node, ["Add", "Add", "Gather"], [0, 0, 0], output_name_to_node)
        if word_embed_path is None:
            logger.debug("failed to match word_embed_path")
            return False

        skip_node, add_node, gather_node = word_embed_path

        word_initializer = self.get_initializer(gather_node.input[0])
        if word_initializer is None:
            logger.debug("failed to get word initializer")
            return False

        temp = numpy_helper.to_array(word_initializer)
        if len(temp.shape) == 2:
            logger.info("Found word embedding. name:{}, shape:{}".format(word_initializer.name, temp.shape))
            word_embedding = word_initializer.name
        else:
            logger.info("Failed to find word embedding. name:{}, shape:{}".format(word_initializer.name, temp.shape))
            return False

        pos_initializer = self.get_initializer(add_node.input[1])
        if pos_initializer is not None:
            temp = numpy_helper.to_array(pos_initializer)
            if len(temp.shape) == 3 and temp.shape[0] == 1:
                tensor = numpy_helper.from_array(temp.reshape((temp.shape[1], temp.shape[2])), "position_embedding")
                self.add_initializer(tensor)
                logger.info("Found position embedding. name:{}, shape:{}".format(pos_initializer.name, temp.shape[1:]))
                position_embedding = "position_embedding"
            else:
                logger.info(
                    "Failed to find position embedding. name:{}, shape:{}".format(pos_initializer.name, temp.shape)
                )
                return False
        else:
            pos_embed_path = self.match_parent_path(add_node, ["Gather", "Slice"], [1, 1], output_name_to_node)
            if pos_embed_path is None:
                logger.debug("failed to match pos_embed_path")
                return False

            pos_gather, pos_slice = pos_embed_path
            pos_initializer = self.get_initializer(pos_gather.input[0])
            if pos_initializer is None:
                logger.debug("failed to get pos initializer")
                return False

            temp = numpy_helper.to_array(pos_initializer)
            if len(temp.shape) == 2:
                logger.info("Found word embedding. name:{}, shape:{}".format(pos_initializer.name, temp.shape))
                position_embedding = pos_initializer.name
            else:
                logger.info(
                    "Failed to find position embedding. name:{}, shape:{}".format(pos_initializer.name, temp.shape)
                )
                return False

        gather = self.get_parent(skip_node, 1, output_name_to_node)
        if gather is None or gather.op_type != "Gather":
            logger.debug("failed to get gather")
            return False

        segment_initializer = self.get_initializer(gather.input[0])
        if segment_initializer is None:
            logger.debug("failed to get segment initializer")
            return False

        temp = numpy_helper.to_array(segment_initializer)
        if len(temp.shape) == 2:
            logger.info("Found segment embedding. name:{}, shape:{}".format(segment_initializer.name, temp.shape))
            segment_embedding = segment_initializer.name
        else:
            logger.info(
                "Failed to find segment embedding. name:{}, shape:{}".format(segment_initializer.name, temp.shape)
            )
            return False

        logger.info("Create Embedding node")
        self.create_embedding_subgraph(node, word_embedding, segment_embedding, position_embedding)
        return True

    def process_embedding(self):
        """
        Automatically detect word, segment and position embeddings.
        """
        logger.info("start processing embedding layer...")
        output_name_to_node = self.output_name_to_node()
        for node in self.nodes():
            if node.op_type == "LayerNormalization":
                if self.fuse_embedding(node, output_name_to_node):
                    return
                break

    def fuse_mask(self):
        nodes_to_remove = []
        for node in self.nodes():
            if node.op_type == "Mul" and self.has_constant_input(node, -10000):
                mask_path = self.match_parent_path(node, ["Sub", "Cast", "Slice", "Unsqueeze"], [0, 1, 0, 0])
                if mask_path is None:
                    continue
                sub_node, cast_node, slice_node, unsqueeze_node = mask_path

                mask_input_name = self.attention_mask.get_first_mask()
                if unsqueeze_node.input[0] != mask_input_name:
                    print("Cast input {} is not mask input {}".format(unsqueeze_node.input[0], mask_input_name))
                    continue

                unsqueeze_added_1 = onnx.helper.make_node(
                    "Unsqueeze",
                    inputs=[mask_input_name],
                    outputs=["mask_fuse_unsqueeze1_output"],
                    name="Mask_UnSqueeze_1",
                    axes=[1],
                )

                unsqueeze_added_2 = onnx.helper.make_node(
                    "Unsqueeze",
                    inputs=["mask_fuse_unsqueeze1_output"],
                    outputs=["mask_fuse_unsqueeze2_output"],
                    name="Mask_UnSqueeze_2",
                    axes=[2],
                )

                # self.replace_node_input(cast_node, cast_node.input[0], 'mask_fuse_unsqueeze2_output')
                cast_node_2 = onnx.helper.make_node(
                    "Cast",
                    inputs=["mask_fuse_unsqueeze2_output"],
                    outputs=["mask_fuse_cast_output"],
                )
                cast_node_2.attribute.extend([onnx.helper.make_attribute("to", 1)])
                self.replace_node_input(sub_node, sub_node.input[1], "mask_fuse_cast_output")

                nodes_to_remove.extend([slice_node, unsqueeze_node, cast_node])
                self.add_node(unsqueeze_added_1)
                self.add_node(unsqueeze_added_2)
                self.add_node(cast_node_2)

        self.remove_nodes(nodes_to_remove)

        # Prune graph is done after removing nodes to remove island nodes.
        if len(nodes_to_remove) > 0:
            self.prune_graph()

        logger.info("Fused mask" if len(nodes_to_remove) > 0 else "Failed to fuse mask")

    def remove_extra_reshape(self):
        skiplayernorm_nodes = self.get_nodes_by_op_type("SkipLayerNormalization")
        reshape_removed = 0
        for skiplayernorm_node in skiplayernorm_nodes:
            path = self.match_parent_path(
                skiplayernorm_node,
                [
                    "Add",
                    "Reshape",
                    "MatMul",
                    "Reshape",
                    "Gelu",
                    "Add",
                    "Reshape",
                    "MatMul",
                    "SkipLayerNormalization",
                ],
                [0, 0, 0, 0, 0, 0, 0, 0, 0],
            )
            if path is None:
                continue

            (
                add_1,
                reshape_1,
                matmul_1,
                reshape_2,
                gelu,
                add_2,
                reshape_3,
                matmul_2,
                skiplayernorm,
            ) = path
            add_2.input[0] = matmul_2.output[0]
            self.remove_node(reshape_3)
            matmul_1.input[0] = gelu.output[0]
            self.remove_node(reshape_2)
            add_1.input[0] = matmul_1.output[0]
            self.remove_node(reshape_1)
            reshape_removed += 3

        return reshape_removed

    def remove_extra_reshape_2(self):
        skiplayernorm_nodes = self.get_nodes_by_op_type("SkipLayerNormalization")
        reshape_removed = 0
        for skiplayernorm_node in skiplayernorm_nodes:
            path = self.match_parent_path(
                skiplayernorm_node,
                [
                    "Add",
                    "Reshape",
                    "MatMul",
                    "Reshape",
                    "Gelu",
                    "Add",
                    "Reshape",
                    "MatMul",
                    "Reshape",
                    "SkipLayerNormalization",
                ],
                [None, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            )  # yapf: disable
            if path is None:
                continue

            (
                add_1,
                reshape_1,
                matmul_1,
                reshape_2,
                gelu,
                add_2,
                reshape_3,
                matmul_2,
                reshape_4,
                skiplayernorm,
            ) = path

            matmul_2.input[0] = skiplayernorm.output[0]
            self.remove_node(reshape_4)

            add_2.input[0] = matmul_2.output[0]
            self.remove_node(reshape_3)

            matmul_1.input[0] = gelu.output[0]
            self.remove_node(reshape_2)

            add_1.input[0] = matmul_1.output[0]
            self.remove_node(reshape_1)

            reshape_removed += 4

        return reshape_removed

    def postprocess(self):
        reshape_removed = self.remove_extra_reshape() + self.remove_extra_reshape_2()
        logger.info(f"Remove {reshape_removed} Reshape nodes.")

        self.prune_graph()