MTtranslateService/Lib/site-packages/onnxruntime/transformers/fusion_options.py

# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation.  All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------
from argparse import ArgumentParser


class AttentionMaskFormat:
    # Build 1D mask indice (sequence length). It requires right side padding! Recommended for BERT model to get best performance.
    MaskIndexEnd = 0

    # For experiment only. Do not use it in production.
    MaskIndexEndAndStart = 1

    # Raw attention mask with 0 means padding (or no attention) and 1 otherwise.
    AttentionMask = 2

    # No attention mask
    NoMask = 3


class FusionOptions:
    """Options of fusion in graph optimization"""

    def __init__(self, model_type):
        self.enable_gelu = True
        self.enable_layer_norm = True
        self.enable_attention = True

        # Use MultiHeadAttention instead of Attention operator. The difference:
        # (1) Attention has merged weights for Q/K/V projection, which might be faster in some cases since 3 MatMul is
        #     merged into one.
        # (2) Attention could only handle self attention; MultiHeadAttention could handle both self and cross attention.
        # (3) MultiHeadAttention has only cuda implementation right now.
        self.use_multi_head_attention = False

        self.enable_skip_layer_norm = True
        self.enable_embed_layer_norm = True
        self.enable_bias_skip_layer_norm = True
        self.enable_bias_gelu = True
        self.enable_gelu_approximation = False
        self.enable_qordered_matmul = True

        self.enable_shape_inference = True
        self.enable_gemm_fast_gelu = False

        # Set default to sequence length for BERT model to use fused attention to speed up.
        # Note that embed layer normalization will convert 2D mask to 1D when mask type is MaskIndexEnd.
        self.attention_mask_format = (
            AttentionMaskFormat.MaskIndexEnd if model_type == "bert" else AttentionMaskFormat.AttentionMask
        )

        # options for stable diffusion
        self.enable_group_norm = model_type == "unet"
        self.enable_bias_splitgelu = model_type == "unet"
        self.enable_packed_kv = model_type == "unet"

    def use_raw_attention_mask(self, use_raw_mask=True):
        if use_raw_mask:
            self.attention_mask_format = AttentionMaskFormat.AttentionMask
        else:
            self.attention_mask_format = AttentionMaskFormat.MaskIndexEnd

    def disable_attention_mask(self):
        self.attention_mask_format = AttentionMaskFormat.NoMask

    @staticmethod
    def parse(args):
        options = FusionOptions(args.model_type)
        if args.disable_gelu:
            options.enable_gelu = False
        if args.disable_layer_norm:
            options.enable_layer_norm = False
        if args.disable_attention:
            options.enable_attention = False
        if args.use_multi_head_attention:
            options.use_multi_head_attention = True
        if args.disable_skip_layer_norm:
            options.enable_skip_layer_norm = False
        if args.disable_embed_layer_norm:
            options.enable_embed_layer_norm = False
        if args.disable_bias_skip_layer_norm:
            options.enable_bias_skip_layer_norm = False
        if args.disable_bias_gelu:
            options.enable_bias_gelu = False
        if args.enable_gelu_approximation:
            options.enable_gelu_approximation = True
        if args.disable_shape_inference:
            options.enable_shape_inference = False
        if args.enable_gemm_fast_gelu:
            options.enable_gemm_fast_gelu = True
        if args.use_mask_index:
            options.use_raw_attention_mask(False)
        if args.use_raw_attention_mask:
            options.use_raw_attention_mask(True)
        if args.no_attention_mask:
            options.disable_attention_mask()
        if args.disable_group_norm:
            options.enable_group_norm = False
        if args.disable_packed_kv:
            options.enable_packed_kv = False
        return options

    @staticmethod
    def add_arguments(parser: ArgumentParser):
        parser.add_argument(
            "--disable_attention",
            required=False,
            action="store_true",
            help="disable Attention fusion",
        )
        parser.set_defaults(disable_attention=False)

        parser.add_argument(
            "--disable_skip_layer_norm",
            required=False,
            action="store_true",
            help="disable SkipLayerNormalization fusion",
        )
        parser.set_defaults(disable_skip_layer_norm=False)

        parser.add_argument(
            "--disable_embed_layer_norm",
            required=False,
            action="store_true",
            help="disable EmbedLayerNormalization fusion",
        )
        parser.set_defaults(disable_embed_layer_norm=False)

        parser.add_argument(
            "--disable_bias_skip_layer_norm",
            required=False,
            action="store_true",
            help="disable Add Bias and SkipLayerNormalization fusion",
        )
        parser.set_defaults(disable_bias_skip_layer_norm=False)

        parser.add_argument(
            "--disable_bias_gelu",
            required=False,
            action="store_true",
            help="disable Add Bias and Gelu/FastGelu fusion",
        )
        parser.set_defaults(disable_bias_gelu=False)

        parser.add_argument(
            "--disable_layer_norm",
            required=False,
            action="store_true",
            help="disable LayerNormalization fusion",
        )
        parser.set_defaults(disable_layer_norm=False)

        parser.add_argument(
            "--disable_gelu",
            required=False,
            action="store_true",
            help="disable Gelu fusion",
        )
        parser.set_defaults(disable_gelu=False)

        parser.add_argument(
            "--enable_gelu_approximation",
            required=False,
            action="store_true",
            help="enable Gelu/BiasGelu to FastGelu conversion",
        )
        parser.set_defaults(enable_gelu_approximation=False)

        parser.add_argument(
            "--disable_shape_inference",
            required=False,
            action="store_true",
            help="disable symbolic shape inference",
        )
        parser.set_defaults(disable_shape_inference=False)

        parser.add_argument(
            "--enable_gemm_fast_gelu",
            required=False,
            action="store_true",
            help="enable GemmfastGelu fusion",
        )
        parser.set_defaults(enable_gemm_fast_gelu=False)

        parser.add_argument(
            "--use_mask_index",
            required=False,
            action="store_true",
            help="use mask index to activate fused attention to speed up. It requires right-side padding!",
        )
        parser.set_defaults(use_mask_index=False)

        parser.add_argument(
            "--use_raw_attention_mask",
            required=False,
            action="store_true",
            help="use raw attention mask. Use this option if your input is not right-side padding. This might deactivate fused attention and get worse performance.",
        )
        parser.set_defaults(use_raw_attention_mask=False)

        parser.add_argument(
            "--no_attention_mask",
            required=False,
            action="store_true",
            help="no attention mask. Only works for model_type=bert",
        )
        parser.set_defaults(no_attention_mask=False)

        parser.add_argument(
            "--use_multi_head_attention",
            required=False,
            action="store_true",
            help="Use MultiHeadAttention instead of Attention operator for testing purpose. "
            "Note that MultiHeadAttention might be slower than Attention since MatMul of input projection is excluded. "
            "MultiHeadAttention has only CUDA implementation so the model can only run with cuda execution provider.",
        )
        parser.set_defaults(use_multi_head_attention=False)

        parser.add_argument(
            "--disable_group_norm",
            required=False,
            action="store_true",
            help="not fuse GroupNorm. Only works for model_type=unet",
        )
        parser.set_defaults(disable_group_norm=False)

        parser.add_argument(
            "--disable_packed_kv",
            required=False,
            action="store_true",
            help="not use packed kv in cross attention. Only works for model_type=unet",
        )
        parser.set_defaults(disable_packed_kv=False)