MTtranslateService/Lib/site-packages/onnxruntime/transformers/float16.py


								# -------------------------------------------------------------------------

								# Copyright (c) Microsoft Corporation.  All rights reserved.

								# Licensed under the MIT License.

								# --------------------------------------------------------------------------


								# This file is modified from https://github.com/microsoft/onnxconverter-common/blob/master/onnxconverter_common/float16.py

								# Modifications: keep_io_types can be list of names; convert initializers if needed to preserve precision; add force_fp16_initializers option.


								import itertools

								import logging

								from typing import Dict, List


								import numpy as np

								import onnx

								from onnx import helper, numpy_helper

								from onnx import onnx_pb as onnx_proto

								from packaging import version


								logger = logging.getLogger(__name__)


								def _npfloat16_to_int(np_list):

								    """

								    Convert numpy float16 to python int.


								    :param np_list: numpy float16 list

								    :return int_list: python int list

								    """

								    return [int(bin(_.view("H"))[2:].zfill(16), 2) for _ in np_list]


								def convert_np_to_float16(np_array, min_positive_val=5.96e-08, max_finite_val=65504.0):

								    """

								    Convert float32 numpy array to float16 without changing sign or finiteness.

								    Positive values less than min_positive_val are mapped to min_positive_val.

								    Positive finite values greater than max_finite_val are mapped to max_finite_val.

								    Similar for negative values. NaN, 0, inf, and -inf are unchanged.

								    """


								    def between(a, b, c):

								        return np.logical_and(a < b, b < c)


								    np_array = np.where(between(0, np_array, min_positive_val), min_positive_val, np_array)

								    np_array = np.where(between(-min_positive_val, np_array, 0), -min_positive_val, np_array)

								    np_array = np.where(between(max_finite_val, np_array, float("inf")), max_finite_val, np_array)

								    np_array = np.where(between(float("-inf"), np_array, -max_finite_val), -max_finite_val, np_array)

								    return np.float16(np_array)


								def convert_tensor_float_to_float16(tensor, min_positive_val=5.96e-08, max_finite_val=65504.0):

								    """Convert tensor float to float16.


								    Args:

								        tensor (TensorProto): the tensor to convert.

								        min_positive_val (float, optional): minimal positive value. Defaults to 1e-7.

								        max_finite_val (float, optional): maximal finite value. Defaults to 1e4.


								    Raises:

								        ValueError: input type is not TensorProto.


								    Returns:

								        TensorProto: the converted tensor.

								    """


								    if not isinstance(tensor, onnx_proto.TensorProto):

								        raise ValueError("Expected input type is an ONNX TensorProto but got %s" % type(tensor))


								    if tensor.data_type == onnx_proto.TensorProto.FLOAT:

								        tensor.data_type = onnx_proto.TensorProto.FLOAT16

								        # convert float_data (float type) to float16 and write to int32_data

								        if tensor.float_data:

								            float16_data = convert_np_to_float16(np.array(tensor.float_data), min_positive_val, max_finite_val)

								            int_list = _npfloat16_to_int(float16_data)

								            tensor.int32_data[:] = int_list

								            tensor.float_data[:] = []

								        # convert raw_data (bytes type)

								        if tensor.raw_data:

								            # convert n.raw_data to float

								            float32_list = np.frombuffer(tensor.raw_data, dtype="float32")

								            # convert float to float16

								            float16_list = convert_np_to_float16(float32_list, min_positive_val, max_finite_val)

								            # convert float16 to bytes and write back to raw_data

								            tensor.raw_data = float16_list.tobytes()

								    return tensor


								def make_value_info_from_tensor(tensor):

								    shape = numpy_helper.to_array(tensor).shape

								    return helper.make_tensor_value_info(tensor.name, tensor.data_type, shape)


								DEFAULT_OP_BLOCK_LIST = [

								    "ArrayFeatureExtractor",

								    "Binarizer",

								    "CastMap",

								    "CategoryMapper",

								    "DictVectorizer",

								    "FeatureVectorizer",

								    "Imputer",

								    "LabelEncoder",

								    "LinearClassifier",

								    "LinearRegressor",

								    "Normalizer",

								    "OneHotEncoder",

								    "SVMClassifier",

								    "SVMRegressor",

								    "Scaler",

								    "TreeEnsembleClassifier",

								    "TreeEnsembleRegressor",

								    "ZipMap",

								    "NonMaxSuppression",

								    "TopK",

								    "RoiAlign",

								    "Resize",

								    "Range",

								    "CumSum",

								    "Min",

								    "Max",

								    "Upsample",

								]


								class InitializerTracker:

								    """Class for keeping track of initializer."""


								    def __init__(self, initializer: onnx_proto.TensorProto):

								        self.initializer = initializer

								        self.fp32_nodes = []

								        self.fp16_nodes = []


								    def add_node(self, node: onnx_proto.NodeProto, is_node_blocked):

								        if is_node_blocked:

								            self.fp32_nodes.append(node)

								        else:

								            self.fp16_nodes.append(node)


								def convert_float_to_float16(

								    model,

								    min_positive_val=5.96e-08,

								    max_finite_val=65504.0,

								    keep_io_types=False,

								    disable_shape_infer=False,

								    op_block_list=None,

								    node_block_list=None,

								    force_fp16_initializers=False,

								):

								    """Convert model tensor float type in the ONNX ModelProto input to tensor float16.


								    Args:

								        model (ModelProto): The ONNX model to convert.

								        min_positive_val (float, optional): minimal positive value. Defaults to 5.96e-08.

								        max_finite_val (float, optional): maximal finite value of float16. Defaults to 65504.

								        keep_io_types (Union[bool, List[str]], optional): It could be boolean or a list of float32 input/output names.

								                                                          If True, model inputs/outputs should be left as float32. Defaults to False.

								        disable_shape_infer (bool, optional): Skips running onnx shape/type inference. Useful if shape inference has been done. Defaults to False.

								        op_block_list (List[str], optional): List of op types to leave as float32.

								                                             Defaults to None, which will use `float16.DEFAULT_OP_BLOCK_LIST` as default.

								        node_block_list (List[str], optional): List of node names to leave as float32. Defaults to None.

								        force_fp16_initializers(bool): force converting all float initializers to float16.

								                                       Default to false, which will convert only the one needed to avoid precision loss.

								    Raises:

								        ValueError: input type is not ModelProto.


								    Returns:

								        ModelProto: converted model.

								    """

								    assert (

								        min_positive_val >= 5.96e-08

								    ), "invalid min_positive_val. smallest positive float16 value: subnormal 5.96e-08, and normalized 6.104e-05"

								    assert max_finite_val <= float(np.finfo(np.float16).max), "invalid max_finite_val. largest float16 value: 65504"


								    func_infer_shape = None

								    if not disable_shape_infer and version.parse(onnx.__version__) >= version.parse("1.2.0"):

								        try:

								            from onnx.shape_inference import infer_shapes


								            func_infer_shape = infer_shapes

								        finally:

								            pass


								    if not isinstance(model, onnx_proto.ModelProto):

								        raise ValueError("Expected model type is an ONNX ModelProto but got %s" % type(model))


								    # create blocklists

								    if op_block_list is None:

								        op_block_list = DEFAULT_OP_BLOCK_LIST

								    if node_block_list is None:

								        node_block_list = []

								    op_block_list = set(op_block_list)

								    node_block_list = set(node_block_list)


								    logger.debug(

								        f"fp16 parameters: min_positive_val={min_positive_val} max_finite_val={max_finite_val} keep_io_types={keep_io_types} disable_shape_infer={disable_shape_infer} op_block_list={op_block_list} node_block_list={node_block_list} force_fp16_initializers={force_fp16_initializers}"

								    )


								    # create a queue for BFS

								    queue = []

								    value_info_list = []

								    node_list = []

								    # type inference on input model

								    if func_infer_shape is not None:

								        model = func_infer_shape(model)

								    queue.append(model)

								    name_mapping = {}

								    graph_io_to_skip = set()

								    io_casts = set()


								    fp32_inputs = [n.name for n in model.graph.input if n.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT]

								    fp32_outputs = [n.name for n in model.graph.output if n.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT]

								    if isinstance(keep_io_types, list):

								        fp32_inputs = [n for n in fp32_inputs if n in keep_io_types]

								        fp32_outputs = [n for n in fp32_outputs if n in keep_io_types]

								    elif not keep_io_types:

								        fp32_inputs = []

								        fp32_outputs = []


								    for i, n in enumerate(model.graph.input):

								        if n.name in fp32_inputs:

								            output_name = "graph_input_cast_" + str(i)

								            name_mapping[n.name] = output_name

								            graph_io_to_skip.add(n.name)


								            node_name = "graph_input_cast" + str(i)

								            new_value_info = model.graph.value_info.add()

								            new_value_info.CopyFrom(n)

								            new_value_info.name = output_name

								            new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16

								            # add Cast node (from tensor(float) to tensor(float16) after graph input

								            new_node = [helper.make_node("Cast", [n.name], [output_name], to=10, name=node_name)]

								            model.graph.node.extend(new_node)

								            value_info_list.append(new_value_info)

								            io_casts.add(node_name)


								    for i, n in enumerate(model.graph.output):

								        if n.name in fp32_outputs:

								            input_name = "graph_output_cast_" + str(i)

								            name_mapping[n.name] = input_name

								            graph_io_to_skip.add(n.name)


								            node_name = "graph_output_cast" + str(i)

								            # add Cast node (from tensor(float16) to tensor(float) before graph output

								            new_value_info = model.graph.value_info.add()

								            new_value_info.CopyFrom(n)

								            new_value_info.name = input_name

								            new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16

								            new_node = [helper.make_node("Cast", [input_name], [n.name], to=1, name=node_name)]

								            model.graph.node.extend(new_node)

								            value_info_list.append(new_value_info)

								            io_casts.add(node_name)


								    fp32_initializers: Dict[str, InitializerTracker] = {}

								    while queue:

								        next_level = []

								        for q in queue:

								            # if q is model, push q.graph (GraphProto)

								            if isinstance(q, onnx_proto.ModelProto):

								                next_level.append(q.graph)

								            # if q is model.graph, push q.node.attribute (AttributeProto)

								            if isinstance(q, onnx_proto.GraphProto):

								                for n in q.initializer:  # TensorProto type

								                    if n.data_type == onnx_proto.TensorProto.FLOAT:

								                        assert n.name not in fp32_initializers

								                        fp32_initializers[n.name] = InitializerTracker(n)


								                for n in q.node:

								                    # if n is in the block list (doesn't support float16), no conversion for the node,

								                    # and save the node for further processing

								                    if n.name in io_casts:

								                        continue

								                    for i in range(len(n.input)):

								                        if n.input[i] in name_mapping:

								                            n.input[i] = name_mapping[n.input[i]]

								                    for i in range(len(n.output)):

								                        if n.output[i] in name_mapping:

								                            n.output[i] = name_mapping[n.output[i]]


								                    is_node_blocked = n.op_type in op_block_list or n.name in node_block_list

								                    for input in n.input:

								                        if input in fp32_initializers:

								                            fp32_initializers[input].add_node(n, is_node_blocked)


								                    if is_node_blocked:

								                        node_list.append(n)

								                    else:

								                        if n.op_type == "Cast":

								                            for attr in n.attribute:

								                                if attr.name == "to" and attr.i == 1:

								                                    attr.i = 10

								                                    break

								                        for attr in n.attribute:

								                            next_level.append(attr)

								            # if q is model.graph.node.attribute, push q.g and q.graphs (GraphProto)

								            # and process node.attribute.t and node.attribute.tensors (TensorProto)

								            if isinstance(q, onnx_proto.AttributeProto):

								                next_level.append(q.g)

								                for n in q.graphs:

								                    next_level.append(n)

								                q.t.CopyFrom(convert_tensor_float_to_float16(q.t, min_positive_val, max_finite_val))

								                for n in q.tensors:

								                    n = convert_tensor_float_to_float16(n, min_positive_val, max_finite_val)

								            # if q is graph, process input, output and value_info (ValueInfoProto)

								            if isinstance(q, onnx_proto.GraphProto):

								                # Note that float initializers tracked by fp32_initializers will be processed later.

								                # for all ValueInfoProto with tensor(float) type in input, output and value_info, convert them to

								                # tensor(float16) except map and seq(map). And save them in value_info_list for further processing

								                for n in itertools.chain(q.input, q.output, q.value_info):

								                    if n.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT:

								                        if n.name not in graph_io_to_skip:

								                            n.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16

								                            value_info_list.append(n)

								                    if n.type.HasField("sequence_type"):

								                        if n.type.sequence_type.elem_type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT:

								                            if n.name not in graph_io_to_skip:

								                                n.type.sequence_type.elem_type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16

								                                value_info_list.append(n)


								        queue = next_level


								    for key, value in fp32_initializers.items():

								        # By default, to avoid precision loss, do not convert an initializer to fp16 when it is used only by fp32 nodes.

								        if force_fp16_initializers or value.fp16_nodes:

								            value.initializer = convert_tensor_float_to_float16(value.initializer, min_positive_val, max_finite_val)

								            value_info_list.append(make_value_info_from_tensor(value.initializer))

								            if value.fp32_nodes and not force_fp16_initializers:

								                logger.info(

								                    "initializer is used by both fp32 and fp16 nodes. Consider add these nodes to block list:{}".format(

								                        value.fp16_nodes

								                    )

								                )


								    # process the nodes in block list that doesn't support tensor(float16)

								    for node in node_list:

								        # if input's name is in the value_info_list meaning input is tensor(float16) type,

								        # insert a float16 to float Cast node before the node,

								        # change current node's input name and create new value_info for the new name

								        for i in range(len(node.input)):

								            input = node.input[i]

								            for value_info in value_info_list:

								                if input == value_info.name:

								                    # create new value_info for current node's new input name

								                    new_value_info = model.graph.value_info.add()

								                    new_value_info.CopyFrom(value_info)

								                    output_name = node.name + "_input_cast_" + str(i)

								                    new_value_info.name = output_name

								                    new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT

								                    # add Cast node (from tensor(float16) to tensor(float) before current node

								                    node_name = node.name + "_input_cast" + str(i)

								                    new_node = [helper.make_node("Cast", [input], [output_name], to=1, name=node_name)]

								                    model.graph.node.extend(new_node)

								                    # change current node's input name

								                    node.input[i] = output_name

								                    break

								        # if output's name is in the value_info_list meaning output is tensor(float16) type, insert a float to

								        # float16 Cast node after the node, change current node's output name and create new value_info for the new name

								        for i in range(len(node.output)):

								            output = node.output[i]

								            for value_info in value_info_list:

								                if output == value_info.name:

								                    # create new value_info for current node's new output

								                    new_value_info = model.graph.value_info.add()

								                    new_value_info.CopyFrom(value_info)

								                    input_name = node.name + "_output_cast_" + str(i)

								                    new_value_info.name = input_name

								                    new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT

								                    # add Cast node (from tensor(float) to tensor(float16) after current node

								                    node_name = node.name + "_output_cast" + str(i)

								                    new_node = [helper.make_node("Cast", [input_name], [output], to=10, name=node_name)]

								                    model.graph.node.extend(new_node)

								                    # change current node's input name

								                    node.output[i] = input_name

								                    break

								    return model


								def float_to_float16_max_diff(tensor, min_positive_val=5.96e-08, max_finite_val=65504.0):

								    """Measure the maximum absolute difference after converting a float tensor to float16."""

								    if not isinstance(tensor, onnx_proto.TensorProto):

								        raise ValueError("Expected input type is an ONNX TensorProto but got %s" % type(tensor))

								    if tensor.data_type != onnx_proto.TensorProto.FLOAT:

								        raise ValueError("Expected tensor data type is float.")


								    float32_data = None

								    if tensor.float_data:

								        float32_data = np.array(tensor.float_data)


								    if tensor.raw_data:

								        float32_data = np.frombuffer(tensor.raw_data, dtype="float32")


								    if float32_data is None:

								        raise RuntimeError("external data not loaded!")


								    float16_data = convert_np_to_float16(float32_data, min_positive_val, max_finite_val)

								    return np.amax(np.abs(float32_data - np.float32(float16_data)))