m2m模型翻译
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

614 lines
19 KiB

import logging
import tempfile
from enum import Enum
from pathlib import Path
import numpy
import onnx
from onnx import external_data_helper
from onnx import onnx_pb as onnx_proto
from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions
__producer__ = "onnx.quantize"
__version__ = "0.1.0"
onnx_domain = "ai.onnx"
ms_domain = "com.microsoft"
QUANT_OP_NAME = "QuantizeLinear"
QUANT_INPUT_SUFFIX = "_QuantizeLinear_Input"
DEQUANT_OP_NAME = "DequantizeLinear"
DEQUANT_OUTPUT_SUFFIX = "_DequantizeLinear_Output"
TENSOR_NAME_QUANT_SUFFIX = "_quantized"
type_to_name = {
1: "FLOAT",
2: "UINT8",
3: "INT8",
4: "UINT16",
5: "INT16",
6: "INT32",
7: "INT64",
8: "STRING",
9: "BOOL",
10: "FLOAT16",
11: "DOUBLE",
12: "UINT32",
13: "UINT64",
14: "COMPLEX64",
15: "COMPLEX128",
}
# Quantization mode
# IntegerOps: Use IntegerOps in quantized model. Only ConvInteger and MatMulInteger ops are supported now.
# QLinearOps: Use QLinearOps in quantized model. Only QLinearConv and QLinearMatMul ops are supported now.
class QuantizationMode(Enum):
IntegerOps = 0
QLinearOps = 1
def __str__(self):
return self.name
@staticmethod
def from_string(mode):
try:
return QuantizationMode[mode]
except KeyError:
raise ValueError()
class QuantizedValueType(Enum):
Input = 0
Initializer = 1
def __str__(self):
return self.name
@staticmethod
def from_string(v):
try:
return QuantizedValueType[v]
except KeyError:
raise ValueError()
class QuantType(Enum):
QInt8 = 0
QUInt8 = 1
def __str__(self):
return self.name
@staticmethod
def from_string(t):
try:
return QuantType[t]
except KeyError:
raise ValueError()
class QuantFormat(Enum):
QOperator = 0
QDQ = 1
def __str__(self):
return self.name
@staticmethod
def from_string(format):
try:
return QuantFormat[format]
except KeyError:
raise ValueError()
ONNX_TYPE_TO_NP_TYPE = {
onnx_proto.TensorProto.INT8: numpy.dtype("int8"),
onnx_proto.TensorProto.UINT8: numpy.dtype("uint8"),
}
def quantize_nparray(qType, arr, scale, zero_point, low=None, high=None):
assert (
qType in ONNX_TYPE_TO_NP_TYPE
), "Unexpected data type {} requested. Only INT8 and UINT8 are supported.".format(qType)
dtype = ONNX_TYPE_TO_NP_TYPE[qType]
cliplow = max(0 if dtype == numpy.uint8 else -127, -127 if low is None else low)
cliphigh = min(255 if dtype == numpy.uint8 else 127, 255 if high is None else high)
arr_fp32 = numpy.asarray((arr.astype(numpy.float32) / scale).round() + zero_point)
numpy.clip(arr_fp32, cliplow, cliphigh, out=arr_fp32)
return arr_fp32.astype(dtype)
def compute_scale_zp(rmin, rmax, qmin, qmax, symmetric=False):
"""Calculate the scale s and zero point z for the quantization relation
r = s(q-z), where r are the original values and q are the corresponding
quantized values.
r and z are calculated such that every value within [rmin,rmax] has an
approximate representation within [qmin,qmax]. In addition, qmin <= z <=
qmax is enforced. If the symmetric flag is set to True, the interval
[rmin,rmax] is symmetrized to [-absmax, +absmax], where
absmax = max(abs(rmin), abs(rmax)).
:parameter rmin: minimum value of r
:parameter rmax: maximum value of r
:parameter qmin: minimum value representable by the target quantization data type
:parameter qmax: maximum value representable by the target quantization data type
:return: zero and scale [z, s]
"""
if qmin > 0 or qmax < 0:
raise ValueError(f"qmin and qmax must meet requirement: qmin <= 0 <= qmax while qmin:{qmin}, qmmax:{qmax}")
# Adjust rmin and rmax such that 0 is included in the range. This is
# required to make sure zero can be represented by the quantization data
# type (i.e. to make sure qmin <= zero_point <= qmax)
rmin = min(rmin, 0)
rmax = max(rmax, 0)
if symmetric:
absmax = max(abs(rmin), abs(rmax))
rmin = -absmax
rmax = +absmax
scale = (rmax - rmin) / float(qmax - qmin)
if scale < numpy.finfo(numpy.float32).tiny:
scale = 1.0
zero_point = 0
else:
zero_point = round(qmin - rmin / scale)
return [zero_point, scale]
def quantize_data(data, qType, symmetric, reduce_range=False):
"""
:param data: data to quantize
:param qType: data type to quantize to. Supported types UINT8 and INT8
:param symmetric: whether symmetric quantization is used or not. This is applied to INT8.
:return: minimum, maximum, zero point, scale, and quantized weights
To pack weights, we compute a linear transformation
- when data `type == uint8` mode, from `[rmin, rmax]` -> :math:`[0, 2^{b-1}]` and
- when data `type == int8`, from `[-m , m]` -> :math:`[-(2^{b-1}-1), 2^{b-1}-1]` where
`m = max(abs(rmin), abs(rmax))`
and add necessary intermediate nodes to trasnform quantized weight to full weight using the equation
:math:`r = S(q-z)`, where
- *r*: real original value
- *q*: quantized value
- *S*: scale
- *z*: zero point
"""
rmin = 0
rmax = 0
zero_point = 0
scale = 1.0
if len(data):
rmin = min(data)
rmax = max(data)
qmin, qmax = get_qmin_qmax_for_qType(qType, reduce_range, symmetric=symmetric)
zero_point, scale = compute_scale_zp(rmin, rmax, qmin, qmax, symmetric)
quantized_data = quantize_nparray(qType, numpy.asarray(data), scale, zero_point)
return rmin, rmax, zero_point, scale, quantized_data
def get_qmin_qmax_for_qType(qType, reduce_range=False, symmetric=False):
"""
Return qmin and qmax, the minimum and maximum value representable by the given qType
:parameter qType: onnx.onnx_pb.TensorProto.UINT8 or onnx.onnx_pb.TensorProto.UINT8
:return: qmin, qmax
"""
if qType == onnx_proto.TensorProto.UINT8:
(qmin, qmax) = (0, 127) if reduce_range else (0, 255)
elif qType == onnx_proto.TensorProto.INT8:
if symmetric:
(qmin, qmax) = (-64, 64) if reduce_range else (-127, 127)
else:
(qmin, qmax) = (-64, 64) if reduce_range else (-128, 127)
else:
raise ValueError("Unexpected data type {} requested. Only INT8 and UINT8 are supported.".format(qType))
return qmin, qmax
def get_qrange_for_qType(qType, reduce_range=False, symmetric=False):
"""
Helper function to get the quantization range for a type.
parameter qType: quantization type.
return: quantization range.
"""
qmin, qmax = get_qmin_qmax_for_qType(qType, reduce_range, symmetric=symmetric)
return qmax - qmin
class QuantizedInitializer:
"""
Represents a linearly quantized weight input from ONNX operators
"""
def __init__(
self,
name,
initializer,
rmins,
rmaxs,
zero_points,
scales,
data=[],
quantized_data=[],
axis=None,
):
self.name = name
self.initializer = initializer # TensorProto initializer in ONNX graph
self.rmins = rmins # List of minimum range for each axis
self.rmaxs = rmaxs # List of maximum range for each axis
# 1D tensor of zero points computed for each axis. scalar if axis is empty
self.zero_points = zero_points
self.scales = scales # 1D tensor of scales computed for each axis. scalar if axis is empty
self.data = data # original data from initializer TensorProto
self.quantized_data = quantized_data # weight-packed data from data
# Scalar to specify which dimension in the initializer to weight pack.
self.axis = axis
# If empty, single zero point and scales computed from a single rmin and rmax
class QuantizedValue:
"""
Represents a linearly quantized value (input\output\intializer)
"""
def __init__(
self,
name,
new_quantized_name,
scale_name,
zero_point_name,
quantized_value_type,
axis=None,
):
self.original_name = name
self.q_name = new_quantized_name
self.scale_name = scale_name
self.zp_name = zero_point_name
self.value_type = quantized_value_type
self.axis = axis
class BiasToQuantize:
"""
Represents a bias to be quantized
"""
def __init__(self, bias_name, input_name, weight_name):
self.bias_name = bias_name
self.input_name = input_name
self.weight_name = weight_name
def attribute_to_kwarg(attribute):
"""
Convert attribute to kwarg format for use with onnx.helper.make_node.
:parameter attribute: attribute in AttributeProto format.
:return: attribute in {key: value} format.
"""
if attribute.type == 0:
raise ValueError("attribute {} does not have type specified.".format(attribute.name))
# Based on attribute type definitions from AttributeProto
# definition in https://github.com/onnx/onnx/blob/main/onnx/onnx.proto
if attribute.type == 1:
value = attribute.f
elif attribute.type == 2:
value = attribute.i
elif attribute.type == 3:
value = attribute.s
elif attribute.type == 4:
value = attribute.t
elif attribute.type == 5:
value = attribute.g
elif attribute.type == 6:
value = attribute.floats
elif attribute.type == 7:
value = attribute.ints
elif attribute.type == 8:
value = attribute.strings
elif attribute.type == 9:
value = attribute.tensors
elif attribute.type == 10:
value = attribute.graphs
else:
raise ValueError("attribute {} has unsupported type {}.".format(attribute.name, attribute.type))
return {attribute.name: value}
def find_by_name(item_name, item_list):
"""
Helper function to find item by name in a list.
parameter item_name: name of the item.
parameter item_list: list of items.
return: item if found. None otherwise.
"""
items = [item for item in item_list if item.name == item_name]
return items[0] if len(items) > 0 else None
def get_elem_index(elem_name, elem_list):
"""
Helper function to return index of an item in a node list
"""
elem_idx = -1
for i in range(0, len(elem_list)):
if elem_list[i] == elem_name:
elem_idx = i
return elem_idx
def get_mul_node(inputs, output, name):
"""
Helper function to create a Mul node.
parameter inputs: list of input names.
parameter output: output name.
parameter name: name of the node.
return: Mul node in NodeProto format.
"""
return onnx.helper.make_node("Mul", inputs, [output], name)
def generate_identified_filename(filename: Path, identifier: str) -> Path:
"""
Helper function to generate a identifiable filepath by concatenating the given identifier as a suffix.
"""
return filename.parent.joinpath(filename.stem + identifier + filename.suffix)
def apply_plot(hist, hist_edges):
import sys
import matplotlib.pyplot as plt
import numpy
numpy.set_printoptions(threshold=sys.maxsize)
print("Histogram:")
print(hist)
print("Histogram Edges:")
print(hist_edges)
plt.stairs(hist, hist_edges, fill=True)
plt.xlabel("Tensor value")
plt.ylabel("Counts")
plt.title("Tensor value V.S. Counts")
plt.show()
def write_calibration_table(calibration_cache):
"""
Helper function to write calibration table to files.
"""
import json
import flatbuffers
import onnxruntime.quantization.CalTableFlatBuffers.KeyValue as KeyValue
import onnxruntime.quantization.CalTableFlatBuffers.TrtTable as TrtTable
logging.info("calibration cache: {}".format(calibration_cache))
with open("calibration.json", "w") as file:
file.write(json.dumps(calibration_cache)) # use `json.loads` to do the reverse
# Serialize data using FlatBuffers
builder = flatbuffers.Builder(1024)
key_value_list = []
for key in sorted(calibration_cache.keys()):
values = calibration_cache[key]
value = str(max(abs(values[0]), abs(values[1])))
flat_key = builder.CreateString(key)
flat_value = builder.CreateString(value)
KeyValue.KeyValueStart(builder)
KeyValue.KeyValueAddKey(builder, flat_key)
KeyValue.KeyValueAddValue(builder, flat_value)
key_value = KeyValue.KeyValueEnd(builder)
key_value_list.append(key_value)
TrtTable.TrtTableStartDictVector(builder, len(key_value_list))
for key_value in key_value_list:
builder.PrependUOffsetTRelative(key_value)
main_dict = builder.EndVector()
TrtTable.TrtTableStart(builder)
TrtTable.TrtTableAddDict(builder, main_dict)
cal_table = TrtTable.TrtTableEnd(builder)
builder.Finish(cal_table)
buf = builder.Output()
with open("calibration.flatbuffers", "wb") as file:
file.write(buf)
# Deserialize data (for validation)
if False:
cal_table = TrtTable.TrtTable.GetRootAsTrtTable(buf, 0)
dict_len = cal_table.DictLength()
for i in range(dict_len):
key_value = cal_table.Dict(i)
logging.info(key_value.Key())
logging.info(key_value.Value())
# write plain text
with open("calibration.cache", "w") as file:
for key in sorted(calibration_cache.keys()):
value = calibration_cache[key]
s = key + " " + str(max(abs(value[0]), abs(value[1])))
file.write(s)
file.write("\n")
def smooth_distribution(p, eps=0.0001):
"""Given a discrete distribution (may have not been normalized to 1),
smooth it by replacing zeros with eps multiplied by a scaling factor
and taking the corresponding amount off the non-zero values.
Ref: http://web.engr.illinois.edu/~hanj/cs412/bk3/KL-divergence.pdf
https://github.com//apache/incubator-mxnet/blob/master/python/mxnet/contrib/quantization.py
"""
import numpy as np
is_zeros = (p == 0).astype(np.float32)
is_nonzeros = (p != 0).astype(np.float32)
n_zeros = is_zeros.sum()
n_nonzeros = p.size - n_zeros
if not n_nonzeros:
# raise ValueError('The discrete probability distribution is malformed. All entries are 0.')
return -1
eps1 = eps * float(n_zeros) / float(n_nonzeros)
assert eps1 < 1.0, "n_zeros=%d, n_nonzeros=%d, eps1=%f" % (
n_zeros,
n_nonzeros,
eps1,
)
hist = p.astype(np.float32)
hist += eps * is_zeros + (-eps1) * is_nonzeros
assert (hist <= 0).sum() == 0
return hist
def model_has_external_data(model_path: Path):
model = onnx.load(model_path.as_posix(), load_external_data=False)
for intializer in model.graph.initializer:
if external_data_helper.uses_external_data(intializer):
return True
return False
def optimize_model(model_path: Path, opt_model_path: Path):
"""
Generate model that applies graph optimization (constant folding, etc.)
parameter model_path: path to the original onnx model
parameter opt_model_path: path to the optimized onnx model
:return: optimized onnx model
"""
sess_option = SessionOptions()
sess_option.optimized_model_filepath = opt_model_path.as_posix()
sess_option.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC
_ = InferenceSession(model_path.as_posix(), sess_option, providers=["CPUExecutionProvider"])
def add_pre_process_metadata(model):
"""Tag the model that it went through quantization pre-processing"""
metadata_props = {"onnx.quant.pre_process": "onnxruntime.quant"}
if model.metadata_props:
for prop in model.metadata_props:
metadata_props.update({prop.key: prop.value})
onnx.helper.set_model_props(model, metadata_props)
def model_has_pre_process_metadata(model):
"""Check the model whether it went through quantization pre-processing"""
if model.metadata_props:
for prop in model.metadata_props:
if prop.key == "onnx.quant.pre_process" and prop.value == "onnxruntime.quant":
return True
return False
def add_infer_metadata(model):
metadata_props = {"onnx.infer": "onnxruntime.quant"}
if model.metadata_props:
for p in model.metadata_props:
metadata_props.update({p.key: p.value})
onnx.helper.set_model_props(model, metadata_props)
def model_has_infer_metadata(model):
if model.metadata_props:
for p in model.metadata_props:
if p.key == "onnx.infer" and p.value == "onnxruntime.quant":
return True
return False
def load_model_with_shape_infer(model_path: Path):
inferred_model_path = generate_identified_filename(model_path, "-inferred")
onnx.shape_inference.infer_shapes_path(str(model_path), str(inferred_model_path))
model = onnx.load(inferred_model_path.as_posix())
inferred_model_path.unlink()
return model
def load_model(model_path: Path, need_optimize: bool):
with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir:
if need_optimize and not model_has_external_data(model_path):
opt_model_path = Path(quant_tmp_dir).joinpath("model.onnx")
optimize_model(model_path, opt_model_path)
model_path = opt_model_path
model = load_model_with_shape_infer(model_path)
add_infer_metadata(model)
return model
def save_and_reload_model(model):
with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir:
model_path = Path(quant_tmp_dir).joinpath("model.onnx")
onnx.external_data_helper.convert_model_to_external_data(model, all_tensors_to_one_file=True)
onnx.save_model(model, model_path.as_posix())
return load_model(model_path, False)
def clone_model_with_shape_infer(model):
if model_has_infer_metadata(model):
cloned_model = onnx_proto.ModelProto()
cloned_model.CopyFrom(model)
else:
cloned_model = save_and_reload_model(model)
return cloned_model
def tensor_proto_to_array(initializer):
if initializer.data_type == onnx_proto.TensorProto.FLOAT:
return onnx.numpy_helper.to_array(initializer)
raise ValueError(
f"Only float type is supported. Weights {initializer.name} is {type_to_name[initializer.data_type]}"
)
def add_quant_suffix(tensor_name):
return tensor_name + "_QuantizeLinear"
def add_quant_input_suffix(tensor_name):
return tensor_name + QUANT_INPUT_SUFFIX
def add_quant_output_suffix(tensor_name):
return tensor_name + "_QuantizeLinear_Output"
def add_dequant_suffix(tensor_name):
return tensor_name + "_DequantizeLinear"
def add_dequant_input_suffix(tensor_name):
return tensor_name + "_DequantizeLinear_Input"
def add_dequant_output_suffix(tensor_name):
return tensor_name + DEQUANT_OUTPUT_SUFFIX