m2m模型翻译
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1077 lines
44 KiB

7 months ago
  1. # -------------------------------------------------------------------------
  2. # Copyright (c) Microsoft Corporation. All rights reserved.
  3. # Licensed under the MIT License. See License.txt in the project root for
  4. # license information.
  5. # --------------------------------------------------------------------------
  6. import logging
  7. import numpy as np
  8. import onnx
  9. import onnx.numpy_helper
  10. from onnx import onnx_pb as onnx_proto
  11. from .onnx_model import ONNXModel
  12. from .quant_utils import (
  13. TENSOR_NAME_QUANT_SUFFIX,
  14. QuantizationMode,
  15. QuantizedValue,
  16. QuantizedValueType,
  17. QuantType,
  18. __producer__,
  19. __version__,
  20. add_infer_metadata,
  21. attribute_to_kwarg,
  22. compute_scale_zp,
  23. find_by_name,
  24. get_qmin_qmax_for_qType,
  25. get_qrange_for_qType,
  26. model_has_infer_metadata,
  27. quantize_data,
  28. save_and_reload_model,
  29. tensor_proto_to_array,
  30. )
  31. from .registry import CreateOpQuantizer
  32. class ONNXQuantizer:
  33. def __init__(
  34. self,
  35. model,
  36. per_channel,
  37. reduce_range,
  38. mode,
  39. static,
  40. weight_qType,
  41. activation_qType,
  42. tensors_range,
  43. nodes_to_quantize,
  44. nodes_to_exclude,
  45. op_types_to_quantize,
  46. extra_options=None,
  47. ):
  48. if not model_has_infer_metadata(model):
  49. model = save_and_reload_model(model)
  50. self.value_infos = {vi.name: vi for vi in model.graph.value_info}
  51. self.value_infos.update({ot.name: ot for ot in model.graph.output})
  52. self.value_infos.update({it.name: it for it in model.graph.input})
  53. self.model = ONNXModel(model)
  54. if not static:
  55. self.model.replace_gemm_with_matmul()
  56. self.per_channel = per_channel # weight-pack per channel
  57. self.reduce_range = reduce_range
  58. self.mode = mode # QuantizationMode.Value
  59. self.static = static # use static quantization for inputs.
  60. self.fuse_dynamic_quant = False
  61. self.extra_options = extra_options if extra_options else {}
  62. self.enable_subgraph_quantization = (
  63. "EnableSubgraph" in self.extra_options and self.extra_options["EnableSubgraph"]
  64. )
  65. self.force_quantize_no_input_check = (
  66. "ForceQuantizeNoInputCheck" in self.extra_options and self.extra_options["ForceQuantizeNoInputCheck"]
  67. )
  68. self.q_matmul_const_b_only = "MatMulConstBOnly" in self.extra_options and self.extra_options["MatMulConstBOnly"]
  69. is_weight_int8 = weight_qType == QuantType.QInt8
  70. self.is_weight_symmetric = (
  71. is_weight_int8 if "WeightSymmetric" not in self.extra_options else self.extra_options["WeightSymmetric"]
  72. )
  73. self.is_activation_symmetric = (
  74. False if "ActivationSymmetric" not in self.extra_options else self.extra_options["ActivationSymmetric"]
  75. )
  76. self.activation_qType = (
  77. onnx_proto.TensorProto.INT8 if activation_qType == QuantType.QInt8 else onnx_proto.TensorProto.UINT8
  78. )
  79. self.weight_qType = (
  80. onnx_proto.TensorProto.INT8 if weight_qType == QuantType.QInt8 else onnx_proto.TensorProto.UINT8
  81. )
  82. """
  83. Dictionary specifying the min and max values for tensors. It has following format:
  84. {
  85. "param_name": [min, max]
  86. }
  87. example:
  88. {
  89. 'Conv_3:0': [np.float32(0), np.float32(0.5)],
  90. 'Conv_4:0': [np.float32(1), np.float32(3.5)]
  91. }
  92. """
  93. self.tensors_range = tensors_range
  94. self.nodes_to_quantize = nodes_to_quantize # specific nodes to quantize
  95. self.nodes_to_exclude = nodes_to_exclude # specific nodes to exclude
  96. self.op_types_to_quantize = op_types_to_quantize
  97. self.new_nodes = []
  98. self.parent = None
  99. self.graph_scope = "/" # for human readable debug information
  100. self.tensor_names = {} # in case the shape inference not totally working
  101. self.tensor_names.update({ot.name: 1 for ot in model.graph.output})
  102. self.tensor_names.update({it.name: 1 for it in model.graph.input})
  103. for node in self.model.model.graph.node:
  104. self.tensor_names.update({output_name: 1 for output_name in node.output})
  105. self.opset_version = self.check_opset_version()
  106. if not self.mode in QuantizationMode:
  107. raise ValueError("unsupported quantization mode {}".format(self.mode))
  108. self.quantization_params = self.calculate_quantization_params()
  109. # QuantizeRange tensor name and zero tensor name for scale and zero point calculation.
  110. # Used when static is False
  111. self.fixed_qrange_uint8_name = "fixed_quantization_range_uint8"
  112. self.fixed_qrange_int8_name = "fixed_quantization_range_int8"
  113. # For uint8 data-type, to compute zero point, we subtract rmin from 0 (represented by fixed_zero_name tensor)
  114. self.fixed_zero_name = "fixed_zero"
  115. # For int8 data-type, zero point is always zero (respresented by fixed_zero_point_name tensor)
  116. self.fixed_zero_zp_name = "fixed_zero_zp"
  117. # Map of all original value names to quantized value names
  118. self.quantized_value_map = {}
  119. # some output from nodes will be quantized, yet itself should be treat as existing so
  120. # no dequantized will be applied when needed later
  121. self.generated_value_names = self.model.get_non_initializer_inputs()
  122. # to store specified scale and zeropoint instead of calculated value, tensor_name->(scale, zeropoint)
  123. self.used_scale_zp_map = {}
  124. # routines for subgraph support
  125. def quantize_subgraph(self, subgraph, graph_key):
  126. """
  127. generate submodel for the subgraph, so that we re-utilize current quantization implementation.
  128. quantize the submodel
  129. update subgraph and set it back to node
  130. """
  131. warped_model = onnx.helper.make_model(
  132. subgraph,
  133. producer_name="onnx-quantizer",
  134. opset_imports=self.model.model.opset_import,
  135. )
  136. add_infer_metadata(warped_model)
  137. sub_quanitzer = ONNXQuantizer(
  138. warped_model,
  139. self.per_channel,
  140. self.reduce_range,
  141. self.mode,
  142. self.static,
  143. self.weight_qType,
  144. self.activation_qType,
  145. self.tensors_range,
  146. self.nodes_to_quantize,
  147. self.nodes_to_exclude,
  148. self.op_types_to_quantize,
  149. self.extra_options,
  150. )
  151. sub_quanitzer.parent = self
  152. sub_quanitzer.graph_scope = "{}{}/".format(self.graph_scope, graph_key)
  153. sub_quanitzer.quantize_model()
  154. return sub_quanitzer.model.model.graph
  155. def quantize_node_with_sub_graph(self, node):
  156. """
  157. Check subgraph, if any, quantize it and replace it.
  158. return new_nodes added for quantizing subgraph
  159. """
  160. graph_attrs = [
  161. attr
  162. for attr in node.attribute
  163. if attr.type == onnx.AttributeProto.GRAPH or attr.type == onnx.AttributeProto.GRAPHS
  164. ]
  165. if len(graph_attrs) == 0:
  166. return node
  167. node_name = node.name if node.name != "" else "{}_node_count_{}".format(node.op_type, len(self.new_nodes))
  168. kwargs = {}
  169. for attr in node.attribute:
  170. if attr.type == onnx.AttributeProto.GRAPH:
  171. kv = {attr.name: self.quantize_subgraph(attr.g, "{}:{}".format(node_name, attr.name))}
  172. elif attr.type == onnx.AttributeProto.GRAPHS:
  173. value = []
  174. for subgraph in attr.graphs:
  175. value.extend(
  176. [
  177. self.quantize_subgraph(
  178. subgraph,
  179. "{}:{}:{}".format(node_name, attr.name, len(value)),
  180. )
  181. ]
  182. )
  183. kv = {attr.name: value}
  184. else:
  185. kv = attribute_to_kwarg(attr)
  186. kwargs.update(kv)
  187. return onnx.helper.make_node(node.op_type, node.input, node.output, name=node.name, **kwargs)
  188. def check_opset_version(self):
  189. ai_onnx_domain = [
  190. opset for opset in self.model.model.opset_import if not opset.domain or opset.domain == "ai.onnx"
  191. ]
  192. if 1 != len(ai_onnx_domain):
  193. raise ValueError("Failed to find proper ai.onnx domain")
  194. opset_version = ai_onnx_domain[0].version
  195. if opset_version == 10:
  196. logging.warning(
  197. "The original model opset version is {}, which does not support node fusions. Please update the model to opset >= 11 for better performance.".format(
  198. opset_version
  199. )
  200. )
  201. return 10
  202. if opset_version < 10:
  203. logging.warning(
  204. "The original model opset version is {}, which does not support quantization. Please update the model to opset >= 11. Updating the model automatically to opset 11. Please verify the quantized model.".format(
  205. opset_version
  206. )
  207. )
  208. self.model.model.opset_import.remove(ai_onnx_domain[0])
  209. self.model.model.opset_import.extend([onnx.helper.make_opsetid("", 11)])
  210. opset_version = 11
  211. self.fuse_dynamic_quant = True
  212. return opset_version
  213. def has_QDQ_nodes(self):
  214. """
  215. Detect if model already has QuantizeLinear or DequantizeLinear.
  216. """
  217. return any(
  218. node.op_type == "QuantizeLinear" or node.op_type == "DequantizeLinear" for node in self.model.nodes()
  219. )
  220. def find_initializer_in_path(self, initializer_name):
  221. if find_by_name(initializer_name, self.model.initializer()) is not None:
  222. return True
  223. if self.parent is not None:
  224. return self.parent.find_initializer_in_path(initializer_name)
  225. return False
  226. def add_new_nodes(self, nodes):
  227. self.new_nodes.extend(nodes)
  228. for node in nodes:
  229. for output_name in node.output:
  230. self.generated_value_names.add(output_name)
  231. def quantize_model(self):
  232. if self.has_QDQ_nodes():
  233. logging.warning(
  234. "Please check if the model is already quantized."
  235. "Note you don't need to quantize a QAT model. OnnxRuntime support to run QAT model directly."
  236. )
  237. for node in self.model.nodes():
  238. # quantize subgraphes if have
  239. if self.enable_subgraph_quantization:
  240. node = self.quantize_node_with_sub_graph(node)
  241. number_of_existing_new_nodes = len(self.new_nodes)
  242. op_quantizer = CreateOpQuantizer(self, node)
  243. op_quantizer.quantize()
  244. for i in range(number_of_existing_new_nodes, len(self.new_nodes)):
  245. for output_name in self.new_nodes[i].output:
  246. self.generated_value_names.add(output_name)
  247. self._dequantize_outputs()
  248. # extend is used to append to the list for a protobuf fields
  249. # https://developers.google.com/protocol-buffers/docs/reference/python-generated?csw=1#fields
  250. self.model.graph().ClearField("node")
  251. self.model.graph().node.extend(self.new_nodes)
  252. # Remove ununsed initializers from graph, starting from the top level graph.
  253. if self.parent is None:
  254. _, initializers_not_found = self.model.clean_initializers()
  255. if len(initializers_not_found) > 0:
  256. raise RuntimeError("Invalid model with unknown initializers/tensors." + str(initializers_not_found))
  257. self.model.model.producer_name = __producer__
  258. self.model.model.producer_version = __version__
  259. return self.model.model
  260. def is_input_a_initializer(self, input_name):
  261. initializer = find_by_name(input_name, self.model.initializer())
  262. return initializer is not None
  263. def is_per_channel(self):
  264. return self.per_channel
  265. def is_valid_quantize_weight(self, weight_name):
  266. weight = find_by_name(weight_name, self.model.initializer())
  267. if weight is not None:
  268. return weight.data_type == onnx_proto.TensorProto.FLOAT
  269. if (not self.enable_subgraph_quantization) or (self.parent is None):
  270. return False
  271. return self.parent.is_valid_quantize_weight(weight_name)
  272. def is_float_tensor(self, tensor_name):
  273. if self.is_input_a_initializer(tensor_name):
  274. return self.is_valid_quantize_weight(tensor_name)
  275. if tensor_name in self.value_infos.keys():
  276. vi = self.value_infos[tensor_name]
  277. if vi.type.HasField("tensor_type") and vi.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT:
  278. return True
  279. elif self.enable_subgraph_quantization and self.parent:
  280. return self.parent.is_float_tensor(tensor_name)
  281. else:
  282. logging.warning(
  283. "Failed to infer data type of tensor: {}. Please add data type info for this tensor "
  284. "if your model has customized operators.".format(tensor_name)
  285. )
  286. return False
  287. def should_quantize_node(self, node):
  288. if (
  289. self.nodes_to_quantize is not None
  290. and len(self.nodes_to_quantize) != 0
  291. and node.name not in self.nodes_to_quantize
  292. ):
  293. return False
  294. if node.op_type not in self.op_types_to_quantize:
  295. return False
  296. if self.nodes_to_exclude is not None and node.name in self.nodes_to_exclude:
  297. return False
  298. return True
  299. def _get_dynamic_input_quantization_params(self, input_name, nodes_list, qType):
  300. """
  301. Create nodes for dynamic quantization of input and add them to nodes_list.
  302. parameter input_name: Name of the input.
  303. parameter nodes_list: new nodes are appended to this list.
  304. parameter qType: type to quantize to.
  305. return: scale_name, zero_point_name, scale_shape, zero_point_shape.
  306. """
  307. if qType == onnx_proto.TensorProto.INT8:
  308. return self._get_dynamic_input_quantization_params_int8(input_name, nodes_list)
  309. return self._get_dynamic_input_quantization_params_uint8(input_name, nodes_list)
  310. def _get_dynamic_input_quantization_params_int8(self, input_name, nodes_list):
  311. """
  312. Create nodes for dynamic quantization of input to int8 and add them to nodes_list
  313. parameter input_name: Name of the input.
  314. parameter nodes_list: new nodes are appended to this list.
  315. return: scale_name, zero_point_name, scale_shape, zero_point_shape.
  316. """
  317. qType = onnx_proto.TensorProto.INT8
  318. # Reduce min and Reduce max
  319. input_scale_name = input_name + "_scale"
  320. reduce_min_name = input_name + "_ReduceMin"
  321. reduce_min_node = onnx.helper.make_node(
  322. "ReduceMin",
  323. [input_name],
  324. [reduce_min_name + ":0"],
  325. reduce_min_name,
  326. keepdims=0,
  327. )
  328. nodes_list.append(reduce_min_node)
  329. reduce_max_name = input_name + "_ReduceMax"
  330. reduce_max_node = onnx.helper.make_node(
  331. "ReduceMax",
  332. [input_name],
  333. [reduce_max_name + ":0"],
  334. reduce_max_name,
  335. keepdims=0,
  336. )
  337. nodes_list.append(reduce_max_node)
  338. # Compute scale
  339. # Find abs(rmin)
  340. reduce_min_abs_name = reduce_min_name + "_Abs"
  341. reduce_min_abs_node = onnx.helper.make_node(
  342. "Abs",
  343. [reduce_min_node.output[0]],
  344. [reduce_min_abs_name + ":0"],
  345. reduce_min_abs_name,
  346. )
  347. nodes_list.append(reduce_min_abs_node)
  348. # Find abs(rmax)
  349. reduce_max_abs_name = reduce_max_name + "_Abs"
  350. reduce_max_abs_node = onnx.helper.make_node(
  351. "Abs",
  352. [reduce_max_node.output[0]],
  353. [reduce_max_abs_name + ":0"],
  354. reduce_max_abs_name,
  355. )
  356. nodes_list.append(reduce_max_abs_node)
  357. # Compute max of abs(rmin) and abs(rmax)
  358. abs_max_name = input_name + "_Abs_Max"
  359. abs_max_node = onnx.helper.make_node(
  360. "Max",
  361. [reduce_min_abs_node.output[0], reduce_max_abs_node.output[0]],
  362. [abs_max_name + ":0"],
  363. abs_max_name,
  364. )
  365. nodes_list.append(abs_max_node)
  366. # and divide by (quantize_range/2.0) which will be equal to max(...)*2.0/quantize_range
  367. initializer_div = onnx.helper.make_tensor(
  368. self.fixed_qrange_int8_name,
  369. onnx_proto.TensorProto.FLOAT,
  370. [],
  371. [get_qrange_for_qType(qType) / 2.0],
  372. )
  373. self.model.add_initializer(initializer_div)
  374. scale_div_name = input_name + "scale_Div"
  375. scale_div_node = onnx.helper.make_node(
  376. "Div",
  377. [abs_max_node.output[0], self.fixed_qrange_int8_name],
  378. [input_scale_name],
  379. scale_div_name,
  380. )
  381. nodes_list.append(scale_div_node)
  382. # Zero point
  383. initializer_zp = onnx.helper.make_tensor(self.fixed_zero_zp_name, qType, [], [0])
  384. self.model.add_initializer(initializer_zp)
  385. return input_scale_name, self.fixed_zero_zp_name, [], []
  386. def _get_dynamic_input_quantization_params_uint8(self, input_name, nodes_list):
  387. """
  388. Create nodes for dynamic quantization of input to uint8 and add them to nodes_list
  389. parameter input_name: Name of the input.
  390. parameter nodes_list: new nodes are appended to this list.
  391. return: scale_name, zero_point_name, scale_shape, zero_point_shape.
  392. """
  393. qType = onnx_proto.TensorProto.UINT8
  394. # Reduce min and Reduce max
  395. input_scale_name = input_name + "_scale"
  396. input_zp_name = input_name + "_zero_point"
  397. reduce_min_name = input_name + "_ReduceMin"
  398. reduce_min_node = onnx.helper.make_node(
  399. "ReduceMin",
  400. [input_name],
  401. [reduce_min_name + ":0"],
  402. reduce_min_name,
  403. keepdims=0,
  404. )
  405. nodes_list.append(reduce_min_node)
  406. reduce_max_name = input_name + "_ReduceMax"
  407. reduce_max_node = onnx.helper.make_node(
  408. "ReduceMax",
  409. [input_name],
  410. [reduce_max_name + ":0"],
  411. reduce_max_name,
  412. keepdims=0,
  413. )
  414. nodes_list.append(reduce_max_node)
  415. # Add tensors for quantize range and zero value.
  416. initializer_qrange = onnx.helper.make_tensor(
  417. self.fixed_qrange_uint8_name,
  418. onnx_proto.TensorProto.FLOAT,
  419. [],
  420. [get_qrange_for_qType(qType)],
  421. )
  422. self.model.add_initializer(initializer_qrange)
  423. initializer_qvalue = onnx.helper.make_tensor(self.fixed_zero_name, onnx_proto.TensorProto.FLOAT, [], [0.0])
  424. self.model.add_initializer(initializer_qvalue)
  425. # Compute Scale
  426. # Subtract rmax and rmin
  427. scale_sub_name = input_name + "_scale_Sub"
  428. scale_sub_node = onnx.helper.make_node(
  429. "Sub",
  430. [reduce_max_node.output[0], reduce_min_node.output[0]],
  431. [scale_sub_name + ":0"],
  432. scale_sub_name,
  433. )
  434. nodes_list.append(scale_sub_node)
  435. # and divide by quantize range
  436. scale_div_name = input_name + "_scale_Div"
  437. scale_div_node = onnx.helper.make_node(
  438. "Div",
  439. [scale_sub_node.output[0], self.fixed_qrange_uint8_name],
  440. [input_scale_name],
  441. scale_div_name,
  442. )
  443. nodes_list.append(scale_div_node)
  444. # Compute zero point
  445. # Subtract zero and rmin
  446. zp_sub_name = input_name + "_zero_point_Sub"
  447. zp_sub_node = onnx.helper.make_node(
  448. "Sub",
  449. [self.fixed_zero_name, reduce_min_node.output[0]],
  450. [zp_sub_name + ":0"],
  451. zp_sub_name,
  452. )
  453. nodes_list.append(zp_sub_node)
  454. # Divide by scale
  455. zp_div_name = input_name + "_zero_point_Div"
  456. zp_div_node = onnx.helper.make_node(
  457. "Div",
  458. [zp_sub_node.output[0], input_scale_name],
  459. [zp_div_name + ":0"],
  460. zp_div_name,
  461. )
  462. nodes_list.append(zp_div_node)
  463. # Compute floor
  464. zp_floor_name = input_name + "_zero_point_Floor"
  465. zp_floor_node = onnx.helper.make_node("Floor", zp_div_node.output, [zp_floor_name + ":0"], zp_floor_name)
  466. nodes_list.append(zp_floor_node)
  467. # Cast to integer
  468. zp_cast_name = input_name + "_zero_point_Cast"
  469. zp_cast_node = onnx.helper.make_node("Cast", zp_floor_node.output, [input_zp_name], zp_cast_name, to=qType)
  470. nodes_list.append(zp_cast_node)
  471. return input_scale_name, input_zp_name, [], []
  472. def _get_quantization_params(self, param_name, use_scale=None, use_zeropoint=None):
  473. """
  474. Create initializers and inputs in the graph for zero point and scale of output.
  475. Zero point and scale values are obtained from self.quantization_params if specified.
  476. parameter param_name: Name of the quantization parameter.
  477. return: result, scale_name, zero_point_name, scale_shape, zero_point_shape.
  478. """
  479. if use_scale is None or use_zeropoint is None:
  480. if self.quantization_params is None or param_name not in self.quantization_params:
  481. logging.info('Quantization parameters for tensor:"{}" not specified'.format(param_name))
  482. return False, "", "", "", ""
  483. params = self.quantization_params[param_name]
  484. if params is None or len(params) != 2:
  485. raise ValueError(
  486. "Quantization parameters should contain zero point and scale. "
  487. "Specified values for output {}: {}".format(param_name, params)
  488. )
  489. zero_point_values = [params[0]]
  490. scale_values = [params[1]]
  491. else:
  492. zero_point_values = [use_zeropoint]
  493. scale_values = [use_scale]
  494. zero_point_shape = []
  495. zero_point_name = param_name + "_zero_point"
  496. zero_point_type = self.activation_qType
  497. scale_shape = []
  498. scale_name = param_name + "_scale"
  499. # Add initializers
  500. init_zp = onnx.helper.make_tensor(zero_point_name, zero_point_type, zero_point_shape, zero_point_values)
  501. self.model.add_initializer(init_zp)
  502. init_scale = onnx.helper.make_tensor(scale_name, onnx_proto.TensorProto.FLOAT, scale_shape, scale_values)
  503. self.model.add_initializer(init_scale)
  504. return True, scale_name, zero_point_name, scale_shape, zero_point_shape
  505. def _get_quantize_input_nodes(self, node, input_index, qType, given_scale_name=None, given_zp_name=None):
  506. """
  507. Given an input for a node (which is not a initializer), this function
  508. - add nodes to compute zero point and scale for this input if they don't exist.
  509. - add new QuantizeLinear node to quantize the input.
  510. :param node: node being quantized in NodeProto format.
  511. :param input_index: index of input in node.input.
  512. :param qType: type to quantize to.
  513. :param given_scale_name: if those inputs need to be quanitzed using this scale tensor.
  514. :param given_zp_name: if those inputs to be quantized using this zeropoint tensor.
  515. :return: List of newly created nodes in NodeProto format.
  516. """
  517. input_name = node.input[input_index]
  518. output_name = input_name + TENSOR_NAME_QUANT_SUFFIX
  519. ql_node_name = input_name + "_QuantizeLinear"
  520. if (given_scale_name is not None) and (given_zp_name is not None):
  521. data_found, scale_name, zp_name = (True, given_scale_name, given_zp_name)
  522. else:
  523. data_found, scale_name, zp_name, _, _ = self._get_quantization_params(input_name)
  524. nodes = []
  525. if data_found:
  526. qlinear_node = onnx.helper.make_node(
  527. "QuantizeLinear",
  528. [input_name, scale_name, zp_name],
  529. [output_name],
  530. ql_node_name,
  531. )
  532. else:
  533. if self.static:
  534. return None
  535. # dynamic mode
  536. # Scale and Zero Points not available for this input. Add nodes to dynamically compute it
  537. if self.fuse_dynamic_quant and qType == onnx_proto.TensorProto.UINT8:
  538. scale_name = input_name + "_scale"
  539. zp_name = input_name + "_zero_point"
  540. qlinear_node = onnx.helper.make_node(
  541. "DynamicQuantizeLinear",
  542. [input_name],
  543. [output_name, scale_name, zp_name],
  544. ql_node_name,
  545. )
  546. else:
  547. (
  548. scale_name,
  549. zp_name,
  550. scale_shape,
  551. zp_shape,
  552. ) = self._get_dynamic_input_quantization_params(input_name, nodes, qType)
  553. qlinear_node = onnx.helper.make_node(
  554. "QuantizeLinear",
  555. [input_name, scale_name, zp_name],
  556. [output_name],
  557. ql_node_name,
  558. )
  559. self.quantized_value_map[input_name] = QuantizedValue(input_name, output_name, scale_name, zp_name, qType)
  560. return nodes + [qlinear_node]
  561. def set_quant_scale_zp(self, tensor_name, value):
  562. assert isinstance(value, tuple) and len(value) == 2, "value must be scale(float) and zeropoint"
  563. assert tensor_name not in self.used_scale_zp_map, f"{tensor_name} has been setted before"
  564. self.used_scale_zp_map[tensor_name] = value
  565. def find_quant_scale_zp(self, input_name):
  566. if input_name in self.used_scale_zp_map:
  567. return self.used_scale_zp_map[input_name]
  568. if self.parent is not None:
  569. return self.parent.find_quantized_value(input_name)
  570. return (None, None)
  571. def find_quantized_value(self, input_name):
  572. if input_name in self.quantized_value_map:
  573. return self.quantized_value_map[input_name]
  574. if self.parent is not None:
  575. return self.parent.find_quantized_value(input_name)
  576. return None
  577. def quantize_bias_static(self, bias_name, input_name, weight_name, beta=1.0):
  578. """
  579. Quantized the bias. Zero Point == 0 and Scale == Input_Scale * Weight_Scale
  580. """
  581. # Handle case where bias already in quantization map
  582. if bias_name in self.quantized_value_map:
  583. return self.quantized_value_map[bias_name].q_name
  584. # get scale for weight
  585. weight_scale_name = self.quantized_value_map[weight_name].scale_name
  586. weight_initializer = find_by_name(weight_scale_name, self.model.initializer())
  587. weight_scale = tensor_proto_to_array(weight_initializer)
  588. # get bias
  589. bias_initializer = find_by_name(bias_name, self.model.initializer())
  590. bias_data = tensor_proto_to_array(bias_initializer)
  591. quantized_bias_name = bias_name + TENSOR_NAME_QUANT_SUFFIX
  592. # get scale for input
  593. if input_name in self.quantized_value_map:
  594. input_scale_name = self.quantized_value_map[input_name].scale_name
  595. elif input_name in self.quantization_params:
  596. _, input_scale_name, _, _, _ = self._get_quantization_params(input_name)
  597. else:
  598. raise ValueError("Expected {} to be in quantized value map for static quantization".format(input_name))
  599. inputscale_initializer = find_by_name(input_scale_name, self.model.initializer())
  600. input_scale = tensor_proto_to_array(inputscale_initializer)
  601. # calcuate scale for bias
  602. bias_scale = input_scale * weight_scale * beta
  603. # quantize bias
  604. quantized_data = (np.asarray(bias_data) / bias_scale).round().astype(np.int32)
  605. # update bias initializer
  606. bias_np_data = np.asarray(quantized_data, dtype=np.int32).reshape(bias_initializer.dims)
  607. packed_bias_initializer = onnx.numpy_helper.from_array(bias_np_data, quantized_bias_name)
  608. self.model.initializer().extend([packed_bias_initializer])
  609. # update scale initializer
  610. quantized_bias_scale_name = quantized_bias_name + "_scale"
  611. bias_scale_data = np.asarray(bias_scale, dtype=np.float32).reshape(-1)
  612. if self.is_per_channel():
  613. packed_bias_scale_initializer = onnx.numpy_helper.from_array(bias_scale_data, quantized_bias_scale_name)
  614. else:
  615. packed_bias_scale_initializer = onnx.helper.make_tensor(
  616. quantized_bias_scale_name, onnx_proto.TensorProto.FLOAT, [], bias_scale_data
  617. )
  618. self.model.initializer().extend([packed_bias_scale_initializer])
  619. # update zero initializer
  620. quantized_bias_zp_name = quantized_bias_name + "_zero_point"
  621. bias_zp_data = np.zeros(bias_scale.shape, dtype=np.int32).reshape(-1)
  622. if self.is_per_channel():
  623. packed_bias_zp_initializer = onnx.numpy_helper.from_array(bias_zp_data, quantized_bias_zp_name)
  624. else:
  625. packed_bias_zp_initializer = onnx.helper.make_tensor(
  626. quantized_bias_zp_name, onnx_proto.TensorProto.INT32, [], bias_zp_data
  627. )
  628. self.model.initializer().extend([packed_bias_zp_initializer])
  629. assert bias_name not in self.quantized_value_map
  630. quantized_value = QuantizedValue(
  631. bias_name,
  632. quantized_bias_name,
  633. quantized_bias_scale_name,
  634. quantized_bias_zp_name,
  635. QuantizedValueType.Initializer,
  636. 0 if bias_scale_data.size > 1 else None,
  637. )
  638. self.quantized_value_map[bias_name] = quantized_value
  639. return quantized_bias_name
  640. def contains_tensor(self, tensor_name):
  641. """
  642. only check for value info and newly generated tensor names, initializers are checked separately
  643. """
  644. return (
  645. (tensor_name in self.value_infos)
  646. or (tensor_name in self.tensor_names)
  647. or (tensor_name in self.generated_value_names)
  648. )
  649. def quantize_activation(self, node, indices, from_subgraph=False):
  650. return self.__quantize_inputs(
  651. node=node,
  652. indices=indices,
  653. initializer_use_weight_qType=False,
  654. reduce_range=False,
  655. op_level_per_channel=False,
  656. axis=-1,
  657. from_subgraph=from_subgraph,
  658. )
  659. # In some circumstances a weight is not an initializer, for example of MatMul, if both A and B are not
  660. # initializer, B can still be considered as Weight
  661. def quantize_weight(
  662. self,
  663. node,
  664. indices,
  665. reduce_range=False,
  666. op_level_per_channel=False,
  667. axis=-1,
  668. from_subgraph=False,
  669. ):
  670. return self.__quantize_inputs(
  671. node=node,
  672. indices=indices,
  673. initializer_use_weight_qType=True,
  674. reduce_range=reduce_range,
  675. op_level_per_channel=op_level_per_channel,
  676. axis=axis,
  677. from_subgraph=from_subgraph,
  678. )
  679. def __quantize_inputs(
  680. self,
  681. node,
  682. indices,
  683. initializer_use_weight_qType=True,
  684. reduce_range=False,
  685. op_level_per_channel=False,
  686. axis=-1,
  687. from_subgraph=False,
  688. ):
  689. """
  690. Given a node, this function quantizes the inputs as follows:
  691. - If input is an initializer, quantize the initializer data, replace old initializer
  692. with new initializer
  693. - Else, add QuantizeLinear nodes to perform quantization
  694. parameter node: node being quantized in NodeProto format.
  695. parameter indices: input indices to quantize.
  696. return: (List of quantized input names,
  697. List of zero point names used for input quantization,
  698. List of scale names used for input quantization,
  699. List of new QuantizeLinear nodes created)
  700. """
  701. scale_names = []
  702. zero_point_names = []
  703. quantized_input_names = []
  704. nodes = []
  705. for input_index in indices:
  706. node_input = node.input[input_index]
  707. # Find if this input is already quantized
  708. if node_input in self.quantized_value_map:
  709. quantized_value = self.quantized_value_map[node_input]
  710. scale_names.append(quantized_value.scale_name)
  711. zero_point_names.append(quantized_value.zp_name)
  712. quantized_input_names.append(quantized_value.q_name)
  713. continue
  714. # Quantize the input
  715. initializer = find_by_name(node_input, self.model.initializer())
  716. if initializer is not None:
  717. if self.per_channel and op_level_per_channel:
  718. (q_weight_name, zp_name, scale_name,) = self.quantize_weight_per_channel(
  719. initializer.name,
  720. self.weight_qType if initializer_use_weight_qType else self.activation_qType,
  721. axis,
  722. reduce_range,
  723. )
  724. else:
  725. q_weight_name, zp_name, scale_name = self.quantize_initializer(
  726. initializer,
  727. self.weight_qType if initializer_use_weight_qType else self.activation_qType,
  728. reduce_range,
  729. )
  730. quantized_input_names.append(q_weight_name)
  731. zero_point_names.append(zp_name)
  732. scale_names.append(scale_name)
  733. elif self.contains_tensor(node_input):
  734. # Add QuantizeLinear node.
  735. qlinear_node = self.model.find_node_by_name(
  736. node_input + "_QuantizeLinear", self.new_nodes, self.model.graph()
  737. )
  738. if qlinear_node is None:
  739. quantize_input_nodes = self._get_quantize_input_nodes(node, input_index, self.activation_qType)
  740. if quantize_input_nodes is None:
  741. return (None, None, None, None)
  742. if from_subgraph:
  743. self.add_new_nodes(quantize_input_nodes)
  744. else:
  745. nodes.extend(quantize_input_nodes)
  746. qlinear_node = quantize_input_nodes[-1]
  747. if qlinear_node.op_type == "QuantizeLinear":
  748. quantized_input_names.extend(qlinear_node.output)
  749. scale_names.append(qlinear_node.input[1])
  750. zero_point_names.append(qlinear_node.input[2])
  751. else:
  752. quantized_input_names.append(qlinear_node.output[0])
  753. scale_names.append(qlinear_node.output[1])
  754. zero_point_names.append(qlinear_node.output[2])
  755. elif self.parent is not None:
  756. (
  757. parent_quantized_input_names,
  758. parent_zero_point_names,
  759. parent_scale_names,
  760. _,
  761. ) = self.parent.__quantize_inputs(
  762. node,
  763. [input_index],
  764. initializer_use_weight_qType=initializer_use_weight_qType,
  765. reduce_range=reduce_range,
  766. op_level_per_channel=op_level_per_channel,
  767. axis=axis,
  768. from_subgraph=True,
  769. )
  770. quantized_input_names.append(parent_quantized_input_names[0])
  771. scale_names.append(parent_scale_names[0])
  772. zero_point_names.append(parent_zero_point_names[0])
  773. # node should not be add this child level here
  774. else:
  775. raise ValueError(
  776. "Invalid tensor name to quantize: {} @graph scope{}".format(node_input, self.graph_scope)
  777. )
  778. return quantized_input_names, zero_point_names, scale_names, nodes
  779. def quantize_initializer(self, weight, qType, reduce_range=False, keep_float_weight=False):
  780. """
  781. :param weight: TensorProto initializer
  782. :param qType: type to quantize to
  783. :param keep_float_weight: Whether to quantize the weight. In some cases, we only want to qunatize scale and zero point.
  784. If keep_float_weight is False, quantize the weight, or don't quantize the weight.
  785. :return: quantized weight name, zero point name, scale name
  786. """
  787. # Find if this input is already quantized
  788. if weight.name in self.quantized_value_map:
  789. quantized_value = self.quantized_value_map[weight.name]
  790. return (
  791. quantized_value.q_name,
  792. quantized_value.zp_name,
  793. quantized_value.scale_name,
  794. )
  795. q_weight_name = weight.name + TENSOR_NAME_QUANT_SUFFIX
  796. zp_name = weight.name + "_zero_point"
  797. scale_name = weight.name + "_scale"
  798. # Update packed weight, zero point, and scale initializers
  799. weight_data = tensor_proto_to_array(weight)
  800. _, _, zero_point, scale, q_weight_data = quantize_data(
  801. weight_data.flatten().tolist(),
  802. qType,
  803. self.is_weight_symmetric,
  804. self.reduce_range and reduce_range,
  805. )
  806. scale_initializer = onnx.helper.make_tensor(scale_name, onnx_proto.TensorProto.FLOAT, [], [scale])
  807. zero_initializer = onnx.helper.make_tensor(zp_name, qType, [], [zero_point])
  808. self.model.initializer().extend([scale_initializer, zero_initializer])
  809. if not keep_float_weight:
  810. q_weight_data = np.asarray(q_weight_data, dtype=onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[qType]).reshape(
  811. weight.dims
  812. )
  813. q_weight_initializer = onnx.numpy_helper.from_array(q_weight_data, q_weight_name)
  814. self.model.initializer().extend([q_weight_initializer])
  815. # Log entry for this quantized weight
  816. quantized_value = QuantizedValue(
  817. weight.name,
  818. q_weight_name,
  819. scale_name,
  820. zp_name,
  821. QuantizedValueType.Initializer,
  822. None,
  823. )
  824. self.quantized_value_map[weight.name] = quantized_value
  825. return q_weight_name, zp_name, scale_name
  826. def quantize_weight_per_channel(
  827. self,
  828. weight_name,
  829. weight_qType,
  830. channel_axis,
  831. reduce_range=True,
  832. keep_float_weight=False,
  833. ):
  834. # Find if this input is already quantized
  835. if weight_name in self.quantized_value_map:
  836. quantized_value = self.quantized_value_map[weight_name]
  837. return (
  838. quantized_value.q_name,
  839. quantized_value.zp_name,
  840. quantized_value.scale_name,
  841. )
  842. initializer = find_by_name(weight_name, self.model.initializer())
  843. if initializer is None:
  844. raise ValueError("{} is not an initializer", weight_name)
  845. weights = tensor_proto_to_array(initializer)
  846. channel_count = weights.shape[channel_axis]
  847. rmin_list = []
  848. rmax_list = []
  849. zero_point_list = []
  850. scale_list = []
  851. quantized_per_channel_data_list = []
  852. for i in range(channel_count):
  853. per_channel_data = weights.take(i, channel_axis)
  854. rmin, rmax, zero_point, scale, quantized_per_channel_data = quantize_data(
  855. per_channel_data.flatten().tolist(),
  856. weight_qType,
  857. self.is_weight_symmetric or weight_qType == onnx_proto.TensorProto.INT8,
  858. self.reduce_range and reduce_range,
  859. )
  860. rmin_list.append(rmin)
  861. rmax_list.append(rmax)
  862. zero_point_list.append(zero_point)
  863. scale_list.append(scale)
  864. quantized_per_channel_data_list.append(quantized_per_channel_data)
  865. # combine per_channel_data into one
  866. reshape_dims = list(weights.shape) # deep copy
  867. reshape_dims[channel_axis] = 1 # only one per channel for reshape
  868. quantized_weights = np.asarray(quantized_per_channel_data_list[0]).reshape(reshape_dims)
  869. for i in range(1, len(quantized_per_channel_data_list)):
  870. channel_weights = np.asarray(quantized_per_channel_data_list[i]).reshape(reshape_dims)
  871. quantized_weights = np.concatenate((quantized_weights, channel_weights), channel_axis)
  872. q_weight_name = weight_name + TENSOR_NAME_QUANT_SUFFIX
  873. zp_name = weight_name + "_zero_point"
  874. scale_name = weight_name + "_scale"
  875. quantized_value = QuantizedValue(
  876. weight_name,
  877. q_weight_name,
  878. scale_name,
  879. zp_name,
  880. QuantizedValueType.Initializer,
  881. None,
  882. )
  883. self.quantized_value_map[weight_name] = quantized_value
  884. # Update packed weight, zero point, and scale initializers
  885. zero_scale_shape = [initializer.dims[channel_axis]]
  886. scale_initializer = onnx.helper.make_tensor(
  887. scale_name, onnx_proto.TensorProto.FLOAT, zero_scale_shape, scale_list
  888. )
  889. zero_initializer = onnx.helper.make_tensor(zp_name, weight_qType, zero_scale_shape, zero_point_list)
  890. self.model.initializer().extend([scale_initializer, zero_initializer])
  891. if not keep_float_weight:
  892. quantized_weights = np.asarray(
  893. quantized_weights,
  894. dtype=onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[weight_qType],
  895. ).reshape(initializer.dims)
  896. q_weight_initializer = onnx.numpy_helper.from_array(quantized_weights, q_weight_name)
  897. self.model.initializer().extend([q_weight_initializer])
  898. return q_weight_name, zp_name, scale_name
  899. def _dequantize_value(self, value_name):
  900. """
  901. Given a value (input/output) which is quantized, add a DequantizeLinear node to dequantize
  902. it back to float32
  903. parameter value_name: value to dequantize
  904. parameter new_nodes_list: List of new nodes created before processing current node
  905. return: None if there is already a DequantizeLinear node that dequantizes it
  906. A DequantizeLinear node otherwise
  907. """
  908. if (value_name in self.quantized_value_map) and (value_name not in self.generated_value_names):
  909. quantized_value = self.quantized_value_map[value_name]
  910. # Add DequantizeLinear Node for this input
  911. dqlinear_name = value_name + "_DequantizeLinear"
  912. dqlinear_node = self.model.find_node_by_name(dqlinear_name, self.new_nodes, self.model.graph())
  913. if dqlinear_node is None:
  914. dqlinear_inputs = [
  915. quantized_value.q_name,
  916. quantized_value.scale_name,
  917. quantized_value.zp_name,
  918. ]
  919. dequantize_node = onnx.helper.make_node(
  920. "DequantizeLinear", dqlinear_inputs, [value_name], dqlinear_name
  921. )
  922. return dequantize_node
  923. else:
  924. # DQ op is already present, assert it's output matches the input of current node
  925. assert value_name == dqlinear_node.output[0]
  926. return None
  927. def _dequantize_outputs(self):
  928. """
  929. Dequantize output if it is quantized
  930. parameter new_nodes_list: List of new nodes created before processing current node
  931. return: List of new nodes created
  932. """
  933. for output in self.model.graph().output:
  934. dequantize_node = self._dequantize_value(output.name)
  935. if dequantize_node is not None:
  936. self.new_nodes.append(dequantize_node)
  937. def calculate_quantization_params(self):
  938. if self.tensors_range is None:
  939. return
  940. # adjust tensor_ranges for input of Clip and Relu node
  941. for node in self.model.nodes():
  942. if node.op_type not in ["Clip", "Relu"]:
  943. continue
  944. if self.is_activation_symmetric:
  945. continue
  946. if not self.should_quantize_node(node):
  947. continue
  948. if len(self.model.input_name_to_nodes()[node.input[0]]) != 1:
  949. continue
  950. if node.input[0] not in self.tensors_range.keys() or node.output[0] not in self.tensors_range.keys():
  951. continue
  952. self.tensors_range[node.input[0]] = self.tensors_range[node.output[0]]
  953. quantization_params = {}
  954. for tensor_name in self.tensors_range.keys():
  955. rmin, rmax = self.tensors_range[tensor_name]
  956. qmin, qmax = get_qmin_qmax_for_qType(self.activation_qType, symmetric=self.is_activation_symmetric)
  957. quantization_params[tensor_name] = compute_scale_zp(rmin, rmax, qmin, qmax, self.is_activation_symmetric)
  958. return quantization_params