图片解析应用
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

211 lines
7.4 KiB

  1. import itertools
  2. import onnx
  3. from onnx import onnx_pb as onnx_proto
  4. from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, find_by_name, get_mul_node
  5. from .base_operator import QuantOperatorBase
  6. from .qdq_base_operator import QDQOperatorBase
  7. class QOpMatMul(QuantOperatorBase):
  8. def __init__(self, onnx_quantizer, onnx_node):
  9. super().__init__(onnx_quantizer, onnx_node)
  10. def should_quantize(self):
  11. if not self.quantizer.should_quantize_node(self.node):
  12. return False
  13. if (not self.quantizer.is_float_tensor(self.node.input[1])) and (
  14. not self.quantizer.is_float_tensor(self.node.input[0])
  15. ):
  16. return False
  17. # do not quantize non-constant B matrices for matmul
  18. if self.quantizer.q_matmul_const_b_only:
  19. if not self.quantizer.find_initializer_in_path(self.node.input[1]):
  20. print("Ignore MatMul due to non constant B: {}[{}]".format(self.quantizer.graph_scope, self.node.name))
  21. return False
  22. return True
  23. """
  24. Used when quantize mode is QuantizationMode.IntegerOps.
  25. """
  26. class MatMulInteger(QOpMatMul):
  27. def __init__(self, onnx_quantizer, onnx_node):
  28. super().__init__(onnx_quantizer, onnx_node)
  29. def quantize(self):
  30. node = self.node
  31. assert node.op_type == "MatMul"
  32. # Get Quantized from both activation(input[0]) and weight(input[1])
  33. (
  34. quantized_input_names,
  35. zero_point_names,
  36. scale_names,
  37. nodes,
  38. ) = self.quantizer.quantize_activation(node, [0])
  39. (
  40. quantized_input_names_weight,
  41. zero_point_names_weight,
  42. scale_names_weight,
  43. nodes_weight,
  44. ) = self.quantizer.quantize_weight(node, [1], reduce_range=True, op_level_per_channel=True)
  45. quantized_input_names.extend(quantized_input_names_weight)
  46. zero_point_names.extend(zero_point_names_weight)
  47. scale_names.extend(scale_names_weight)
  48. nodes.extend(nodes_weight)
  49. matmul_integer_output = node.output[0] + "_output_quantized"
  50. matmul_integer_name = node.name + "_quant" if node.name != "" else ""
  51. matmul_integer_node = onnx.helper.make_node(
  52. "MatMulInteger",
  53. quantized_input_names + zero_point_names,
  54. [matmul_integer_output],
  55. matmul_integer_name,
  56. )
  57. nodes.append(matmul_integer_node)
  58. # Add cast operation to cast matmulInteger output to float.
  59. cast_op_output = matmul_integer_output + "_cast_output"
  60. cast_node = onnx.helper.make_node(
  61. "Cast",
  62. [matmul_integer_output],
  63. [cast_op_output],
  64. matmul_integer_output + "_cast",
  65. to=onnx_proto.TensorProto.FLOAT,
  66. )
  67. nodes.append(cast_node)
  68. # Add mul operation to multiply scales of two inputs.
  69. assert len(scale_names) == 2
  70. scales_mul_op = (
  71. matmul_integer_name + "_scales_mul"
  72. if matmul_integer_name != ""
  73. else scale_names[0] + "_" + scale_names[1] + "_mul"
  74. )
  75. scales_mul_node = find_by_name(scales_mul_op, self.quantizer.new_nodes)
  76. if scales_mul_node is None:
  77. scales_mul_node = get_mul_node(scale_names, scales_mul_op + ":0", scales_mul_op)
  78. nodes.append(scales_mul_node)
  79. scales_mul_op_output = scales_mul_node.output[0]
  80. # Add mul operation to multiply mul_scales_op result with output of MatMulInteger
  81. # and make the output of this node the same as output of original matmul node.
  82. output_scale_mul_op = ""
  83. if matmul_integer_name != "":
  84. output_scale_mul_op = matmul_integer_name + "_output_scale_mul"
  85. nodes.append(
  86. get_mul_node(
  87. [cast_op_output, scales_mul_op_output],
  88. node.output[0],
  89. output_scale_mul_op,
  90. )
  91. )
  92. self.quantizer.new_nodes += nodes
  93. """
  94. Used when quantize mode is QuantizationMode.QLinearOps
  95. """
  96. class QLinearMatMul(QOpMatMul):
  97. def __init__(self, onnx_quantizer, onnx_node):
  98. super().__init__(onnx_quantizer, onnx_node)
  99. def quantize(self):
  100. node = self.node
  101. assert node.op_type == "MatMul"
  102. # Get Quantized from both activation(input[0]) and weight(input[1])
  103. (
  104. quantized_input_names,
  105. zero_point_names,
  106. scale_names,
  107. nodes,
  108. ) = self.quantizer.quantize_activation(node, [0])
  109. (
  110. quantized_input_names_weight,
  111. zero_point_names_weight,
  112. scale_names_weight,
  113. nodes_weight,
  114. ) = self.quantizer.quantize_weight(node, [1], reduce_range=True, op_level_per_channel=True)
  115. quantized_input_names.extend(quantized_input_names_weight)
  116. zero_point_names.extend(zero_point_names_weight)
  117. scale_names.extend(scale_names_weight)
  118. nodes.extend(nodes_weight)
  119. (
  120. data_found,
  121. output_scale_name,
  122. output_zp_name,
  123. _,
  124. _,
  125. ) = self.quantizer._get_quantization_params(node.output[0])
  126. if not data_found or quantized_input_names is None:
  127. return super().quantize()
  128. qlinear_matmul_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
  129. qlinear_matmul_name = node.name + "_quant" if node.name != "" else ""
  130. qlinear_matmul_inputs = []
  131. # Input 0
  132. qlinear_matmul_inputs.append(quantized_input_names[0])
  133. qlinear_matmul_inputs.append(scale_names[0])
  134. qlinear_matmul_inputs.append(zero_point_names[0])
  135. # Input 1
  136. qlinear_matmul_inputs.append(quantized_input_names[1])
  137. qlinear_matmul_inputs.append(scale_names[1])
  138. qlinear_matmul_inputs.append(zero_point_names[1])
  139. # Output quantization parameter
  140. qlinear_matmul_inputs.append(output_scale_name)
  141. qlinear_matmul_inputs.append(output_zp_name)
  142. qlinear_matmul_node = onnx.helper.make_node(
  143. "QLinearMatMul",
  144. qlinear_matmul_inputs,
  145. [qlinear_matmul_output],
  146. qlinear_matmul_name,
  147. )
  148. nodes.append(qlinear_matmul_node)
  149. # Create an entry for this quantized value
  150. q_output = QuantizedValue(
  151. node.output[0],
  152. qlinear_matmul_output,
  153. output_scale_name,
  154. output_zp_name,
  155. QuantizedValueType.Input,
  156. )
  157. self.quantizer.quantized_value_map[node.output[0]] = q_output
  158. self.quantizer.new_nodes += nodes
  159. class QDQMatMul(QDQOperatorBase):
  160. def __init__(self, onnx_quantizer, onnx_node):
  161. super().__init__(onnx_quantizer, onnx_node)
  162. def quantize(self):
  163. node = self.node
  164. assert node.op_type == "MatMul"
  165. if self.disable_qdq_for_node_output:
  166. nodes_to_iterate = node.input
  167. else:
  168. nodes_to_iterate = itertools.chain(node.input, node.output)
  169. for tensor_name in nodes_to_iterate:
  170. # only support per-channel quantization on weight
  171. if self.quantizer.is_per_channel() and find_by_name(tensor_name, self.quantizer.model.initializer()):
  172. channel_axis = self.quantizer.qdq_op_type_per_channel_support_to_axis.get(node.op_type, 1)
  173. self.quantizer.quantize_weight_tensor_per_channel(tensor_name, channel_axis)
  174. else:
  175. self.quantizer.quantize_activation_tensor(tensor_name)