m2m模型翻译
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

902 lines
36 KiB

6 months ago
  1. #!/usr/bin/env python
  2. # coding: utf-8
  3. # -------------------------------------------------------------------------
  4. # Copyright (c) Microsoft, Intel Corporation. All rights reserved.
  5. # Licensed under the MIT License. See License.txt in the project root for
  6. # license information.
  7. # --------------------------------------------------------------------------
  8. import abc
  9. import itertools
  10. import uuid
  11. from enum import Enum
  12. from pathlib import Path
  13. from typing import Optional, Sequence
  14. import numpy as np
  15. import onnx
  16. from onnx import ModelProto, TensorProto, helper, numpy_helper
  17. import onnxruntime
  18. from .quant_utils import apply_plot, clone_model_with_shape_infer, load_model, smooth_distribution
  19. class CalibrationMethod(Enum):
  20. MinMax = 0
  21. Entropy = 1
  22. Percentile = 2
  23. class CalibrationDataReader(metaclass=abc.ABCMeta):
  24. @classmethod
  25. def __subclasshook__(cls, subclass):
  26. return hasattr(subclass, "get_next") and callable(subclass.get_next) or NotImplemented
  27. @abc.abstractmethod
  28. def get_next(self) -> dict:
  29. """generate the input data dict for ONNXinferenceSession run"""
  30. raise NotImplementedError
  31. def __iter__(self):
  32. return self
  33. def __next__(self):
  34. result = self.get_next()
  35. if result is None:
  36. raise StopIteration
  37. return result
  38. class CalibraterBase:
  39. def __init__(
  40. self,
  41. model,
  42. op_types_to_calibrate: Optional[Sequence[str]] = None,
  43. augmented_model_path="augmented_model.onnx",
  44. symmetric=False,
  45. use_external_data_format=False,
  46. ):
  47. """
  48. :param model: ONNX model to calibrate. It can be a ModelProto or a model path
  49. :param op_types_to_calibrate: operator types to calibrate. By default, calibrate all the float32/float16 tensors.
  50. :param augmented_model_path: save augmented model to this path.
  51. :param symmetric: make range of tensor symmetric (central point is 0).
  52. :param use_external_data_format: use external data format to store model which size is >= 2Gb
  53. """
  54. if isinstance(model, str):
  55. self.model = load_model(Path(model), False)
  56. elif isinstance(model, Path):
  57. self.model = load_model(model, False)
  58. elif isinstance(model, ModelProto):
  59. self.model = model
  60. else:
  61. raise ValueError("model should be either model path or onnx.ModelProto.")
  62. self.op_types_to_calibrate = op_types_to_calibrate
  63. self.augmented_model_path = augmented_model_path
  64. self.symmetric = symmetric
  65. self.use_external_data_format = use_external_data_format
  66. self.augment_model = None
  67. self.infer_session = None
  68. self.execution_providers = ["CPUExecutionProvider"]
  69. def set_execution_providers(self, execution_providers=["CPUExecutionProvider"]):
  70. """
  71. reset the execution providers to execute the collect_data. It triggers to re-creating inference session.
  72. """
  73. self.execution_providers = execution_providers
  74. self.create_inference_session()
  75. def create_inference_session(self):
  76. """
  77. create an OnnxRuntime InferenceSession.
  78. """
  79. sess_options = onnxruntime.SessionOptions()
  80. sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
  81. self.infer_session = onnxruntime.InferenceSession(
  82. self.augmented_model_path,
  83. sess_options=sess_options,
  84. providers=self.execution_providers,
  85. )
  86. def select_tensors_to_calibrate(self, model):
  87. """
  88. select all quantization_candidates op type nodes' input/output tensors.
  89. returns:
  90. tensors (set): set of tensor name.
  91. value_infos (dict): tensor name to value info.
  92. """
  93. value_infos = {vi.name: vi for vi in model.graph.value_info}
  94. value_infos.update({ot.name: ot for ot in model.graph.output})
  95. value_infos.update({it.name: it for it in model.graph.input})
  96. initializer = set(init.name for init in model.graph.initializer)
  97. tensors_to_calibrate = set()
  98. tensor_type_to_calibrate = set([TensorProto.FLOAT, TensorProto.FLOAT16])
  99. for node in model.graph.node:
  100. if not self.op_types_to_calibrate or node.op_type in self.op_types_to_calibrate:
  101. for tensor_name in itertools.chain(node.input, node.output):
  102. if tensor_name in value_infos.keys():
  103. vi = value_infos[tensor_name]
  104. if (
  105. vi.type.HasField("tensor_type")
  106. and (vi.type.tensor_type.elem_type in tensor_type_to_calibrate)
  107. and (tensor_name not in initializer)
  108. ):
  109. tensors_to_calibrate.add(tensor_name)
  110. return tensors_to_calibrate, value_infos
  111. def get_augment_model(self):
  112. """
  113. return: augmented onnx model
  114. """
  115. return self.augment_model
  116. def augment_graph(self):
  117. """
  118. abstract method: augment the input model to prepare for collecting data. It will:
  119. 1. save augmented model to augmented_model_path.
  120. 2. set the self.augment_model
  121. """
  122. raise NotImplementedError
  123. def collect_data(self, data_reader: CalibrationDataReader):
  124. """
  125. abstract method: collect the tensors that will be used for range computation. It can be called multiple times.
  126. """
  127. raise NotImplementedError
  128. def compute_range(self, data_reader: CalibrationDataReader):
  129. """
  130. abstract method: compute the [min, max] range for the tensors to calibrate based on the collected data.
  131. """
  132. raise NotImplementedError
  133. class MinMaxCalibrater(CalibraterBase):
  134. def __init__(
  135. self,
  136. model,
  137. op_types_to_calibrate: Optional[Sequence[str]] = None,
  138. augmented_model_path="augmented_model.onnx",
  139. symmetric=False,
  140. use_external_data_format=False,
  141. moving_average=False,
  142. averaging_constant=0.01,
  143. ):
  144. """
  145. :param model: ONNX model to calibrate. It can be a ModelProto or a model path
  146. :param op_types_to_calibrate: operator types to calibrate. By default, calibrate all the float32/float16 tensors.
  147. :param augmented_model_path: save augmented model to this path.
  148. :param symmetric: make range of tensor symmetric (central point is 0).
  149. :param use_external_data_format: use external data format to store model which size is >= 2Gb
  150. :param moving_average: compute the moving average of the minimum and maximum values instead of the global minimum and maximum.
  151. :param averaging_constant: constant smoothing factor to use when computing the moving average.
  152. """
  153. super(MinMaxCalibrater, self).__init__(
  154. model,
  155. op_types_to_calibrate=op_types_to_calibrate,
  156. augmented_model_path=augmented_model_path,
  157. symmetric=symmetric,
  158. use_external_data_format=use_external_data_format,
  159. )
  160. self.intermediate_outputs = []
  161. self.calibrate_tensors_range = None
  162. self.num_model_outputs = len(self.model.graph.output)
  163. self.model_original_outputs = set(output.name for output in self.model.graph.output)
  164. self.moving_average = moving_average
  165. if moving_average and (averaging_constant < 0 or averaging_constant > 1):
  166. raise ValueError("Invalid averaging constant, which should not be < 0 or > 1.")
  167. self.averaging_constant = averaging_constant
  168. def augment_graph(self):
  169. """
  170. Adds ReduceMin and ReduceMax nodes to all quantization_candidates op type nodes in
  171. model and ensures their outputs are stored as part of the graph output
  172. :return: augmented ONNX model
  173. """
  174. model = clone_model_with_shape_infer(self.model)
  175. tensors, _ = self.select_tensors_to_calibrate(model)
  176. reshape_shape_name = str(uuid.uuid4())
  177. reshape_shape = numpy_helper.from_array(np.array([1], dtype=np.int64), reshape_shape_name)
  178. model.graph.initializer.append(reshape_shape)
  179. def add_reduce_min_max(tensor_name, reduce_op_name):
  180. # When doing ReduceMax/ReduceMin, ORT can't reduce on dim with value of 0 if 'keepdims' is false.
  181. # To make the code simple, we always let keepdims to be 1.
  182. keepdims = 1
  183. # Adding ReduceMin/ReduceMax nodes: ReduceMin/ReduceMax -> Reshape-> (output)
  184. reduce_output = tensor_name + "_" + reduce_op_name
  185. intermediate_output = reduce_output + "_Reshape"
  186. reduce_node = onnx.helper.make_node(
  187. reduce_op_name, [tensor_name], [intermediate_output], keepdims=keepdims, name=reduce_output
  188. )
  189. reshape_node = onnx.helper.make_node(
  190. "Reshape",
  191. inputs=[intermediate_output, reshape_shape_name],
  192. outputs=[reduce_output],
  193. name=intermediate_output,
  194. )
  195. model.graph.node.extend([reduce_node, reshape_node])
  196. model.graph.output.append(helper.make_tensor_value_info(reduce_output, TensorProto.FLOAT, [1]))
  197. for tensor in tensors:
  198. add_reduce_min_max(tensor, "ReduceMin")
  199. add_reduce_min_max(tensor, "ReduceMax")
  200. onnx.save(
  201. model,
  202. self.augmented_model_path,
  203. save_as_external_data=self.use_external_data_format,
  204. )
  205. self.augment_model = model
  206. def clear_collected_data(self):
  207. self.intermediate_outputs = []
  208. def collect_data(self, data_reader: CalibrationDataReader):
  209. while True:
  210. inputs = data_reader.get_next()
  211. if not inputs:
  212. break
  213. self.intermediate_outputs.append(self.infer_session.run(None, inputs))
  214. if len(self.intermediate_outputs) == 0:
  215. raise ValueError("No data is collected.")
  216. self.compute_range()
  217. self.clear_collected_data()
  218. def merge_range(self, old_range, new_range):
  219. if not old_range:
  220. return new_range
  221. for key, value in old_range.items():
  222. if self.moving_average:
  223. min_value = value[0] + self.averaging_constant * (new_range[key][0] - value[0])
  224. max_value = value[1] + self.averaging_constant * (new_range[key][1] - value[1])
  225. else:
  226. min_value = min(value[0], new_range[key][0])
  227. max_value = max(value[1], new_range[key][1])
  228. new_range[key] = (min_value, max_value)
  229. return new_range
  230. def compute_range(self):
  231. """
  232. Compute the min-max range of tensor
  233. :return: dictionary mapping: {added node names: (ReduceMin, ReduceMax) pairs }
  234. """
  235. if len(self.intermediate_outputs) == 0:
  236. return self.calibrate_tensors_range
  237. output_names = [self.infer_session.get_outputs()[i].name for i in range(len(self.intermediate_outputs[0]))]
  238. output_dicts_list = [
  239. dict(zip(output_names, intermediate_output)) for intermediate_output in self.intermediate_outputs
  240. ]
  241. merged_output_dict = {}
  242. for d in output_dicts_list:
  243. for k, v in d.items():
  244. merged_output_dict.setdefault(k, []).append(v)
  245. added_output_names = output_names[self.num_model_outputs :]
  246. calibrate_tensor_names = [
  247. added_output_names[i].rpartition("_")[0] for i in range(0, len(added_output_names), 2)
  248. ] # output names
  249. merged_added_output_dict = dict(
  250. (i, merged_output_dict[i]) for i in merged_output_dict if i not in self.model_original_outputs
  251. )
  252. pairs = []
  253. for i in range(0, len(added_output_names), 2):
  254. min_value = 0
  255. max_value = 0
  256. if self.moving_average:
  257. min_value_array = np.mean(merged_added_output_dict[added_output_names[i]], axis=0)
  258. max_value_array = np.mean(merged_added_output_dict[added_output_names[i + 1]], axis=0)
  259. else:
  260. min_value_array = min(merged_added_output_dict[added_output_names[i]])
  261. max_value_array = max(merged_added_output_dict[added_output_names[i + 1]])
  262. if type(min_value_array) == int or min_value_array.size > 0:
  263. min_value = float(min_value_array)
  264. if type(max_value_array) == int or max_value_array.size > 0:
  265. max_value = float(max_value_array)
  266. if self.symmetric:
  267. max_absolute_value = max(abs(min_value), abs(max_value))
  268. pairs.append(tuple([-max_absolute_value, max_absolute_value]))
  269. else:
  270. pairs.append(tuple([min_value, max_value]))
  271. new_calibrate_tensors_range = dict(zip(calibrate_tensor_names, pairs))
  272. if self.calibrate_tensors_range:
  273. self.calibrate_tensors_range = self.merge_range(self.calibrate_tensors_range, new_calibrate_tensors_range)
  274. else:
  275. self.calibrate_tensors_range = new_calibrate_tensors_range
  276. return self.calibrate_tensors_range
  277. class HistogramCalibrater(CalibraterBase):
  278. def __init__(
  279. self,
  280. model,
  281. op_types_to_calibrate: Optional[Sequence[str]] = None,
  282. augmented_model_path="augmented_model.onnx",
  283. use_external_data_format=False,
  284. method="percentile",
  285. symmetric=False,
  286. num_bins=128,
  287. num_quantized_bins=2048,
  288. percentile=99.999,
  289. ):
  290. """
  291. :param model: ONNX model to calibrate. It can be a ModelProto or a model path
  292. :param op_types_to_calibrate: operator types to calibrate. By default, calibrate all the float32/float16 tensors.
  293. :param augmented_model_path: save augmented model to this path.
  294. :param use_external_data_format: use external data format to store model which size is >= 2Gb
  295. :param method: A string. One of ['entropy', 'percentile'].
  296. :param symmetric: make range of tensor symmetric (central point is 0).
  297. :param num_bins: number of bins to create a new histogram for collecting tensor values.
  298. :param num_quantized_bins: number of quantized bins. Default 128.
  299. :param percentile: A float number between [0, 100]. Default 99.99.
  300. """
  301. super(HistogramCalibrater, self).__init__(
  302. model,
  303. op_types_to_calibrate=op_types_to_calibrate,
  304. augmented_model_path=augmented_model_path,
  305. symmetric=symmetric,
  306. use_external_data_format=use_external_data_format,
  307. )
  308. self.intermediate_outputs = []
  309. self.calibrate_tensors_range = None
  310. self.num_model_outputs = len(self.model.graph.output)
  311. self.model_original_outputs = set(output.name for output in self.model.graph.output)
  312. self.collector = None
  313. self.method = method
  314. self.num_bins = num_bins
  315. self.num_quantized_bins = num_quantized_bins
  316. self.percentile = percentile
  317. self.tensors_to_calibrate = None
  318. def augment_graph(self):
  319. """
  320. make all quantization_candidates op type nodes as part of the graph output.
  321. :return: augmented ONNX model
  322. """
  323. model = clone_model_with_shape_infer(self.model)
  324. self.tensors_to_calibrate, value_infos = self.select_tensors_to_calibrate(model)
  325. for tensor in self.tensors_to_calibrate:
  326. if tensor not in self.model_original_outputs:
  327. model.graph.output.append(value_infos[tensor])
  328. onnx.save(
  329. model,
  330. self.augmented_model_path,
  331. save_as_external_data=self.use_external_data_format,
  332. )
  333. self.augment_model = model
  334. def clear_collected_data(self):
  335. self.intermediate_outputs = []
  336. def collect_data(self, data_reader: CalibrationDataReader):
  337. """
  338. Entropy Calibrator collects operators' tensors as well as generates tensor histogram for each operator.
  339. """
  340. while True:
  341. inputs = data_reader.get_next()
  342. if not inputs:
  343. break
  344. self.intermediate_outputs.append(self.infer_session.run(None, inputs))
  345. if len(self.intermediate_outputs) == 0:
  346. raise ValueError("No data is collected.")
  347. output_names = [self.infer_session.get_outputs()[i].name for i in range(len(self.intermediate_outputs[0]))]
  348. output_dicts_list = [
  349. dict(zip(output_names, intermediate_output)) for intermediate_output in self.intermediate_outputs
  350. ]
  351. merged_dict = {}
  352. for d in output_dicts_list:
  353. for k, v in d.items():
  354. merged_dict.setdefault(k, []).append(v)
  355. clean_merged_dict = dict((i, merged_dict[i]) for i in merged_dict if i in self.tensors_to_calibrate)
  356. if not self.collector:
  357. self.collector = HistogramCollector(
  358. method=self.method,
  359. symmetric=self.symmetric,
  360. num_bins=self.num_bins,
  361. num_quantized_bins=self.num_quantized_bins,
  362. percentile=self.percentile,
  363. )
  364. self.collector.collect(clean_merged_dict)
  365. self.clear_collected_data()
  366. def compute_range(self):
  367. """
  368. Compute the min-max range of tensor
  369. :return: dictionary mapping: {tensor name: (min value, max value)}
  370. """
  371. if not self.collector:
  372. raise ValueError("No collector created and can't generate calibration data.")
  373. return self.collector.compute_collection_result()
  374. class EntropyCalibrater(HistogramCalibrater):
  375. def __init__(
  376. self,
  377. model,
  378. op_types_to_calibrate: Optional[Sequence[str]] = None,
  379. augmented_model_path="augmented_model.onnx",
  380. use_external_data_format=False,
  381. method="entropy",
  382. symmetric=False,
  383. num_bins=128,
  384. num_quantized_bins=128,
  385. ):
  386. """
  387. :param model: ONNX model to calibrate. It can be a ModelProto or a model path
  388. :param op_types_to_calibrate: operator types to calibrate. By default, calibrate all the float32/float16 tensors.
  389. :param augmented_model_path: save augmented model to this path.
  390. :param use_external_data_format: use external data format to store model which size is >= 2Gb
  391. :param method: A string. One of ['entropy', 'percentile'].
  392. :param symmetric: make range of tensor symmetric (central point is 0).
  393. :param num_bins: number of bins to create a new histogram for collecting tensor values.
  394. :param num_quantized_bins: number of quantized bins. Default 128.
  395. """
  396. super(EntropyCalibrater, self).__init__(
  397. model,
  398. op_types_to_calibrate,
  399. augmented_model_path,
  400. use_external_data_format,
  401. method=method,
  402. symmetric=symmetric,
  403. num_bins=num_bins,
  404. num_quantized_bins=num_quantized_bins,
  405. )
  406. class PercentileCalibrater(HistogramCalibrater):
  407. def __init__(
  408. self,
  409. model,
  410. op_types_to_calibrate: Optional[Sequence[str]] = None,
  411. augmented_model_path="augmented_model.onnx",
  412. use_external_data_format=False,
  413. method="percentile",
  414. symmetric=False,
  415. num_bins=2048,
  416. percentile=99.999,
  417. ):
  418. """
  419. :param model: ONNX model to calibrate. It can be a ModelProto or a model path
  420. :param op_types_to_calibrate: operator types to calibrate. By default, calibrate all the float32/float16 tensors.
  421. :param augmented_model_path: save augmented model to this path.
  422. :param use_external_data_format: use external data format to store model which size is >= 2Gb
  423. :param method: A string. One of ['entropy', 'percentile'].
  424. :param symmetric: make range of tensor symmetric (central point is 0).
  425. :param num_quantized_bins: number of quantized bins. Default 128.
  426. :param percentile: A float number between [0, 100]. Default 99.99.
  427. """
  428. super(PercentileCalibrater, self).__init__(
  429. model,
  430. op_types_to_calibrate,
  431. augmented_model_path,
  432. use_external_data_format,
  433. method=method,
  434. symmetric=symmetric,
  435. num_bins=num_bins,
  436. percentile=percentile,
  437. )
  438. class CalibrationDataCollector(metaclass=abc.ABCMeta):
  439. """
  440. Base class for collecting data for calibration-based quantization.
  441. """
  442. @abc.abstractmethod
  443. def collect(self, name_to_arr):
  444. """
  445. Generate informative data based on given data.
  446. name_to_arr : dict
  447. tensor name to NDArray data
  448. """
  449. raise NotImplementedError
  450. @abc.abstractmethod
  451. def compute_collection_result(self):
  452. """
  453. Get the optimal result among collection data.
  454. """
  455. raise NotImplementedError
  456. class HistogramCollector(CalibrationDataCollector):
  457. """
  458. Collecting histogram for each tensor. Percentile and Entropy method are supported.
  459. ref: https://github.com//apache/incubator-mxnet/blob/master/python/mxnet/contrib/quantization.py
  460. ref: https://docs.nvidia.com/deeplearning/tensorrt/pytorch-quantization-toolkit/docs/_modules/
  461. pytorch_quantization/calib/histogram.html
  462. """
  463. def __init__(self, method, symmetric, num_bins, num_quantized_bins, percentile):
  464. self.histogram_dict = {}
  465. self.method = method
  466. self.symmetric = symmetric
  467. self.num_bins = num_bins
  468. self.num_quantized_bins = num_quantized_bins
  469. self.percentile = percentile
  470. def get_histogram_dict(self):
  471. return self.histogram_dict
  472. def collect(self, name_to_arr):
  473. print("Collecting tensor data and making histogram ...")
  474. # TODO: Currently we have different collect() for entropy and percentile method respectively.
  475. # Need unified collect in the future.
  476. if self.method == "entropy":
  477. return self.collect_value(name_to_arr)
  478. elif self.method == "percentile":
  479. if self.symmetric:
  480. return self.collect_absolute_value(name_to_arr)
  481. else:
  482. return self.collect_value(name_to_arr)
  483. else:
  484. raise ValueError("Only 'entropy' or 'percentile' method are supported")
  485. def collect_absolute_value(self, name_to_arr):
  486. """
  487. Collect histogram on absolute value
  488. """
  489. for tensor, data_arr in name_to_arr.items():
  490. data_arr = np.asarray(data_arr)
  491. data_arr = data_arr.flatten()
  492. if data_arr.size > 0:
  493. min_value = np.min(data_arr)
  494. max_value = np.max(data_arr)
  495. else:
  496. min_value = 0
  497. max_value = 0
  498. data_arr = np.absolute(data_arr) # only consider absolute value
  499. if tensor not in self.histogram_dict:
  500. # first time it uses num_bins to compute histogram.
  501. hist, hist_edges = np.histogram(data_arr, bins=self.num_bins)
  502. self.histogram_dict[tensor] = (hist, hist_edges, min_value, max_value)
  503. else:
  504. old_histogram = self.histogram_dict[tensor]
  505. old_min = old_histogram[2]
  506. old_max = old_histogram[3]
  507. old_hist = old_histogram[0]
  508. old_hist_edges = old_histogram[1]
  509. temp_amax = np.max(data_arr)
  510. if temp_amax > old_hist_edges[-1]:
  511. # increase the number of bins
  512. width = old_hist_edges[1] - old_hist_edges[0]
  513. # NOTE: np.arange may create an extra bin after the one containing temp_amax
  514. new_bin_edges = np.arange(old_hist_edges[-1] + width, temp_amax + width, width)
  515. old_hist_edges = np.hstack((old_hist_edges, new_bin_edges))
  516. hist, hist_edges = np.histogram(data_arr, bins=old_hist_edges)
  517. hist[: len(old_hist)] += old_hist
  518. self.histogram_dict[tensor] = (hist, hist_edges, min(old_min, min_value), max(old_max, max_value))
  519. def collect_value(self, name_to_arr):
  520. """
  521. Collect histogram on real value
  522. """
  523. for tensor, data_arr in name_to_arr.items():
  524. data_arr = np.asarray(data_arr)
  525. data_arr = data_arr.flatten()
  526. if data_arr.size > 0:
  527. min_value = np.min(data_arr)
  528. max_value = np.max(data_arr)
  529. else:
  530. min_value = 0
  531. max_value = 0
  532. threshold = max(abs(min_value), abs(max_value))
  533. if tensor in self.histogram_dict:
  534. old_histogram = self.histogram_dict[tensor]
  535. self.histogram_dict[tensor] = self.merge_histogram(
  536. old_histogram, data_arr, min_value, max_value, threshold
  537. )
  538. else:
  539. hist, hist_edges = np.histogram(data_arr, self.num_bins, range=(-threshold, threshold))
  540. self.histogram_dict[tensor] = (
  541. hist,
  542. hist_edges,
  543. min_value,
  544. max_value,
  545. threshold,
  546. )
  547. def merge_histogram(self, old_histogram, data_arr, new_min, new_max, new_threshold):
  548. (old_hist, old_hist_edges, old_min, old_max, old_threshold) = old_histogram
  549. if new_threshold <= old_threshold:
  550. new_hist, _ = np.histogram(data_arr, len(old_hist), range=(-old_threshold, old_threshold))
  551. return (
  552. new_hist + old_hist,
  553. old_hist_edges,
  554. min(old_min, new_min),
  555. max(old_max, new_max),
  556. old_threshold,
  557. )
  558. else:
  559. if old_threshold == 0:
  560. hist, hist_edges = np.histogram(data_arr, len(old_hist), range=(-new_threshold, new_threshold))
  561. hist += old_hist
  562. else:
  563. old_num_bins = len(old_hist)
  564. old_stride = 2 * old_threshold / old_num_bins
  565. half_increased_bins = int((new_threshold - old_threshold) // old_stride + 1)
  566. new_num_bins = old_num_bins + 2 * half_increased_bins
  567. new_threshold = half_increased_bins * old_stride + old_threshold
  568. hist, hist_edges = np.histogram(data_arr, new_num_bins, range=(-new_threshold, new_threshold))
  569. hist[half_increased_bins : new_num_bins - half_increased_bins] += old_hist
  570. return (
  571. hist,
  572. hist_edges,
  573. min(old_min, new_min),
  574. max(old_max, new_max),
  575. new_threshold,
  576. )
  577. def compute_collection_result(self):
  578. if not self.histogram_dict or len(self.histogram_dict) == 0:
  579. raise ValueError("Histogram has not been collected. Please run collect() first.")
  580. print("Finding optimal threshold for each tensor using {} algorithm ...".format(self.method))
  581. if self.method == "entropy":
  582. return self.compute_entropy()
  583. elif self.method == "percentile":
  584. return self.compute_percentile()
  585. else:
  586. raise ValueError("Only 'entropy' or 'percentile' method are supported")
  587. def compute_percentile(self):
  588. if self.percentile < 0 or self.percentile > 100:
  589. raise ValueError("Invalid percentile. Must be in range 0 <= percentile <= 100.")
  590. histogram_dict = self.histogram_dict
  591. percentile = self.percentile
  592. thresholds_dict = {} # per tensor thresholds
  593. print("Number of tensors : {}".format(len(histogram_dict)))
  594. print("Number of histogram bins : {}".format(self.num_bins))
  595. print("Percentile : ({},{})".format(100.0 - percentile, percentile))
  596. for tensor, histogram in histogram_dict.items():
  597. hist = histogram[0]
  598. hist_edges = histogram[1]
  599. total = hist.sum()
  600. cdf = np.cumsum(hist / total)
  601. if self.symmetric:
  602. idx_right = np.searchsorted(cdf, percentile / 100.0)
  603. thresholds_dict[tensor] = (
  604. -float(hist_edges[idx_right]),
  605. float(hist_edges[idx_right]),
  606. )
  607. else:
  608. percent_to_cut_one_side = (100.0 - percentile) / 200.0
  609. idx_right = np.searchsorted(cdf, 1.0 - percent_to_cut_one_side)
  610. idx_left = np.searchsorted(cdf, percent_to_cut_one_side)
  611. thresholds_dict[tensor] = (
  612. float(hist_edges[idx_left]),
  613. float(hist_edges[idx_right]),
  614. )
  615. min_value = histogram[2]
  616. max_value = histogram[3]
  617. if thresholds_dict[tensor][0] < min_value:
  618. thresholds_dict[tensor] = (min_value, thresholds_dict[tensor][1])
  619. if thresholds_dict[tensor][1] > max_value:
  620. thresholds_dict[tensor] = (thresholds_dict[tensor][0], max_value)
  621. # Plot histogram for debug only
  622. if False:
  623. apply_plot(hist, hist_edges)
  624. return thresholds_dict
  625. def compute_entropy(self):
  626. histogram_dict = self.histogram_dict
  627. num_quantized_bins = self.num_quantized_bins
  628. thresholds_dict = {} # per tensor thresholds
  629. print("Number of tensors : {}".format(len(histogram_dict)))
  630. print(
  631. "Number of histogram bins : {} (The number may increase depends on the data it collects)".format(
  632. self.num_bins
  633. )
  634. )
  635. print("Number of quantized bins : {}".format(self.num_quantized_bins))
  636. for tensor, histogram in histogram_dict.items():
  637. optimal_threshold = self.get_entropy_threshold(histogram, num_quantized_bins)
  638. thresholds_dict[tensor] = optimal_threshold
  639. # Plot histogram for debug only
  640. if False:
  641. apply_plot(histogram[0], histogram[1])
  642. return thresholds_dict
  643. def get_entropy_threshold(self, histogram, num_quantized_bins):
  644. """Given a dataset, find the optimal threshold for quantizing it.
  645. The reference distribution is `q`, and the candidate distribution is `p`.
  646. `q` is a truncated version of the original distribution.
  647. Ref: http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf
  648. """
  649. import copy
  650. from scipy.stats import entropy
  651. hist = histogram[0]
  652. hist_edges = histogram[1]
  653. num_bins = hist.size
  654. zero_bin_index = num_bins // 2
  655. num_half_quantized_bin = num_quantized_bins // 2
  656. kl_divergence = np.zeros(zero_bin_index - num_half_quantized_bin + 1)
  657. thresholds = [(0, 0) for i in range(kl_divergence.size)]
  658. # <------------ num bins ---------------->
  659. # <--- quantized bins ---->
  660. # |======|===========|===========|=======|
  661. # zero bin index
  662. # ^ ^
  663. # | |
  664. # start index end index (start of iteration)
  665. # ^ ^
  666. # | |
  667. # start index end index ...
  668. # ^ ^
  669. # | |
  670. # start index end index (end of iteration)
  671. for i in range(num_half_quantized_bin, zero_bin_index + 1, 1):
  672. start_index = zero_bin_index - i
  673. end_index = zero_bin_index + i + 1 if (zero_bin_index + i + 1) <= num_bins else num_bins
  674. thresholds[i - num_half_quantized_bin] = (
  675. float(hist_edges[start_index]),
  676. float(hist_edges[end_index]),
  677. )
  678. sliced_distribution = copy.deepcopy(hist[start_index:end_index])
  679. # reference distribution p
  680. p = sliced_distribution.copy() # a copy of np array
  681. left_outliers_count = sum(hist[:start_index])
  682. right_outliers_count = sum(hist[end_index:])
  683. p[0] += left_outliers_count
  684. p[-1] += right_outliers_count
  685. # nonzeros[i] incidates whether p[i] is non-zero
  686. nonzeros = (p != 0).astype(np.int64)
  687. # quantize p.size bins into quantized bins (default 128 bins)
  688. quantized_bins = np.zeros(num_quantized_bins, dtype=np.int64)
  689. num_merged_bins = sliced_distribution.size // num_quantized_bins
  690. # merge bins into quantized bins
  691. for index in range(num_quantized_bins):
  692. start = index * num_merged_bins
  693. end = start + num_merged_bins
  694. quantized_bins[index] = sum(sliced_distribution[start:end])
  695. quantized_bins[-1] += sum(sliced_distribution[num_quantized_bins * num_merged_bins :])
  696. # in order to compare p and q, we need to make length of q equals to length of p
  697. # expand quantized bins into p.size bins
  698. q = np.zeros(p.size, dtype=np.int64)
  699. for index in range(num_quantized_bins):
  700. start = index * num_merged_bins
  701. end = start + num_merged_bins
  702. norm = sum(nonzeros[start:end])
  703. if norm != 0:
  704. q[start:end] = float(quantized_bins[index]) / float(norm)
  705. p = smooth_distribution(p)
  706. q = smooth_distribution(q)
  707. if isinstance(q, np.ndarray):
  708. kl_divergence[i - num_half_quantized_bin] = entropy(p, q)
  709. else:
  710. kl_divergence[i - num_half_quantized_bin] = float("inf")
  711. min_kl_divergence_idx = np.argmin(kl_divergence)
  712. optimal_threshold = thresholds[min_kl_divergence_idx]
  713. min_value = histogram[2]
  714. max_value = histogram[3]
  715. if optimal_threshold[0] < min_value:
  716. optimal_threshold = (min_value, optimal_threshold[1])
  717. if optimal_threshold[1] > max_value:
  718. optimal_threshold = (optimal_threshold[0], max_value)
  719. return optimal_threshold
  720. def create_calibrator(
  721. model,
  722. op_types_to_calibrate: Optional[Sequence[str]] = None,
  723. augmented_model_path="augmented_model.onnx",
  724. calibrate_method=CalibrationMethod.MinMax,
  725. use_external_data_format=False,
  726. extra_options={},
  727. ):
  728. calibrator = None
  729. if calibrate_method == CalibrationMethod.MinMax:
  730. # default settings for min-max algorithm
  731. symmetric = False if "symmetric" not in extra_options else extra_options["symmetric"]
  732. moving_average = False if "moving_average" not in extra_options else extra_options["moving_average"]
  733. averaging_constant = 0.01 if "averaging_constant" not in extra_options else extra_options["averaging_constant"]
  734. calibrator = MinMaxCalibrater(
  735. model,
  736. op_types_to_calibrate,
  737. augmented_model_path,
  738. use_external_data_format=use_external_data_format,
  739. symmetric=symmetric,
  740. moving_average=moving_average,
  741. averaging_constant=averaging_constant,
  742. )
  743. elif calibrate_method == CalibrationMethod.Entropy:
  744. # default settings for entropy algorithm
  745. num_bins = 128 if "num_bins" not in extra_options else extra_options["num_bins"]
  746. num_quantized_bins = 128 if "num_quantized_bins" not in extra_options else extra_options["num_quantized_bins"]
  747. symmetric = False if "symmetric" not in extra_options else extra_options["symmetric"]
  748. calibrator = EntropyCalibrater(
  749. model,
  750. op_types_to_calibrate,
  751. augmented_model_path,
  752. use_external_data_format=use_external_data_format,
  753. symmetric=symmetric,
  754. num_bins=num_bins,
  755. num_quantized_bins=num_quantized_bins,
  756. )
  757. elif calibrate_method == CalibrationMethod.Percentile:
  758. # default settings for percentile algorithm
  759. num_bins = 2048 if "num_bins" not in extra_options else extra_options["num_bins"]
  760. percentile = 99.999 if "percentile" not in extra_options else extra_options["percentile"]
  761. symmetric = True if "symmetric" not in extra_options else extra_options["symmetric"]
  762. calibrator = PercentileCalibrater(
  763. model,
  764. op_types_to_calibrate,
  765. augmented_model_path,
  766. use_external_data_format=use_external_data_format,
  767. symmetric=symmetric,
  768. num_bins=num_bins,
  769. percentile=percentile,
  770. )
  771. if calibrator:
  772. calibrator.augment_graph()
  773. calibrator.create_inference_session()
  774. return calibrator
  775. raise ValueError("Unsupported calibration method {}".format(calibrate_method))