m2m模型翻译
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

454 lines
17 KiB

6 months ago
  1. # -------------------------------------------------------------------------
  2. # Copyright (c) Microsoft Corporation. All rights reserved.
  3. # Licensed under the MIT License.
  4. # --------------------------------------------------------------------------
  5. # Convert Bert ONNX model converted from TensorFlow or exported from PyTorch to use Attention, Gelu,
  6. # SkipLayerNormalization and EmbedLayerNormalization ops to optimize
  7. # performance on NVidia GPU and CPU.
  8. #
  9. # For Bert model exported from PyTorch, OnnxRuntime has bert model optimization support internally.
  10. # You can use the option --use_onnxruntime to check optimizations from OnnxRuntime.
  11. # For Bert model file like name.onnx, optimized model for GPU or CPU from OnnxRuntime will output as
  12. # name_ort_gpu.onnx or name_ort_cpu.onnx in the same directory.
  13. #
  14. # This script is retained for experiment purpose. Useful scenarios like the following:
  15. # (1) Change model from fp32 to fp16 for mixed precision inference in GPU with Tensor Core.
  16. # (2) Change input data type from int64 to int32.
  17. # (3) Some model cannot be handled by OnnxRuntime, and you can modify this script to get optimized model.
  18. import argparse
  19. import logging
  20. import os
  21. from typing import Dict, Optional
  22. import coloredlogs
  23. from fusion_options import FusionOptions
  24. from onnx import ModelProto, load_model
  25. from onnx_model_bart import BartOnnxModel
  26. from onnx_model_bert import BertOnnxModel
  27. from onnx_model_bert_keras import BertOnnxModelKeras
  28. from onnx_model_bert_tf import BertOnnxModelTF
  29. from onnx_model_gpt2 import Gpt2OnnxModel
  30. from onnx_model_tnlr import TnlrOnnxModel
  31. from onnx_model_unet import UnetOnnxModel
  32. logger = logging.getLogger(__name__)
  33. # Map model type to tuple: optimizer class, export tools (pytorch, tf2onnx, keras2onnx), and default opt_level
  34. MODEL_TYPES = {
  35. "bart": (BartOnnxModel, "pytorch", 1),
  36. "bert": (BertOnnxModel, "pytorch", 1),
  37. "bert_tf": (BertOnnxModelTF, "tf2onnx", 0),
  38. "bert_keras": (BertOnnxModelKeras, "keras2onnx", 0),
  39. "gpt2": (Gpt2OnnxModel, "pytorch", 1),
  40. "gpt2_tf": (
  41. Gpt2OnnxModel,
  42. "tf2onnx",
  43. 0,
  44. ), # might add a class for GPT2OnnxModel for TF later.
  45. "tnlr": (TnlrOnnxModel, "pytorch", 1),
  46. "unet": (UnetOnnxModel, "pytorch", 1),
  47. }
  48. def optimize_by_onnxruntime(
  49. onnx_model_path: str,
  50. use_gpu: bool = False,
  51. optimized_model_path: Optional[str] = None,
  52. opt_level: Optional[int] = 99,
  53. disabled_optimizers=[],
  54. ) -> str:
  55. """
  56. Use onnxruntime to optimize model.
  57. Args:
  58. onnx_model_path (str): the path of input onnx model.
  59. use_gpu (bool): whether the optimized model is targeted to run in GPU.
  60. optimized_model_path (str or None): the path of optimized model.
  61. opt_level (int): graph optimization level.
  62. disabled_optimizers (List[str]): a list of names of disabled optimizers
  63. Returns:
  64. optimized_model_path (str): the path of optimized model
  65. """
  66. assert opt_level in [1, 2, 99]
  67. from torch import version as torch_version
  68. import onnxruntime
  69. if use_gpu and set(onnxruntime.get_available_providers()).isdisjoint(
  70. ["CUDAExecutionProvider", "ROCMExecutionProvider", "MIGraphXExecutionProvider"]
  71. ):
  72. logger.error("There is no gpu for onnxruntime to do optimization.")
  73. return onnx_model_path
  74. sess_options = onnxruntime.SessionOptions()
  75. if opt_level == 1:
  76. sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC
  77. elif opt_level == 2:
  78. sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
  79. else:
  80. sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
  81. if optimized_model_path is None:
  82. path_prefix = onnx_model_path[:-5] # remove .onnx suffix
  83. optimized_model_path = "{}_o{}_{}.onnx".format(path_prefix, opt_level, "gpu" if use_gpu else "cpu")
  84. sess_options.optimized_model_filepath = optimized_model_path
  85. kwargs = {}
  86. if disabled_optimizers:
  87. kwargs["disabled_optimizers"] = disabled_optimizers
  88. if not use_gpu:
  89. session = onnxruntime.InferenceSession(
  90. onnx_model_path, sess_options, providers=["CPUExecutionProvider"], **kwargs
  91. )
  92. else:
  93. gpu_ep = []
  94. if torch_version.cuda:
  95. gpu_ep.append("CUDAExecutionProvider")
  96. elif torch_version.hip:
  97. gpu_ep.append("MIGraphXExecutionProvider")
  98. gpu_ep.append("ROCMExecutionProvider")
  99. session = onnxruntime.InferenceSession(onnx_model_path, sess_options, providers=gpu_ep, **kwargs)
  100. assert not set(onnxruntime.get_available_providers()).isdisjoint(
  101. ["CUDAExecutionProvider", "ROCMExecutionProvider", "MIGraphXExecutionProvider"]
  102. )
  103. assert os.path.exists(optimized_model_path) and os.path.isfile(optimized_model_path)
  104. logger.debug("Save optimized model by onnxruntime to %s", optimized_model_path)
  105. return optimized_model_path
  106. def optimize_by_fusion(
  107. model: ModelProto,
  108. model_type: str = "bert",
  109. num_heads: int = 0,
  110. hidden_size: int = 0,
  111. optimization_options: Optional[FusionOptions] = None,
  112. ):
  113. """Optimize Model by graph fusion logic.
  114. Note that ONNXRuntime graph optimizations (like constant folding) will not be applied. So it is better to enable
  115. constant folding during exporting ONNX model, or run optimize_by_onnxruntime on the model first like optimize_model.
  116. For BERT model, num_heads and hidden_size are optional. For other model types, you need specify these parameters.
  117. Args:
  118. model (ModelProto): model object
  119. model_type (str, optional): model type - like bert, bert_tf, bert_keras or gpt2. Defaults to 'bert'.
  120. num_heads (int, optional): number of attention heads. Defaults to 0.
  121. 0 allows detect the parameter from graph automatically.
  122. hidden_size (int, optional): hidden size. Defaults to 0.
  123. 0 allows detect the parameter from graph automatically.
  124. optimization_options (FusionOptions, optional): optimization options that turn on/off some fusions.
  125. Defaults to None.
  126. Returns:
  127. object of an optimizer class.
  128. """
  129. if model_type not in ["bert", "unet"] and (num_heads == 0 or hidden_size == 0):
  130. logger.warning(f"Please specify parameters of num_heads and hidden_size for model_type {model_type}")
  131. (optimizer_class, producer, _) = MODEL_TYPES[model_type]
  132. if model.producer_name and producer != model.producer_name:
  133. logger.warning(
  134. f'Model producer not matched: Expected "{producer}", Got "{model.producer_name}".'
  135. "Please specify correct --model_type parameter."
  136. )
  137. if optimization_options is None:
  138. optimization_options = FusionOptions(model_type)
  139. optimizer = optimizer_class(model, num_heads, hidden_size)
  140. optimizer.optimize(optimization_options)
  141. optimizer.topological_sort()
  142. optimizer.model.producer_name = "onnxruntime.transformers"
  143. from onnxruntime import __version__ as onnxruntime_version
  144. optimizer.model.producer_version = onnxruntime_version
  145. return optimizer
  146. def optimize_model(
  147. input: str,
  148. model_type: str = "bert",
  149. num_heads: int = 0,
  150. hidden_size: int = 0,
  151. optimization_options: Optional[FusionOptions] = None,
  152. opt_level: Optional[int] = None,
  153. use_gpu: bool = False,
  154. only_onnxruntime: bool = False,
  155. ):
  156. """Optimize Model by OnnxRuntime and/or python fusion logic.
  157. ONNX Runtime has graph optimizations (https://onnxruntime.ai/docs/resources/graph-optimizations.html).
  158. However, the coverage is limited. We also have graph fusions that implemented in Python to improve the coverage.
  159. They can combined: ONNX Runtime will run first when opt_level > 0, then graph fusions in Python will be applied.
  160. To use ONNX Runtime only and no Python fusion logic, use only_onnxruntime flag and a positive opt_level like
  161. optimize_model(input, opt_level=1, use_gpu=False, only_onnxruntime=True)
  162. When opt_level is None, we will choose default optimization level according to model type.
  163. When opt_level is 0 and only_onnxruntime is False, only python fusion logic is used and onnxruntime is disabled.
  164. When opt_level > 1, use_gpu shall set properly
  165. since the optimized graph might contain operators for GPU or CPU only.
  166. If your model is intended for GPU inference only (especially float16 or mixed precision model), it is recommended to
  167. set use_gpu to be True, otherwise the model is not optimized for GPU inference.
  168. For BERT model, num_heads and hidden_size are optional. For other model types, you need specify these parameters.
  169. Args:
  170. input (str): input model path.
  171. model_type (str, optional): model type - like bert, bert_tf, bert_keras or gpt2. Defaults to 'bert'.
  172. num_heads (int, optional): number of attention heads. Defaults to 0.
  173. 0 allows detect the parameter from graph automatically.
  174. hidden_size (int, optional): hidden size. Defaults to 0.
  175. 0 allows detect the parameter from graph automatically.
  176. optimization_options (FusionOptions, optional): optimization options that turn on/off some fusions.
  177. Defaults to None.
  178. opt_level (int, optional): onnxruntime graph optimization level (0, 1, 2 or 99) or None. Defaults to None.
  179. When the value is None, default value (1 for bert and gpt2, 0 for other model types) will be used.
  180. When the level > 0, onnxruntime will be used to optimize model first.
  181. use_gpu (bool, optional): use gpu or not for onnxruntime. Defaults to False.
  182. only_onnxruntime (bool, optional): only use onnxruntime to optimize model, and no python fusion.
  183. Defaults to False.
  184. Returns:
  185. object of an optimizer class.
  186. """
  187. assert opt_level is None or opt_level in [0, 1, 2, 99]
  188. (optimizer_class, _producer, default_opt_level) = MODEL_TYPES[model_type]
  189. if opt_level is None:
  190. opt_level = default_opt_level
  191. # Disable constant sharing to avoid model proto str mismatch in test. Ideally the optimizer should not
  192. # affect other fusions. We can update the expected model proto once the ConstantSharing optimizer logic becomes
  193. # stable.
  194. disabled_optimizers = ["ConstantSharing"]
  195. temp_model_path = None
  196. if opt_level > 1:
  197. # Disable some optimizers that might cause failure in symbolic shape inference or attention fusion.
  198. disabled_optimizers += (
  199. []
  200. if only_onnxruntime
  201. else [
  202. "MatMulScaleFusion",
  203. "MatMulAddFusion",
  204. "SimplifiedLayerNormFusion",
  205. "GemmActivationFusion",
  206. "BiasSoftmaxFusion",
  207. ]
  208. )
  209. temp_model_path = optimize_by_onnxruntime(
  210. input,
  211. use_gpu=use_gpu,
  212. opt_level=opt_level,
  213. disabled_optimizers=disabled_optimizers,
  214. )
  215. elif opt_level == 1:
  216. # basic optimizations (like constant folding and cast elimination) are not specified to execution provider.
  217. # CPU provider is used here so that there is no extra node for GPU memory copy.
  218. temp_model_path = optimize_by_onnxruntime(
  219. input,
  220. use_gpu=False,
  221. opt_level=1,
  222. disabled_optimizers=disabled_optimizers,
  223. )
  224. if only_onnxruntime and not temp_model_path:
  225. logger.warning("Please specify a positive value for opt_level when only_onnxruntime is True")
  226. model = load_model(temp_model_path or input)
  227. if only_onnxruntime:
  228. optimizer = optimizer_class(model, num_heads, hidden_size)
  229. else:
  230. optimizer = optimize_by_fusion(model, model_type, num_heads, hidden_size, optimization_options)
  231. # Remove the temporary model.
  232. if temp_model_path:
  233. os.remove(temp_model_path)
  234. logger.debug("Remove temporary model: {}".format(temp_model_path))
  235. return optimizer
  236. def get_fusion_statistics(optimized_model_path: str) -> Dict[str, int]:
  237. """
  238. Get counter of fused operators in optimized model.
  239. Args:
  240. optimized_model_path (str): the path of onnx model.
  241. Returns:
  242. A dictionary with operator type as key, and count as value
  243. """
  244. model = load_model(optimized_model_path, format=None, load_external_data=True)
  245. optimizer = BertOnnxModel(model)
  246. return optimizer.get_fused_operator_statistics()
  247. def _parse_arguments():
  248. parser = argparse.ArgumentParser(
  249. description="Graph optimization tool for ONNX Runtime."
  250. "It transforms ONNX graph to use optimized operators for Transformer models."
  251. )
  252. parser.add_argument("--input", required=True, type=str, help="input onnx model path")
  253. parser.add_argument("--output", required=True, type=str, help="optimized onnx model path")
  254. parser.add_argument(
  255. "--model_type",
  256. required=False,
  257. type=str.lower,
  258. default="bert",
  259. choices=list(MODEL_TYPES.keys()),
  260. help="Model type selected in the list: " + ", ".join(MODEL_TYPES.keys()),
  261. )
  262. parser.add_argument(
  263. "--num_heads",
  264. required=False,
  265. type=int,
  266. default=0,
  267. help="number of attention heads like 12 for bert-base and 16 for bert-large. "
  268. "Default is 0 to detect automatically for BERT."
  269. "For other model type, this parameter need specify correctly.",
  270. )
  271. parser.add_argument(
  272. "--hidden_size",
  273. required=False,
  274. type=int,
  275. default=0,
  276. help="hidden size like 768 for bert-base and 1024 for bert-large. "
  277. "Default is 0 to detect automatically for BERT. "
  278. "For other model type, this parameter need specify correctly.",
  279. )
  280. parser.add_argument(
  281. "--input_int32",
  282. required=False,
  283. action="store_true",
  284. help="Use int32 (instead of int64) inputs. "
  285. "It could avoid unnecessary data cast when EmbedLayerNormalization is fused for BERT.",
  286. )
  287. parser.set_defaults(input_int32=False)
  288. parser.add_argument(
  289. "--float16",
  290. required=False,
  291. action="store_true",
  292. help="Convert all weights and nodes in float32 to float16. "
  293. "It has potential loss in precision compared to mixed precision conversion.",
  294. )
  295. parser.set_defaults(float16=False)
  296. FusionOptions.add_arguments(parser)
  297. parser.add_argument("--verbose", required=False, action="store_true", help="show debug information.")
  298. parser.set_defaults(verbose=False)
  299. parser.add_argument(
  300. "--use_gpu",
  301. required=False,
  302. action="store_true",
  303. help="Use GPU for inference. Set this flag if your model is intended for GPU when opt_level > 1.",
  304. )
  305. parser.set_defaults(use_gpu=False)
  306. parser.add_argument(
  307. "--only_onnxruntime",
  308. required=False,
  309. action="store_true",
  310. help="optimized by onnxruntime only, and no graph fusion in Python",
  311. )
  312. parser.set_defaults(only_onnxruntime=False)
  313. parser.add_argument(
  314. "--opt_level",
  315. required=False,
  316. type=int,
  317. choices=[0, 1, 2, 99],
  318. default=None,
  319. help="onnxruntime optimization level. 0 will disable onnxruntime graph optimization. "
  320. "The recommended value is 1. When opt_level > 1 is used, optimized model for GPU might not run in CPU. "
  321. "Level 2 and 99 are intended for --only_onnxruntime.",
  322. )
  323. parser.add_argument(
  324. "--use_external_data_format",
  325. required=False,
  326. action="store_true",
  327. help="use external data format to store large model (>2GB)",
  328. )
  329. parser.set_defaults(use_external_data_format=False)
  330. args = parser.parse_args()
  331. return args
  332. def _setup_logger(verbose):
  333. if verbose:
  334. coloredlogs.install(
  335. level="DEBUG",
  336. fmt="[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s",
  337. )
  338. else:
  339. coloredlogs.install(fmt="%(funcName)20s: %(message)s")
  340. def main():
  341. args = _parse_arguments()
  342. _setup_logger(args.verbose)
  343. logger.debug(f"arguments:{args}")
  344. if os.path.realpath(args.input) == os.path.realpath(args.output):
  345. logger.warning("Specified the same input and output path. Note that this may overwrite the original model")
  346. optimization_options = FusionOptions.parse(args)
  347. optimizer = optimize_model(
  348. args.input,
  349. args.model_type,
  350. args.num_heads,
  351. args.hidden_size,
  352. opt_level=args.opt_level,
  353. optimization_options=optimization_options,
  354. use_gpu=args.use_gpu,
  355. only_onnxruntime=args.only_onnxruntime,
  356. )
  357. if args.float16:
  358. optimizer.convert_float_to_float16(keep_io_types=True)
  359. if args.input_int32:
  360. optimizer.change_graph_inputs_to_int32()
  361. optimizer.save_model_to_file(args.output, args.use_external_data_format)
  362. if optimizer.is_fully_optimized():
  363. logger.info("The model has been fully optimized.")
  364. else:
  365. logger.info("The model has been optimized.")
  366. if __name__ == "__main__":
  367. main()