m2m模型翻译
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

546 lines
17 KiB

6 months ago
  1. # -------------------------------------------------------------------------
  2. # Copyright (c) Microsoft Corporation. All rights reserved.
  3. # Licensed under the MIT License.
  4. # --------------------------------------------------------------------------
  5. # This tool measures the inference performance of onnxruntime or onnxruntime-gpu python package on Bert model.
  6. # The input model shall have exactly three inputs. The model is either fully optimized (with EmbedLayerNormalization node),
  7. # or with reasonable input names (one input name has 'mask' substring, another has 'token' or 'segment' substring).
  8. # See get_bert_inputs function in bert_test_data.py for more information.
  9. # Example command to run test on batch_size 1 and 2 for a model on GPU:
  10. # python bert_perf_test.py --model bert.onnx --batch_size 1 2 --sequence_length 128 --use_gpu --samples 1000 --test_times 1
  11. import argparse
  12. import csv
  13. import multiprocessing
  14. import os
  15. import random
  16. import statistics
  17. import timeit
  18. from dataclasses import dataclass
  19. from datetime import datetime
  20. from pathlib import Path
  21. import numpy as np
  22. import psutil
  23. import torch
  24. from bert_test_data import generate_test_data, get_bert_inputs
  25. @dataclass
  26. class TestSetting:
  27. batch_size: int
  28. sequence_length: int
  29. test_cases: int
  30. test_times: int
  31. use_gpu: bool
  32. use_io_binding: bool
  33. provider: str
  34. intra_op_num_threads: int
  35. seed: int
  36. verbose: bool
  37. log_severity: int
  38. @dataclass
  39. class ModelSetting:
  40. model_path: str
  41. input_ids_name: str
  42. segment_ids_name: str
  43. input_mask_name: str
  44. opt_level: int
  45. def create_session(model_path, use_gpu, provider, intra_op_num_threads, graph_optimization_level=None, log_severity=2):
  46. import onnxruntime
  47. onnxruntime.set_default_logger_severity(log_severity)
  48. if use_gpu and ("CUDAExecutionProvider" not in onnxruntime.get_available_providers()):
  49. print(
  50. "Warning: Please install onnxruntime-gpu package instead of onnxruntime, and use a machine with GPU for testing gpu performance."
  51. )
  52. if use_gpu:
  53. if provider == "dml":
  54. execution_providers = ["DmlExecutionProvider", "CPUExecutionProvider"]
  55. elif provider == "rocm":
  56. execution_providers = ["ROCMExecutionProvider", "CPUExecutionProvider"]
  57. elif provider == "migraphx":
  58. execution_providers = [
  59. "MIGraphXExecutionProvider",
  60. "ROCMExecutionProvider",
  61. "CPUExecutionProvider",
  62. ]
  63. elif provider == "cuda":
  64. execution_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
  65. elif provider == "tensorrt":
  66. execution_providers = [
  67. "TensorrtExecutionProvider",
  68. "CUDAExecutionProvider",
  69. "CPUExecutionProvider",
  70. ]
  71. else:
  72. execution_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
  73. else:
  74. execution_providers = ["CPUExecutionProvider"]
  75. sess_options = onnxruntime.SessionOptions()
  76. sess_options.log_severity_level = log_severity
  77. sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
  78. if graph_optimization_level is None:
  79. sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
  80. elif graph_optimization_level == 0:
  81. sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
  82. elif graph_optimization_level == 1:
  83. sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC
  84. elif graph_optimization_level == 2:
  85. sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
  86. elif graph_optimization_level == 99:
  87. sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
  88. else:
  89. sess_options.graph_optimization_level = graph_optimization_level
  90. if intra_op_num_threads is not None:
  91. sess_options.intra_op_num_threads = intra_op_num_threads
  92. session = onnxruntime.InferenceSession(model_path, sess_options, providers=execution_providers)
  93. if use_gpu:
  94. if provider == "dml":
  95. assert "DmlExecutionProvider" in session.get_providers()
  96. elif provider == "rocm":
  97. assert "ROCMExecutionProvider" in session.get_providers()
  98. elif provider == "migraphx":
  99. assert "MIGraphXExecutionProvider" in session.get_providers()
  100. assert "ROCMExecutionProvider" in session.get_providers()
  101. elif provider == "cuda":
  102. assert "CUDAExecutionProvider" in session.get_providers()
  103. elif provider == "tensorrt":
  104. assert "TensorrtExecutionProvider" in session.get_providers()
  105. assert "CUDAExecutionProvider" in session.get_providers()
  106. else:
  107. assert "CUDAExecutionProvider" in session.get_providers()
  108. else:
  109. assert "CPUExecutionProvider" in session.get_providers()
  110. return session
  111. def numpy_type(torch_type):
  112. type_map = {
  113. torch.float32: np.float32,
  114. torch.float16: np.float16,
  115. torch.int32: np.int32,
  116. torch.int64: np.longlong,
  117. }
  118. return type_map[torch_type]
  119. def create_input_output_tensors(inputs, outputs, device):
  120. input_tensors = {name: torch.from_numpy(array).to(device) for name, array in inputs.items()}
  121. output_tensors = {name: torch.from_numpy(array).to(device) for name, array in outputs.items()}
  122. return input_tensors, output_tensors
  123. def create_io_binding(sess, input_tensors, output_tensors):
  124. io_binding = sess.io_binding()
  125. for name, tensor in input_tensors.items():
  126. io_binding.bind_input(
  127. name,
  128. tensor.device.type,
  129. 0,
  130. numpy_type(tensor.dtype),
  131. tensor.shape,
  132. tensor.data_ptr(),
  133. )
  134. for name, tensor in output_tensors.items():
  135. io_binding.bind_output(
  136. name,
  137. tensor.device.type,
  138. 0,
  139. numpy_type(tensor.dtype),
  140. tensor.shape,
  141. tensor.data_ptr(),
  142. )
  143. return io_binding
  144. def onnxruntime_inference_with_io_binding(session, all_inputs, output_names, test_setting):
  145. results = []
  146. latency_list = []
  147. device = "cuda" if test_setting.use_gpu else "cpu"
  148. for test_case_id, inputs in enumerate(all_inputs):
  149. result = session.run(output_names, inputs)
  150. results.append(result)
  151. outputs = {}
  152. for i in range(len(output_names)):
  153. outputs[output_names[i]] = result[i]
  154. input_tensors, output_tensors = create_input_output_tensors(inputs, outputs, device)
  155. io_binding = create_io_binding(session, input_tensors, output_tensors)
  156. # warm up once
  157. session.run_with_iobinding(io_binding)
  158. start_time = timeit.default_timer()
  159. session.run_with_iobinding(io_binding)
  160. latency = timeit.default_timer() - start_time
  161. latency_list.append(latency)
  162. return results, latency_list
  163. def onnxruntime_inference(session, all_inputs, output_names):
  164. if len(all_inputs) > 0:
  165. # Use a random input as warm up.
  166. session.run(output_names, random.choice(all_inputs))
  167. results = []
  168. latency_list = []
  169. for test_case_id, inputs in enumerate(all_inputs):
  170. start_time = timeit.default_timer()
  171. result = session.run(output_names, inputs)
  172. latency = timeit.default_timer() - start_time
  173. results.append(result)
  174. latency_list.append(latency)
  175. return results, latency_list
  176. def to_string(model_path, session, test_setting):
  177. sess_options = session.get_session_options()
  178. option = "model={},".format(os.path.basename(model_path))
  179. option += "graph_optimization_level={},intra_op_num_threads={},".format(
  180. sess_options.graph_optimization_level, sess_options.intra_op_num_threads
  181. ).replace("GraphOptimizationLevel.ORT_", "")
  182. option += f"batch_size={test_setting.batch_size},sequence_length={test_setting.sequence_length},test_cases={test_setting.test_cases},test_times={test_setting.test_times},use_gpu={test_setting.use_gpu}"
  183. return option
  184. def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads):
  185. session = create_session(
  186. model_setting.model_path,
  187. test_setting.use_gpu,
  188. test_setting.provider,
  189. intra_op_num_threads,
  190. model_setting.opt_level,
  191. log_severity=test_setting.log_severity,
  192. )
  193. output_names = [output.name for output in session.get_outputs()]
  194. key = to_string(model_setting.model_path, session, test_setting)
  195. if key in perf_results:
  196. print("skip duplicated test:", key)
  197. return
  198. print("Running test:", key)
  199. all_latency_list = []
  200. if test_setting.use_io_binding:
  201. for i in range(test_setting.test_times):
  202. results, latency_list = onnxruntime_inference_with_io_binding(
  203. session, all_inputs, output_names, test_setting
  204. )
  205. all_latency_list.extend(latency_list)
  206. else:
  207. for i in range(test_setting.test_times):
  208. results, latency_list = onnxruntime_inference(session, all_inputs, output_names)
  209. all_latency_list.extend(latency_list)
  210. # latency in miliseconds
  211. latency_ms = np.array(all_latency_list) * 1000
  212. average_latency = statistics.mean(latency_ms)
  213. latency_50 = np.percentile(latency_ms, 50)
  214. latency_75 = np.percentile(latency_ms, 75)
  215. latency_90 = np.percentile(latency_ms, 90)
  216. latency_95 = np.percentile(latency_ms, 95)
  217. latency_99 = np.percentile(latency_ms, 99)
  218. throughput = test_setting.batch_size * (1000.0 / average_latency)
  219. perf_results[key] = (
  220. average_latency,
  221. latency_50,
  222. latency_75,
  223. latency_90,
  224. latency_95,
  225. latency_99,
  226. throughput,
  227. )
  228. print(
  229. "Average latency = {} ms, Throughput = {} QPS".format(format(average_latency, ".2f"), format(throughput, ".2f"))
  230. )
  231. def launch_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads):
  232. process = multiprocessing.Process(
  233. target=run_one_test,
  234. args=(
  235. model_setting,
  236. test_setting,
  237. perf_results,
  238. all_inputs,
  239. intra_op_num_threads,
  240. ),
  241. )
  242. process.start()
  243. process.join()
  244. def run_perf_tests(model_setting, test_setting, perf_results, all_inputs):
  245. if test_setting.intra_op_num_threads is not None:
  246. launch_test(
  247. model_setting,
  248. test_setting,
  249. perf_results,
  250. all_inputs,
  251. test_setting.intra_op_num_threads,
  252. )
  253. return
  254. cpu_count = psutil.cpu_count(logical=False)
  255. logical_cores = psutil.cpu_count(logical=True)
  256. candidate_threads = list(set([logical_cores, cpu_count]))
  257. for i in range(1, min(16, logical_cores)):
  258. if i not in candidate_threads:
  259. candidate_threads.append(i)
  260. candidate_threads.sort(reverse=True)
  261. for intra_op_num_threads in candidate_threads:
  262. launch_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads)
  263. def run_performance(model_setting, test_setting, perf_results):
  264. input_ids, segment_ids, input_mask = get_bert_inputs(
  265. model_setting.model_path,
  266. model_setting.input_ids_name,
  267. model_setting.segment_ids_name,
  268. model_setting.input_mask_name,
  269. )
  270. # Do not generate random mask for performance test.
  271. print(
  272. f"Generating {test_setting.test_cases} samples for batch_size={test_setting.batch_size} sequence_length={test_setting.sequence_length}"
  273. )
  274. all_inputs = generate_test_data(
  275. test_setting.batch_size,
  276. test_setting.sequence_length,
  277. test_setting.test_cases,
  278. test_setting.seed,
  279. test_setting.verbose,
  280. input_ids,
  281. segment_ids,
  282. input_mask,
  283. random_mask_length=False,
  284. )
  285. run_perf_tests(model_setting, test_setting, perf_results, all_inputs)
  286. def parse_arguments():
  287. parser = argparse.ArgumentParser()
  288. parser.add_argument("--model", required=True, type=str, help="bert onnx model path")
  289. parser.add_argument(
  290. "-b",
  291. "--batch_size",
  292. required=True,
  293. type=int,
  294. nargs="+",
  295. help="batch size of input. Allow one or multiple values in the range of [1, 128].",
  296. )
  297. parser.add_argument(
  298. "-s",
  299. "--sequence_length",
  300. required=True,
  301. type=int,
  302. help="maximum sequence length of input",
  303. )
  304. parser.add_argument(
  305. "--samples",
  306. required=False,
  307. type=int,
  308. default=10,
  309. help="number of samples to be generated",
  310. )
  311. parser.add_argument(
  312. "-t",
  313. "--test_times",
  314. required=False,
  315. type=int,
  316. default=0,
  317. help="number of times to run per sample. By default, the value is 1000 / samples",
  318. )
  319. parser.add_argument(
  320. "--opt_level",
  321. required=False,
  322. type=int,
  323. choices=[0, 1, 2, 99],
  324. default=99,
  325. help="onnxruntime optimization level: 0 - disable all, 1 - basic, 2 - extended, 99 - enable all.",
  326. )
  327. parser.add_argument(
  328. "--seed",
  329. required=False,
  330. type=int,
  331. default=3,
  332. help="random seed. Use the same seed to make sure test data is same in multiple tests.",
  333. )
  334. parser.add_argument(
  335. "--verbose",
  336. required=False,
  337. action="store_true",
  338. help="print verbose information",
  339. )
  340. parser.set_defaults(verbose=False)
  341. parser.add_argument(
  342. "--log_severity",
  343. required=False,
  344. type=int,
  345. default=2,
  346. choices=[0, 1, 2, 3, 4],
  347. help="0:Verbose, 1:Info, 2:Warning, 3:Error, 4:Fatal",
  348. )
  349. parser.add_argument("--use_gpu", required=False, action="store_true", help="use GPU")
  350. parser.set_defaults(use_gpu=False)
  351. parser.add_argument("--use_io_binding", required=False, action="store_true", help="use io_binding")
  352. parser.set_defaults(use_io_binding=False)
  353. parser.add_argument(
  354. "--provider",
  355. required=False,
  356. type=str,
  357. default=None,
  358. help="Execution provider to use",
  359. )
  360. parser.add_argument(
  361. "-n",
  362. "--intra_op_num_threads",
  363. required=False,
  364. type=int,
  365. default=None,
  366. help=">=0, set intra_op_num_threads",
  367. )
  368. parser.add_argument(
  369. "--input_ids_name",
  370. required=False,
  371. type=str,
  372. default=None,
  373. help="input name for input ids",
  374. )
  375. parser.add_argument(
  376. "--segment_ids_name",
  377. required=False,
  378. type=str,
  379. default=None,
  380. help="input name for segment ids",
  381. )
  382. parser.add_argument(
  383. "--input_mask_name",
  384. required=False,
  385. type=str,
  386. default=None,
  387. help="input name for attention mask",
  388. )
  389. args = parser.parse_args()
  390. return args
  391. def main():
  392. args = parse_arguments()
  393. if args.test_times == 0:
  394. args.test_times = max(1, int(1000 / args.samples))
  395. manager = multiprocessing.Manager()
  396. perf_results = manager.dict()
  397. batch_size_set = set(args.batch_size)
  398. if not min(batch_size_set) >= 1 and max(batch_size_set) <= 128:
  399. raise Exception("batch_size not in range [1, 128]")
  400. model_setting = ModelSetting(
  401. args.model,
  402. args.input_ids_name,
  403. args.segment_ids_name,
  404. args.input_mask_name,
  405. args.opt_level,
  406. )
  407. for batch_size in batch_size_set:
  408. test_setting = TestSetting(
  409. batch_size,
  410. args.sequence_length,
  411. args.samples,
  412. args.test_times,
  413. args.use_gpu,
  414. args.use_io_binding,
  415. args.provider,
  416. args.intra_op_num_threads,
  417. args.seed,
  418. args.verbose,
  419. args.log_severity,
  420. )
  421. print("test setting", test_setting)
  422. run_performance(model_setting, test_setting, perf_results)
  423. # Sort the results so that the first one has smallest latency.
  424. sorted_results = sorted(perf_results.items(), reverse=False, key=lambda x: x[1])
  425. summary_file = os.path.join(
  426. Path(args.model).parent,
  427. "perf_results_{}_B{}_S{}_{}.txt".format(
  428. "GPU" if args.use_gpu else "CPU",
  429. "-".join([str(x) for x in sorted(list(batch_size_set))]),
  430. args.sequence_length,
  431. datetime.now().strftime("%Y%m%d-%H%M%S"),
  432. ),
  433. )
  434. with open(summary_file, "w+", newline="") as tsv_file:
  435. tsv_writer = csv.writer(tsv_file, delimiter="\t", lineterminator="\n")
  436. headers = None
  437. for (key, perf_result) in sorted_results:
  438. params = key.split(",")
  439. if headers is None:
  440. headers = [
  441. "Latency(ms)",
  442. "Latency_P50",
  443. "Latency_P75",
  444. "Latency_P90",
  445. "Latency_P95",
  446. "Latency_P99",
  447. "Throughput(QPS)",
  448. ]
  449. headers.extend([x.split("=")[0] for x in params])
  450. tsv_writer.writerow(headers)
  451. values = [format(x, ".2f") for x in perf_result]
  452. values.extend([x.split("=")[1] for x in params])
  453. tsv_writer.writerow(values)
  454. print("Test summary is saved to", summary_file)
  455. if __name__ == "__main__":
  456. # work around for AnaConda Jupyter. See https://stackoverflow.com/questions/45720153/python-multiprocessing-error-attributeerror-module-main-has-no-attribute
  457. __spec__ = None
  458. main()