m2m模型翻译
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

915 lines
30 KiB

6 months ago
  1. # Copyright (c) Microsoft Corporation. All rights reserved.
  2. # Copyright 2018 The HuggingFace Inc. team.
  3. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. """ Benchmarking the inference of pretrained transformer models.
  17. PyTorch/TorchScript benchmark is based on https://github.com/huggingface/transformers/blob/master/examples/benchmarks.py.
  18. One difference is that random input_ids is generated in this benchmark.
  19. For onnxruntime, this script will convert a pretrained model to ONNX, and optimize it when -o parameter is used.
  20. Example commands:
  21. Export all models to ONNX, optimize and validate them:
  22. python benchmark.py -b 0 -o -v -i 1 2 3
  23. Run OnnxRuntime on GPU for all models:
  24. python benchmark.py -g
  25. Run OnnxRuntime on GPU for all models with fp32 optimization:
  26. python benchmark.py -g -o
  27. Run OnnxRuntime on GPU with fp16 optimization:
  28. python benchmark.py -g -o -p "fp16"
  29. Run TorchScript on GPU for all models:
  30. python benchmark.py -e torchscript -g
  31. Run TorchScript on GPU for all models with fp16:
  32. python benchmark.py -e torchscript -g -p "fp16"
  33. Run ONNXRuntime and TorchScript on CPU for all models with quantization:
  34. python benchmark.py -e torchscript onnxruntime -p "int8" -o
  35. Run OnnxRuntime with the ROCM provider and graph optimization script:
  36. python benchmark.py -g -m bert-base-cased --provider rocm --optimizer_info by_script --disable_embed_layer_norm
  37. It is recommended to use run_benchmark.sh to launch benchmark.
  38. """
  39. import argparse
  40. import logging
  41. import os
  42. import timeit
  43. from datetime import datetime
  44. from enum import Enum
  45. import numpy
  46. import onnx
  47. import psutil
  48. from benchmark_helper import (
  49. ConfigModifier,
  50. OptimizerInfo,
  51. Precision,
  52. allocateOutputBuffers,
  53. create_onnxruntime_session,
  54. get_latency_result,
  55. inference_ort,
  56. inference_ort_with_io_binding,
  57. output_details,
  58. output_fusion_statistics,
  59. output_summary,
  60. setup_logger,
  61. )
  62. from fusion_options import FusionOptions
  63. from onnx_exporter import (
  64. create_onnxruntime_input,
  65. export_onnx_model_from_pt,
  66. export_onnx_model_from_tf,
  67. load_pretrained_model,
  68. )
  69. from packaging import version
  70. from quantize_helper import QuantizeHelper
  71. logger = logging.getLogger("")
  72. from huggingface_models import MODEL_CLASSES, MODELS
  73. cpu_count = psutil.cpu_count(logical=False)
  74. # Set OMP environment variable before importing onnxruntime or torch.
  75. if "OMP_NUM_THREADS" not in os.environ:
  76. os.environ["OMP_NUM_THREADS"] = str(cpu_count)
  77. import torch
  78. from transformers import AutoConfig, AutoModel, AutoTokenizer, GPT2Model, LxmertConfig
  79. def run_onnxruntime(
  80. use_gpu,
  81. provider,
  82. model_names,
  83. model_class,
  84. config_modifier,
  85. precision,
  86. num_threads,
  87. batch_sizes,
  88. sequence_lengths,
  89. repeat_times,
  90. input_counts,
  91. optimizer_info,
  92. validate_onnx,
  93. cache_dir,
  94. onnx_dir,
  95. verbose,
  96. overwrite,
  97. disable_ort_io_binding,
  98. use_raw_attention_mask,
  99. model_fusion_statistics,
  100. model_source,
  101. args,
  102. ):
  103. import onnxruntime
  104. results = []
  105. if (
  106. use_gpu
  107. and ("CUDAExecutionProvider" not in onnxruntime.get_available_providers())
  108. and ("ROCMExecutionProvider" not in onnxruntime.get_available_providers())
  109. and ("DmlExecutionProvider" not in onnxruntime.get_available_providers())
  110. ):
  111. logger.error(
  112. "Please install onnxruntime-gpu or onnxruntime-directml package instead of onnxruntime, and use a machine with GPU for testing gpu performance."
  113. )
  114. return results
  115. warm_up_repeat = 0
  116. if provider == "tensorrt":
  117. optimizer_info = OptimizerInfo.NOOPT
  118. warm_up_repeat = 5
  119. if "TensorrtExecutionProvider" not in onnxruntime.get_available_providers():
  120. logger.error(
  121. "Please install onnxruntime-gpu-tensorrt package, and use a machine with GPU for testing gpu performance."
  122. )
  123. return results
  124. if optimizer_info == OptimizerInfo.NOOPT:
  125. logger.warning(
  126. f"OptimizerInfo is set to {optimizer_info}, graph optimizations specified in FusionOptions are not applied."
  127. )
  128. for model_name in model_names:
  129. all_input_names = MODELS[model_name][0]
  130. for num_inputs in input_counts:
  131. if num_inputs > len(all_input_names):
  132. break
  133. input_names = all_input_names[:num_inputs]
  134. args.model_type = MODELS[model_name][3]
  135. fusion_options = FusionOptions.parse(args)
  136. if "pt" in model_source:
  137. with torch.no_grad():
  138. (
  139. onnx_model_file,
  140. is_valid_onnx_model,
  141. vocab_size,
  142. max_sequence_length,
  143. ) = export_onnx_model_from_pt(
  144. model_name,
  145. MODELS[model_name][1],
  146. MODELS[model_name][2],
  147. MODELS[model_name][3],
  148. model_class,
  149. config_modifier,
  150. cache_dir,
  151. onnx_dir,
  152. input_names,
  153. use_gpu,
  154. precision,
  155. optimizer_info,
  156. validate_onnx,
  157. use_raw_attention_mask,
  158. overwrite,
  159. model_fusion_statistics,
  160. fusion_options,
  161. )
  162. if "tf" in model_source:
  163. (onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length,) = export_onnx_model_from_tf(
  164. model_name,
  165. MODELS[model_name][1],
  166. MODELS[model_name][2],
  167. MODELS[model_name][3],
  168. model_class,
  169. config_modifier,
  170. cache_dir,
  171. onnx_dir,
  172. input_names,
  173. use_gpu,
  174. precision,
  175. optimizer_info,
  176. validate_onnx,
  177. use_raw_attention_mask,
  178. overwrite,
  179. model_fusion_statistics,
  180. fusion_options,
  181. )
  182. if not is_valid_onnx_model:
  183. continue
  184. ort_session = create_onnxruntime_session(
  185. onnx_model_file,
  186. use_gpu,
  187. provider,
  188. enable_all_optimization=True,
  189. num_threads=num_threads,
  190. verbose=verbose,
  191. )
  192. if ort_session is None:
  193. continue
  194. ort_output_names = [node_arg.name for node_arg in ort_session.get_outputs()]
  195. output_buffers = []
  196. device = "cuda" if use_gpu else "cpu"
  197. config = AutoConfig.from_pretrained(model_name, cache_dir=cache_dir)
  198. max_last_state_size = numpy.prod(
  199. [
  200. max(batch_sizes),
  201. max(sequence_lengths),
  202. max(vocab_size, config.hidden_size),
  203. ]
  204. )
  205. max_pooler_size = numpy.prod([max(batch_sizes), config.hidden_size])
  206. for batch_size in batch_sizes:
  207. if batch_size <= 0:
  208. continue
  209. for sequence_length in sequence_lengths:
  210. if max_sequence_length is not None and sequence_length > max_sequence_length:
  211. continue
  212. input_value_type = numpy.int64 if "pt" in model_source else numpy.int32
  213. ort_inputs = create_onnxruntime_input(
  214. vocab_size,
  215. batch_size,
  216. sequence_length,
  217. input_names,
  218. config,
  219. input_value_type,
  220. )
  221. result_template = {
  222. "engine": "onnxruntime",
  223. "version": onnxruntime.__version__,
  224. "providers": provider,
  225. "device": device,
  226. "optimizer": optimizer_info,
  227. "precision": precision,
  228. "io_binding": not disable_ort_io_binding,
  229. "model_name": model_name,
  230. "inputs": num_inputs,
  231. "threads": num_threads,
  232. "batch_size": batch_size,
  233. "sequence_length": sequence_length,
  234. "custom_layer_num": config_modifier.get_layer_num(),
  235. "datetime": str(datetime.now()),
  236. }
  237. logger.info(
  238. "Run onnxruntime on {} with input shape {}".format(model_name, [batch_size, sequence_length])
  239. )
  240. if disable_ort_io_binding:
  241. result = inference_ort(
  242. ort_session,
  243. ort_inputs,
  244. result_template,
  245. repeat_times,
  246. batch_size,
  247. warm_up_repeat,
  248. )
  249. else:
  250. # Get output sizes from a dummy ort run
  251. ort_outputs = ort_session.run(ort_output_names, ort_inputs)
  252. output_buffer_max_sizes = [max_last_state_size]
  253. for i in range(len(ort_outputs)):
  254. if i == 2 and MODELS[model_name][3] == "gpt":
  255. # past state output max size
  256. output_buffer_max_sizes.append(max_pooler_size)
  257. else:
  258. output_buffer_max_sizes.append(max_last_state_size)
  259. data_type = numpy.longlong if "pt" in model_source else numpy.intc
  260. result = inference_ort_with_io_binding(
  261. ort_session,
  262. ort_inputs,
  263. result_template,
  264. repeat_times,
  265. ort_output_names,
  266. ort_outputs,
  267. output_buffers,
  268. output_buffer_max_sizes,
  269. batch_size,
  270. device,
  271. data_type,
  272. warm_up_repeat,
  273. )
  274. logger.info(result)
  275. results.append(result)
  276. return results
  277. def run_pytorch(
  278. use_gpu,
  279. model_names,
  280. model_class,
  281. config_modifier,
  282. precision,
  283. num_threads,
  284. batch_sizes,
  285. sequence_lengths,
  286. repeat_times,
  287. torchscript,
  288. torch2,
  289. cache_dir,
  290. verbose,
  291. ):
  292. results = []
  293. if use_gpu and not torch.cuda.is_available():
  294. logger.error("Please install PyTorch with Cuda, and use a machine with GPU for testing gpu performance.")
  295. return results
  296. torch.set_grad_enabled(False)
  297. for model_name in model_names:
  298. config = AutoConfig.from_pretrained(model_name, torchscript=torchscript, cache_dir=cache_dir)
  299. config_modifier.modify(config)
  300. model = load_pretrained_model(
  301. model_name,
  302. config=config,
  303. cache_dir=cache_dir,
  304. custom_model_class=model_class,
  305. )
  306. tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
  307. max_input_size = (
  308. tokenizer.max_model_input_sizes[model_name] if model_name in tokenizer.max_model_input_sizes else 1024
  309. )
  310. logger.debug(f"Model {model}")
  311. logger.debug(f"Number of parameters {model.num_parameters()}")
  312. if precision == Precision.FLOAT16:
  313. model.half()
  314. device = torch.device("cuda:0" if use_gpu else "cpu")
  315. model.to(device)
  316. if precision == Precision.INT8:
  317. model = QuantizeHelper.quantize_torch_model(model)
  318. for batch_size in batch_sizes:
  319. if batch_size <= 0:
  320. continue
  321. for sequence_length in sequence_lengths:
  322. if max_input_size is not None and sequence_length > max_input_size:
  323. continue
  324. logger.info("Run PyTorch on {} with input shape {}".format(model_name, [batch_size, sequence_length]))
  325. input_ids = torch.randint(
  326. low=0,
  327. high=config.vocab_size - 1,
  328. size=(batch_size, sequence_length),
  329. dtype=torch.long,
  330. device=device,
  331. )
  332. try:
  333. inference = (
  334. torch.jit.trace(model, input_ids) if torchscript else torch.compile(model) if torch2 else model
  335. )
  336. inference(input_ids)
  337. runtimes = timeit.repeat(lambda: inference(input_ids), repeat=repeat_times, number=1)
  338. result = {
  339. "engine": "torchscript" if torchscript else "torch2" if torch2 else "torch",
  340. "version": torch.__version__,
  341. "providers": "NA",
  342. "device": "cuda" if use_gpu else "cpu",
  343. "optimizer": "",
  344. "precision": precision,
  345. "io_binding": "",
  346. "model_name": model_name,
  347. "inputs": 1,
  348. "threads": num_threads,
  349. "batch_size": batch_size,
  350. "sequence_length": sequence_length,
  351. "custom_layer_num": config_modifier.get_layer_num(),
  352. "datetime": str(datetime.now()),
  353. }
  354. result.update(get_latency_result(runtimes, batch_size))
  355. logger.info(result)
  356. results.append(result)
  357. except RuntimeError as e:
  358. logger.exception(e)
  359. torch.cuda.empty_cache()
  360. return results
  361. def run_with_tf_optimizations(do_eager_mode: bool, use_xla: bool):
  362. from functools import wraps
  363. import tensorflow as tf
  364. def run_func(func):
  365. @wraps(func)
  366. def run_in_eager_mode(*args, **kwargs):
  367. return func(*args, **kwargs)
  368. @wraps(func)
  369. @tf.function(experimental_compile=use_xla)
  370. def run_in_graph_mode(*args, **kwargs):
  371. return func(*args, **kwargs)
  372. if do_eager_mode is True:
  373. assert (
  374. use_xla is False
  375. ), "Cannot run model in XLA, if `args.eager_mode` is set to `True`. Please set `args.eager_mode=False`."
  376. return run_in_eager_mode
  377. else:
  378. return run_in_graph_mode
  379. return run_func
  380. def run_tensorflow(
  381. use_gpu,
  382. model_names,
  383. model_class,
  384. config_modifier,
  385. precision,
  386. num_threads,
  387. batch_sizes,
  388. sequence_lengths,
  389. repeat_times,
  390. cache_dir,
  391. verbose,
  392. ):
  393. results = []
  394. import tensorflow as tf
  395. tf.config.threading.set_intra_op_parallelism_threads(num_threads)
  396. if not use_gpu:
  397. tf.config.set_visible_devices([], "GPU")
  398. if use_gpu and not tf.test.is_built_with_cuda():
  399. logger.error("Please install Tensorflow-gpu, and use a machine with GPU for testing gpu performance.")
  400. return results
  401. if use_gpu: # Restrict TensorFlow to only use the first GPU
  402. physical_devices = tf.config.list_physical_devices("GPU")
  403. try:
  404. tf.config.set_visible_devices(physical_devices[0], "GPU")
  405. tf.config.experimental.set_memory_growth(physical_devices[0], True)
  406. tf.distribute.OneDeviceStrategy(device="/gpu:0")
  407. except RuntimeError as e:
  408. logger.exception(e)
  409. if precision == Precision.FLOAT16 or precision == Precision.INT8:
  410. raise NotImplementedError("Mixed precision is currently not supported.")
  411. for model_name in model_names:
  412. config = AutoConfig.from_pretrained(model_name, cache_dir=cache_dir)
  413. config_modifier.modify(config)
  414. model = load_pretrained_model(
  415. model_name,
  416. config=config,
  417. cache_dir=cache_dir,
  418. custom_model_class=model_class,
  419. is_tf_model=True,
  420. )
  421. tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
  422. max_input_size = (
  423. tokenizer.max_model_input_sizes[model_name] if model_name in tokenizer.max_model_input_sizes else 1024
  424. )
  425. for batch_size in batch_sizes:
  426. if batch_size <= 0:
  427. continue
  428. for sequence_length in sequence_lengths:
  429. if max_input_size is not None and sequence_length > max_input_size:
  430. continue
  431. logger.info(
  432. "Run Tensorflow on {} with input shape {}".format(model_name, [batch_size, sequence_length])
  433. )
  434. import random
  435. rng = random.Random()
  436. values = [rng.randint(0, config.vocab_size - 1) for i in range(batch_size * sequence_length)]
  437. input_ids = tf.constant(values, shape=(batch_size, sequence_length), dtype=tf.int32)
  438. try:
  439. # Disable both for better inference perf
  440. @run_with_tf_optimizations(do_eager_mode=False, use_xla=False)
  441. def encoder_forward():
  442. return model(input_ids, training=False)
  443. @run_with_tf_optimizations(do_eager_mode=False, use_xla=False)
  444. def encoder_decoder_forward():
  445. return model(input_ids, decoder_input_ids=input_ids, training=False)
  446. @run_with_tf_optimizations(do_eager_mode=False, use_xla=False)
  447. def lxmert_forward():
  448. feats = tf.random.normal([1, 1, config.visual_feat_dim])
  449. pos = tf.random.normal([1, 1, config.visual_pos_dim])
  450. return model(
  451. input_ids,
  452. visual_feats=feats,
  453. visual_pos=pos,
  454. training=False,
  455. )
  456. inference = encoder_forward
  457. if config.is_encoder_decoder:
  458. inference = encoder_decoder_forward
  459. elif isinstance(config, LxmertConfig):
  460. inference = lxmert_forward
  461. inference()
  462. runtimes = timeit.repeat(lambda: inference(), repeat=repeat_times, number=1)
  463. result = {
  464. "engine": "tensorflow",
  465. "version": tf.__version__,
  466. "providers": "NA",
  467. "device": "cuda" if use_gpu else "cpu",
  468. "optimizer": "",
  469. "precision": precision,
  470. "io_binding": "",
  471. "model_name": model_name,
  472. "inputs": 1,
  473. "threads": num_threads,
  474. "batch_size": batch_size,
  475. "sequence_length": sequence_length,
  476. "custom_layer_num": config_modifier.get_layer_num(),
  477. "datetime": str(datetime.now()),
  478. }
  479. result.update(get_latency_result(runtimes, batch_size))
  480. logger.info(result)
  481. results.append(result)
  482. except RuntimeError as e:
  483. logger.exception(e)
  484. from numba import cuda
  485. device = cuda.get_current_device()
  486. device.reset()
  487. return results
  488. def parse_arguments():
  489. parser = argparse.ArgumentParser()
  490. parser.add_argument(
  491. "-m",
  492. "--models",
  493. required=False,
  494. nargs="+",
  495. type=str,
  496. default=["bert-base-cased", "roberta-base", "gpt2"],
  497. choices=list(MODELS.keys()),
  498. help="Pre-trained models in the list: " + ", ".join(MODELS.keys()),
  499. )
  500. parser.add_argument(
  501. "--model_source",
  502. required=False,
  503. nargs=1,
  504. type=str,
  505. default="pt",
  506. choices=["pt", "tf"],
  507. help="Export onnx from pt or tf",
  508. )
  509. parser.add_argument(
  510. "--model_class",
  511. required=False,
  512. type=str,
  513. default=None,
  514. choices=list(MODEL_CLASSES),
  515. help="Model type selected in the list: " + ", ".join(MODEL_CLASSES),
  516. )
  517. parser.add_argument(
  518. "-e",
  519. "--engines",
  520. required=False,
  521. nargs="+",
  522. type=str,
  523. default=["onnxruntime"],
  524. choices=["onnxruntime", "torch", "torch2", "torchscript", "tensorflow"],
  525. help="Engines to benchmark",
  526. )
  527. parser.add_argument(
  528. "-c",
  529. "--cache_dir",
  530. required=False,
  531. type=str,
  532. default=os.path.join(".", "cache_models"),
  533. help="Directory to cache pre-trained models",
  534. )
  535. parser.add_argument(
  536. "--onnx_dir",
  537. required=False,
  538. type=str,
  539. default=os.path.join(".", "onnx_models"),
  540. help="Directory to store onnx models",
  541. )
  542. parser.add_argument("-g", "--use_gpu", required=False, action="store_true", help="Run on gpu device")
  543. parser.add_argument(
  544. "--provider",
  545. required=False,
  546. type=str,
  547. default=None,
  548. help="Execution provider to use",
  549. )
  550. parser.add_argument(
  551. "-p",
  552. "--precision",
  553. type=Precision,
  554. default=Precision.FLOAT32,
  555. choices=list(Precision),
  556. help="Precision of model to run. fp32 for full precision, fp16 for half precision, and int8 for quantization",
  557. )
  558. parser.add_argument("--verbose", required=False, action="store_true", help="Print more information")
  559. parser.add_argument(
  560. "--overwrite",
  561. required=False,
  562. action="store_true",
  563. help="Overwrite existing models",
  564. )
  565. parser.add_argument(
  566. "-o",
  567. "--optimizer_info",
  568. type=OptimizerInfo,
  569. default=OptimizerInfo.BYSCRIPT,
  570. choices=list(OptimizerInfo),
  571. help="Optimizer info: Use optimizer.py to optimize onnx model as default. Can also choose from by_ort and no_opt",
  572. )
  573. parser.add_argument(
  574. "-v",
  575. "--validate_onnx",
  576. required=False,
  577. action="store_true",
  578. help="Validate ONNX model",
  579. )
  580. parser.add_argument(
  581. "-f",
  582. "--fusion_csv",
  583. required=False,
  584. default=None,
  585. help="CSV file for saving summary results of graph optimization.",
  586. )
  587. parser.add_argument(
  588. "-d",
  589. "--detail_csv",
  590. required=False,
  591. default=None,
  592. help="CSV file for saving detail results.",
  593. )
  594. parser.add_argument(
  595. "-r",
  596. "--result_csv",
  597. required=False,
  598. default=None,
  599. help="CSV file for saving summary results.",
  600. )
  601. parser.add_argument(
  602. "-i",
  603. "--input_counts",
  604. required=False,
  605. nargs="+",
  606. default=[1],
  607. type=int,
  608. choices=[1, 2, 3],
  609. help="Number of ONNX model inputs. Please use 1 for fair comparison with Torch or TorchScript.",
  610. )
  611. parser.add_argument(
  612. "-t",
  613. "--test_times",
  614. required=False,
  615. default=100,
  616. type=int,
  617. help="Number of repeat times to get average inference latency.",
  618. )
  619. parser.add_argument("-b", "--batch_sizes", nargs="+", type=int, default=[1])
  620. parser.add_argument(
  621. "-s",
  622. "--sequence_lengths",
  623. nargs="+",
  624. type=int,
  625. default=[4, 8, 16, 32, 64, 128, 256],
  626. )
  627. parser.add_argument(
  628. "--disable_ort_io_binding",
  629. required=False,
  630. action="store_true",
  631. help="Disable running ONNX Runtime with binded inputs and outputs. ",
  632. )
  633. parser.set_defaults(disable_ort_io_binding=False)
  634. parser.add_argument(
  635. "-n",
  636. "--num_threads",
  637. required=False,
  638. nargs="+",
  639. type=int,
  640. default=[0],
  641. help="Threads to use",
  642. )
  643. parser.add_argument(
  644. "--force_num_layers",
  645. required=False,
  646. type=int,
  647. default=None,
  648. help="Manually set the model's layer number",
  649. )
  650. FusionOptions.add_arguments(parser)
  651. args = parser.parse_args()
  652. return args
  653. def main():
  654. args = parse_arguments()
  655. setup_logger(args.verbose)
  656. if args.precision == Precision.FLOAT16 and not args.use_gpu:
  657. logger.error("fp16 is for GPU only")
  658. return
  659. if args.precision == Precision.INT8 and args.use_gpu:
  660. logger.error("int8 is for CPU only")
  661. return
  662. args.num_threads = sorted(set(cpu_count if x <= 0 else x for x in args.num_threads))
  663. logger.info(f"Arguments: {args}")
  664. if not os.path.exists(args.cache_dir):
  665. try:
  666. os.mkdir(args.cache_dir)
  667. except OSError:
  668. logger.error("Creation of the directory %s failed" % args.cache_dir)
  669. enable_torch = "torch" in args.engines
  670. enable_torch2 = "torch2" in args.engines
  671. enable_torchscript = "torchscript" in args.engines
  672. enable_onnxruntime = "onnxruntime" in args.engines
  673. enable_tensorflow = "tensorflow" in args.engines
  674. if enable_torch2 and version.parse(torch.__version__) < version.parse("2.0.0"):
  675. logger.error(f"PyTorch version must be >=2.0.0 and you are using {torch.__version__}")
  676. return
  677. config_modifier = ConfigModifier(args.force_num_layers)
  678. results = []
  679. for num_threads in args.num_threads:
  680. torch.set_num_threads(num_threads)
  681. logger.debug(torch.__config__.parallel_info())
  682. if enable_torch or enable_torch2 or enable_torchscript:
  683. if args.input_counts != [1]:
  684. logger.warning("--input_counts is not implemented for torch or torchscript engine.")
  685. if enable_torchscript:
  686. results += run_pytorch(
  687. args.use_gpu,
  688. args.models,
  689. args.model_class,
  690. config_modifier,
  691. args.precision,
  692. num_threads,
  693. args.batch_sizes,
  694. args.sequence_lengths,
  695. args.test_times,
  696. True,
  697. False,
  698. args.cache_dir,
  699. args.verbose,
  700. )
  701. if enable_torch:
  702. results += run_pytorch(
  703. args.use_gpu,
  704. args.models,
  705. args.model_class,
  706. config_modifier,
  707. args.precision,
  708. num_threads,
  709. args.batch_sizes,
  710. args.sequence_lengths,
  711. args.test_times,
  712. False,
  713. False,
  714. args.cache_dir,
  715. args.verbose,
  716. )
  717. if enable_torch2:
  718. results += run_pytorch(
  719. args.use_gpu,
  720. args.models,
  721. args.model_class,
  722. config_modifier,
  723. args.precision,
  724. num_threads,
  725. args.batch_sizes,
  726. args.sequence_lengths,
  727. args.test_times,
  728. False,
  729. True,
  730. args.cache_dir,
  731. args.verbose,
  732. )
  733. if enable_tensorflow:
  734. results += run_tensorflow(
  735. args.use_gpu,
  736. args.models,
  737. args.model_class,
  738. config_modifier,
  739. args.precision,
  740. num_threads,
  741. args.batch_sizes,
  742. args.sequence_lengths,
  743. args.test_times,
  744. args.cache_dir,
  745. args.verbose,
  746. )
  747. model_fusion_statistics = {}
  748. if enable_onnxruntime:
  749. try:
  750. use_raw_attention_mask = not args.use_mask_index
  751. results += run_onnxruntime(
  752. args.use_gpu,
  753. args.provider,
  754. args.models,
  755. args.model_class,
  756. config_modifier,
  757. args.precision,
  758. num_threads,
  759. args.batch_sizes,
  760. args.sequence_lengths,
  761. args.test_times,
  762. args.input_counts,
  763. args.optimizer_info,
  764. args.validate_onnx,
  765. args.cache_dir,
  766. args.onnx_dir,
  767. args.verbose,
  768. args.overwrite,
  769. args.disable_ort_io_binding,
  770. use_raw_attention_mask,
  771. model_fusion_statistics,
  772. args.model_source,
  773. args,
  774. )
  775. except:
  776. logger.error(f"Exception", exc_info=True)
  777. time_stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
  778. if model_fusion_statistics:
  779. csv_filename = args.fusion_csv or f"benchmark_fusion_{time_stamp}.csv"
  780. output_fusion_statistics(model_fusion_statistics, csv_filename)
  781. if len(results) == 0:
  782. if args.batch_sizes != [0]:
  783. logger.warning("No any result avaiable.")
  784. return
  785. csv_filename = args.detail_csv or f"benchmark_detail_{time_stamp}.csv"
  786. output_details(results, csv_filename)
  787. csv_filename = args.result_csv or f"benchmark_summary_{time_stamp}.csv"
  788. output_summary(results, csv_filename, args)
  789. if __name__ == "__main__":
  790. main()