m2m模型翻译
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

725 lines
24 KiB

import argparse
import json
import os
import numpy
import psutil
from onnx import TensorProto
"""
This profiler tool could run a transformer model and print out the kernel time spent on each Node of the model.
Example of profiling of longformer model:
python profiler.py --model longformer-base-4096_fp32.onnx --batch_size 1 --sequence_length 4096 --global_length 8 --samples 1000 --thread_num 8 --dummy_inputs longformer --use_gpu
Example of importing profile result file from onnxruntime_perf_test:
python profiler.py --input profile_2021-10-25_12-02-41.json
"""
NODES_TYPE_CONTAINING_SUBGRAPH = ["Scan", "Loop", "If"]
def parse_arguments(argv=None):
parser = argparse.ArgumentParser()
parser.add_argument(
"-i",
"--input",
required=False,
type=str,
help="Set the input file for reading the profile results",
)
parser.add_argument(
"-m",
"--model",
required=False,
type=str,
help="onnx model path to run profiling. Required when --input is not specified.",
)
parser.add_argument(
"-b",
"--batch_size",
required=False,
type=int,
default=1,
help="batch size of input",
)
parser.add_argument(
"-s",
"--sequence_length",
required=False,
type=int,
default=32,
help="sequence length of input",
)
parser.add_argument(
"--past_sequence_length",
required=False,
type=int,
default=1,
help="past sequence length for gpt2",
)
parser.add_argument(
"--global_length",
required=False,
type=int,
default=1,
help="number of global tokens for longformer",
)
parser.add_argument(
"--samples",
required=False,
type=int,
default=1000,
help="number of samples to test. Set it large enough to reduce the variance of performance result.",
)
parser.add_argument(
"--threshold",
required=False,
type=float,
default=0.01,
help="Threshold of run time ratio among all nodes. Nodes with larger ratio will show in top expensive nodes.",
)
parser.add_argument(
"--thread_num",
required=False,
type=int,
default=-1,
help="number of threads to use",
)
parser.add_argument(
"--input_ids_name",
required=False,
type=str,
default=None,
help="input name for input IDs, for bert",
)
parser.add_argument(
"--segment_ids_name",
required=False,
type=str,
default=None,
help="input name for segment IDs, for bert",
)
parser.add_argument(
"--input_mask_name",
required=False,
type=str,
default=None,
help="input name for attention mask, for bert",
)
parser.add_argument(
"--dummy_inputs",
required=False,
default="default",
choices=["bert", "gpt2", "longformer", "default"],
help="Type of model inputs. The default will create dummy inputs with ones.",
)
parser.add_argument("-g", "--use_gpu", required=False, action="store_true", help="use GPU")
parser.set_defaults(use_gpu=False)
parser.add_argument(
"--provider",
required=False,
type=str,
default="cuda",
help="Execution provider to use",
)
parser.add_argument(
"--basic_optimization",
required=False,
action="store_true",
help="Enable only basic graph optimizations. By default, all optimizations are enabled in OnnxRuntime",
)
parser.set_defaults(basic_optimization=False)
parser.add_argument(
"--kernel_time_only",
required=False,
action="store_true",
help="Only include the kernel time and no fence time",
)
parser.set_defaults(kernel_time_only=False)
parser.add_argument("-v", "--verbose", required=False, action="store_true")
parser.set_defaults(verbose=False)
return parser.parse_args(argv)
def run_profile(onnx_model_path, use_gpu, provider, basic_optimization, thread_num, all_inputs):
from benchmark_helper import create_onnxruntime_session
session = create_onnxruntime_session(
onnx_model_path,
use_gpu,
provider,
enable_all_optimization=not basic_optimization,
num_threads=thread_num,
enable_profiling=True,
)
for inputs in all_inputs:
_ = session.run(None, inputs)
profile_file = session.end_profiling()
return profile_file
def load_profile_json(profile_file):
print(f"loading profile output {profile_file} ...")
with open(profile_file, "r") as opened_file:
sess_time = json.load(opened_file)
assert isinstance(sess_time, list)
return sess_time
def parse_kernel_results(sess_time, threshold=0):
"""Parse profile data and output nodes in two sections - nodes in the original order, and top expensive nodes.
Args:
sess_time (List[Dict]): profile data
kernel_time_only (bool, optional): Only include items for kernel time. Defaults to False.
threshold (int, optional): Minimum ratio of duration among all. Defaults to 0.
Returns:
List[str]: lines of string for output.
"""
kernel_name_to_op_name = {}
kernel_time = {}
kernel_freq = {}
total = 0
session_init = False
for item in sess_time:
# Skip all MemcpyHostToDevice before session_initialization
if item["cat"] == "Session" and item["name"] == "session_initialization":
session_init = True
if not session_init:
continue
if item["cat"] == "Kernel" and "dur" in item and "args" in item and "op_name" in item["args"]:
kernel_name = item["name"]
op_name = item["args"]["op_name"]
if op_name in NODES_TYPE_CONTAINING_SUBGRAPH:
continue
# Handle MemcpyHostToDevice and MemcpyDeviceToHost here
if not op_name:
op_name = f"({kernel_name})"
if kernel_name in kernel_time:
kernel_time[kernel_name] += item["dur"]
kernel_freq[kernel_name] += 1
else:
kernel_time[kernel_name] = item["dur"]
kernel_freq[kernel_name] = 1
kernel_name_to_op_name[kernel_name] = op_name
total += item["dur"]
if not kernel_time:
return ["No kernel record found!"]
# Output items with run time ratio > thresholds, and sorted by duration in the descending order.
lines = []
lines.append(f"\nTop expensive kernels with Time% >= {threshold*100:.2f}:")
lines.append("-" * 64)
lines.append("Total(μs)\tTime%\tCalls\tAvg(μs)\tKernel")
for kernel_name, duration in sorted(kernel_time.items(), key=lambda x: x[1], reverse=True):
ratio = duration / total
if ratio < threshold:
continue
calls = kernel_freq[kernel_name]
avg_time = duration / float(calls)
lines.append(f"{duration:10d}\t{ratio * 100.0:5.2f}\t{calls:5d}\t{avg_time:8.1f}\t{kernel_name}")
# Group by operator
op_time = {}
for kernel_name, op_name in kernel_name_to_op_name.items():
duration = kernel_time[kernel_name]
if op_name in op_time:
op_time[op_name] += duration
else:
op_time[op_name] = duration
lines.append(f"\nGroup kernel time by operator:")
lines.append("-" * 64)
lines.append("Total(μs)\tTime%\tOperator")
for op_name, duration in sorted(op_time.items(), key=lambda x: x[1], reverse=True):
ratio = duration / total
lines.append(f"{duration:10d}\t{ratio * 100.0:5.2f}\t{op_name}")
return lines
def parse_node_results(sess_time, kernel_time_only=False, threshold=0):
"""Parse profile data and output nodes in two sections - nodes in the original order, and top expensive nodes.
Args:
sess_time (List[Dict]): profile data
kernel_time_only (bool, optional): Only include items for kernel time. Defaults to False.
threshold (int, optional): Minimum ratio of duration among all. Defaults to 0.
Returns:
List[str]: lines of string for output.
"""
node_name_list = []
node_time = {}
node_freq = {}
node_provider = {}
total = 0
for item in sess_time:
if item["cat"] == "Node" and "dur" in item and "args" in item and "op_name" in item["args"]:
node_name = (
item["name"].replace("_kernel_time", "").replace("_fence_before", "").replace("_fence_after", "")
)
if "provider" in item["args"]:
if item["args"]["provider"] == "CPUExecutionProvider":
device = "CPU"
elif item["args"]["provider"] == "CUDAExecutionProvider":
device = "CUDA"
elif item["args"]["provider"] == "DmlExecutionProvider":
device = "DML"
if node_name not in node_provider:
node_provider[node_name] = device
else:
assert node_provider[node_name] == device
elif kernel_time_only:
continue
op_name = item["args"]["op_name"]
if op_name in NODES_TYPE_CONTAINING_SUBGRAPH:
continue
if node_name in node_time:
node_time[node_name] += item["dur"]
node_freq[node_name] += 1
else:
node_time[node_name] = item["dur"]
node_freq[node_name] = 1
node_name_list.append(node_name)
total += item["dur"]
# Output items in the original order.
lines = [
"\nNodes in the original order:",
"-" * 64,
"Total(μs)\tTime%\tAcc %\tAvg(μs)\tCalls\tProvider\tNode",
]
before_percentage = 0.0
for node_name in node_name_list:
duration = node_time[node_name]
calls = node_freq[node_name]
avg_time = duration / float(calls)
percentage = (duration / total) * 100.0
provider = node_provider[node_name] if node_name in node_provider else ""
before_percentage += percentage
lines.append(
f"{duration:10d}\t{percentage:5.2f}\t{before_percentage:5.2f}\t{avg_time:8.1f}\t{calls:5d}\t{provider:8s}\t{node_name}"
)
# Output items with run time ratio > thresholds, and sorted by duration in the descending order.
lines.append(f"\nTop expensive nodes with Time% >= {threshold*100:.2f}:")
lines.append("-" * 64)
lines.append("Total(μs)\tTime%\tAvg(μs)\tCalls\tProvider\tNode")
for node_name, duration in sorted(node_time.items(), key=lambda x: x[1], reverse=True):
ratio = duration / total
if ratio < threshold:
continue
calls = node_freq[node_name]
avg_time = duration / float(calls)
percentage = (duration / total) * 100.0
provider = node_provider[node_name] if node_name in node_provider else ""
lines.append(f"{duration:10d}\t{percentage:5.2f}\t{avg_time:8.1f}\t{calls:5d}\t{provider:8s}\t{node_name}")
return lines
def group_node_results(sess_time, kernel_time_only, use_gpu):
"""Group results by operator name.
Args:
sess_time (List[Dict]): profile data
kernel_time_only (bool): Only include items for kernel time.
use_gpu (bool): GPU is used in profiling or not.
Returns:
List[str]: lines of string for output.
"""
op_kernel_time = {}
op_kernel_records = {}
total_kernel_time = 0
provider_op_kernel_time = {}
provider_op_kernel_records = {}
provider_kernel_time = {}
op_fence_time = {}
total_fence_time = 0
provider_counter = {}
for item in sess_time:
if item["cat"] == "Node" and "dur" in item and "args" in item and "op_name" in item["args"]:
op_name = item["args"]["op_name"]
# TODO: shall we have a separated group for nodes with subgraph?
if op_name in NODES_TYPE_CONTAINING_SUBGRAPH:
continue
if "provider" not in item["args"]:
if "fence" in item["name"]:
if op_name in op_fence_time:
op_fence_time[op_name] += item["dur"]
else:
op_fence_time[op_name] = item["dur"]
total_fence_time += item["dur"]
continue
provider = item["args"]["provider"] if "provider" in item["args"] else ""
if provider in provider_counter:
provider_counter[provider] += 1
else:
provider_counter[provider] = 1
key = f"{provider}:{op_name}"
if key in provider_op_kernel_time:
provider_op_kernel_time[key] += item["dur"]
provider_op_kernel_records[key] += 1
else:
provider_op_kernel_time[key] = item["dur"]
provider_op_kernel_records[key] = 1
if provider in provider_kernel_time:
provider_kernel_time[provider] += item["dur"]
else:
provider_kernel_time[provider] = item["dur"]
if op_name in op_kernel_time:
op_kernel_time[op_name] += item["dur"]
op_kernel_records[op_name] += 1
else:
op_kernel_time[op_name] = item["dur"]
op_kernel_records[op_name] = 1
total_kernel_time += item["dur"]
lines = ["", "Grouped by operator"]
lines.append("-" * 64)
lines.append("Total(μs)\tTime%\tKernel(μs)\tKernel%\tCalls\tAvgKernel(μs)\tFence(μs)\tOperator")
for op_name, kernel_time in sorted(op_kernel_time.items(), key=lambda x: x[1], reverse=True):
fence_time = op_fence_time[op_name] if op_name in op_fence_time else 0
kernel_time_ratio = kernel_time / total_kernel_time
total_time = kernel_time + fence_time
time_ratio = total_time / (total_kernel_time + total_fence_time)
kernel_calls = op_kernel_records[op_name]
avg_kernel_time = kernel_time / kernel_calls
lines.append(
f"{total_time:10d}\t{time_ratio * 100.0:5.2f}\t{kernel_time:11d}\t{kernel_time_ratio * 100.0:5.2f}\t{kernel_calls:5d}\t{avg_kernel_time:14.1f}\t{fence_time:10d}\t{op_name}"
)
lines += ["", "Grouped by provider + operator"]
lines.append("-" * 64)
lines.append("Kernel(μs)\tProvider%\tCalls\tAvgKernel(μs)\tProvider\tOperator")
for key, kernel_time in sorted(provider_op_kernel_time.items(), key=lambda x: x[1], reverse=True):
parts = key.split(":")
provider = parts[0]
op_name = parts[1]
short_ep = provider.replace("ExecutionProvider", "")
calls = provider_op_kernel_records[key]
avg_kernel_time = kernel_time / calls
provider_time_ratio = kernel_time / provider_kernel_time[provider]
lines.append(
f"{kernel_time:10d}\t{provider_time_ratio * 100.0:9.2f}\t{calls:5d}\t{avg_kernel_time:14.1f}\t{short_ep:8s}\t{op_name}"
)
return lines
def get_dim_from_type_proto(dim):
return getattr(dim, dim.WhichOneof("value")) if type(dim.WhichOneof("value")) == str else None
def get_shape_from_type_proto(type_proto):
return [get_dim_from_type_proto(d) for d in type_proto.tensor_type.shape.dim]
def create_dummy_inputs(onnx_model, batch_size, sequence_length, samples):
"""Create dummy inputs for ONNX model.
Args:
onnx_model (OnnxModel): ONNX model
batch_size (int): batch size
sequence_length (int): sequence length
samples (int): number of samples
Returns:
List[Dict]: list of inputs
"""
dummy_inputs = {}
for graph_input in onnx_model.get_graph_inputs_excluding_initializers():
shape = get_shape_from_type_proto(graph_input.type)
symbol_dims = []
for i, dim in enumerate(shape):
if isinstance(dim, str):
symbol_dims.append(i)
# allowed symbolic dimensions: batch_size and sequence_length
if len(symbol_dims) > 2:
return None
if len(symbol_dims) > 0:
shape[symbol_dims[0]] = batch_size
if len(symbol_dims) > 1:
shape[symbol_dims[1]] = sequence_length
elem_type = graph_input.type.tensor_type.elem_type
assert elem_type in [TensorProto.FLOAT, TensorProto.INT32, TensorProto.INT64]
data_type = (
numpy.float32
if elem_type == TensorProto.FLOAT
else (numpy.int64 if elem_type == TensorProto.INT64 else numpy.int32)
)
data = numpy.ones(shape, dtype=data_type)
dummy_inputs[graph_input.name] = data
all_inputs = [dummy_inputs for _ in range(samples)]
return all_inputs
def create_bert_inputs(
onnx_model,
batch_size,
sequence_length,
samples,
input_ids_name=None,
segment_ids_name=None,
input_mask_name=None,
):
"""Create dummy inputs for BERT model.
Args:
onnx_model (OnnxModel): ONNX model
batch_size (int): batch size
sequence_length (int): sequence length
samples (int): number of samples
input_ids_name (str, optional): Name of graph input for input IDs. Defaults to None.
segment_ids_name (str, optional): Name of graph input for segment IDs. Defaults to None.
input_mask_name (str, optional): Name of graph input for attention mask. Defaults to None.
Returns:
List[Dict]: list of inputs
"""
from bert_test_data import find_bert_inputs, generate_test_data
input_ids, segment_ids, input_mask = find_bert_inputs(onnx_model, input_ids_name, segment_ids_name, input_mask_name)
all_inputs = generate_test_data(
batch_size,
sequence_length,
test_cases=samples,
seed=123,
verbose=False,
input_ids=input_ids,
segment_ids=segment_ids,
input_mask=input_mask,
random_mask_length=False,
)
return all_inputs
def create_gpt2_inputs(onnx_model, batch_size, sequence_length, past_sequence_length, samples):
"""Create dummy inputs for GPT-2 model.
Args:
onnx_model (OnnxModel): ONNX model
batch_size (int): batch size
sequence_length (int): sequence length
past_sequence_length (int): past sequence length
samples (int): number of samples
Raises:
RuntimeError: symbolic is not supported. Use the tool convert_to_onnx.py to export ONNX model instead.
Returns:
List[Dict]: list of inputs
"""
# The symbolic names shall be same as those used in Gpt2Helper.export_onnx(...) function.
symbols = {
"batch_size": batch_size,
"seq_len": sequence_length,
"past_seq_len": past_sequence_length,
"total_seq_len": sequence_length + past_sequence_length,
}
dummy_inputs = {}
for graph_input in onnx_model.get_graph_inputs_excluding_initializers():
shape = get_shape_from_type_proto(graph_input.type)
for i, dim in enumerate(shape):
if isinstance(dim, str):
if dim not in symbols.keys():
raise RuntimeError(f"symbol is not supported: {dim}")
else:
shape[i] = symbols[dim]
elem_type = graph_input.type.tensor_type.elem_type
assert elem_type in [TensorProto.FLOAT, TensorProto.INT32, TensorProto.INT64]
data_type = (
numpy.float32
if elem_type == TensorProto.FLOAT
else (numpy.int64 if elem_type == TensorProto.INT64 else numpy.int32)
)
data = numpy.ones(shape, dtype=data_type)
dummy_inputs[graph_input.name] = data
all_inputs = [dummy_inputs for _ in range(samples)]
return all_inputs
def create_longformer_inputs(onnx_model, batch_size, sequence_length, global_length, samples):
"""Create dummy inputs for Longformer model.
Args:
onnx_model (OnnxModel): ONNX model
batch_size (int): batch size
sequence_length (int): sequence length
global_length (int): number of global tokens
samples (int): number of samples
Raises:
RuntimeError: symbolic is not supported. Use the tool convert_longformer_to_onnx.py to export ONNX model instead.
Returns:
List[Dict]: list of inputs
"""
symbols = {"batch_size": batch_size, "sequence_length": sequence_length}
dummy_inputs = {}
for graph_input in onnx_model.get_graph_inputs_excluding_initializers():
shape = get_shape_from_type_proto(graph_input.type)
for i, dim in enumerate(shape):
if isinstance(dim, str):
if dim not in symbols.keys():
raise RuntimeError(f"symbol is not supported: {dim}")
else:
shape[i] = symbols[dim]
elem_type = graph_input.type.tensor_type.elem_type
assert elem_type in [TensorProto.FLOAT, TensorProto.INT32, TensorProto.INT64]
data_type = (
numpy.float32
if elem_type == TensorProto.FLOAT
else (numpy.int64 if elem_type == TensorProto.INT64 else numpy.int32)
)
if "global" in graph_input.name:
data = numpy.zeros(shape, dtype=data_type)
data[:, :global_length] = 1
else:
data = numpy.ones(shape, dtype=data_type)
dummy_inputs[graph_input.name] = data
all_inputs = [dummy_inputs for _ in range(samples)]
return all_inputs
def process_results(profile_file, args):
profile_records = load_profile_json(profile_file)
lines = parse_kernel_results(profile_records, args.threshold)
lines += parse_node_results(profile_records, args.kernel_time_only, args.threshold)
lines += group_node_results(profile_records, args.kernel_time_only, args.use_gpu)
return lines
def run(args):
num_threads = args.thread_num if args.thread_num > 0 else psutil.cpu_count(logical=False)
# Set OMP environment variable before importing onnxruntime. Needed for cpu only, and no impact for onnxruntime-gpu package.
if "OMP_NUM_THREADS" not in os.environ:
os.environ["OMP_NUM_THREADS"] = str(num_threads)
from onnx import load
from onnx_model import OnnxModel
onnx_model = OnnxModel(load(args.model))
all_inputs = None
if args.dummy_inputs == "bert":
all_inputs = create_bert_inputs(
onnx_model,
args.batch_size,
args.sequence_length,
args.samples,
args.input_ids_name,
args.segment_ids_name,
args.input_mask_name,
)
elif args.dummy_inputs == "gpt2":
all_inputs = create_gpt2_inputs(
onnx_model,
args.batch_size,
args.sequence_length,
args.past_sequence_length,
args.samples,
)
elif args.dummy_inputs == "longformer":
all_inputs = create_longformer_inputs(
onnx_model,
args.batch_size,
args.sequence_length,
args.global_length,
args.samples,
)
else: # default
all_inputs = create_dummy_inputs(onnx_model, args.batch_size, args.sequence_length, args.samples)
profile_file = run_profile(
args.model,
args.use_gpu,
args.provider,
args.basic_optimization,
args.thread_num,
all_inputs,
)
return profile_file
if __name__ == "__main__":
arguments = parse_arguments()
print("Arguments", arguments)
from benchmark_helper import setup_logger
setup_logger(arguments.verbose)
if not arguments.input:
assert arguments.model, "requires either --model to run profiling or --input to read profiling results"
profile_file = run(arguments)
else:
profile_file = arguments.input
results = process_results(profile_file, arguments)
for line in results:
print(line)