You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
725 lines
24 KiB
725 lines
24 KiB
import argparse
|
|
import json
|
|
import os
|
|
|
|
import numpy
|
|
import psutil
|
|
from onnx import TensorProto
|
|
|
|
"""
|
|
This profiler tool could run a transformer model and print out the kernel time spent on each Node of the model.
|
|
Example of profiling of longformer model:
|
|
python profiler.py --model longformer-base-4096_fp32.onnx --batch_size 1 --sequence_length 4096 --global_length 8 --samples 1000 --thread_num 8 --dummy_inputs longformer --use_gpu
|
|
Example of importing profile result file from onnxruntime_perf_test:
|
|
python profiler.py --input profile_2021-10-25_12-02-41.json
|
|
"""
|
|
|
|
NODES_TYPE_CONTAINING_SUBGRAPH = ["Scan", "Loop", "If"]
|
|
|
|
|
|
def parse_arguments(argv=None):
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument(
|
|
"-i",
|
|
"--input",
|
|
required=False,
|
|
type=str,
|
|
help="Set the input file for reading the profile results",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"-m",
|
|
"--model",
|
|
required=False,
|
|
type=str,
|
|
help="onnx model path to run profiling. Required when --input is not specified.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"-b",
|
|
"--batch_size",
|
|
required=False,
|
|
type=int,
|
|
default=1,
|
|
help="batch size of input",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"-s",
|
|
"--sequence_length",
|
|
required=False,
|
|
type=int,
|
|
default=32,
|
|
help="sequence length of input",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--past_sequence_length",
|
|
required=False,
|
|
type=int,
|
|
default=1,
|
|
help="past sequence length for gpt2",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--global_length",
|
|
required=False,
|
|
type=int,
|
|
default=1,
|
|
help="number of global tokens for longformer",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--samples",
|
|
required=False,
|
|
type=int,
|
|
default=1000,
|
|
help="number of samples to test. Set it large enough to reduce the variance of performance result.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--threshold",
|
|
required=False,
|
|
type=float,
|
|
default=0.01,
|
|
help="Threshold of run time ratio among all nodes. Nodes with larger ratio will show in top expensive nodes.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--thread_num",
|
|
required=False,
|
|
type=int,
|
|
default=-1,
|
|
help="number of threads to use",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--input_ids_name",
|
|
required=False,
|
|
type=str,
|
|
default=None,
|
|
help="input name for input IDs, for bert",
|
|
)
|
|
parser.add_argument(
|
|
"--segment_ids_name",
|
|
required=False,
|
|
type=str,
|
|
default=None,
|
|
help="input name for segment IDs, for bert",
|
|
)
|
|
parser.add_argument(
|
|
"--input_mask_name",
|
|
required=False,
|
|
type=str,
|
|
default=None,
|
|
help="input name for attention mask, for bert",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--dummy_inputs",
|
|
required=False,
|
|
default="default",
|
|
choices=["bert", "gpt2", "longformer", "default"],
|
|
help="Type of model inputs. The default will create dummy inputs with ones.",
|
|
)
|
|
|
|
parser.add_argument("-g", "--use_gpu", required=False, action="store_true", help="use GPU")
|
|
parser.set_defaults(use_gpu=False)
|
|
|
|
parser.add_argument(
|
|
"--provider",
|
|
required=False,
|
|
type=str,
|
|
default="cuda",
|
|
help="Execution provider to use",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--basic_optimization",
|
|
required=False,
|
|
action="store_true",
|
|
help="Enable only basic graph optimizations. By default, all optimizations are enabled in OnnxRuntime",
|
|
)
|
|
parser.set_defaults(basic_optimization=False)
|
|
|
|
parser.add_argument(
|
|
"--kernel_time_only",
|
|
required=False,
|
|
action="store_true",
|
|
help="Only include the kernel time and no fence time",
|
|
)
|
|
parser.set_defaults(kernel_time_only=False)
|
|
|
|
parser.add_argument("-v", "--verbose", required=False, action="store_true")
|
|
parser.set_defaults(verbose=False)
|
|
|
|
return parser.parse_args(argv)
|
|
|
|
|
|
def run_profile(onnx_model_path, use_gpu, provider, basic_optimization, thread_num, all_inputs):
|
|
from benchmark_helper import create_onnxruntime_session
|
|
|
|
session = create_onnxruntime_session(
|
|
onnx_model_path,
|
|
use_gpu,
|
|
provider,
|
|
enable_all_optimization=not basic_optimization,
|
|
num_threads=thread_num,
|
|
enable_profiling=True,
|
|
)
|
|
|
|
for inputs in all_inputs:
|
|
_ = session.run(None, inputs)
|
|
|
|
profile_file = session.end_profiling()
|
|
return profile_file
|
|
|
|
|
|
def load_profile_json(profile_file):
|
|
print(f"loading profile output {profile_file} ...")
|
|
|
|
with open(profile_file, "r") as opened_file:
|
|
sess_time = json.load(opened_file)
|
|
|
|
assert isinstance(sess_time, list)
|
|
return sess_time
|
|
|
|
|
|
def parse_kernel_results(sess_time, threshold=0):
|
|
"""Parse profile data and output nodes in two sections - nodes in the original order, and top expensive nodes.
|
|
|
|
Args:
|
|
sess_time (List[Dict]): profile data
|
|
kernel_time_only (bool, optional): Only include items for kernel time. Defaults to False.
|
|
threshold (int, optional): Minimum ratio of duration among all. Defaults to 0.
|
|
|
|
Returns:
|
|
List[str]: lines of string for output.
|
|
"""
|
|
kernel_name_to_op_name = {}
|
|
kernel_time = {}
|
|
kernel_freq = {}
|
|
total = 0
|
|
session_init = False
|
|
for item in sess_time:
|
|
# Skip all MemcpyHostToDevice before session_initialization
|
|
if item["cat"] == "Session" and item["name"] == "session_initialization":
|
|
session_init = True
|
|
if not session_init:
|
|
continue
|
|
|
|
if item["cat"] == "Kernel" and "dur" in item and "args" in item and "op_name" in item["args"]:
|
|
kernel_name = item["name"]
|
|
|
|
op_name = item["args"]["op_name"]
|
|
if op_name in NODES_TYPE_CONTAINING_SUBGRAPH:
|
|
continue
|
|
|
|
# Handle MemcpyHostToDevice and MemcpyDeviceToHost here
|
|
if not op_name:
|
|
op_name = f"({kernel_name})"
|
|
|
|
if kernel_name in kernel_time:
|
|
kernel_time[kernel_name] += item["dur"]
|
|
kernel_freq[kernel_name] += 1
|
|
else:
|
|
kernel_time[kernel_name] = item["dur"]
|
|
kernel_freq[kernel_name] = 1
|
|
kernel_name_to_op_name[kernel_name] = op_name
|
|
|
|
total += item["dur"]
|
|
|
|
if not kernel_time:
|
|
return ["No kernel record found!"]
|
|
|
|
# Output items with run time ratio > thresholds, and sorted by duration in the descending order.
|
|
lines = []
|
|
lines.append(f"\nTop expensive kernels with Time% >= {threshold*100:.2f}:")
|
|
lines.append("-" * 64)
|
|
lines.append("Total(μs)\tTime%\tCalls\tAvg(μs)\tKernel")
|
|
for kernel_name, duration in sorted(kernel_time.items(), key=lambda x: x[1], reverse=True):
|
|
ratio = duration / total
|
|
if ratio < threshold:
|
|
continue
|
|
|
|
calls = kernel_freq[kernel_name]
|
|
avg_time = duration / float(calls)
|
|
lines.append(f"{duration:10d}\t{ratio * 100.0:5.2f}\t{calls:5d}\t{avg_time:8.1f}\t{kernel_name}")
|
|
|
|
# Group by operator
|
|
op_time = {}
|
|
for kernel_name, op_name in kernel_name_to_op_name.items():
|
|
duration = kernel_time[kernel_name]
|
|
if op_name in op_time:
|
|
op_time[op_name] += duration
|
|
else:
|
|
op_time[op_name] = duration
|
|
|
|
lines.append(f"\nGroup kernel time by operator:")
|
|
lines.append("-" * 64)
|
|
lines.append("Total(μs)\tTime%\tOperator")
|
|
for op_name, duration in sorted(op_time.items(), key=lambda x: x[1], reverse=True):
|
|
ratio = duration / total
|
|
lines.append(f"{duration:10d}\t{ratio * 100.0:5.2f}\t{op_name}")
|
|
|
|
return lines
|
|
|
|
|
|
def parse_node_results(sess_time, kernel_time_only=False, threshold=0):
|
|
"""Parse profile data and output nodes in two sections - nodes in the original order, and top expensive nodes.
|
|
|
|
Args:
|
|
sess_time (List[Dict]): profile data
|
|
kernel_time_only (bool, optional): Only include items for kernel time. Defaults to False.
|
|
threshold (int, optional): Minimum ratio of duration among all. Defaults to 0.
|
|
|
|
Returns:
|
|
List[str]: lines of string for output.
|
|
"""
|
|
node_name_list = []
|
|
node_time = {}
|
|
node_freq = {}
|
|
node_provider = {}
|
|
total = 0
|
|
for item in sess_time:
|
|
if item["cat"] == "Node" and "dur" in item and "args" in item and "op_name" in item["args"]:
|
|
node_name = (
|
|
item["name"].replace("_kernel_time", "").replace("_fence_before", "").replace("_fence_after", "")
|
|
)
|
|
|
|
if "provider" in item["args"]:
|
|
if item["args"]["provider"] == "CPUExecutionProvider":
|
|
device = "CPU"
|
|
elif item["args"]["provider"] == "CUDAExecutionProvider":
|
|
device = "CUDA"
|
|
elif item["args"]["provider"] == "DmlExecutionProvider":
|
|
device = "DML"
|
|
|
|
if node_name not in node_provider:
|
|
node_provider[node_name] = device
|
|
else:
|
|
assert node_provider[node_name] == device
|
|
elif kernel_time_only:
|
|
continue
|
|
|
|
op_name = item["args"]["op_name"]
|
|
if op_name in NODES_TYPE_CONTAINING_SUBGRAPH:
|
|
continue
|
|
|
|
if node_name in node_time:
|
|
node_time[node_name] += item["dur"]
|
|
node_freq[node_name] += 1
|
|
else:
|
|
node_time[node_name] = item["dur"]
|
|
node_freq[node_name] = 1
|
|
node_name_list.append(node_name)
|
|
|
|
total += item["dur"]
|
|
|
|
# Output items in the original order.
|
|
lines = [
|
|
"\nNodes in the original order:",
|
|
"-" * 64,
|
|
"Total(μs)\tTime%\tAcc %\tAvg(μs)\tCalls\tProvider\tNode",
|
|
]
|
|
before_percentage = 0.0
|
|
for node_name in node_name_list:
|
|
duration = node_time[node_name]
|
|
calls = node_freq[node_name]
|
|
avg_time = duration / float(calls)
|
|
percentage = (duration / total) * 100.0
|
|
provider = node_provider[node_name] if node_name in node_provider else ""
|
|
before_percentage += percentage
|
|
lines.append(
|
|
f"{duration:10d}\t{percentage:5.2f}\t{before_percentage:5.2f}\t{avg_time:8.1f}\t{calls:5d}\t{provider:8s}\t{node_name}"
|
|
)
|
|
|
|
# Output items with run time ratio > thresholds, and sorted by duration in the descending order.
|
|
lines.append(f"\nTop expensive nodes with Time% >= {threshold*100:.2f}:")
|
|
lines.append("-" * 64)
|
|
lines.append("Total(μs)\tTime%\tAvg(μs)\tCalls\tProvider\tNode")
|
|
for node_name, duration in sorted(node_time.items(), key=lambda x: x[1], reverse=True):
|
|
ratio = duration / total
|
|
if ratio < threshold:
|
|
continue
|
|
|
|
calls = node_freq[node_name]
|
|
avg_time = duration / float(calls)
|
|
percentage = (duration / total) * 100.0
|
|
provider = node_provider[node_name] if node_name in node_provider else ""
|
|
lines.append(f"{duration:10d}\t{percentage:5.2f}\t{avg_time:8.1f}\t{calls:5d}\t{provider:8s}\t{node_name}")
|
|
|
|
return lines
|
|
|
|
|
|
def group_node_results(sess_time, kernel_time_only, use_gpu):
|
|
"""Group results by operator name.
|
|
|
|
Args:
|
|
sess_time (List[Dict]): profile data
|
|
kernel_time_only (bool): Only include items for kernel time.
|
|
use_gpu (bool): GPU is used in profiling or not.
|
|
|
|
Returns:
|
|
List[str]: lines of string for output.
|
|
"""
|
|
op_kernel_time = {}
|
|
op_kernel_records = {}
|
|
total_kernel_time = 0
|
|
|
|
provider_op_kernel_time = {}
|
|
provider_op_kernel_records = {}
|
|
provider_kernel_time = {}
|
|
|
|
op_fence_time = {}
|
|
total_fence_time = 0
|
|
|
|
provider_counter = {}
|
|
for item in sess_time:
|
|
if item["cat"] == "Node" and "dur" in item and "args" in item and "op_name" in item["args"]:
|
|
op_name = item["args"]["op_name"]
|
|
|
|
# TODO: shall we have a separated group for nodes with subgraph?
|
|
if op_name in NODES_TYPE_CONTAINING_SUBGRAPH:
|
|
continue
|
|
|
|
if "provider" not in item["args"]:
|
|
if "fence" in item["name"]:
|
|
if op_name in op_fence_time:
|
|
op_fence_time[op_name] += item["dur"]
|
|
else:
|
|
op_fence_time[op_name] = item["dur"]
|
|
total_fence_time += item["dur"]
|
|
continue
|
|
|
|
provider = item["args"]["provider"] if "provider" in item["args"] else ""
|
|
if provider in provider_counter:
|
|
provider_counter[provider] += 1
|
|
else:
|
|
provider_counter[provider] = 1
|
|
|
|
key = f"{provider}:{op_name}"
|
|
if key in provider_op_kernel_time:
|
|
provider_op_kernel_time[key] += item["dur"]
|
|
provider_op_kernel_records[key] += 1
|
|
else:
|
|
provider_op_kernel_time[key] = item["dur"]
|
|
provider_op_kernel_records[key] = 1
|
|
|
|
if provider in provider_kernel_time:
|
|
provider_kernel_time[provider] += item["dur"]
|
|
else:
|
|
provider_kernel_time[provider] = item["dur"]
|
|
|
|
if op_name in op_kernel_time:
|
|
op_kernel_time[op_name] += item["dur"]
|
|
op_kernel_records[op_name] += 1
|
|
else:
|
|
op_kernel_time[op_name] = item["dur"]
|
|
op_kernel_records[op_name] = 1
|
|
|
|
total_kernel_time += item["dur"]
|
|
|
|
lines = ["", "Grouped by operator"]
|
|
lines.append("-" * 64)
|
|
lines.append("Total(μs)\tTime%\tKernel(μs)\tKernel%\tCalls\tAvgKernel(μs)\tFence(μs)\tOperator")
|
|
for op_name, kernel_time in sorted(op_kernel_time.items(), key=lambda x: x[1], reverse=True):
|
|
fence_time = op_fence_time[op_name] if op_name in op_fence_time else 0
|
|
kernel_time_ratio = kernel_time / total_kernel_time
|
|
total_time = kernel_time + fence_time
|
|
time_ratio = total_time / (total_kernel_time + total_fence_time)
|
|
kernel_calls = op_kernel_records[op_name]
|
|
avg_kernel_time = kernel_time / kernel_calls
|
|
lines.append(
|
|
f"{total_time:10d}\t{time_ratio * 100.0:5.2f}\t{kernel_time:11d}\t{kernel_time_ratio * 100.0:5.2f}\t{kernel_calls:5d}\t{avg_kernel_time:14.1f}\t{fence_time:10d}\t{op_name}"
|
|
)
|
|
|
|
lines += ["", "Grouped by provider + operator"]
|
|
lines.append("-" * 64)
|
|
lines.append("Kernel(μs)\tProvider%\tCalls\tAvgKernel(μs)\tProvider\tOperator")
|
|
for key, kernel_time in sorted(provider_op_kernel_time.items(), key=lambda x: x[1], reverse=True):
|
|
parts = key.split(":")
|
|
provider = parts[0]
|
|
op_name = parts[1]
|
|
short_ep = provider.replace("ExecutionProvider", "")
|
|
calls = provider_op_kernel_records[key]
|
|
avg_kernel_time = kernel_time / calls
|
|
provider_time_ratio = kernel_time / provider_kernel_time[provider]
|
|
lines.append(
|
|
f"{kernel_time:10d}\t{provider_time_ratio * 100.0:9.2f}\t{calls:5d}\t{avg_kernel_time:14.1f}\t{short_ep:8s}\t{op_name}"
|
|
)
|
|
|
|
return lines
|
|
|
|
|
|
def get_dim_from_type_proto(dim):
|
|
return getattr(dim, dim.WhichOneof("value")) if type(dim.WhichOneof("value")) == str else None
|
|
|
|
|
|
def get_shape_from_type_proto(type_proto):
|
|
return [get_dim_from_type_proto(d) for d in type_proto.tensor_type.shape.dim]
|
|
|
|
|
|
def create_dummy_inputs(onnx_model, batch_size, sequence_length, samples):
|
|
"""Create dummy inputs for ONNX model.
|
|
|
|
Args:
|
|
onnx_model (OnnxModel): ONNX model
|
|
batch_size (int): batch size
|
|
sequence_length (int): sequence length
|
|
samples (int): number of samples
|
|
|
|
Returns:
|
|
List[Dict]: list of inputs
|
|
"""
|
|
dummy_inputs = {}
|
|
for graph_input in onnx_model.get_graph_inputs_excluding_initializers():
|
|
shape = get_shape_from_type_proto(graph_input.type)
|
|
symbol_dims = []
|
|
for i, dim in enumerate(shape):
|
|
if isinstance(dim, str):
|
|
symbol_dims.append(i)
|
|
|
|
# allowed symbolic dimensions: batch_size and sequence_length
|
|
if len(symbol_dims) > 2:
|
|
return None
|
|
if len(symbol_dims) > 0:
|
|
shape[symbol_dims[0]] = batch_size
|
|
if len(symbol_dims) > 1:
|
|
shape[symbol_dims[1]] = sequence_length
|
|
|
|
elem_type = graph_input.type.tensor_type.elem_type
|
|
assert elem_type in [TensorProto.FLOAT, TensorProto.INT32, TensorProto.INT64]
|
|
data_type = (
|
|
numpy.float32
|
|
if elem_type == TensorProto.FLOAT
|
|
else (numpy.int64 if elem_type == TensorProto.INT64 else numpy.int32)
|
|
)
|
|
data = numpy.ones(shape, dtype=data_type)
|
|
dummy_inputs[graph_input.name] = data
|
|
|
|
all_inputs = [dummy_inputs for _ in range(samples)]
|
|
return all_inputs
|
|
|
|
|
|
def create_bert_inputs(
|
|
onnx_model,
|
|
batch_size,
|
|
sequence_length,
|
|
samples,
|
|
input_ids_name=None,
|
|
segment_ids_name=None,
|
|
input_mask_name=None,
|
|
):
|
|
"""Create dummy inputs for BERT model.
|
|
|
|
Args:
|
|
onnx_model (OnnxModel): ONNX model
|
|
batch_size (int): batch size
|
|
sequence_length (int): sequence length
|
|
samples (int): number of samples
|
|
input_ids_name (str, optional): Name of graph input for input IDs. Defaults to None.
|
|
segment_ids_name (str, optional): Name of graph input for segment IDs. Defaults to None.
|
|
input_mask_name (str, optional): Name of graph input for attention mask. Defaults to None.
|
|
|
|
Returns:
|
|
List[Dict]: list of inputs
|
|
"""
|
|
from bert_test_data import find_bert_inputs, generate_test_data
|
|
|
|
input_ids, segment_ids, input_mask = find_bert_inputs(onnx_model, input_ids_name, segment_ids_name, input_mask_name)
|
|
all_inputs = generate_test_data(
|
|
batch_size,
|
|
sequence_length,
|
|
test_cases=samples,
|
|
seed=123,
|
|
verbose=False,
|
|
input_ids=input_ids,
|
|
segment_ids=segment_ids,
|
|
input_mask=input_mask,
|
|
random_mask_length=False,
|
|
)
|
|
|
|
return all_inputs
|
|
|
|
|
|
def create_gpt2_inputs(onnx_model, batch_size, sequence_length, past_sequence_length, samples):
|
|
"""Create dummy inputs for GPT-2 model.
|
|
|
|
Args:
|
|
onnx_model (OnnxModel): ONNX model
|
|
batch_size (int): batch size
|
|
sequence_length (int): sequence length
|
|
past_sequence_length (int): past sequence length
|
|
samples (int): number of samples
|
|
|
|
Raises:
|
|
RuntimeError: symbolic is not supported. Use the tool convert_to_onnx.py to export ONNX model instead.
|
|
|
|
Returns:
|
|
List[Dict]: list of inputs
|
|
"""
|
|
# The symbolic names shall be same as those used in Gpt2Helper.export_onnx(...) function.
|
|
symbols = {
|
|
"batch_size": batch_size,
|
|
"seq_len": sequence_length,
|
|
"past_seq_len": past_sequence_length,
|
|
"total_seq_len": sequence_length + past_sequence_length,
|
|
}
|
|
|
|
dummy_inputs = {}
|
|
for graph_input in onnx_model.get_graph_inputs_excluding_initializers():
|
|
shape = get_shape_from_type_proto(graph_input.type)
|
|
for i, dim in enumerate(shape):
|
|
if isinstance(dim, str):
|
|
if dim not in symbols.keys():
|
|
raise RuntimeError(f"symbol is not supported: {dim}")
|
|
else:
|
|
shape[i] = symbols[dim]
|
|
|
|
elem_type = graph_input.type.tensor_type.elem_type
|
|
assert elem_type in [TensorProto.FLOAT, TensorProto.INT32, TensorProto.INT64]
|
|
data_type = (
|
|
numpy.float32
|
|
if elem_type == TensorProto.FLOAT
|
|
else (numpy.int64 if elem_type == TensorProto.INT64 else numpy.int32)
|
|
)
|
|
data = numpy.ones(shape, dtype=data_type)
|
|
dummy_inputs[graph_input.name] = data
|
|
|
|
all_inputs = [dummy_inputs for _ in range(samples)]
|
|
return all_inputs
|
|
|
|
|
|
def create_longformer_inputs(onnx_model, batch_size, sequence_length, global_length, samples):
|
|
"""Create dummy inputs for Longformer model.
|
|
|
|
Args:
|
|
onnx_model (OnnxModel): ONNX model
|
|
batch_size (int): batch size
|
|
sequence_length (int): sequence length
|
|
global_length (int): number of global tokens
|
|
samples (int): number of samples
|
|
|
|
Raises:
|
|
RuntimeError: symbolic is not supported. Use the tool convert_longformer_to_onnx.py to export ONNX model instead.
|
|
|
|
Returns:
|
|
List[Dict]: list of inputs
|
|
"""
|
|
symbols = {"batch_size": batch_size, "sequence_length": sequence_length}
|
|
|
|
dummy_inputs = {}
|
|
for graph_input in onnx_model.get_graph_inputs_excluding_initializers():
|
|
shape = get_shape_from_type_proto(graph_input.type)
|
|
for i, dim in enumerate(shape):
|
|
if isinstance(dim, str):
|
|
if dim not in symbols.keys():
|
|
raise RuntimeError(f"symbol is not supported: {dim}")
|
|
else:
|
|
shape[i] = symbols[dim]
|
|
|
|
elem_type = graph_input.type.tensor_type.elem_type
|
|
assert elem_type in [TensorProto.FLOAT, TensorProto.INT32, TensorProto.INT64]
|
|
data_type = (
|
|
numpy.float32
|
|
if elem_type == TensorProto.FLOAT
|
|
else (numpy.int64 if elem_type == TensorProto.INT64 else numpy.int32)
|
|
)
|
|
|
|
if "global" in graph_input.name:
|
|
data = numpy.zeros(shape, dtype=data_type)
|
|
data[:, :global_length] = 1
|
|
else:
|
|
data = numpy.ones(shape, dtype=data_type)
|
|
dummy_inputs[graph_input.name] = data
|
|
|
|
all_inputs = [dummy_inputs for _ in range(samples)]
|
|
return all_inputs
|
|
|
|
|
|
def process_results(profile_file, args):
|
|
profile_records = load_profile_json(profile_file)
|
|
|
|
lines = parse_kernel_results(profile_records, args.threshold)
|
|
|
|
lines += parse_node_results(profile_records, args.kernel_time_only, args.threshold)
|
|
|
|
lines += group_node_results(profile_records, args.kernel_time_only, args.use_gpu)
|
|
|
|
return lines
|
|
|
|
|
|
def run(args):
|
|
num_threads = args.thread_num if args.thread_num > 0 else psutil.cpu_count(logical=False)
|
|
|
|
# Set OMP environment variable before importing onnxruntime. Needed for cpu only, and no impact for onnxruntime-gpu package.
|
|
if "OMP_NUM_THREADS" not in os.environ:
|
|
os.environ["OMP_NUM_THREADS"] = str(num_threads)
|
|
|
|
from onnx import load
|
|
from onnx_model import OnnxModel
|
|
|
|
onnx_model = OnnxModel(load(args.model))
|
|
|
|
all_inputs = None
|
|
if args.dummy_inputs == "bert":
|
|
all_inputs = create_bert_inputs(
|
|
onnx_model,
|
|
args.batch_size,
|
|
args.sequence_length,
|
|
args.samples,
|
|
args.input_ids_name,
|
|
args.segment_ids_name,
|
|
args.input_mask_name,
|
|
)
|
|
elif args.dummy_inputs == "gpt2":
|
|
all_inputs = create_gpt2_inputs(
|
|
onnx_model,
|
|
args.batch_size,
|
|
args.sequence_length,
|
|
args.past_sequence_length,
|
|
args.samples,
|
|
)
|
|
elif args.dummy_inputs == "longformer":
|
|
all_inputs = create_longformer_inputs(
|
|
onnx_model,
|
|
args.batch_size,
|
|
args.sequence_length,
|
|
args.global_length,
|
|
args.samples,
|
|
)
|
|
else: # default
|
|
all_inputs = create_dummy_inputs(onnx_model, args.batch_size, args.sequence_length, args.samples)
|
|
|
|
profile_file = run_profile(
|
|
args.model,
|
|
args.use_gpu,
|
|
args.provider,
|
|
args.basic_optimization,
|
|
args.thread_num,
|
|
all_inputs,
|
|
)
|
|
|
|
return profile_file
|
|
|
|
|
|
if __name__ == "__main__":
|
|
arguments = parse_arguments()
|
|
print("Arguments", arguments)
|
|
|
|
from benchmark_helper import setup_logger
|
|
|
|
setup_logger(arguments.verbose)
|
|
|
|
if not arguments.input:
|
|
assert arguments.model, "requires either --model to run profiling or --input to read profiling results"
|
|
profile_file = run(arguments)
|
|
else:
|
|
profile_file = arguments.input
|
|
|
|
results = process_results(profile_file, arguments)
|
|
|
|
for line in results:
|
|
print(line)
|