pytorch
4784 строки · 168.3 Кб
1#!/usr/bin/env python3
2
3from __future__ import annotations4
5import abc6import argparse7import collections8import contextlib9import copy10import csv11import dataclasses12import functools13import importlib14import itertools15import logging16import os17import shutil18import signal19import subprocess20import sys21import time22import weakref23from contextlib import contextmanager24from pathlib import Path25from typing import (26Any,27Callable,28Generator,29List,30Mapping,31NamedTuple,32Optional,33Sequence,34Tuple,35Type,36TYPE_CHECKING,37)
38from typing_extensions import Self39from unittest.mock import MagicMock40
41import numpy as np42import pandas as pd43import psutil44import yaml45from scipy.stats import gmean, ttest_ind46from tqdm.auto import tqdm, trange47
48import torch49import torch._dynamo50import torch._dynamo.utils51import torch._export52import torch.distributed53import torch.multiprocessing as mp54from torch._C import _has_cuda as HAS_CUDA, _has_xpu as HAS_XPU55from torch._dynamo.profiler import fx_insert_profiling, Profiler56from torch._dynamo.testing import (57dummy_fx_compile,58format_speedup,59reset_rng_state,60same,61)
62
63
64try:65from torch._dynamo.utils import (66clone_inputs,67graph_break_reasons,68maybe_enable_compiled_autograd,69)70from torch._inductor.utils import fresh_inductor_cache71except ImportError:72from _dynamo.utils import (73clone_inputs,74graph_break_reasons,75maybe_enable_compiled_autograd,76)77
78import torch._functorch.config79from torch._functorch.aot_autograd import set_model_name80from torch._inductor import config as inductor_config, metrics81from torch._subclasses.fake_tensor import FakeTensorMode82from torch.utils import _pytree as pytree83from torch.utils._pytree import tree_map, tree_map_only84
85
86try:87import torch_xla88import torch_xla.core.xla_model as xm89
90# This is to woraround the backward issue https://github.com/pytorch/xla/issues/417491torch_xla._XLAC._init_computation_client()92except ImportError:93# ignore the error if torch_xla is not installed94pass95
96
97if TYPE_CHECKING:98from torch.onnx._internal.fx import diagnostics99
100
101log = logging.getLogger(__name__)102
103# We are primarily interested in TF32
104torch.backends.cuda.matmul.allow_tf32 = True105
106# Suppress torch.profiler spam
107os.environ["KINETO_LOG_LEVEL"] = "5"108
109current_name = ""110current_device = ""111current_onnx_compiler = ""112current_batch_size = None113output_filename = None114disable_output = False115
116MAX_DOWNLOAD_ATTEMPTS = 5117
118
119class CI(NamedTuple):120backend: str # aot_eager or inductor121training: bool122dynamic: bool = False123device: str = "cuda"124
125
126CI_SKIP_OPTIMIZER = {127# TIMM128"convmixer_768_32", # accuracy129"hrnet_w18", # Stack issue in fx130# HF131"pnasnet5large", # Stack issue in fx132"MobileBertForMaskedLM", # Stack issue in fx133"MobileBertForQuestionAnswering", # Stack issue in fx134"PegasusForConditionalGeneration", # OOM135}
136
137try:138from .fb.common import INTERNAL_CI_SKIP_DYNAMIC_BATCH_ONLY139except ImportError:140INTERNAL_CI_SKIP_DYNAMIC_BATCH_ONLY = set()141
142CI_SKIP_DYNAMIC_BATCH_ONLY = {143"sam",144# See https://github.com/mindee/doctr/blob/f2114758d529ed8d3d0030581638f0520b6b98d8/doctr/models/detection/core.py#L89145# It iterates over the batch, which is dynamic, and dynamo chokes146# We should be able to graphbreak there.147"doctr_det_predictor",148"dlrm",149"pyhpc_isoneutral_mixing",150"pyhpc_equation_of_state",151"pyhpc_turbulent_kinetic_energy",152"detectron2_fcos_r_50_fpn",153"detectron2_fasterrcnn_r_101_c4",154"detectron2_fasterrcnn_r_101_dc5",155"detectron2_fasterrcnn_r_101_fpn",156"detectron2_fasterrcnn_r_50_c4",157"detectron2_fasterrcnn_r_50_dc5",158"detectron2_fasterrcnn_r_50_fpn",159"hf_T5_generate",160"Reformer",161}.union(INTERNAL_CI_SKIP_DYNAMIC_BATCH_ONLY)162
163# These models currently fail accuracy with eager Adam optimizer
164# so we use SGD when running the full benchmarks
165# https://github.com/pytorch/pytorch/issues/115966
166BENCHMARK_USE_SGD = {167# TorchBench168"BERT_pytorch",169"LearningToPaint",170"alexnet",171"dcgan",172"demucs",173"densenet121",174"dlrm",175"fastNLP_Bert",176"mobilenet_v2",177"phlippe_densenet",178"phlippe_resnet",179"pytorch_stargan",180"resnet18",181"shufflenet_v2_x1_0",182"speech_transformer",183"squeezenet1_1",184"stable_diffusion_text_encoder",185"timm_efficientdet",186"timm_nfnet",187"timm_regnet",188"timm_vision_transformer",189"timm_vovnet",190"vgg16",191"hf_T5", # Fails dynamic https://github.com/pytorch/pytorch/issues/115968192# HF193"AlbertForMaskedLM",194"BartForCausalLM",195"BartForConditionalGeneration",196"BlenderbotSmallForCausalLM",197"BlenderbotSmallForConditionalGeneration",198"DebertaV2ForQuestionAnswering", # eager OOM199"ElectraForCausalLM",200"M2M100ForConditionalGeneration",201"MBartForCausalLM",202"MBartForConditionalGeneration",203"OPTForCausalLM",204"PLBartForCausalLM",205"PLBartForConditionalGeneration",206"PegasusForCausalLM",207"Speech2Text2ForCausalLM",208"TrOCRForCausalLM",209"XGLMForCausalLM",210# TIMM211"adv_inception_v3",212"botnet26t_256",213"cait_m36_384", # OOM214"coat_lite_mini",215"convit_base",216"dpn107",217"fbnetv3_b",218"gernet_l",219"lcnet_050",220"mixnet_l",221"res2net101_26w_4s",222"res2net50_14w_8s",223"res2next50",224"resnest101e",225"sebotnet33ts_256",226"swsl_resnext101_32x16d",227"tf_efficientnet_b0",228"ghostnet_100",229"gmixer_24_224",230"tinynet_a",231}
232
233# These models OOM in CI
234# due to the extra memory of Adam optimizer states,
235# so we fall back to SGD in CI
236CI_USE_SGD = {237"torchrec_dlrm",238"demucs",239"detectron2_fasterrcnn_r_101_c4",240"detectron2_fasterrcnn_r_101_dc5",241"detectron2_fasterrcnn_r_101_fpn",242"detectron2_fasterrcnn_r_50_c4",243"detectron2_fasterrcnn_r_50_dc5",244"detectron2_fasterrcnn_r_50_fpn",245"detectron2_maskrcnn_r_101_c4",246"detectron2_maskrcnn_r_101_fpn",247"detectron2_maskrcnn_r_50_c4",248"detectron2_maskrcnn_r_50_fpn",249"hf_T5_base",250"hf_clip",251"llama_v2_7b_16h",252"mobilenet_v2_quantized_qat",253"phi_1_5 resnet50_quantized_qat",254"BlenderbotForCausalLM",255"cait_m36_384",256"DALLE2_pytorch",257"moco",258"timm_efficientdet",259"ghostnet_100",260"regnety_002",261"poolformer_m36",262"inception_v3",263"tinynet_a",264"selecsls42b",265"mobilevit_s",266"pytorch_CycleGAN_and_pix2pix",267"vision_maskrcnn",268"resmlp_12_224",269"dlrm",270"resnet50",271"dm_nfnet_f0",272"pit_b_224",273"tf_mixnet_l",274}
275
276
277DO_NOT_CAST_INPUTS = {"stable_diffusion"}278
279
280# Maps a benchmark model name to a list of status codes. For any listed entry, we'll
281# capture TORCH_COMPILE_DEBUG logs in CI runs and preseve them (i.e., for upload) if
282# the result status matches one listed.
283CI_PRESERVE_COMPILE_DEBUG = {284# For example:285# "mnasnet1_0": ["fail_accuracy"],286}
287
288
289@functools.lru_cache(maxsize=1)290def load_yaml_file(filename):291filepath = os.path.join(os.path.dirname(__file__), filename)292
293with open(filepath) as f:294data = yaml.safe_load(f)295
296internal_file_path = os.path.join(os.path.dirname(__file__), "fb", filename)297if os.path.exists(internal_file_path):298with open(internal_file_path) as f:299internal_data = yaml.safe_load(f)300data.update(internal_data)301
302def flatten(lst):303for item in lst:304if isinstance(item, list):305yield from flatten(item)306else:307yield item308
309def maybe_list_to_set(obj):310if isinstance(obj, dict):311return {k: maybe_list_to_set(v) for k, v in obj.items()}312if isinstance(obj, list):313return set(flatten(obj))314return obj315
316return maybe_list_to_set(data)317
318
319def model_specified_by_path(path_and_class_str):320return ":" in path_and_class_str321
322
323def load_model_from_path(path_and_class_str):324configs = {}325for kvstr in path_and_class_str.split(","):326k, v = kvstr.split(":")327configs[k] = v328
329for name in ["path", "class"]:330if name not in configs:331raise RuntimeError(332"Invalid --only arguments. Check help message for the correct format"333)334
335path = configs["path"]336class_name = configs["class"]337
338if path[:1] != "/":339raise RuntimeError(340"Use absolute path since dynamo may change the current working directory which makes using relative path tricky"341)342
343spec = importlib.util.spec_from_file_location("module_name", path)344module = importlib.util.module_from_spec(spec)345spec.loader.exec_module(module)346
347model_class = getattr(module, class_name)348assert issubclass(model_class, torch.nn.Module)349model = model_class()350assert hasattr(model, "get_example_inputs")351inputs = model.get_example_inputs()352return model, inputs353
354
355def output_csv(filename, headers, row):356global disable_output357if disable_output:358return359if os.path.exists(filename):360with open(filename) as fd:361lines = list(csv.reader(fd)) or [[]]362if headers and len(headers) > len(lines[0]):363# if prior results failed the header might not be filled in yet364lines[0] = headers365else:366headers = lines[0]367else:368lines = [headers]369lines.append([(f"{x:.6f}" if isinstance(x, float) else x) for x in row])370with open(filename, "w") as fd:371writer = csv.writer(fd, lineterminator="\n")372for line in lines:373writer.writerow(list(line) + ["0"] * (len(headers) - len(line)))374
375
376def nothing(f):377return f378
379
380@functools.lru_cache(None)381def patch_torch_manual_seed():382"""Make torch manual seed deterministic. Helps with accuracy testing."""383
384def deterministic_torch_manual_seed(*args, **kwargs):385from torch._C import default_generator386
387seed = 1337388if HAS_CUDA:389import torch.cuda390
391if not torch.cuda._is_in_bad_fork():392torch.cuda.manual_seed_all(seed)393if HAS_XPU:394import torch.xpu395
396if not torch.xpu._is_in_bad_fork():397torch.xpu.manual_seed_all(seed)398return default_generator.manual_seed(seed)399
400torch.manual_seed = deterministic_torch_manual_seed401
402
403def empty_gpu_cache(device):404"""405Explicitly empty gpu cache to avoid OOM in subsequent run.
406"""
407
408if device not in ["cuda", "xpu"]:409log.warning(410"Trying to call the empty_gpu_cache for device: %s, which is not in list [cuda, xpu]",411device,412)413return414
415if device == "cuda":416torch.cuda.empty_cache()417elif device == "xpu":418torch.xpu.empty_cache()419
420
421def synchronize():422pass423
424
425def summarize_graph_break(filename):426"""427Sorts and de-dupes the graphs breaks on the reason string. Note that this
428function is just a best effort to reduce the logging information. We could
429miss some graph breaks because of de-duping. We can further refine this
430function as need arises.
431"""
432log_file = f"{filename.rstrip('.csv')}_graph_breaks.csv"433if os.path.exists(log_file):434df = pd.read_csv(log_file)435df = df.sort_values("reason").drop_duplicates(subset="reason")436
437# Specialize for multi tensor sgd as reason is not identical438multi_tensor_sgd_row = df.loc[df["reason"].str.contains("_multi_tensor_sgd")]439if len(multi_tensor_sgd_row):440df = df[441~df["reason"].str.contains("_multi_tensor_sgd")442] # Drop all sgd rows443df = pd.concat(444[df, pd.DataFrame([multi_tensor_sgd_row.iloc[0]])], axis=0445) # Add back a single row446df.to_csv(f"{log_file.rstrip('.csv')}_deduped.csv", index=False)447
448
449def print_summary(filename, print_dataframe=False):450if not (filename and os.path.exists(filename)):451return452data = pd.read_csv(filename)453if "tag" in data.columns:454for tag in data.tag.unique():455if tag == "0.0000":456continue # This happens for failed runs457print(f"\nSummary for tag={tag}:")458print_summary_table(data[data.tag == tag], print_dataframe=print_dataframe)459else:460print_summary_table(data, print_dataframe=print_dataframe)461summarize_graph_break(filename)462
463
464def print_summary_table(data, print_dataframe=False):465if print_dataframe:466pd.options.display.max_rows = 1000467pd.options.display.max_columns = 1000468pd.options.display.width = 2000469print(data)470width = max(map(len, data.columns))471for col in data.columns:472try:473if col in ("dev", "name", "batch_size", "tag"):474continue475elif col in ("pct_ops", "pct_time"):476print(col.ljust(width), f"{data[col].mean():.3%}")477elif col in ("graphs", "graph_calls", "captured_ops", "total_ops"):478print(col.ljust(width), f"{data[col].mean():.3f}")479elif col in ("compilation_latency"):480print(col.ljust(width), f"mean={data[col].mean():.3f} seconds")481elif col in ("compression_ratio"):482print(col.ljust(width), f"mean={data[col].mean():.3f}x")483elif col in ("accuracy"):484pass_rate = (data[col] == "pass").mean()485print(col.ljust(width), f"pass_rate={100*pass_rate:.2f}%")486else:487cdata = data[col]488print(489col.ljust(width),490f"gmean={gmean(cdata):.2f}x mean={cdata.mean():.3f}x",491)492except Exception as e:493pass494
495
496def tensor_is_on_xla(tensors):497def visit(x: torch.Tensor):498nonlocal result499if x.device.type == "xla":500result = True501
502result = False503tree_map_only(torch.Tensor, visit, tensors)504return result505
506
507def timed(508model,509model_iter_fn,510example_inputs,511times=1,512return_result=False,513collect_outputs=False,514):515use_xla = tensor_is_on_xla(example_inputs)516synchronize()517
518if use_xla:519xm.mark_step()520xm.wait_device_ops()521
522time_total = 0523# Dont collect outputs to correctly measure timing524for _ in range(times):525# Put this call inside the loop to reset the seed for each iteration.526# Don't include reset_rng_state() to correctly measure timing527reset_rng_state(use_xla)528t_iter_begin = time.perf_counter()529result = model_iter_fn(model, example_inputs, collect_outputs=collect_outputs)530
531# instead of calling sync on result_list, we should call mark_step.532# In training case, result_list may be empty, but we want to533# send all the pending graphs for compilation.534if use_xla:535# For the model running on regular torchxla (baseline), we need the536# mark step to send the accumulated graph for compilation.537#538# For the model running with dynamo/torchxla bridge, in training case,539# we need the mark step to send the optimizer graph out for540# compilation.541xm.mark_step()542t_iter_end = time.perf_counter()543time_total += t_iter_end - t_iter_begin544
545t_0 = time.perf_counter()546if use_xla:547xm.wait_device_ops()548synchronize()549t_1 = time.perf_counter()550time_total += t_1 - t_0551return (time_total, result) if return_result else time_total552
553
554def _normalize_bench_inputs(example_inputs) -> Tuple[Tuple[Any], Mapping[str, Any]]:555# NOTE(bowbao): For huggingface benchmark, example_inputs are formatted as dictionary,556# and consumed like `model(**example_inputs)`.557# For other benchmarks, example_inputs are formatted as tuple and consumed558# like `model(*example_inputs)`.559if isinstance(example_inputs, dict):560return (), example_inputs561else:562return tuple(example_inputs), {}563
564
565def _register_dataclass_output_as_pytree(example_outputs) -> None:566# NOTE(angelayi): For huggingface benchmark, some example outputs are567# formatted as a dataclass which pytree cannot consume. So we want568# to register the pytree implementation here569example_outputs_flat = pytree.tree_leaves(example_outputs)570output_dataclass_types = [571type(out) for out in example_outputs_flat if dataclasses.is_dataclass(type(out))572]573for output_type in output_dataclass_types:574from torch._export.utils import register_dataclass_as_pytree_node575
576register_dataclass_as_pytree_node(577output_type,578serialized_type_name=f"{output_type.__module__}.{output_type.__name__}",579)580
581
582class Stats:583totals = collections.defaultdict(collections.Counter)584
585@classmethod586def reset_counters(cls):587for k, v in torch._dynamo.utils.counters.items():588cls.totals[k].update(v)589ok = torch._dynamo.utils.counters["frames"]["ok"]590total = torch._dynamo.utils.counters["frames"]["total"]591torch._dynamo.utils.counters.clear()592return ok, total593
594@classmethod595def print_summary(cls):596for k, v in sorted(cls.totals.items()):597lines = "\n ".join(map(str, v.most_common(50)))598print(f"STATS {k}\n {lines}")599
600@classmethod601def aot_summary(cls):602return [cls.totals["aot_autograd"]["total"], cls.totals["aot_autograd"]["ok"]]603
604
605def coverage_experiment(args, model_iter_fn, model, example_inputs):606"""607Test operator/model coverage of TorchDynamo and record statistics
608taken from a profiler. This target is mainly intended to check
609correctness.
610
611Writes to ./coverage.csv
612"""
613profiler = Profiler()614frozen_model_iter_fn = torch._dynamo.run(model_iter_fn)615with profiler.prof:616frozen_model_iter_fn(model, example_inputs)617coverage_result = profiler.results()618output_csv(619output_filename,620(621"dev",622"name",623"batch_size",624"graphs",625"graph_calls",626"captured_ops",627"total_ops",628"pct_ops",629"pct_time",630),631[632current_device,633current_name,634current_batch_size,635]636+ coverage_result.tocsv(),637)638return coverage_result639
640
641def speedup_experiment_fx2trt(args, model_iter_fn, model, example_inputs):642"""643Measure speedups over eager using the trt inference backend. TRT backend is based fx graph
644generated by torch._dynamo.
645Writes to ./speedups_fx2trt.csv
646"""
647return speedup_experiment(args, model_iter_fn, model, example_inputs)648
649
650def recompile_profiler_experiment(args, model_iter_fn, model, example_inputs):651prof = torch._dynamo.utils.CompilerProfiler()652opt_model_iter_fn = torch._dynamo.optimize(prof, nopython=args.nopython)(653model_iter_fn
654)655opt_model_iter_fn(model, example_inputs)656output_csv(657output_filename, ["model", "profiler report"], [current_name, prof.report()]658)659met = prof.get_metrics()660guard_failures = len(met["guard_failures"])661return [guard_failures]662
663
664def randomize_input(inputs):665if isinstance(inputs, (list, tuple)):666return type(inputs)([randomize_input(x) for x in inputs])667elif isinstance(inputs, torch.Tensor):668if inputs.dtype in (torch.float32, torch.float64):669torch._dynamo.utils.counters["randomize_input"]["times"] += 1670return torch.randn_like(inputs)671elif inputs.dtype == torch.int64:672# Note: we can not simply tune integer tensors as follows673# `return torch.randint_like(inputs, high=inputs.max().item())`674# This may break some invariants between tensors.675# E.g. in embedding lookup case, one tensor is the length676# and another is an indices tensor.677return inputs678else:679raise RuntimeError(680f"randomize_input need support tensor of type {inputs.dtype}"681)682else:683raise RuntimeError(684f"randomize_input can not handle input of type {type(inputs)}"685)686
687
688def maybe_mark_step(args):689if args.trace_on_xla:690xm.mark_step()691
692
693def latency_experiment(args, model_iter_fn, model, example_inputs, mark, **kwargs):694"""695Measure latency on a specific backend.
696"""
697
698timings = np.zeros((args.repeat,), np.float64)699# if we randomize the input, we should also check the result is correct700should_randomize_input = args.randomize_input701
702import contextlib703
704from torch._inductor.utils import maybe_profile705
706@contextlib.contextmanager707def maybe_mark_profile(*args, **kwargs):708prof: torch.profiler.profile = kwargs.pop("p", None)709mark = kwargs.pop("mark", None)710if prof:711with torch.profiler.record_function(mark):712yield713else:714yield715
716times = args.iterations_per_run717
718with maybe_profile(args.export_profiler_trace) as p:719for rep in trange(args.repeat, desc="running benchmark"):720inputs = (721randomize_input(copy.deepcopy(example_inputs))722if should_randomize_input723else example_inputs724)725# need call mark_step to perform the computation726# on randomize_input. Otherwise the first call using the727# inputs will incur high penalty then the next one.728maybe_mark_step(args)729
730with maybe_mark_profile(p=p, mark=mark), maybe_enable_compiled_autograd(731args.compiled_autograd,732fullgraph=args.nopython,733dynamic=args.dynamic_shapes,734):735timings[rep], actual_output = timed(736model,737model_iter_fn,738inputs,739return_result=True,740times=times,741collect_outputs=args.collect_outputs,742)743
744if args.export_profiler_trace:745name = args.profiler_trace_name + "_" + model.name746if hasattr(args, "rank"):747name += f"_rank_{args.rank}"748name += ".json"749name = os.path.join(torch._dynamo.config.base_dir, name)750p.export_chrome_trace(name)751return timings752
753
754def latency_experiment_summary(args, model, timings, **kwargs):755median = np.median(timings, axis=0)756speedup = median[0] / median[1]757if args.dump_raw_metrics:758np.save(759f"{output_filename[:-4]}-raw_timings-{current_name}-{current_device}.npy",760timings,761)762
763first_headers = ["dev", "name", "batch_size"]764first_fields = [current_device, current_name, current_batch_size]765if "tag" in kwargs:766first_headers.append("tag")767first_fields.append(kwargs["tag"])768headers = first_headers + ["speedup", "abs_latency"]769row = first_fields + [float(speedup), median[1] * 1000]770msg = f"{speedup:.3f}x"771if args.baseline:772headers.extend(773[774"baseline",775"speedup_vs_baseline",776]777)778df = pd.read_csv(args.baseline)779try:780baseline_speedup = df[df["name"] == current_name]["speedup"].item()781row.extend([baseline_speedup, speedup / baseline_speedup])782msg = f"{baseline_speedup:.3f}x -> {speedup:.3f}x [{speedup / baseline_speedup:.3f}x]"783except (KeyError, ZeroDivisionError):784row.extend(785[7860.0,7870.0,788]789)790if "compilation_latency" in kwargs:791headers += [792"compilation_latency",793"compression_ratio",794"eager_peak_mem",795"dynamo_peak_mem",796]797row.append(kwargs["compilation_latency"])798row.append(kwargs["compression_ratio"])799row.append(kwargs["eager_peak_mem"])800row.append(kwargs["dynamo_peak_mem"])801
802if "cache_lookup_latency" in kwargs:803headers.append("cache_lookup_latency")804row.append(kwargs["cache_lookup_latency"])805
806if "dynamo_stats" in kwargs:807for k, v in kwargs["dynamo_stats"].items():808headers.append(k)809row.append(v)810output_csv(811output_filename,812headers,813row,814)815headers, data = torch._dynamo.utils.compile_times(repr="csv", aggregate=True)816assert (817output_filename.find(".csv") > 0818), f"expected output_filename to be a .csv, but got {output_filename}"819output_csv(820output_filename[:-4] + "_compilation_metrics.csv",821first_headers + headers,822first_fields + data,823)824return msg825
826
827def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs):828"""829Measure speedups over eager.
830
831Writes to ./speedups.csv
832"""
833# if args.dynamic_shapes:834# return speedup_experiment_ds(args, model_iter_fn, model, example_inputs)835
836timings = np.zeros((args.repeat, 2), np.float64)837# if we randomize the input, we should also check the result is correct838should_randomize_input = args.randomize_input839
840import contextlib841
842from torch._inductor.utils import maybe_profile843
844@contextlib.contextmanager845def maybe_mark_profile(*args, **kwargs):846prof: torch.profiler.profile = kwargs.pop("p", None)847mark = kwargs.pop("mark", None)848if prof:849with torch.profiler.record_function(mark):850yield851else:852yield853
854times = args.iterations_per_run855
856# Use higher tolerance for XLA since XLA cause numerical unstability when857# graph size changes858tolerance = args.xla_tolerance if args.trace_on_xla else 1e-4859torch._dynamo.config.repro_tolerance = tolerance860
861with maybe_profile(args.export_profiler_trace) as p:862if args.export_aot_inductor:863frozen_model_iter_fn = export_aot_inductor(864model, example_inputs, args.devices[0]865)866else:867frozen_model_iter_fn = torch._dynamo.run(model_iter_fn)868
869for rep in trange(args.repeat, desc="running benchmark"):870inputs = (871randomize_input(copy.deepcopy(example_inputs))872if should_randomize_input873else example_inputs874)875# need call mark_step to perform the computation876# on randomize_input. Otherwise the first call using the877# inputs will incur high penalty then the next one.878maybe_mark_step(args)879
880# interleave the runs to handle frequency scaling and load changes881with maybe_mark_profile(p=p, mark="expected"):882timings[rep, 0], expected_output = timed(883model,884model_iter_fn,885inputs,886return_result=True,887times=times,888collect_outputs=args.collect_outputs,889)890
891# call mark_step between the 2 calls to make the comparison fair.892maybe_mark_step(args)893
894with maybe_mark_profile(p=p, mark="actual"), maybe_enable_compiled_autograd(895args.compiled_autograd,896fullgraph=args.nopython,897dynamic=args.dynamic_shapes,898):899timings[rep, 1], actual_output = timed(900model,901frozen_model_iter_fn,902inputs,903return_result=True,904times=times,905collect_outputs=args.collect_outputs,906)907
908if args.export_profiler_trace:909name = args.profiler_trace_name + "_" + model.name910if hasattr(args, "rank"):911name += f"_rank_{args.rank}"912name += ".json"913name = os.path.join(torch._dynamo.config.base_dir, name)914p.export_chrome_trace(name)915median = np.median(timings, axis=0)916speedup = median[0] / median[1]917if args.dump_raw_metrics:918np.save(919f"{output_filename[:-4]}-raw_timings-{current_name}-{current_device}.npy",920timings,921)922
923first_headers = ["dev", "name", "batch_size"]924first_fields = [current_device, current_name, current_batch_size]925if "tag" in kwargs:926first_headers.append("tag")927first_fields.append(kwargs["tag"])928headers = first_headers + ["speedup", "abs_latency"]929row = first_fields + [float(speedup), median[1] * 1000]930msg = f"{speedup:.3f}x"931if args.baseline:932headers.extend(933[934"baseline",935"speedup_vs_baseline",936]937)938df = pd.read_csv(args.baseline)939try:940baseline_speedup = df[df["name"] == current_name]["speedup"].item()941row.extend([baseline_speedup, speedup / baseline_speedup])942msg = f"{baseline_speedup:.3f}x -> {speedup:.3f}x [{speedup / baseline_speedup:.3f}x]"943except (KeyError, ZeroDivisionError):944row.extend(945[9460.0,9470.0,948]949)950if "compilation_latency" in kwargs:951headers += [952"compilation_latency",953"compression_ratio",954"eager_peak_mem",955"dynamo_peak_mem",956]957row.append(kwargs["compilation_latency"])958row.append(kwargs["compression_ratio"])959row.append(kwargs["eager_peak_mem"])960row.append(kwargs["dynamo_peak_mem"])961
962if "cache_lookup_latency" in kwargs:963headers.append("cache_lookup_latency")964row.append(kwargs["cache_lookup_latency"])965
966if "dynamo_stats" in kwargs:967for k, v in kwargs["dynamo_stats"].items():968headers.append(k)969row.append(v)970output_csv(971output_filename,972headers,973row,974)975headers, data = torch._dynamo.utils.compile_times(repr="csv", aggregate=True)976assert (977output_filename.find(".csv") > 0978), f"expected output_filename to be a .csv, but got {output_filename}"979output_csv(980output_filename[:-4] + "_compilation_metrics.csv",981first_headers + headers,982first_fields + data,983)984return msg985
986
987def speedup_experiment_ds(args, model_iter_fn, model, example_inputs):988"""989Run dynamic shapes benchmarks.
990
991Requires dynamic shape compatible models, which provide a list of example inputs.
992
993Warms up using the first input example and then iterates the inputs,
994measuring (and expecting minimal) variance between the runtime for different examples.
995
996"""
997timings = np.zeros((args.repeat, len(example_inputs), 2), np.float64)998
999if args.repeat > 5:1000print(1001f"\ndynamic shapes experiments are slow, consider setting --repeat less than {args.repeat}\n"1002)1003
1004nwarmup = 41005for rep in range(args.repeat):1006# Start each rep fresh, e.g. only warmup on example 01007torch._dynamo.reset()1008optimized_model_iter_fn = optimize_ctx(model_iter_fn)1009for _ in range(nwarmup):1010optimized_model_iter_fn(model, example_inputs[0])1011
1012for input_idx, inputs in enumerate(example_inputs):1013# interleave the runs to handle frequency scaling and load changes1014timings[rep, input_idx, 0] = timed(1015model, model_iter_fn, inputs, return_result=False1016)1017# different from regular speedup_experiment, we _DO_ want to allow recompilation1018timings[rep, input_idx, 1] = timed(1019model, optimized_model_iter_fn, inputs, return_result=False1020)1021medians = np.median(timings, axis=0)1022speedups = list(medians[:, 0] / medians[:, 1])1023speedups_mean = np.mean(speedups)1024speedups_median = np.median(speedups)1025speedups_var = np.var(speedups)1026
1027# TODO this x[0] is not going to work in general but bert only has 1 input1028shapes = [x[0].shape for x in example_inputs]1029shape_keys = sorted(set(shapes))1030shape_speedups = {1031shape: [1032it[1] for it in filter(lambda it: it[0] == shape, zip(shapes, speedups))1033]1034for shape in shape_keys1035}1036output_str = (1037f"mean: {speedups_mean:.3f}, median: {speedups_median:.3f}, var: {speedups_var:.3f}"1038+ "\nSpeedups by shape: "1039+ "\n".join(1040[1041f"{shape}: "1042+ ", ".join([f"{speedup: .3g}" for speedup in shape_speedups[shape]])1043for shape in shape_keys1044]1045)1046)1047output_csv(1048output_filename,1049("dev", "name", "batch_size", "speedup mean", "speedup median", "speedup var"),1050[1051current_device,1052current_name,1053current_batch_size,1054speedups_mean,1055speedups_median,1056speedups_var,1057],1058)1059return output_str1060
1061
1062@contextlib.contextmanager1063def override_synchronize_with_onnx_iobinding(iobinding):1064global synchronize1065prev_synchrnoize = synchronize1066try:1067if iobinding is not None:1068
1069def new_synchronize():1070iobinding.synchronize_inputs()1071iobinding.synchronize_outputs()1072
1073synchronize = new_synchronize1074yield1075finally:1076synchronize = prev_synchrnoize1077
1078
1079def speedup_experiment_onnx(1080args,1081model_iter_fn,1082onnx_model: OnnxModel,1083model,1084example_inputs,1085**kwargs,1086):1087"""1088Measure speedups over eager.
1089
1090This function is responsible for the following:
10911. Creating iobinding with OnnxModel if device is CUDA, which is essential for perf measurement.
10922. Running ORT with OnnxModel.
1093
1094Writes to ./{output_filename}, which should be
1095`Path(self.output_dir) / f"{self.compiler}_{suite}_{self.dtype}_{self.mode}_{self.device}_{self.testing}.csv".
1096
1097TODO(bowbao): Record export time and export peak memory usage.
1098"""
1099timings = np.zeros((args.repeat, 2), np.float64)1100is_correct = True1101should_randomize_input = args.randomize_input1102times = args.iterations_per_run1103
1104def create_onnx_input_binded_fn(onnx_model: OnnxModel, pt_inputs, example_outputs):1105# Goal is to move the iobinding creation outside of the timer function.1106iobinding, outputs = onnx_model.create_iobinding(pt_inputs, example_outputs)1107
1108def onnxrt_model_iter_fn(model, inputs, collect_outputs=True):1109onnx_model.run_with_iobinding(iobinding, outputs)1110if collect_outputs:1111return outputs1112
1113return onnxrt_model_iter_fn, iobinding1114
1115def create_onnx_fn(onnx_model: OnnxModel, pt_inputs):1116# NOTE: Making perf comparison fair by moving out the i/o adapting part.1117# 1. Pre-adapt `pt_inputs` to `onnx_inputs` here.1118# 2. Drop `onnx_outputs` to `pt_outputs` adapting. Output comparison is not part of perf measurement.1119onnx_inputs = onnx_model.adapt_pt_inputs_to_onnx(pt_inputs)1120
1121def onnxrt_model_iter_fn(model, inputs, collect_outputs=True):1122return onnx_model.run_with_onnx_inputs(onnx_inputs)1123
1124return onnxrt_model_iter_fn1125
1126def timed_onnx(model, onnx_model: OnnxModel, inputs):1127if current_device == "cpu" or onnx_model.is_cpu():1128onnxrt_model_iter_fn = create_onnx_fn(onnx_model, inputs)1129iobinding = None1130else:1131onnxrt_model_iter_fn, iobinding = create_onnx_input_binded_fn(1132onnx_model, inputs, expected_output1133)1134with override_synchronize_with_onnx_iobinding(iobinding):1135return timed(1136model,1137onnxrt_model_iter_fn,1138inputs,1139return_result=True,1140times=times,1141collect_outputs=args.collect_outputs,1142)1143
1144# Insert ONNX warm-up1145inputs = (1146randomize_input(copy.deepcopy(example_inputs))1147if should_randomize_input1148else example_inputs1149)1150_, expected_output = timed(1151model,1152model_iter_fn,1153inputs,1154return_result=True,1155times=times,1156collect_outputs=args.collect_outputs,1157)1158for _ in range(2):1159timed_onnx(model, onnx_model, inputs)1160
1161for rep in range(args.repeat):1162inputs = (1163randomize_input(copy.deepcopy(example_inputs))1164if should_randomize_input1165else example_inputs1166)1167if torch.cuda.device_count() > 1:1168# Manually set correct torch.cuda.current_device to ensure torch.cuda.synchronize() works as intended.1169# When there are more than 1 cuda devices, the first one is used for pytorch eager.1170# The second one is used for onnx ort.1171torch.cuda.set_device(0)1172timings[rep, 0], expected_output = timed(1173model,1174model_iter_fn,1175inputs,1176return_result=True,1177times=times,1178collect_outputs=args.collect_outputs,1179)1180if torch.cuda.device_count() > 1:1181# Manually set correct torch.cuda.current_device to ensure torch.cuda.synchronize() works as intended.1182# When there are more than 1 cuda devices, the first one is used for pytorch eager.1183# The second one is used for onnx ort.1184torch.cuda.set_device(1)1185timings[rep, 1], actual_output = timed_onnx(model, onnx_model, inputs)1186
1187pvalue = ttest_ind(timings[:, 0], timings[:, 1]).pvalue1188median = np.median(timings, axis=0)1189speedup = median[0] / median[1]1190if args.dump_raw_metrics:1191np.save(1192f"{output_filename[:-4]}-raw_timings-{current_name}-{current_device}.npy",1193timings,1194)1195
1196headers = ["dev", "name", "batch_size", "speedup", "abs_latency"]1197row = [1198current_device,1199current_name,1200current_batch_size,1201float(speedup),1202median[1] * 1000,1203]1204if "compilation_latency" in kwargs:1205headers = headers + ["compilation_latency", "compression_ratio"]1206row.append(kwargs["compilation_latency"])1207row.append(kwargs["compression_ratio"])1208
1209output_csv(1210output_filename,1211headers,1212row,1213)1214headers, data = torch._dynamo.utils.compile_times(repr="csv", aggregate=True)1215assert (1216output_filename.find(".csv") > 01217), f"expected output_filename to be a .csv, but got {output_filename}"1218output_csv(1219output_filename[:-4] + "_compilation_metrics.csv",1220["dev", "name", "batch_size"] + headers,1221[current_device, current_name, current_batch_size] + data,1222)1223return format_speedup(speedup, pvalue, is_correct=is_correct)1224
1225
1226def overhead_experiment(*args, model_iter_fn):1227"""1228Measure overheads of TorchDynamo by running with no backend (only
1229eager+FX), and reporting speedup/slowdown over eager.
1230
1231Writes to ./overheads.csv
1232"""
1233return speedup_experiment(*args, model_iter_fn)1234
1235
1236def print_fx(gm, example_inputs):1237print(gm.graph)1238return gm1239
1240
1241def print_aten_ops(gm, example_inputs):1242from functorch.compile import aot_module1243
1244def trace_printer(gm, _):1245print(gm.graph)1246return gm1247
1248return aot_module(gm, fw_compiler=trace_printer, bw_compiler=trace_printer)1249
1250
1251def baselines(models, model_iter_fn, example_inputs, args):1252"""1253Common measurement code across all baseline experiments.
1254"""
1255models = list(models)1256for idx, (name, model) in enumerate(models):1257if idx == 0:1258result0 = model_iter_fn(model, example_inputs)1259elif model is not None:1260try:1261result = model_iter_fn(model, example_inputs)1262if same(result0, result):1263continue1264print(name, "is INCORRECT")1265except Exception:1266log.exception("error checking %s", name)1267models[idx] = (name, None)1268timings = np.zeros((args.repeat, len(models)), np.float64)1269timings.fill(1.0e10)1270for rep in range(args.repeat):1271for idx, (name, model) in enumerate(models):1272if model is not None:1273try:1274timings[rep, idx] = timed(model, model_iter_fn, example_inputs)1275except Exception:1276pass1277pvalue = [1278ttest_ind(timings[:, 0], timings[:, i]).pvalue1279for i in range(1, timings.shape[1])1280]1281median = np.median(timings, axis=0)1282speedup = median[0] / median[1:]1283for idx, (name, model) in enumerate(models[1:]):1284if model is None:1285speedup[idx] = 0.01286result = " ".join(1287[1288format_speedup(s, p, m is not None)1289for s, p, m in zip(speedup, pvalue, [m for n, m in models[1:]])1290]1291)1292output_csv(1293output_filename,1294("dev", "name", "batch_size") + tuple(n for n, m in models[1:]),1295[current_device, current_name, current_batch_size]1296+ [f"{x:.4f}" for x in speedup],1297)1298return result1299
1300
1301def xla(args, model_iter_fn, model, example_inputs):1302xla_dev = xm.xla_device(devkind=current_device)1303model_xla = copy.deepcopy(model).to("cpu").to(device=xla_dev)1304example_inputs_xla = tree_map_only(1305torch.Tensor, lambda x: x.to("cpu").to(device=xla_dev), example_inputs1306)1307for _ in range(3): # warmup1308timed(model, model_iter_fn, example_inputs)1309timed(model_xla, model_iter_fn, example_inputs_xla)1310timings = np.zeros((args.repeat, 2), np.float64)1311timings.fill(1.0e10)1312for rep in range(args.repeat):1313timings[rep, 0] = timed(model, model_iter_fn, example_inputs)1314timings[rep, 1] = timed(model_xla, model_iter_fn, example_inputs_xla)1315
1316pvalue = ttest_ind(timings[:, 0], timings[:, 1]).pvalue1317time_baseline, time_xla = np.median(timings, axis=0)1318speedup = time_baseline / time_xla1319output_csv(1320output_filename,1321("dev", "name", "batch_size", "speedup", "time_baseline", "time_xla"),1322[1323current_device,1324current_name,1325current_batch_size,1326speedup,1327time_baseline,1328time_xla,1329],1330)1331return format_speedup(speedup, pvalue)1332
1333
1334def try_script(model, example_inputs):1335try:1336return torch.jit.script(model)1337except Exception:1338return None1339
1340
1341class AOTInductorModelCache:1342cache = {}1343
1344@classmethod1345def load(cls, model, example_inputs, device):1346import torch._inductor1347import torch.export._trace1348
1349key = weakref.ref(model)1350if key not in cls.cache:1351# Register the output dataclass to pytree1352example_args, example_kwargs = _normalize_bench_inputs(example_inputs)1353with torch.no_grad():1354# copy.deepcopy is required to prevent any surprising side-effect,1355# see https://github.com/pytorch/pytorch/issues/1130291356example_outputs = copy.deepcopy(model)(*example_args, **example_kwargs)1357
1358if pytree._is_namedtuple_instance(example_outputs):1359typ = type(example_outputs)1360pytree._register_namedtuple(1361typ,1362serialized_type_name=f"{typ.__module__}.{typ.__name__}",1363)1364else:1365_register_dataclass_output_as_pytree(example_outputs)1366
1367gm = torch.export._trace._export(1368model,1369example_args,1370example_kwargs,1371pre_dispatch=True,1372strict=False,1373).module()1374with torch.no_grad():1375so_path = torch._inductor.aot_compile(1376gm, example_args, example_kwargs1377) # type: ignore[arg-type]1378
1379cls.cache[key] = torch._export.aot_load(so_path, device)1380
1381return cls.cache[key]1382
1383
1384def export(model, example_inputs):1385example_args, example_kwargs = _normalize_bench_inputs(example_inputs)1386example_outputs = model(*example_args, **example_kwargs)1387_register_dataclass_output_as_pytree(example_outputs)1388
1389ep = torch.export.export(model, example_args, example_kwargs)1390
1391def opt_export(_, example_inputs):1392example_args, example_kwargs = _normalize_bench_inputs(example_inputs)1393return ep.module()(*example_args, **example_kwargs)1394
1395return opt_export1396
1397
1398def export_aot_inductor(model, example_inputs, device):1399optimized = AOTInductorModelCache.load(model, example_inputs, device)1400
1401def opt_aot_inductor(_, example_inputs, collect_outputs=False):1402example_args, example_kwargs = _normalize_bench_inputs(example_inputs)1403return optimized(*example_args, **example_kwargs)1404
1405return opt_aot_inductor1406
1407
1408def download_retry_decorator(download_fn):1409"""1410Decorator function for applying retry logic to a download function.
1411
1412The wrapped function will be called up to 5 times and raises an exception if the function fails each time.
1413After each unsuccessful attempt, there is a delay before the next attempt, which is increased linearly with the number of tries.
1414
1415Usage:
1416@download_retry_decorator
1417def download_function(model_name: str):
1418# download logic goes here
1419"""
1420
1421@functools.wraps(download_fn)1422def wrapper(self, *args, **kwargs) -> Any:1423tries = 01424total_allowed_tries = MAX_DOWNLOAD_ATTEMPTS1425while tries <= total_allowed_tries:1426try:1427model = download_fn(self, *args, **kwargs)1428return model1429except Exception as e:1430tries += 11431if tries <= total_allowed_tries:1432wait = tries * 301433print(1434f"Failed to load model: {e}. Trying again ({tries}/{total_allowed_tries}) after {wait}s"1435)1436time.sleep(wait)1437else:1438raise RuntimeError( # noqa: B9041439f"Failed to load model '{args}' with following error(s): {str(e)}."1440)1441
1442return wrapper1443
1444
1445class OnnxModel(abc.ABC):1446TORCH_TO_NUMPY_DTYPE = {1447torch.float16: np.float16,1448torch.float32: np.float32,1449torch.float64: np.float64,1450torch.uint8: np.uint8,1451torch.int8: np.int8,1452torch.int16: np.int16,1453torch.int32: np.int32,1454torch.int64: np.longlong,1455torch.bool: np.bool_,1456}1457
1458_COMPILER_NAME: str1459
1460def __init__(1461self,1462output_directory,1463model,1464example_inputs,1465dynamic_shapes: bool,1466copy_before_export: bool = False,1467use_experimental_patch: bool = False,1468):1469"""The abstract class for exporting ONNX model.1470
1471Args:
1472output_directory: output path
1473model: model
1474example_inputs: example inputs for exporting
1475dynamic_shapes (bool): Whether to export the model with dynamic shapes.
1476copy_before_export (bool,): copy before export. Defaults to False.
1477use_experimental_patch (bool): Whether to apply torch_onnx patch which exports
1478with torch.export and onnx ir. Defaults to False.
1479"""
1480model_name = current_name1481self.copy_before_export = copy_before_export1482self.use_experimental_patch = use_experimental_patch1483# NOTE: torch_onnx patch is using OnnxModelFromTorchScript to export ONNX model.1484if self.use_experimental_patch:1485self._COMPILER_NAME = "torch_onnx_patch"1486self.model_dir = self._generate_onnx_model_directory(1487output_directory, self._COMPILER_NAME, model_name1488)1489self.model_path = str(1490self.model_dir / f"{model_name}_{self._COMPILER_NAME}.onnx"1491)1492
1493def _determine_deepcopy_target_device(self):1494if current_device == "cpu":1495target_device = "cpu"1496else:1497if torch.cuda.device_count() > 1:1498# Copy to another cuda device to avoid OOM.1499target_device = "cuda:1"1500else:1501target_device = "cuda"1502return target_device1503
1504def deepcopy_model_and_inputs_to_device(self, model, example_inputs, target_device):1505# Deepcopy model before export to avoid modification to baseline model.1506# To avoid OOM, the model is first moved to CPU. Both models are then moved to device.1507model_device = next(model.parameters()).device1508model.to("cpu")1509model_copy = copy.deepcopy(model).to(target_device)1510model.to(model_device)1511
1512target_device_example_inputs = tree_map_only(1513torch.Tensor, lambda x: x.to(device=target_device), example_inputs1514)1515
1516return model_copy, target_device_example_inputs1517
1518@classmethod1519def _generate_onnx_model_directory(1520cls, output_directory: str, compiler_name: str, model_name: str1521) -> Path:1522model_path = Path(1523output_directory,1524".onnx_models",1525model_name,1526compiler_name,1527)1528if model_path.exists() and model_path.is_dir():1529shutil.rmtree(model_path)1530model_path.mkdir(parents=True, exist_ok=True)1531return model_path1532
1533@abc.abstractmethod1534def format_pt_inputs(self, pt_inputs: Any) -> Sequence[torch.Tensor]:1535...1536
1537@abc.abstractmethod1538def format_pt_outputs(self, pt_outputs: Any) -> Sequence[torch.Tensor]:1539...1540
1541def adapt_pt_inputs_to_onnx(self, pt_inputs) -> Mapping[str, np.ndarray]:1542pt_inputs = self.format_pt_inputs(pt_inputs)1543return {1544ort_input.name: pt_input.cpu().numpy()1545for ort_input, pt_input in zip(self.onnx_session.get_inputs(), pt_inputs)1546}1547
1548def adapt_onnx_outputs_to_pt(self, onnx_outputs: List[np.ndarray]) -> Any:1549pt_outputs = [1550torch.from_numpy(onnx_output).to(current_device)1551for onnx_output in onnx_outputs1552]1553if len(pt_outputs) == 1:1554return pt_outputs[0]1555return pt_outputs1556
1557def _init_ort_session(self, model_path: str):1558import onnxruntime1559
1560if current_device == "cpu":1561ort_providers = ["CPUExecutionProvider"]1562else:1563# NOTE(bowbao): Reduce OOM by running ORT on another gpu.1564# TODO(bowbao): This works to avoid OOM, but performance is surprisingly very bad.1565cuda_provider_options = {1566"device_id": 1 if torch.cuda.device_count() > 1 else 0,1567}1568ort_providers = [("CUDAExecutionProvider", cuda_provider_options)]1569session_options = onnxruntime.SessionOptions()1570session_options.log_severity_level = 3 # Error1571
1572ort_session = onnxruntime.InferenceSession(1573self.model_path,1574providers=ort_providers,1575sess_options=session_options,1576)1577return ort_session1578
1579def is_cpu(self) -> bool:1580return self.onnx_session.get_providers()[0] == "CPUExecutionProvider"1581
1582def cpu(self) -> Self:1583self.onnx_session.set_providers(["CPUExecutionProvider"])1584return self1585
1586def create_outputs(self, *example_outputs):1587return tuple(torch.empty_like(x) for x in example_outputs)1588
1589def create_iobinding(self, pt_inputs, example_outputs):1590pt_inputs = self.format_pt_inputs(pt_inputs)1591example_outputs = self.format_pt_outputs(example_outputs)1592
1593iobinding = self.onnx_session.io_binding()1594args = [arg.contiguous() for arg in pt_inputs]1595for ort_input, arg in zip(self.onnx_session.get_inputs(), args):1596# NOTE: Run ORT on another cuda device to reduce OOM.1597if torch.cuda.device_count() > 1:1598arg = arg.detach().to("cuda:1")1599device = arg.device1600iobinding.bind_input(1601ort_input.name,1602device.type,1603device.index or 0,1604self.TORCH_TO_NUMPY_DTYPE[arg.dtype],1605arg.size(),1606arg.data_ptr(),1607)1608
1609outputs = self.create_outputs(*example_outputs)1610for ort_output, output in zip(self.onnx_session.get_outputs(), outputs):1611if torch.cuda.device_count() > 1:1612output = output.detach().to("cuda:1")1613device = output.device1614iobinding.bind_output(1615ort_output.name,1616device.type,1617device.index or 0,1618self.TORCH_TO_NUMPY_DTYPE[output.dtype],1619output.size(),1620output.data_ptr(),1621)1622return iobinding, outputs1623
1624def run_with_iobinding(self, iobinding, outputs):1625# 'outputs' are torch empty tensors binded to 'iobinding'.1626self.onnx_session.run_with_iobinding(iobinding)1627return outputs1628
1629def run_with_onnx_inputs(self, onnx_inputs):1630return self.onnx_session.run(None, onnx_inputs)1631
1632@classmethod1633def save_tensor_data(cls, numpy_tensor, output_path):1634from onnx import numpy_helper1635
1636proto_tensor = numpy_helper.from_array(numpy_tensor)1637with open(output_path, "wb") as f:1638f.write(proto_tensor.SerializeToString())1639
1640def run_and_serialize_inputs_outputs(self, pt_inputs):1641test_data_dir = self.model_dir / "test_data_set_0"1642test_data_dir.mkdir(parents=True, exist_ok=True)1643
1644onnx_inputs = self.adapt_pt_inputs_to_onnx(pt_inputs)1645for i, onnx_input in enumerate(onnx_inputs.values()):1646self.save_tensor_data(onnx_input, str(test_data_dir / f"input_{i}.pb"))1647
1648onnx_outputs = self.run_with_onnx_inputs(onnx_inputs)1649
1650for i, onnx_output in enumerate(onnx_outputs):1651self.save_tensor_data(onnx_output, str(test_data_dir / f"output_{i}.pb"))1652
1653return self.adapt_onnx_outputs_to_pt(onnx_outputs)1654
1655def run(self, pt_inputs):1656# NOTE: For CUDA performance testing, use `run_with_iobinding` to exclude memory1657# copying overhead for inputs/outputs between cpu and gpu.1658# Otherwise perf number is inaccurate.1659onnx_inputs = self.adapt_pt_inputs_to_onnx(pt_inputs)1660onnx_outputs = self.run_with_onnx_inputs(onnx_inputs)1661return self.adapt_onnx_outputs_to_pt(onnx_outputs)1662
1663
1664class OnnxModelFromTorchScript(OnnxModel):1665"""TorchScript based onnx export. `torch.onnx.export`1666
1667TODO(bowbao):
1668* large model export failed.
1669Onnx Model is larger than 2GB, but exporter makes decision based pt model size, which is
1670smaller than 2GB.
1671* OOM on slightly larger model.
1672Both pt model and ort inference session are on gpu. Attempt has been made to move ORT to
1673cuda:1, however ORT perf drop significantly.
1674For now running everything with batch_size 1 set in launch script.
1675"""
1676
1677_COMPILER_NAME = "torchscript"1678
1679def __init__(1680self, output_directory, model, example_inputs, dynamic_shapes: bool, **kwargs1681):1682if dynamic_shapes:1683raise NotImplementedError("NYI dynamic shapes for OnnxModelFromTorchScript")1684super().__init__(1685output_directory, model, example_inputs, dynamic_shapes, **kwargs1686)1687self._export(1688model,1689example_inputs,1690self.model_path,1691opset_version=17,1692do_constant_folding=False,1693verbose=False,1694)1695self.onnx_session = self._init_ort_session(self.model_path)1696
1697def _export(self, model, example_inputs, output_path: str, /, **kwargs) -> None:1698if self.copy_before_export:1699# Deepcopy model before export to avoid modification to baseline model.1700model, example_inputs = self.deepcopy_model_and_inputs_to_device(1701model, example_inputs, self._determine_deepcopy_target_device()1702)1703
1704# Hack for huggingface models (kwargs only).1705if isinstance(example_inputs, dict):1706
1707class WrapperModel(torch.nn.Module):1708def __init__(self, model, keys):1709super().__init__()1710self.model = model1711self.keys = keys1712
1713def forward(self, *args):1714return self.model(**dict(zip(self.keys, args)))1715
1716model = WrapperModel(model, list(example_inputs.keys()))1717
1718if self.use_experimental_patch:1719import torch_onnx1720
1721torch_onnx.patch_torch(1722error_report=True,1723profile=True,1724dump_exported_program=True,1725artifacts_dir=os.path.dirname(output_path),1726)1727else:1728# make sure the patch is not in effect1729try:1730import torch_onnx1731
1732torch_onnx.unpatch_torch()1733except ImportError:1734pass1735
1736torch.onnx.export(1737model,1738self.format_pt_inputs(example_inputs),1739output_path,1740**kwargs,1741)1742
1743def format_pt_inputs(self, pt_inputs):1744# NOTE(bowbao): For huggingface benchmark, pt_inputs are formatted as dictionary,1745# and consumed like `model(**pt_inputs)`.1746# For other benchmarks, pt_inputs are formatted as tuple and consumed1747# like `model(*pt_inputs)`.1748if isinstance(pt_inputs, dict):1749pt_inputs = list(pt_inputs.values())1750if isinstance(pt_inputs, torch.Tensor):1751pt_inputs = (pt_inputs,)1752return tuple(arg.contiguous() for arg in pt_inputs)1753
1754def format_pt_outputs(self, pt_outputs):1755if isinstance(pt_outputs, torch.Tensor):1756pt_outputs = (pt_outputs,)1757
1758pt_outputs = pytree.tree_leaves(pt_outputs)1759
1760# Hack for huggingface model outputs1761try:1762from transformers import modeling_outputs1763except ImportError:1764pass1765else:1766
1767def _to_tuple(x):1768if isinstance(x, modeling_outputs.ModelOutput):1769return x.to_tuple()1770return x1771
1772pt_outputs = pytree.tree_map(_to_tuple, pt_outputs)1773pt_outputs = pytree.tree_leaves(pt_outputs)1774
1775return pt_outputs1776
1777
1778class OnnxModelFromDynamo(OnnxModel):1779"""Dynamo and Fx based export. `torch.onnx.dynamo_export`."""1780
1781_COMPILER_NAME = "dynamo"1782
1783def __init__(1784self, output_directory, model, example_inputs, dynamic_shapes: bool, **kwargs1785):1786super().__init__(1787output_directory, model, example_inputs, dynamic_shapes, **kwargs1788)1789self._dynamic_shapes = dynamic_shapes1790self._onnx_program = self._export(model, example_inputs, self.model_path)1791# Clear the model proto to save memory.1792# The model proto is saved to disk and no longer needed from `onnx_program`.1793# `onnx_program` is kept for i/o adapter usage.1794self._onnx_program.model_proto.Clear()1795self.onnx_session = self._init_ort_session(self.model_path)1796
1797def _export(1798self, model, example_inputs, output_path: str1799) -> torch.onnx.ONNXProgram:1800if self.copy_before_export:1801# Deepcopy model before export to avoid modification to baseline model.1802model, example_inputs = self.deepcopy_model_and_inputs_to_device(1803model, example_inputs, self._determine_deepcopy_target_device()1804)1805
1806example_args, example_kwargs = _normalize_bench_inputs(example_inputs)1807options = torch.onnx.ExportOptions(dynamic_shapes=self._dynamic_shapes)1808onnx_program = torch.onnx.dynamo_export(1809model, *example_args, **example_kwargs, export_options=options1810)1811
1812onnx_program.save(output_path)1813return onnx_program1814
1815def format_pt_inputs(self, pt_inputs):1816pt_args, pt_kwargs = _normalize_bench_inputs(pt_inputs)1817return self._onnx_program.adapt_torch_inputs_to_onnx(*pt_args, **pt_kwargs)1818
1819def format_pt_outputs(self, pt_outputs):1820return self._onnx_program.adapt_torch_outputs_to_onnx(pt_outputs)1821
1822
1823class OnnxModelFromDynamoAotInline(OnnxModelFromDynamo):1824"""Dynamo and Fx based export, with AOT inline post export. `torch.onnx.dynamo_export`."""1825
1826_COMPILER_NAME = "dynamo_aot_inline"1827
1828def _export(1829self, model, example_inputs, output_path: str1830) -> torch.onnx.ONNXProgram:1831if self.copy_before_export:1832# Deepcopy model before export to avoid modification to baseline model.1833model, example_inputs = self.deepcopy_model_and_inputs_to_device(1834model, example_inputs, self._determine_deepcopy_target_device()1835)1836
1837example_args, example_kwargs = _normalize_bench_inputs(example_inputs)1838options = torch.onnx.ExportOptions(dynamic_shapes=self._dynamic_shapes)1839onnx_program = torch.onnx.dynamo_export(1840model, *example_args, **example_kwargs, export_options=options1841)1842# Apply AOT inline post export.1843# Requires onnx >= 1.151844import onnx1845import onnx.inliner1846
1847# Workaround for inliner not supporting with models larger than 2GB.1848# Save model to disk first separating out external data,1849# and load back without external data for inliner to work on.1850model_proto = onnx_program.model_proto1851onnx.save_model(model_proto, output_path, save_as_external_data=True)1852model_proto = onnx.load(output_path, load_external_data=False)1853model_proto = onnx.inliner.inline_local_functions(model_proto)1854onnx.save_model(model_proto, output_path)1855return onnx_program1856
1857
1858class OnnxModelFromDynamoAotOptimize(OnnxModelFromDynamo):1859"""Dynamo and Fx based export, with AOT optimize post export. `torch.onnx.dynamo_export`."""1860
1861_COMPILER_NAME = "dynamo_aot_optimize"1862
1863def _export(1864self, model, example_inputs, output_path: str1865) -> torch.onnx.ONNXProgram:1866if self.copy_before_export:1867# Deepcopy model before export to avoid modification to baseline model.1868model, example_inputs = self.deepcopy_model_and_inputs_to_device(1869model, example_inputs, self._determine_deepcopy_target_device()1870)1871
1872example_args, example_kwargs = _normalize_bench_inputs(example_inputs)1873options = torch.onnx.ExportOptions(dynamic_shapes=self._dynamic_shapes)1874export_output = torch.onnx.dynamo_export(1875model, *example_args, **example_kwargs, export_options=options1876)1877
1878import onnx1879from onnxscript.rewriter.onnxruntime import rewrite1880
1881model_proto = rewrite(export_output.model_proto)1882onnx.save_model(1883model_proto,1884output_path,1885save_as_external_data=True,1886all_tensors_to_one_file=True,1887)1888
1889return export_output1890
1891
1892class _OnnxPatch:1893@classmethod1894def patch_non_tensor_outputs(cls, correct_result, new_result, fp64_outputs):1895"""Patch non-tensor outputs to make them comparable with the correct result.1896
1897ONNX model always returns a flat tuple of tensors, but the PyTorch model outputs
1898`correct_result` and `fp64_outputs` can be arbitrary types. This function normalizes
1899the outputs to make them comparable with the ONNX model output.
1900"""
1901try:1902from transformers import modeling_outputs1903except ImportError:1904has_transformers = False1905else:1906has_transformers = True1907
1908if has_transformers and isinstance(1909correct_result, modeling_outputs.ModelOutput1910):1911correct_result = correct_result.to_tuple()1912fp64_outputs = fp64_outputs.to_tuple() if fp64_outputs is not None else None1913elif type(correct_result).__name__ in (1914"MaskedLMOutput",1915"Seq2SeqLMOutput",1916"CausalLMOutputWithCrossAttentions",1917"LongformerMaskedLMOutput",1918"Instances",1919"SquashedNormal",1920"Boxes",1921"Normal",1922"TanhTransform",1923"Foo",1924"Variable",1925):1926# Copied from `same` function in `torch._dynamo.utils`1927correct_result = [1928value
1929for key in correct_result.__dict__.keys()1930if (value := getattr(correct_result, key)) is not None1931]1932fp64_outputs = (1933[1934value
1935for key in fp64_outputs.__dict__.keys()1936if (value := getattr(fp64_outputs, key)) is not None1937]1938if fp64_outputs is not None1939else None1940)1941
1942# Flatten nested tuple of tensors, i.e. past_key_values1943correct_result = pytree.tree_leaves(correct_result)1944# Hack to put results from different runs on same device.1945# This is needed for ONNX CPU fallback benchmark, where PyTorch eager is run on GPU.1946# Assuming outputs from a single run are always on same device!1947devices = [x.device for x in correct_result if isinstance(x, torch.Tensor)]1948assert devices and all(1949x == devices[0] for x in devices1950), "All tensors must be on same device!"1951device = devices[0]1952new_result = pytree.tree_leaves(new_result)1953new_result = pytree.tree_map(1954lambda x: x.to(device=device) if isinstance(x, torch.Tensor) else x,1955new_result,1956)1957fp64_outputs = pytree.tree_leaves(fp64_outputs)1958
1959return correct_result, new_result, fp64_outputs1960
1961
1962@dataclasses.dataclass1963class OnnxExportErrorRow:1964device: str1965model_name: str1966batch_size: int1967rule_id: Optional[str] = None1968rule_name: Optional[str] = None1969diagnostic_level: Optional[str] = None1970diagnostic_message: Optional[str] = None1971exception_type_name: Optional[str] = None1972exception_message: Optional[str] = None1973
1974def __post_init__(self):1975assert (1976self.rule_id is not None1977and self.rule_name is not None1978and self.diagnostic_level is not None1979and self.diagnostic_message is not None1980) or self.exception_type_name, (1981"Either rule_id, rule_name, diagnostic_level and diagnostic_message "1982"must be set or exception_type_name must be set"1983)1984
1985@property1986def headers(self) -> List[str]:1987return [field.name for field in dataclasses.fields(self)]1988
1989@property1990def row(self) -> List[str]:1991return [getattr(self, field.name) for field in dataclasses.fields(self)]1992
1993
1994class OnnxExportErrorParser:1995def __init__(self, device: str, model_name: str, batch_size: int):1996self.device = device1997self.model_name = model_name1998self.batch_size = batch_size1999
2000def _qualified_exception_class_name(self, exception: Exception) -> str:2001if exception.__class__.__module__ == "builtins":2002return exception.__class__.__name__2003return f"{exception.__class__.__module__}.{exception.__class__.__name__}"2004
2005def parse_diagnostic_context(2006self,2007diagnostic_context: diagnostics.DiagnosticContext,2008) -> Generator[OnnxExportErrorRow, Any, Any]:2009from torch.onnx._internal.fx import diagnostics2010
2011for diagnostic in diagnostic_context.diagnostics:2012if diagnostic.level >= diagnostics.levels.ERROR:2013yield OnnxExportErrorRow(2014device=self.device,2015model_name=self.model_name,2016batch_size=self.batch_size,2017rule_id=diagnostic.rule.id,2018rule_name=diagnostic.rule.name,2019diagnostic_level=diagnostic.level.name,2020diagnostic_message=diagnostic.message,2021)2022
2023def parse_exception(self, exception: Exception) -> OnnxExportErrorRow:2024return OnnxExportErrorRow(2025device=self.device,2026model_name=self.model_name,2027batch_size=self.batch_size,2028exception_type_name=self._qualified_exception_class_name(exception),2029exception_message=str(exception),2030)2031
2032
2033@dataclasses.dataclass2034class OnnxContext:2035onnx_model: Optional[OnnxModel] = None2036
2037
2038def optimize_onnx_ctx(2039output_directory: str,2040onnx_model_cls: Type[OnnxModel],2041run_n_iterations: Callable,2042dynamic_shapes: bool = False,2043copy_before_export: bool = False,2044use_experimental_patch: bool = False,2045) -> Callable:2046# NOTE(bowbao): This function creates and returns the onnx version of 'run_n_iterations',2047# which does the following:2048# 1. Export and cache model.2049# 2. Create iobinding for ORT.2050# 3. Run ORT for n iterations.2051# The cached model is stored in 'context' under the returned callable.2052context = OnnxContext()2053test_data_dumped = False2054
2055def run_n_iterations_onnx(model, inputs, n=2):2056from torch.onnx._internal import _exporter_legacy2057from torch.onnx._internal.fx import diagnostics2058
2059# NOTE(bowbao): Capture all export & ort errors and diagnostics.2060# Serialize to csv, to be parsed and summarized later by '._onnx/reporter.py'.2061# TODO: Accuracy mismatch is not reported here in csv.2062assert (2063output_filename.find(".csv") > 02064), f"expected output_filename to be a .csv, but got {output_filename}"2065output_error_filename = output_filename[:-4] + "_export_error.csv"2066parser = OnnxExportErrorParser(current_device, current_name, current_batch_size)2067try:2068nonlocal context2069if context.onnx_model is None:2070context.onnx_model = onnx_model_cls(2071output_directory,2072model,2073copy.deepcopy(inputs),2074dynamic_shapes=dynamic_shapes,2075copy_before_export=copy_before_export,2076use_experimental_patch=use_experimental_patch,2077)2078onnx_model = context.onnx_model2079
2080for _ in range(n):2081nonlocal test_data_dumped2082if not test_data_dumped:2083# Serializes inputs and outputs to .pb files for further offline analysis.2084# Due to this, this function is not and should not be used for perf measurement.2085outputs = onnx_model.run_and_serialize_inputs_outputs(inputs)2086test_data_dumped = True2087else:2088outputs = onnx_model.run(inputs)2089return outputs2090except _exporter_legacy.OnnxExporterError as e:2091# `torch.onnx.dynamo_export` raises error that encloses diagnostics.2092diagnostic_context = e.onnx_program.diagnostic_context2093for parsed_error in parser.parse_diagnostic_context(diagnostic_context):2094output_csv(2095output_error_filename, parsed_error.headers, parsed_error.row2096)2097if context.onnx_model is not None:2098e.onnx_program.save_diagnostics(2099f"{context.onnx_model.model_dir}/"2100f"{current_onnx_compiler}_{current_name}_{current_device}.sarif"2101)2102
2103# Check also the raw exception that caused export failure.2104# Skip if it is already analyzed by diagnostics.2105cause_of_exception = e.__cause__2106if not isinstance(2107cause_of_exception, diagnostics.RuntimeErrorWithDiagnostic2108):2109parsed_error = parser.parse_exception(cause_of_exception)2110output_csv(2111output_error_filename, parsed_error.headers, parsed_error.row2112)2113raise2114except Exception as e:2115# `torch.onnx.export` errors.2116# ORT errors.2117parsed_error = parser.parse_exception(e)2118output_csv(output_error_filename, parsed_error.headers, parsed_error.row)2119raise2120
2121run_n_iterations_onnx.context = context2122
2123return run_n_iterations_onnx2124
2125
2126def read_batch_size_from_file(args, filename, model_name):2127batch_size = None2128if os.path.exists("benchmarks"):2129filename = os.path.join("benchmarks", filename)2130assert os.path.exists(filename), filename2131with open(filename) as f:2132lines = f.readlines()2133lines = [i.split(",") for i in lines if len(i.strip()) > 0]2134for val in lines:2135cur_name, b = val2136if model_name == cur_name:2137batch_size = int(b)2138if batch_size is None:2139log.warning("Could not find batch size for %s", model_name)2140elif batch_size == -1:2141raise RuntimeError(2142f"Batch size is unset for {model_name} in {args.batch_size_file}"2143)2144print(f"batch size: {batch_size}")2145return batch_size2146
2147
2148class TimeOutException(Exception):2149pass2150
2151
2152def alarm_handler(signum, frame):2153raise TimeOutException2154
2155
2156def exit_after(s):2157"""2158Decorator to raise TimeoutException if the fn is taking more than s seconds
2159to run.
2160"""
2161
2162def outer(fn):2163def inner(*args, **kwargs):2164signal.signal(signal.SIGALRM, alarm_handler)2165signal.alarm(s)2166try:2167result = fn(*args, **kwargs)2168finally:2169signal.alarm(0)2170return result2171
2172return inner2173
2174return outer2175
2176
2177def get_peak_memory():2178return torch.cuda.max_memory_allocated() / 10**92179
2180
2181def null_experiment(args, model_iter_fn, model, example_inputs):2182"""2183A no-op experiment useful for making sure TorchBenchark alone works properly.
2184"""
2185
2186return []2187
2188
2189def cast_to(dtype, model, inputs):2190# cast model and inputs to fp162191if dtype == torch.float16:2192model = model.half()2193else:2194model = model.to(dtype)2195
2196inputs = tree_map(2197lambda x: x.to(dtype)2198if isinstance(x, torch.Tensor) and x.is_floating_point()2199else x,2200inputs,2201)2202return model, inputs2203
2204
2205def cast_to_bf16(model, inputs):2206return cast_to(torch.bfloat16, model, inputs)2207
2208
2209def cast_to_fp16(model, inputs):2210return cast_to(torch.float16, model, inputs)2211
2212
2213def cast_to_fp64(model, inputs):2214return cast_to(torch.float64, model, inputs)2215
2216
2217def cast_to_fp32(model, inputs):2218return cast_to(torch.float32, model, inputs)2219
2220
2221class DummyGradScaler:2222def scale(self, loss):2223return loss2224
2225
2226def get_dynamo_stats():2227# TODO: consider deepcopy'ing the entire counters struct and2228# adding a helper to do subtraction on it2229return collections.Counter(2230{2231"calls_captured": torch._dynamo.utils.counters["stats"]["calls_captured"],2232"unique_graphs": torch._dynamo.utils.counters["stats"]["unique_graphs"],2233"graph_breaks": sum(torch._dynamo.utils.counters["graph_break"].values()),2234# NB: The plus removes zero counts2235"unique_graph_breaks": len(+torch._dynamo.utils.counters["graph_break"]),2236"autograd_captures": torch._dynamo.utils.counters["compiled_autograd"][2237"captures"2238],2239"autograd_compiles": torch._dynamo.utils.counters["compiled_autograd"][2240"compiles"2241],2242"cudagraph_skips": torch._dynamo.utils.counters["inductor"][2243"cudagraph_skips"2244],2245}2246)2247
2248
2249@contextmanager
2250def maybe_init_distributed(should_init_distributed, rank, world_size, port="6789"):2251try:2252if should_init_distributed:2253torch.cuda.set_device(rank)2254os.environ["MASTER_ADDR"] = "localhost"2255os.environ["MASTER_PORT"] = port2256torch.distributed.init_process_group(2257"nccl", rank=rank, world_size=world_size2258)2259yield2260finally:2261if should_init_distributed:2262torch.distributed.destroy_process_group()2263
2264
2265@contextmanager
2266def maybe_snapshot_memory(should_snapshot_memory, suffix):2267# Enables Memory Snapshot tool for memory deep dives:2268# https://pytorch.org/blog/understanding-gpu-memory-1/2269try:2270if should_snapshot_memory:2271torch.cuda.memory._record_memory_history(max_entries=100000)2272yield2273finally:2274if should_snapshot_memory:2275try:2276torch.cuda.memory._dump_snapshot(2277os.path.join(2278torch._dynamo.config.base_dir,2279f"{output_filename.rstrip('.csv')}_{suffix}.pickle",2280)2281)2282except Exception as e:2283logging.error("Failed to save memory snapshot, %s", e)2284
2285torch.cuda.memory._record_memory_history(enabled=None)2286
2287
2288class BenchmarkRunner:2289def __init__(self):2290self.model_iter_fn = None2291self.grad_scaler = DummyGradScaler()2292self.autocast = contextlib.nullcontext2293self.autocast_arg = {}2294self.optimizer = None2295self._args = None2296
2297def setup_amp(self, current_device=None):2298if self.args.only in self.fp32_only_models:2299return2300
2301devices = [current_device] if current_device else self.args.devices2302if self.args.amp:2303# AMP training can lead to small loss values which can undeflow2304# gradient values returning in zero gradients. To solve this2305# problem, PyTorch introduces GradScaler. GradScaler is a stateful2306# structure, that scales the loss values to prevent underflow. Loss2307# values are big at the beginning of training (therefore not2308# requiring scaling), while loss value tends to be small as network2309# starts getting better (requiring scaling). GradScaler manages all2310# of this fine tuning, checking the gradients are turning to inf,2311# discarding such batches.2312
2313# Since we are not running a long iteration, default value of2314# init_scale 65536 is going to turn all gradients to inf. Therefore,2315# we just use a init_scale of 2.0 for benchmarking purpose.2316
2317# Disabling Gradscaler because2318# 1) Benchmark setup runs 2 iterations of fwd-bwd. So, not useful.2319# 2) Current setup shares grad_scaler for eager and dynamo model,2320# which is bad as Gradscaler has state and can adjust the scaling2321# factor between eager and dynamo run, making accuracy check2322# harder.2323# self.grad_scaler = torch.amp.GradScaler(device="cuda", init_scale=2.0)2324self.autocast = functools.partial(2325torch.amp.autocast, device_type=devices[0]2326)2327if self.args.amp_dtype:2328amp_dtype = (2329torch.float162330if self.args.amp_dtype == "float16"2331else torch.bfloat162332)2333self.autocast_arg["dtype"] = amp_dtype2334
2335def init_optimizer(self, name, device, params):2336if device == "cuda" and self.args.training and name not in CI_SKIP_OPTIMIZER:2337if (name in CI_USE_SGD and self.args.ci) or name in BENCHMARK_USE_SGD:2338self.optimizer = torch.optim.SGD(params, lr=0.01, foreach=True)2339# Disable multi_tensor_sgd for benchmarking, there isn't a large performance benefit (~1%) to compiling2340# this optimizer because it is a single foreach add, and increases compile time.2341# After autotuning and fake tensor caching lands, we can enable, becuase the compile time impact will be lower.2342# Fake Tensor caching: https://github.com/pytorch/pytorch/pull/1138732343# Autotuning: https://github.com/pytorch/pytorch/issues/1174472344self.optimizer.step = torch._dynamo.disable(self.optimizer.step)2345else:2346self.optimizer = torch.optim.Adam(2347params, lr=0.01, capturable=True, foreach=True2348)2349else:2350self.optimizer = None2351
2352@property2353def args(self):2354return self._args2355
2356@args.setter2357def args(self, args):2358self._args = args2359
2360@property2361def skip_models(self):2362return set()2363
2364@property2365def skip_models_for_cuda(self):2366return set()2367
2368@property2369def skip_models_for_cpu(self):2370return set()2371
2372@property2373def skip_models_for_freezing(self):2374return set()2375
2376@property2377def slow_models(self):2378return set()2379
2380@property2381def very_slow_models(self):2382return set()2383
2384@property2385def non_deterministic_models(self):2386return set()2387
2388@property2389def fp32_only_models(self):2390return set()2391
2392@property2393def force_amp_for_fp16_bf16_models(self):2394return set()2395
2396@property2397def force_fp16_for_bf16_models(self):2398return set()2399
2400@property2401def skip_not_suitable_for_training_models(self):2402return set()2403
2404@property2405def failing_torchinductor_models(self):2406return set()2407
2408@property2409def failing_fx2trt_models(self):2410return set()2411
2412@property2413def skip_accuracy_checks_large_models_dashboard(self):2414return set()2415
2416@property2417def skip_accuracy_check_as_eager_non_deterministic(self):2418return set()2419
2420@property2421def skip_multiprocess_models(self):2422return set()2423
2424@property2425def skip_models_due_to_control_flow(self):2426return set()2427
2428@property2429def guard_on_nn_module_models(self):2430return set()2431
2432@property2433def inline_inbuilt_nn_modules_models(self):2434return set()2435
2436def get_tolerance_and_cosine_flag(self, is_training, current_device, name):2437raise NotImplementedError2438
2439@property2440def equal_nan(self):2441equal_nan = True2442if self.args.float32:2443equal_nan = False2444return equal_nan2445
2446def use_larger_multiplier_for_smaller_tensor(self, name):2447return False2448
2449def iter_models(self, args):2450for model_name in self.iter_model_names(args):2451for device in args.devices:2452try:2453yield self.load_model(2454device,2455model_name,2456batch_size=args.batch_size,2457)2458except NotImplementedError:2459continue # bad benchmark implementation2460
2461def deepcopy_model(self, model):2462return copy.deepcopy(model)2463
2464def cast_based_on_args(self, model, example_inputs):2465if self.args.float32 or self.args.only in self.fp32_only_models:2466if not self.args.float32:2467log.warning("Model %s supports float32 only", self.args.only)2468model, example_inputs = cast_to_fp32(model, example_inputs)2469elif self.args.float16:2470if self.args.only in self.force_amp_for_fp16_bf16_models:2471log.warning(2472"Model %s does not support float16, running with amp instead",2473self.args.only,2474)2475self.args.amp = True2476self.setup_amp()2477else:2478model, example_inputs = cast_to_fp16(model, example_inputs)2479elif self.args.bfloat16:2480if self.args.only in self.force_amp_for_fp16_bf16_models:2481log.warning(2482"Model %s does not support bfloat16, running with amp instead",2483self.args.only,2484)2485self.args.amp = True2486self.setup_amp()2487elif self.args.only in self.force_fp16_for_bf16_models:2488log.warning(2489"Model %s does not support bfloat16, running with float16 instead",2490self.args.only,2491)2492model, example_inputs = cast_to_fp16(model, example_inputs)2493else:2494model, example_inputs = cast_to_bf16(model, example_inputs)2495
2496return model, example_inputs2497
2498def validate_model(self, model, example_inputs):2499"""2500Runs the eager model with example inputs to ensure that eager passes.
2501"""
2502model = self.deepcopy_model(model)2503example_inputs = clone_inputs(example_inputs)2504model, example_inputs = self.cast_based_on_args(model, example_inputs)2505try:2506self.model_iter_fn(model, example_inputs)2507except Exception as e:2508raise RuntimeError("Eager run failed") from e2509
2510def maybe_cast(self, model, example_inputs):2511model, example_inputs = self.cast_based_on_args(model, example_inputs)2512return model, example_inputs2513
2514def decay_batch_exp(self, batch_size, factor=0.5, divisor=2):2515out_batch_size = batch_size * factor2516if out_batch_size > divisor:2517out_batch_size = (out_batch_size + 1) // divisor * divisor2518else:2519out_batch_size = batch_size - 12520return max(0, int(out_batch_size))2521
2522def batch_size_finder(self, device, model_name, initial_batch_size=1024):2523batch_size = initial_batch_size2524while batch_size >= 1:2525empty_gpu_cache(current_device)2526try:2527device, name, model, example_inputs, _ = self.load_model(2528device,2529model_name,2530batch_size,2531)2532self.model_iter_fn(model, example_inputs)2533return batch_size2534except RuntimeError as e:2535error_str = str(e)2536if "channels_last" in error_str:2537break2538batch_size = self.decay_batch_exp(batch_size)2539return 12540
2541def run_n_iterations(self, mod, inputs):2542n = self.args.iterations2543for _ in range(n - 1):2544self.model_iter_fn(mod, inputs, collect_outputs=False)2545return self.model_iter_fn(mod, inputs, collect_outputs=True)2546
2547@torch._disable_dynamo(recursive=True)2548def optimizer_zero_grad(self, mod):2549if self.optimizer is not None:2550self.optimizer.zero_grad(True)2551else:2552mod.zero_grad(True)2553
2554def optimizer_step(self):2555if self.optimizer is not None:2556self.optimizer.step()2557
2558def get_benchmark_indices(self, length):2559start = self._args.partition_id * (length // self._args.total_partitions)2560end = (2561(self._args.partition_id + 1) * (length // self._args.total_partitions)2562if self._args.partition_id < self._args.total_partitions - 12563else length2564)2565return start, end2566
2567def get_fsdp_auto_wrap_policy(self, model_name: str):2568from diffusers.models.transformer_2d import Transformer2DModel2569from torchbenchmark.models.nanogpt.model import Block2570from transformers.models.llama.modeling_llama import LlamaDecoderLayer2571from transformers.models.t5.modeling_t5 import T5Block2572from transformers.models.whisper.modeling_whisper import WhisperEncoderLayer2573
2574from torch.distributed.fsdp.wrap import (2575ModuleWrapPolicy,2576size_based_auto_wrap_policy,2577)2578
2579# handcrafted wrap policy2580MODEL_FSDP_WRAP = {2581"stable_diffusion_unet": (Transformer2DModel,),2582"hf_T5": (T5Block,),2583"hf_T5_base": (T5Block,),2584"hf_T5_large": (T5Block,),2585"hf_Whisper": (WhisperEncoderLayer,),2586"llama_v2_7b_16h": (LlamaDecoderLayer,),2587"nanogpt": (Block,),2588}2589
2590if model_name not in MODEL_FSDP_WRAP:2591# default to using wrap policy based on module size2592return functools.partial(2593size_based_auto_wrap_policy, recurse=True, min_num_params=int(1e5)2594)2595
2596return ModuleWrapPolicy(MODEL_FSDP_WRAP[model_name])2597
2598def deepcopy_and_maybe_parallelize(self, model):2599model = self.deepcopy_model(model)2600if self.args.ddp:2601assert (2602torch.distributed.is_available()2603), "Can't use DDP without a distributed enabled build"2604from torch.nn.parallel import DistributedDataParallel as DDP2605
2606model = DDP(model, find_unused_parameters=True)2607elif self.args.fsdp:2608assert (2609torch.distributed.is_available()2610), "Can't use FSDP without a distributed enabled build"2611from torch.distributed.fsdp import (2612FullyShardedDataParallel as FSDP,2613MixedPrecision,2614)2615
2616if self.args.float16:2617dtype = torch.float162618elif self.args.bfloat16:2619dtype = torch.bfloat162620else:2621dtype = torch.float322622
2623mp_policy = MixedPrecision(2624param_dtype=dtype,2625# Gradient communication precision.2626reduce_dtype=dtype,2627# Buffer precision.2628buffer_dtype=dtype,2629)2630
2631model = FSDP(2632model,2633use_orig_params=True,2634device_id=torch.cuda.current_device()2635if self.args.devices[-1] == "cuda"2636else None,2637mixed_precision=mp_policy,2638limit_all_gathers=True,2639auto_wrap_policy=self.get_fsdp_auto_wrap_policy(self.args.only),2640)2641return model2642
2643def check_accuracy(2644self, name, model, example_inputs, optimize_ctx, experiment, tag2645):2646"""2647Checks accuracy.
26481) Collect the outputs with fp64 datatype. This is useful for error checking.
26492) Checks if eager itself has variations.
2650"""
2651start_stats = get_dynamo_stats()2652
2653def record_status(accuracy_status, dynamo_start_stats):2654"""2655Records the status in the csv file
2656"""
2657if current_name in self.non_deterministic_models:2658if accuracy_status in (2659"pass",2660"eager_two_runs_differ",2661"fail_accuracy",2662):2663accuracy_status = "pass"2664
2665headers = ["dev", "name", "batch_size", "accuracy"]2666fields = [current_device, current_name, current_batch_size, accuracy_status]2667
2668if tag is not None:2669headers.insert(3, "tag")2670fields.insert(3, tag)2671
2672dynamo_stats = get_dynamo_stats()2673dynamo_stats.subtract(dynamo_start_stats)2674for k, v in dynamo_stats.items():2675headers.append(k)2676fields.append(v)2677
2678output_csv(output_filename, headers, fields)2679return accuracy_status2680
2681if name in self.skip_accuracy_checks_large_models_dashboard:2682return record_status("pass_due_to_skip", dynamo_start_stats=start_stats)2683
2684# Skip all accuracy check for the torchao backend2685if self.args.backend == "torchao":2686return record_status("pass_due_to_skip", dynamo_start_stats=start_stats)2687
2688with self.pick_grad(name, self.args.training):2689# Collect the fp64 reference outputs to be used later for accuracy checking.2690fp64_outputs = None2691model_fp64 = None2692inputs_fp64 = None2693try:2694model_fp64, inputs_fp64 = cast_to_fp64(2695self.deepcopy_and_maybe_parallelize(model),2696clone_inputs(example_inputs),2697)2698self.init_optimizer(name, current_device, model_fp64.parameters())2699fp64_outputs = self.run_n_iterations(model_fp64, inputs_fp64)2700fp64_outputs = tree_map(2701lambda x: x.to(torch.float64)2702if isinstance(x, torch.Tensor) and x.is_floating_point()2703else x,2704fp64_outputs,2705)2706except Exception:2707log.warning(2708"fp64 golden ref were not generated for %s. Setting accuracy check to cosine",2709name,2710)2711self.args.cosine = True2712fp64_outputs = None2713finally:2714del model_fp64, inputs_fp642715empty_gpu_cache(current_device)2716
2717tolerance, cos_similarity = self.get_tolerance_and_cosine_flag(2718self.args.training, current_device, name2719)2720
2721# Cast the model to float16/float32 as necessary2722model, example_inputs = self.maybe_cast(model, example_inputs)2723accuracy_status = "pass"2724
2725# Get results of native pytorch2726reset_rng_state()2727model_copy = None2728try:2729model_copy = self.deepcopy_and_maybe_parallelize(model)2730self.init_optimizer(name, current_device, model_copy.parameters())2731correct_result = self.run_n_iterations(2732model_copy, clone_inputs(example_inputs)2733)2734except Exception as e:2735accuracy_status = (2736"eager_1st_run_OOM"2737if isinstance(e, torch.cuda.OutOfMemoryError)2738else "eager_1st_run_fail"2739)2740log.exception("")2741return record_status(accuracy_status, dynamo_start_stats=start_stats)2742finally:2743del model_copy2744empty_gpu_cache(current_device)2745
2746# Rerun native pytorch2747reset_rng_state()2748model_copy = None2749try:2750model_copy = self.deepcopy_and_maybe_parallelize(model)2751self.init_optimizer(name, current_device, model_copy.parameters())2752correct_rerun_result = self.run_n_iterations(2753model_copy, clone_inputs(example_inputs)2754)2755except Exception as e:2756accuracy_status = (2757"eager_2nd_run_OOM"2758if isinstance(e, torch.cuda.OutOfMemoryError)2759else "eager_2nd_run_fail"2760)2761log.exception("")2762return record_status(accuracy_status, dynamo_start_stats=start_stats)2763finally:2764del model_copy2765empty_gpu_cache(current_device)2766
2767# Two eager runs should have exactly same result2768is_same = True2769try:2770if (2771name not in self.skip_accuracy_check_as_eager_non_deterministic2772and not same(2773correct_result,2774correct_rerun_result,2775fp64_ref=None,2776cos_similarity=False,2777tol=0,2778equal_nan=self.equal_nan,2779use_larger_multiplier_for_smaller_tensor=self.use_larger_multiplier_for_smaller_tensor(2780name
2781),2782)2783):2784is_same = False2785except Exception as e:2786# Sometimes torch.allclose may throw RuntimeError2787is_same = False2788
2789if not is_same:2790accuracy_status = "eager_two_runs_differ"2791return record_status(accuracy_status, dynamo_start_stats=start_stats)2792
2793correct_rerun_result = None2794
2795# Run with Dynamo2796reset_rng_state()2797torch._dynamo.reset()2798model_copy = None2799try:2800model_copy = self.deepcopy_and_maybe_parallelize(model)2801self.init_optimizer(name, current_device, model_copy.parameters())2802if self.args.export or self.args.export_aot_inductor:2803# apply export on module directly2804# no need for n iterations2805# the logic should be the same to self.model_iter_fn (forward_pass)2806with self.autocast(**self.autocast_arg):2807optimized_model_iter_fn = optimize_ctx(2808model_copy, example_inputs2809)2810new_result = optimized_model_iter_fn(model_copy, example_inputs)2811else:2812optimized_model_iter_fn = optimize_ctx(self.run_n_iterations)2813with maybe_enable_compiled_autograd(2814self.args.compiled_autograd,2815fullgraph=self.args.nopython,2816dynamic=self.args.dynamic_shapes,2817):2818new_result = optimized_model_iter_fn(model_copy, example_inputs)2819except Exception as e:2820log.exception("")2821print(2822"TorchDynamo optimized model failed to run because of following error"2823)2824accuracy_status = (2825"OOM"2826if isinstance(e, torch.cuda.OutOfMemoryError)2827else "fail_to_run"2828)2829return record_status(accuracy_status, dynamo_start_stats=start_stats)2830finally:2831del model_copy2832
2833if name in self.skip_accuracy_check_as_eager_non_deterministic:2834return record_status("pass_due_to_skip", dynamo_start_stats=start_stats)2835
2836if (2837current_onnx_compiler == "torchscript"2838or current_onnx_compiler == "dynamo"2839):2840# Workaround for ONNX for non-tensor outputs2841(2842correct_result,2843new_result,2844fp64_outputs,2845) = _OnnxPatch.patch_non_tensor_outputs(2846correct_result, new_result, fp64_outputs2847)2848# Relax tolerance for ONNX cuda2849if current_device == "cuda":2850tolerance = 1e-22851
2852# TODO: store correct_result into the dumped file for offline onnx model validation.2853# The downside and potential problem, is that the output formats may be different.2854# E.g., the output order might not match, None might be part of output, etc.2855
2856try:2857if self.args.training and self.args.amp:2858if process_fn := self.get_output_amp_train_process_func.get(2859name, None2860):2861correct_result = process_fn(correct_result)2862new_result = process_fn(new_result)2863fp64_outputs = process_fn(fp64_outputs)2864
2865if not same(2866correct_result,2867new_result,2868fp64_outputs,2869equal_nan=self.equal_nan,2870use_larger_multiplier_for_smaller_tensor=self.use_larger_multiplier_for_smaller_tensor(2871name
2872),2873cos_similarity=cos_similarity,2874tol=tolerance,2875):2876is_same = False2877except Exception as e:2878# Sometimes torch.allclose may throw RuntimeError2879is_same = False2880
2881if not is_same:2882if self.args.skip_accuracy_check:2883accuracy_status = "pass_due_to_skip"2884else:2885accuracy_status = "fail_accuracy"2886return record_status(accuracy_status, dynamo_start_stats=start_stats)2887
2888return record_status(accuracy_status, dynamo_start_stats=start_stats)2889
2890def check_tolerance(2891self, name, model, example_inputs, optimize_ctx, base_device="cpu"2892):2893"""2894Checks tolerance based on https://pytorch.org/docs/stable/generated/torch.allclose.html.
2895"""
2896tolerance_status = "pass"2897if name in self.skip_accuracy_checks_large_models_dashboard:2898tolerance_status = "pass_due_to_skip"2899return tolerance_status2900# Cast the model to float16/float32 as necessary2901model, example_inputs = self.maybe_cast(model, example_inputs)2902
2903with self.pick_grad(name, self.args.training):2904# Get results of native pytorch2905reset_rng_state()2906model_copy = copy.deepcopy(model)2907model_copy = model_copy.to(base_device)2908example_inputs_copy = copy.deepcopy(example_inputs)2909example_inputs_copy = tree_map(2910lambda x: x.to(base_device), example_inputs_copy2911)2912self.init_optimizer(name, base_device, model_copy.parameters())2913correct_result = self.run_n_iterations(model_copy, example_inputs_copy)2914
2915# Run with Dynamo2916# Sometime CI fails with random triton compilation failure which will be skipped for now2917# TODO: revisit this after switching to new Triton runtime2918reset_rng_state()2919torch._dynamo.reset()2920try:2921self.init_optimizer(name, current_device, model.parameters())2922optimized_model_iter_fn = optimize_ctx(self.run_n_iterations)2923new_result = optimized_model_iter_fn(model, example_inputs)2924except Exception as e:2925log.exception("")2926print(2927"TorchDynamo optimized model failed to run because of following error"2928)2929return "fail_to_run"2930
2931def dump_max_mean_values(tol, ref, res):2932if isinstance(ref, (list, tuple, torch.nn.ParameterList, torch.Size)):2933for refi, resi in zip(ref, res):2934dump_max_mean_values(tol, refi, resi)2935elif isinstance(ref, dict):2936for k in ref.keys():2937dump_max_mean_values(tol, ref[k], res[k])2938elif isinstance(ref, torch.Tensor):2939res = res.to(base_device)2940t = torch.abs(ref - res) / (1 + torch.abs(ref))2941tol.append(t.flatten().to(torch.float32))2942return tol2943
2944tol = []2945dump_max_mean_values(tol, correct_result, new_result)2946tol = torch.cat(tol)2947tol = torch.tensor(tol)2948max = torch.max(tol)2949mean = torch.mean(tol)2950div = torch.std(tol)2951headers = ["dev", "name", "batch_size", "max", "mean", "std"]2952fields = [2953current_device,2954current_name,2955current_batch_size,2956max.item(),2957mean.item(),2958div.item(),2959]2960output_csv(output_filename, headers, fields)2961return tolerance_status2962
2963def run_performance_test_non_alternate(2964self, name, model, example_inputs, optimize_ctx, experiment, tag=None2965):2966"Run performance test in non-alternately."2967assert (2968experiment.func is latency_experiment2969), "Must run with latency_experiment."2970
2971def warmup(fn, model, example_inputs, mode, niters=10):2972peak_mem = 02973start_stats = get_dynamo_stats()2974try:2975if current_device == "cuda":2976torch.cuda.reset_peak_memory_stats()2977empty_gpu_cache(current_device)2978t0 = time.perf_counter()2979for _ in range(niters):2980fn(model, example_inputs)2981t1 = time.perf_counter()2982latency = t1 - t02983if current_device == "cuda":2984peak_mem = get_peak_memory()2985elif current_device == "cpu":2986total = psutil.virtual_memory().total2987percentage = psutil.Process(os.getpid()).memory_percent()2988peak_mem = percentage * total / 10**92989except Exception:2990log.exception("Backend %s failed in warmup()", mode)2991write_csv_when_exception(2992self.args, current_name, "warmup_failed", current_device2993)2994return sys.exit(-1)2995dynamo_stats = get_dynamo_stats()2996dynamo_stats.subtract(start_stats)2997return latency, peak_mem, dynamo_stats2998
2999# Cast the model to float16/float32 as necessary3000model, example_inputs = self.maybe_cast(model, example_inputs)3001
3002# Use distributed wrapping as necessary3003model = self.deepcopy_and_maybe_parallelize(model)3004
3005self.init_optimizer(name, current_device, model.parameters())3006
3007# The self.autocast context is needed for the model we export with aot_compile,3008# similar to what we do in the check_accuracy function3009ctx = (3010self.autocast(**self.autocast_arg)3011if self.args.export_aot_inductor3012else contextlib.nullcontext()3013)3014
3015with self.pick_grad(name, self.args.training), ctx:3016ok, total = Stats.reset_counters()3017experiment_kwargs = {}3018if tag is not None:3019experiment_kwargs["tag"] = tag3020results = []3021
3022with maybe_snapshot_memory(3023self.args.snapshot_memory, f"eager_{self.args.only}"3024):3025eager_latency, eager_peak_mem, _ = warmup(3026self.model_iter_fn, model, example_inputs, "eager"3027)3028if self.args.use_warm_peak_memory:3029_, eager_peak_mem, _ = warmup(3030self.model_iter_fn, model, example_inputs, "eager", niters=13031)3032
3033baseline_timings = experiment(3034model, example_inputs, mark="expected", **experiment_kwargs3035)3036
3037if self.args.export_aot_inductor:3038t_0 = time.perf_counter()3039optimized_model_iter_fn = optimize_ctx3040t_1 = time.perf_counter()3041aot_compilation_time = t_1 - t_03042else:3043optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)3044aot_compilation_time = 03045
3046with maybe_enable_compiled_autograd(3047self.args.compiled_autograd,3048fullgraph=self.args.nopython,3049dynamic=self.args.dynamic_shapes,3050), maybe_snapshot_memory(3051self.args.snapshot_memory, f"compiled_{self.args.only}"3052):3053dynamo_latency, dynamo_peak_mem, dynamo_stats = warmup(3054optimized_model_iter_fn, model, example_inputs, "dynamo"3055)3056if self.args.use_warm_peak_memory:3057_, dynamo_peak_mem, _ = warmup(3058optimized_model_iter_fn,3059model,3060example_inputs,3061"dynamo",3062niters=1,3063)3064
3065if self.args.profile_dynamo_cache_lookup:3066with torch.profiler.profile(3067activities=[torch.profiler.ProfilerActivity.CPU]3068) as prof:3069with maybe_enable_compiled_autograd(3070self.args.compiled_autograd,3071fullgraph=self.args.nopython,3072dynamic=self.args.dynamic_shapes,3073):3074warmup(optimized_model_iter_fn, model, example_inputs, "dynamo")3075
3076events = list(3077filter(3078lambda event: "TorchDynamo Cache Lookup" in event.key,3079prof.key_averages(),3080)3081)3082dynamo_cache_lookup_latency = events[0].self_cpu_time_total3083
3084compilation_time = dynamo_latency - eager_latency + aot_compilation_time3085compression_ratio = (3086eager_peak_mem / dynamo_peak_mem if dynamo_peak_mem else 0.03087)3088if self.args.print_memory:3089print(3090f"memory: eager: {eager_peak_mem:.2f} GB, "3091f"dynamo: {dynamo_peak_mem:.2f} GB, "3092f"ratio: {compression_ratio:.2f}"3093)3094
3095if self.args.print_compilation_time:3096print(f"Compilation time: {compilation_time:.2f}")3097
3098if experiment.func is speedup_experiment:3099experiment_kwargs["compilation_latency"] = compilation_time3100experiment_kwargs["compression_ratio"] = compression_ratio3101experiment_kwargs["eager_peak_mem"] = eager_peak_mem3102experiment_kwargs["dynamo_peak_mem"] = dynamo_peak_mem3103experiment_kwargs["dynamo_stats"] = dynamo_stats3104if self.args.profile_dynamo_cache_lookup:3105experiment_kwargs[3106"cache_lookup_latency"3107] = dynamo_cache_lookup_latency3108
3109if experiment.func is speedup_experiment_onnx:3110experiment = functools.partial(3111experiment, optimized_model_iter_fn.context.onnx_model3112)3113backend_timings = experiment(3114model, example_inputs, mark="expected", **experiment_kwargs3115)3116timings = np.stack((baseline_timings, backend_timings), axis=1)3117result_summary = latency_experiment_summary(3118self.args, model, timings, **experiment_kwargs3119)3120if not hasattr(model, name):3121model.name = name3122results.append(result_summary)3123return " ".join(map(str, results))3124
3125def run_performance_test(3126self, name, model, example_inputs, optimize_ctx, experiment, tag=None3127):3128if self.args.xla:3129with self.pick_grad(name, self.args.training):3130return experiment(*self.maybe_cast(model, example_inputs))3131
3132def warmup(fn, model, example_inputs, mode, niters=5):3133peak_mem = 03134start_stats = get_dynamo_stats()3135try:3136if current_device == "cuda":3137torch.cuda.reset_peak_memory_stats()3138empty_gpu_cache(current_device)3139t0 = time.perf_counter()3140for _ in range(niters):3141fn(model, example_inputs)3142t1 = time.perf_counter()3143latency = t1 - t03144if current_device == "cuda":3145peak_mem = get_peak_memory()3146elif current_device == "cpu":3147total = psutil.virtual_memory().total3148percentage = psutil.Process(os.getpid()).memory_percent()3149peak_mem = percentage * total / 10**93150except Exception:3151log.exception("Backend %s failed in warmup()", mode)3152write_csv_when_exception(3153self.args, current_name, "warmup_failed", current_device3154)3155return sys.exit(-1)3156dynamo_stats = get_dynamo_stats()3157dynamo_stats.subtract(start_stats)3158return latency, peak_mem, dynamo_stats3159
3160# Cast the model to float16/float32 as necessary3161model, example_inputs = self.maybe_cast(model, example_inputs)3162
3163# Use distributed wrapping as necessary3164model = self.deepcopy_and_maybe_parallelize(model)3165
3166self.init_optimizer(name, current_device, model.parameters())3167
3168# The self.autocast context is needed for the model we export with aot_compile,3169# similar to what we do in the check_accuracy function3170ctx = (3171self.autocast(**self.autocast_arg)3172if self.args.export_aot_inductor3173else contextlib.nullcontext()3174)3175
3176with self.pick_grad(name, self.args.training), ctx:3177ok, total = Stats.reset_counters()3178experiment_kwargs = {}3179if tag is not None:3180experiment_kwargs["tag"] = tag3181results = []3182with maybe_snapshot_memory(3183self.args.snapshot_memory, f"eager_{self.args.only}"3184):3185eager_latency, eager_peak_mem, _ = warmup(3186self.model_iter_fn, model, example_inputs, "eager"3187)3188if self.args.use_warm_peak_memory:3189_, eager_peak_mem, _ = warmup(3190self.model_iter_fn, model, example_inputs, "eager", niters=13191)3192
3193if self.args.export_aot_inductor:3194t_0 = time.perf_counter()3195optimized_model_iter_fn = optimize_ctx3196t_1 = time.perf_counter()3197aot_compilation_time = t_1 - t_03198else:3199optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)3200aot_compilation_time = 03201
3202with maybe_enable_compiled_autograd(3203self.args.compiled_autograd,3204fullgraph=self.args.nopython,3205dynamic=self.args.dynamic_shapes,3206), maybe_snapshot_memory(3207self.args.snapshot_memory, f"compiled_{self.args.only}"3208):3209dynamo_latency, dynamo_peak_mem, dynamo_stats = warmup(3210optimized_model_iter_fn, model, example_inputs, "dynamo"3211)3212if self.args.use_warm_peak_memory:3213_, dynamo_peak_mem, _ = warmup(3214optimized_model_iter_fn,3215model,3216example_inputs,3217"dynamo",3218niters=1,3219)3220
3221if self.args.profile_dynamo_cache_lookup:3222with torch.profiler.profile(3223activities=[torch.profiler.ProfilerActivity.CPU]3224) as prof:3225with maybe_enable_compiled_autograd(3226self.args.compiled_autograd,3227fullgraph=self.args.nopython,3228dynamic=self.args.dynamic_shapes,3229):3230warmup(optimized_model_iter_fn, model, example_inputs, "dynamo")3231
3232events = list(3233filter(3234lambda event: "TorchDynamo Cache Lookup" in event.key,3235prof.key_averages(),3236)3237)3238dynamo_cache_lookup_latency = events[0].self_cpu_time_total3239
3240compilation_time = dynamo_latency - eager_latency + aot_compilation_time3241compression_ratio = (3242eager_peak_mem / dynamo_peak_mem if dynamo_peak_mem else 0.03243)3244if self.args.print_memory:3245print(3246f"memory: eager: {eager_peak_mem:.2f} GB, "3247f"dynamo: {dynamo_peak_mem:.2f} GB, "3248f"ratio: {compression_ratio:.2f}"3249)3250
3251if self.args.print_compilation_time:3252print(f"Compilation time: {compilation_time:.2f}")3253
3254if experiment.func is speedup_experiment:3255experiment_kwargs["compilation_latency"] = compilation_time3256experiment_kwargs["compression_ratio"] = compression_ratio3257experiment_kwargs["eager_peak_mem"] = eager_peak_mem3258experiment_kwargs["dynamo_peak_mem"] = dynamo_peak_mem3259experiment_kwargs["dynamo_stats"] = dynamo_stats3260if self.args.profile_dynamo_cache_lookup:3261experiment_kwargs[3262"cache_lookup_latency"3263] = dynamo_cache_lookup_latency3264
3265if experiment.func is coverage_experiment:3266ok, total = Stats.reset_counters()3267results = []3268# run with torch._dynamo few times to populate the cache3269for _ in range(3):3270optimized_model_iter_fn(model, example_inputs)3271_, frames_second_pass = Stats.reset_counters() # should be 03272if frames_second_pass > 0:3273optimized_model_iter_fn(model, example_inputs)3274_, frames_third_pass = Stats.reset_counters() # should be 03275else:3276frames_third_pass = 03277
3278results.append(3279f"{ok:3}/{total:3} +{frames_third_pass} frames {compilation_time:3.0f}s"3280)3281
3282if experiment.func is speedup_experiment_onnx:3283experiment = functools.partial(3284experiment, optimized_model_iter_fn.context.onnx_model3285)3286
3287if not hasattr(model, name):3288model.name = name3289results.append(experiment(model, example_inputs, **experiment_kwargs))3290return " ".join(map(str, results))3291
3292def minify_model(3293self,3294name,3295model,3296example_inputs,3297optimize_ctx,3298experiment,3299tag,3300):3301logging.info("Minifying %s...", name)3302os.environ["TORCH_COMPILE_DEBUG"] = "1"3303os.environ["TORCHDYNAMO_REPRO_AFTER"] = "dynamo"3304os.environ["TORCHDYNAMO_REPRO_LEVEL"] = "4"3305
3306self.check_accuracy(name, model, example_inputs, optimize_ctx, experiment, tag)3307
3308if self.args.output_directory:3309repro_dir = self.args.output_directory3310else:3311repro_dir = torch._dynamo.config.base_dir3312
3313try:3314shutil.move("repro.py", f"{repro_dir}/{name}_repro.py")3315except OSError as e:3316logging.error("Could not find repro script for model %s", name)3317else:3318logging.info(3319"Repro script for model %s with minified graph saved to %s",3320name,3321repro_dir,3322)3323
3324def maybe_preserve_compile_debug(self, name, status):3325if (3326name in CI_PRESERVE_COMPILE_DEBUG3327and status in CI_PRESERVE_COMPILE_DEBUG[name]3328):3329src_dir = torch._dynamo.utils.get_debug_dir()3330if os.path.isdir(src_dir):3331dbg_dir = os.path.join(3332os.getcwd(), "test", "debug", "torch_compile_debug"3333)3334dst_dir = os.path.join(dbg_dir, os.path.basename(src_dir))3335try:3336os.makedirs(dbg_dir, exist_ok=True)3337os.rename(src_dir, dst_dir)3338log.warning("Moved %s to %s", src_dir, dst_dir)3339except OSError:3340log.exception("Failed to preserve %s", src_dir)3341
3342def run_one_model(3343self,3344name,3345model,3346example_inputs,3347optimize_ctx,3348experiment,3349explain=False,3350tag=None,3351):3352mode = "train" if self.args.training else "eval"3353msg = f"{current_device:4} {mode:5} {current_name:34} "3354if tag:3355msg += f" {tag:26}"3356print(msg, flush=True)3357
3358start_stats = get_dynamo_stats()3359
3360if self.args.accuracy:3361status = self.check_accuracy(3362name, model, example_inputs, optimize_ctx, experiment, tag3363)3364print(status)3365if status == "fail_accuracy" and self.args.minify:3366self.minify_model(3367name, model, example_inputs, optimize_ctx, experiment, tag3368)3369elif self.args.tolerance:3370status = self.check_tolerance(name, model, example_inputs, optimize_ctx)3371print(status)3372elif self.args.performance:3373if self.args.backend == "torchao":3374status = self.run_performance_test_non_alternate(3375name, model, example_inputs, optimize_ctx, experiment, tag3376)3377else:3378status = self.run_performance_test(3379name, model, example_inputs, optimize_ctx, experiment, tag3380)3381print(status)3382empty_gpu_cache(current_device)3383
3384self.maybe_preserve_compile_debug(name, status)3385
3386if self.args.timing:3387from torch._dynamo.utils import op_count, print_time_report3388from torch.utils._stats import simple_call_counter3389
3390print_time_report()3391stats = "STATS: "3392stats = stats + " | ".join(3393itertools.chain(3394[f"call_* op count: {op_count}"],3395(f"{key}:{value}" for key, value in simple_call_counter.items()),3396)3397)3398print(stats)3399stats = get_dynamo_stats()3400stats.subtract(start_stats)3401
3402if explain:3403print(3404f"Dynamo produced {stats['unique_graphs']} graphs "3405f"covering {stats['calls_captured']} ops with "3406f"{stats['graph_breaks']} graph breaks ({stats['unique_graph_breaks']} unique)"3407)3408
3409if explain or self.args.log_graph_breaks or self.args.print_graph_breaks:3410filename = f"{output_filename.rstrip('.csv')}_graph_breaks.csv"3411
3412def add_double_quotes(x):3413# Delimiter because reason could have comma3414return f'"{x}"'3415
3416for graph_break in graph_break_reasons:3417reason = add_double_quotes(graph_break.reason)3418user_stack = add_double_quotes(3419", ".join([str(x) for x in graph_break.user_stack])3420)3421output_csv(3422filename,3423["model", "reason", "user_stack"],3424[current_name, reason, user_stack],3425)3426
3427if self.args.stats:3428Stats.print_summary()3429
3430
3431def help(fn):3432return fn.__doc__3433
3434
3435diff_branch_default = "DIFF-BRANCH-DEFAULT"3436
3437
3438def should_diff_branch(args):3439return args.diff_branch != diff_branch_default3440
3441
3442def parse_args(args=None):3443parser = argparse.ArgumentParser()3444parser.add_argument(3445"--filter", "-k", action="append", help="filter benchmarks with regexp"3446)3447parser.add_argument(3448"--exclude", "-x", action="append", help="filter benchmarks with regexp"3449)3450parser.add_argument(3451"--exclude-exact", action="append", help="filter benchmarks with exact match"3452)3453parser.add_argument(3454"--total-partitions",3455type=int,3456default=1,3457choices=range(1, 16),3458help="Total number of partitions we want to divide the benchmark suite into",3459)3460parser.add_argument(3461"--partition-id",3462type=int,3463default=0,3464help="ID of the benchmark suite partition to be run. Used to divide CI tasks",3465)3466parser.add_argument(3467"--devices", "--device", "-d", action="append", help="cpu or cuda"3468)3469parser.add_argument("--device-index", help="CUDA device index")3470parser.add_argument(3471"--repeat", "-n", type=int, default=30, help="number of timing runs"3472)3473iterations_per_run_help = """3474Run this may iterations for each time measurement. This is mainly used for
3475XLA training. We want to run multiple iterations per measurement so the
3476tracing and computation for different iteartions can overlap with each
3477other. This makes sure we have an accurate xla baseline.
3478"""
3479parser.add_argument(3480"--iterations-per-run", type=int, default=1, help=iterations_per_run_help3481)3482parser.add_argument(3483"--randomize-input",3484action="store_true",3485help="Whether to randomize the input values. Dimensions will be kept the same.",3486)3487parser.add_argument(3488"--threads",3489"-t",3490type=int,3491help="number of threads to use for eager and inductor",3492)3493parser.add_argument(3494"--nopython", action="store_true", help="Turn graph breaks into errors"3495)3496parser.add_argument(3497"--no-skip",3498action="store_true",3499help="run models that are in the global SKIP list",3500)3501parser.add_argument(3502"--prims-nvfuser", action="store_true", help="user prims + nvfuser backend"3503)3504parser.add_argument(3505"--dump-raw-metrics",3506action="store_true",3507help="dump raw timing metrics from speedup experiment",3508)3509parser.add_argument(3510"--log-operator-inputs",3511action="store_true",3512default=False,3513)3514parser.add_argument(3515"--channels-last",3516action="store_true",3517default=False,3518help="use channels last format",3519)3520parser.add_argument(3521"--batch-size", "--batch_size", type=int, help="batch size for benchmarking"3522)3523parser.add_argument(3524"--iterations", type=int, default=2, help="how many iterations to run"3525)3526parser.add_argument(3527"--batch-size-file", type=str, help="String to load batch size from"3528)3529parser.add_argument("--cosine", action="store_true", help="use cosine similarity")3530parser.add_argument(3531"--freezing", action="store_true", help="turn on freezing", default=False3532)3533parser.add_argument(3534"--inductor-config",3535"-c",3536action="append",3537help="key=value in torch._inductor.config",3538)3539parser.add_argument(3540"--ci", action="store_true", help="Flag to tell that its a CI run"3541)3542parser.add_argument(3543"--dashboard", action="store_true", help="Flag to tell that its a Dashboard run"3544)3545parser.add_argument(3546"--skip-fp64-check", action="store_true", help="skip accuracy check using fp64"3547)3548parser.add_argument(3549"--fast", "-f", action="store_true", help="skip slow benchmarks"3550)3551parser.add_argument(3552"--only",3553help="""Run just one model from torchbench. Or3554specify the path and class name of the model in format like:
3555--only=path:<MODEL_FILE_PATH>,class:<CLASS_NAME>
3556
3557Due to the fact that dynamo changes current working directory,
3558the path should be an absolute path.
3559
3560The class should have a method get_example_inputs to return the inputs
3561for the model. An example looks like
3562```
3563class LinearModel(nn.Module):
3564def __init__(self):
3565super().__init__()
3566self.linear = nn.Linear(10, 10)
3567
3568def forward(self, x):
3569return self.linear(x)
3570
3571def get_example_inputs(self):
3572return (torch.randn(2, 10),)
3573```
3574""",3575)3576parser.add_argument(3577"--multiprocess",3578action="store_true",3579help="Create n processes based on the number of devices (distributed use case).",3580)3581parser.add_argument(3582"--ddp",3583action="store_true",3584help="Wraps model in DDP before running it, and uses dynamo DDPOptmizer (graph breaks) by default.",3585)3586parser.add_argument(3587"--fsdp",3588action="store_true",3589help="""Wraps model in FSDP before running it.3590Doesn't recursively wrap, mainly useful for checking dynamo UnspecNNModule compatibility
3591""",3592)3593parser.add_argument(3594"--optimize-ddp-mode",3595type=str,3596default="ddp_optimizer",3597help="Specify the DDP optimization mode -- the value of torch._dynamo.config.optimize_ddp.",3598)3599parser.add_argument(3600"--distributed-master-port",3601default="6789",3602help="Port to bind for for torch.distributed. Use the default unless it's conflicting with another user",3603)3604parser.add_argument(3605"--dynamic-shapes",3606action="store_true",3607help="Runs a dynamic shapes version of the benchmark, if available.",3608)3609parser.add_argument(3610"--propagate-real-tensors",3611action="store_true",3612help="Capture as much data dependent as you can by unsoundly propagating real tensors",3613)3614parser.add_argument(3615"--dynamic-batch-only",3616action="store_true",3617help="Only assume batch dimension is dynamic. Implies --dynamic-shapes",3618)3619parser.add_argument(3620"--specialize-int", action="store_true", help="Run with specialize_int=True."3621)3622parser.add_argument(3623"--use-eval-mode",3624action="store_true",3625help="sets model.eval() to reduce randomness",3626)3627parser.add_argument(3628"--skip-accuracy-check",3629action="store_true",3630help="keeps running even when accuracy fails",3631)3632parser.add_argument(3633"--generate-aot-autograd-stats",3634action="store_true",3635help="Generates AOT Autograd stats like how mnay graphs are sent to AOT",3636)3637parser.add_argument(3638"--inductor-settings",3639action="store_true",3640help="Use same settings as --inductor for baseline comparisons",3641)3642parser.add_argument(3643"--suppress-errors",3644action="store_true",3645help="Suppress errors instead of raising them",3646)3647parser.add_argument(3648"--output",3649help="Overrides the output filename",3650)3651parser.add_argument(3652"--output-directory",3653help="Overrides the directory to place output files.",3654)3655parser.add_argument(3656"--disable-output",3657action="store_true",3658help="Disable writing of output files, e.g., for warm-up runs",3659)3660parser.add_argument(3661"--baseline",3662help="Compare with a prior --output",3663)3664parser.add_argument(3665"--part",3666default=None,3667help="Specify the part of the model to run.",3668)3669parser.add_argument(3670"--export-profiler-trace",3671action="store_true",3672help="exports trace of kineto profiler",3673)3674parser.add_argument(3675"--profiler-trace-name",3676"--profiler_trace_name",3677help="Overwrites exported trace name",3678)3679parser.add_argument(3680"--diff-branch",3681default=diff_branch_default,3682help="delta current branch against given branch.",3683)3684parser.add_argument(3685"--tag", default=None, help="Specify a tag to be included in csv files."3686)3687parser.add_argument(3688"--explain",3689action="store_true",3690help="print some graph/op statistics during the run, similar to .explain()",3691)3692parser.add_argument(3693"--stats",3694action="store_true",3695help="print graph counter stats",3696)3697parser.add_argument(3698"--use-warm-peak-memory",3699"--use_warm_peak_memory",3700action="store_true",3701help="Measure peak memory using a warm run to reduce autotuning noise",3702)3703parser.add_argument(3704"--print-memory",3705action="store_true",3706help="print extra memory statistics",3707)3708parser.add_argument(3709"--print-compilation-time",3710action="store_true",3711help="print compilation latency",3712)3713parser.add_argument(3714"--print-dataframe-summary",3715action="store_true",3716help="print dataframe result used for calculating accuracy",3717)3718parser.add_argument(3719"--disable-cudagraphs",3720action="store_true",3721help="Disables cudagraphs for Inductor",3722)3723parser.add_argument(3724"--disable-split-reductions",3725action="store_true",3726help="Disables split reductions for Inductor",3727)3728parser.add_argument(3729"--disable-persistent-reductions",3730action="store_true",3731help="Disables split reductions for Inductor",3732)3733parser.add_argument(3734"--disable-divisible-by-16",3735action="store_true",3736help="Disables divisible by 16 hint to Triton for Inductor",3737)3738parser.add_argument(3739"--inductor-compile-mode",3740default=None,3741help="torch.compile mode argument for inductor runs.",3742)3743parser.add_argument(3744"--print-graph-breaks",3745action="store_true",3746help="Show a warning whenever graph break",3747)3748parser.add_argument(3749"--log-graph-breaks",3750action="store_true",3751help="log graph breaks in a file",3752)3753parser.add_argument(3754"--trace-on-xla",3755action="store_true",3756help="Whether to trace the model on XLA or on eager device",3757)3758parser.add_argument(3759"--xla-tolerance",3760type=float,3761default=1e-2,3762help="XLA needs a loose tolerance to pass the correctness check",3763)3764parser.add_argument(3765"--collect-outputs",3766action="store_true",3767help="""Whether to collect outputs for training. Set this to true if we3768want to verify the numerical correctness of graidents. But that may
3769cause time measurement not accurate""",3770)3771parser.add_argument(3772"--enable-activation-checkpointing",3773action="store_true",3774help="Enables activation checkpointing for HF models",3775)3776parser.add_argument("--timing", action="store_true", help="Emits phase timing")3777
3778parser.add_argument(3779"--progress",3780action="store_true",3781help="Print n/k models message between each model run.",3782)3783
3784parser.add_argument(3785"--timeout",3786type=int,3787default=2000,3788help="timeout (second) for benchmarking.",3789)3790
3791parser.add_argument(3792"--per_process_memory_fraction",3793type=float,3794default=1,3795help="Set per-process GPU memory fraction (limit) for reducing usable size and reproducing OOMs",3796)3797
3798parser.add_argument(3799"--no-translation-validation",3800action="store_true",3801help="Disable translation validation for accuracy builds.",3802)3803
3804parser.add_argument(3805"--minify",3806action="store_true",3807help="Enable minification when failure is below tolerance. Save repro script for each model.",3808)3809
3810parser.add_argument(3811"--compiled-autograd",3812action="store_true",3813help="Enables compiled autograd on compiled benchmark",3814)3815
3816parser.add_argument(3817"--profile_dynamo_cache_lookup",3818"--profile-dynamo-cache-lookup",3819action="store_true",3820help="profiles TorchDynamo cache lookup",3821)3822
3823parser.add_argument(3824"--snapshot-memory",3825"--snapshot_memory",3826action="store_true",3827help="Enables Memory Snapshot tool for memory deep dives: https://pytorch.org/blog/understanding-gpu-memory-1/",3828)3829
3830group_latency = parser.add_mutually_exclusive_group()3831group_latency.add_argument(3832"--cold-start-latency",3833"--cold_start_latency",3834action="store_true",3835help="Use a fresh triton cachedir when running each model, to force cold-start compile.",3836)3837group_latency.add_argument(3838"--warm-start-latency",3839"--warm_start_latency",3840action="store_true",3841help="Run model(s) twice and preseve caches in between to enable a 'warm start' on the 2nd run",3842)3843
3844group_fuser = parser.add_mutually_exclusive_group()3845# --nvfuser is now the default, keep the option to not break scripts3846group_fuser.add_argument("--nvfuser", action="store_true", help=argparse.SUPPRESS)3847group_fuser.add_argument("--nnc", action="store_true", help="enable NNC for GPUs")3848
3849group_prec = parser.add_mutually_exclusive_group()3850group_prec.add_argument("--float16", action="store_true", help="cast model to fp16")3851group_prec.add_argument(3852"--bfloat16", action="store_true", help="cast model to bf16"3853)3854group_prec.add_argument("--float32", action="store_true", help="cast model to fp32")3855group_prec.add_argument(3856"--amp", action="store_true", help="use automatic mixed precision"3857)3858parser.add_argument(3859"--amp-dtype",3860choices=("bfloat16", "float16"),3861help="the data type used with automatic mixed precision",3862)3863group_printout = parser.add_mutually_exclusive_group()3864group_printout.add_argument(3865"--verbose", "-v", action="store_true", help="enable verbose debug printouts"3866)3867group_printout.add_argument(3868"--quiet", "-q", action="store_true", help="suppress debug printouts"3869)3870
3871group = parser.add_mutually_exclusive_group()3872group.add_argument(3873"--coverage", action="store_true", help="(default) " + help(coverage_experiment)3874)3875group.add_argument(3876"--overhead", action="store_true", help=help(overhead_experiment)3877)3878group.add_argument(3879"--speedup-dynamo-ts",3880action="store_true",3881help="TorchDynamo frontend with torchscript backend",3882)3883group.add_argument(3884"--speedup-fx2trt", action="store_true", help=help(speedup_experiment_fx2trt)3885)3886group.add_argument(3887"--speedup-fx2trt-fp16",3888action="store_true",3889help=help(speedup_experiment_fx2trt),3890)3891group.add_argument(3892"--print-fx",3893action="store_true",3894help="Print fx traces captured from model",3895)3896group.add_argument(3897"--print-aten-ops",3898action="store_true",3899help="Print traces of aten ops captured by AOT autograd",3900)3901group.add_argument(3902"--inductor",3903action="store_true",3904help="Measure speedup with TorchInductor",3905)3906group.add_argument(3907"--quantization",3908choices=[3909"int8dynamic",3910"int8weightonly",3911"int4weightonly",3912"autoquant",3913"noquant",3914],3915default=None,3916help="Measure speedup of torchao quantization with TorchInductor baseline",3917)3918group.add_argument(3919"--export",3920action="store_true",3921help="Measure pass rate with export",3922)3923group.add_argument(3924"--export-aot-inductor",3925action="store_true",3926help="Measure pass rate with Export+AOTInductor",3927)3928group.add_argument(3929"--xla", action="store_true", help="Compare TorchXLA to eager PyTorch"3930)3931group.add_argument(3932"--torchscript-onnx",3933"--torchscript_onnx",3934action="store_true",3935help="Measure speedup with TorchScript ONNX, i.e. `torch.onnx.export`",3936)3937group.add_argument(3938"--torch-onnx-patch",3939"--torch_onnx_patch",3940action="store_true",3941help="Measure speedup with dynamo ONNX patch, i.e. `torch_onnx`",3942)3943group.add_argument(3944"--dynamo-onnx",3945"--dynamo_onnx",3946action="store_true",3947help="Measure speedup with Dynamo ONNX, i.e. `torch.onnx.dynamo_export`",3948)3949group.add_argument(3950"--dynamo-onnx-aot-inline",3951"--dynamo_onnx_aot_inline",3952action="store_true",3953help="Measure speedup with Dynamo ONNX AOT Inline, i.e. `torch.onnx.dynamo_export`",3954)3955group.add_argument(3956"--dynamo-onnx-aot-optimize",3957"--dynamo_onnx_aot_optimize",3958action="store_true",3959help="Measure speedup with Dynamo ONNX w/ ort fusions, i.e. `torch.onnx.dynamo_export`",3960)3961group.add_argument(3962"--backend",3963choices=torch._dynamo.list_backends(exclude_tags=None),3964help="measure speedup with a given backend",3965)3966group.add_argument("--nothing", action="store_true", help=help(null_experiment))3967group.add_argument(3968"--log-conv-args",3969action="store_true",3970help="Dump convolution input/weight/bias's shape/stride/dtype and other options to json",3971)3972group.add_argument(3973"--recompile-profiler",3974"--recompile_profiler",3975action="store_true",3976help="Run the dynamo recompilation profiler on each model.",3977)3978group.add_argument(3979"--find-batch-sizes",3980action="store_true",3981help="finds the largest batch size that could fit on GPUs",3982)3983
3984mode_group = parser.add_mutually_exclusive_group(required=True)3985mode_group.add_argument(3986"--accuracy",3987action="store_true",3988help="Checks accuracy with small batch size and eval mode",3989)3990mode_group.add_argument(3991"--performance", action="store_true", help="Measures performance speedup"3992)3993mode_group.add_argument(3994"--tolerance",3995action="store_true",3996help="extracts the tolerance for each model with small batch size and eval mode",3997)3998run_mode_group = parser.add_mutually_exclusive_group(required=True)3999run_mode_group.add_argument(4000"--training",4001action="store_true",4002help="Performs training",4003)4004run_mode_group.add_argument(4005"--inference", action="store_true", help="Performs inference"4006)4007return parser.parse_args(args)4008
4009
4010def process_entry(rank, runner, original_dir, args):4011args.rank = rank4012with maybe_init_distributed(4013args.init_distributed,4014rank=rank,4015world_size=args.world_size,4016port=args.distributed_master_port,4017):4018return run(runner, args, original_dir)4019
4020
4021def maybe_fresh_cache(args):4022cache_dir_assigned = "TORCHINDUCTOR_CACHE_DIR" in os.environ4023if not cache_dir_assigned and (4024args.cold_start_latency or args.warm_start_latency or args.ci4025):4026return fresh_inductor_cache()4027else:4028return contextlib.nullcontext()4029
4030
4031def main(runner, original_dir=None, args=None):4032if original_dir:4033os.chdir(original_dir)4034args = parse_args() if not args else parse_args(args)4035if args.baseline:4036args.baseline = os.path.abspath(args.baseline)4037
4038if should_diff_branch(args):4039import git4040
4041# We do this here so we error out earlier if there's an issue4042repo = git.Repo()4043if repo.is_dirty():4044raise RuntimeError(4045"--diff-branch called on dirty branch. Commit, stash, or reset."4046)4047main_branch = repo.active_branch.name4048if main_branch == args.diff_branch:4049raise RuntimeError(4050f"--diff-branch: current branch is same as {args.diff_branch} branch, what are you diffing?"4051)4052
4053with maybe_fresh_cache(args):4054args.init_distributed = args.only and args.multiprocess4055if args.init_distributed:4056# NB: Do NOT query device count before CUDA initialization; we're4057# going to overwrite CUDA_VISIBLE_DEVICES and this will result in4058# https://github.com/pytorch/pytorch/issues/1073004059device_count = torch.cuda.device_count()4060if device_count <= 1:4061log.warning(4062"The use multiprocess flag is set but there are <= 1 devices available."4063)4064# multiprocess path4065args.world_size = device_count4066mp.spawn(4067process_entry, args=(runner, original_dir, args), nprocs=device_count4068)4069elif args.only and args.warm_start_latency:4070# Warm start mode. Enable FX graph caching and perform back-to-back runs in4071# separate processes (but ensure the inductor cache is preserved across runs).4072env = os.environ.copy()4073env["TORCHINDUCTOR_FX_GRAPH_CACHE"] = "1"4074cmd = [sys.executable] + sys.argv4075cmd.remove("--warm-start-latency")4076
4077print(f"Performing cold-start run for {args.only}")4078warmup_cmd = cmd + ["--repeat=1", "--disable-output"]4079subprocess.check_call(warmup_cmd, timeout=args.timeout, env=env)4080
4081print(f"Performing warm-start run for {args.only}")4082subprocess.check_call(cmd, timeout=args.timeout, env=env)4083else:4084# single process path just uses the main process4085args.world_size = 14086process_entry(0, runner, original_dir, args)4087
4088
4089def write_csv_when_exception(args, name: str, status: str, device=None):4090print(status)4091placeholder_batch_size = 04092devices = [device] if device is not None else args.devices4093if args.accuracy:4094headers = ["dev", "name", "batch_size", "accuracy"]4095rows = [[device, name, placeholder_batch_size, status] for device in devices]4096elif args.performance:4097headers = ["dev", "name", "batch_size", "speedup", "abs_latency"]4098rows = [[device, name, placeholder_batch_size, 0.0, 0.0] for device in devices]4099else:4100headers = []4101rows = [[device, name, placeholder_batch_size, 0.0] for device in devices]4102
4103for row in rows:4104output_csv(output_filename, headers, row)4105
4106
4107def run(runner, args, original_dir=None):4108# Pass the parsed args object to benchmark runner object4109runner.args = args4110
4111args.filter = args.filter or [r"."]4112args.exclude = args.exclude or [r"^$"]4113args.exclude_exact = args.exclude_exact or []4114
4115if args.inductor:4116assert args.backend is None4117args.backend = "inductor"4118if args.quantization:4119assert args.backend is None4120args.backend = "torchao"4121if args.dynamic_batch_only:4122args.dynamic_shapes = True4123torch._dynamo.config.assume_static_by_default = True4124if args.dynamic_shapes:4125if not args.dynamic_batch_only:4126torch._dynamo.config.assume_static_by_default = False4127if args.propagate_real_tensors:4128# TODO: Separate flag for data dependent4129torch._dynamo.config.capture_scalar_outputs = True4130torch._dynamo.config.capture_dynamic_output_shape_ops = True4131torch._functorch.config.fake_tensor_propagate_real_tensors = True4132if args.specialize_int:4133torch._dynamo.config.specialize_int = True4134if args.ci:4135if args.accuracy:4136# Run fewer iterations when checking accuracy4137args.repeat = min(args.repeat, 2)4138
4139# Set translation validation on by default on CI accuracy runs.4140torch.fx.experimental._config.translation_validation = True4141
4142ci = functools.partial(4143CI, args.backend, training=args.training, dynamic=args.dynamic_shapes4144)4145if args.ddp:4146assert args.training, "DDP benchmark requires --training mode"4147torch._dynamo.config.optimize_ddp = args.optimize_ddp_mode4148if args.only == "dlrm":4149log.error(4150"DLRM+DDP is unsupported as it requires sharding the embedding layer separately from DDP"4151)4152return sys.exit(-1)4153if args.accuracy:4154# Use small batch size. We use >1 batch size to ensure we test4155# batch_norm type of operators that work on batch dims.4156# TODO - Go through the failures for batch size = 24157if args.batch_size is None:4158if runner.suite_name == "huggingface":4159args.batch_size = 14160elif runner.suite_name == "torchbench":4161args.batch_size = 44162else:4163# Larger batch size of TIMM models to have stable batch_norm4164assert runner.suite_name == "timm_models"4165args.batch_size = 84166
4167# Remove sources of randomness4168if runner.suite_name not in ("timm_models", "huggingface"):4169# TODO - Using train mode for timm_models and HF models. Move to train mode for Torchbench as well.4170args.use_eval_mode = True4171inductor_config.fallback_random = True4172if args.only is not None and args.only not in {4173"alexnet",4174"Background_Matting",4175"pytorch_CycleGAN_and_pix2pix",4176"pytorch_unet",4177"Super_SloMo",4178"vgg16",4179# https://github.com/pytorch/pytorch/issues/967244180"Wav2Vec2ForCTC",4181"Wav2Vec2ForPreTraining",4182"sam",4183"sam_fast",4184"resnet50_quantized_qat",4185"mobilenet_v2_quantized_qat",4186}:4187# some of the models do not support use_deterministic_algorithms4188torch.use_deterministic_algorithms(True)4189os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"4190torch.backends.cudnn.deterministic = True4191torch.backends.cudnn.allow_tf32 = False4192torch.backends.cudnn.benchmark = False4193torch.backends.cuda.matmul.allow_tf32 = False4194
4195torch.backends.mkldnn.deterministic = True4196
4197# Remove randomeness when torch manual seed is called4198patch_torch_manual_seed()4199
4200# Some models e.g. yolov3 assert batch size on n_gpus4201if "CUDA_VISIBLE_DEVICES" not in os.environ and not args.multiprocess:4202args.device_index = "0"4203
4204# Stricter check to disable fallbacks4205args.suppress_errors = False4206
4207if args.device_index is not None:4208if args.multiprocess:4209print("Cannot specify both --device_index and --multiprocess")4210return sys.exit(-1)4211os.environ["CUDA_VISIBLE_DEVICES"] = args.device_index4212
4213elif args.performance:4214# Ensure that we test on real scenarios4215args.use_eval_mode = False4216
4217if args.partition_id > args.total_partitions or args.partition_id < 0:4218print("Invalid partition id")4219return sys.exit(-1)4220
4221if not args.devices:4222if torch.cuda.is_available():4223args.devices = ["cuda"]4224else:4225log.warning("torch.cuda.is_available() == False, using CPU")4226args.devices = ["cpu"]4227
4228if args.devices != ["cpu"] and (HAS_CUDA or HAS_XPU):4229global synchronize4230synchronize = torch.cuda.synchronize if HAS_CUDA else torch.xpu.synchronize4231
4232if (4233args.devices == ["cuda"]4234and torch.cuda.get_device_properties(0).total_memory < 25 * 2**304235):4236# OOM errors on an RTX 3090 with 24gb RAM4237runner.skip_models.update(4238{4239# torchbench4240"hf_Longformer",4241"timm_nfnet",4242"timm_efficientdet",4243}4244)4245if args.training:4246runner.skip_models.add("hf_T5")4247
4248if args.nnc:4249torch._C._jit_override_can_fuse_on_cpu(True)4250torch._C._jit_override_can_fuse_on_gpu(True)4251torch._C._jit_set_texpr_fuser_enabled(True)4252torch._C._jit_set_nvfuser_enabled(False)4253
4254if args.threads:4255torch.set_num_threads(args.threads)4256
4257if args.verbose:4258torch._logging.set_logs(dynamo=logging.DEBUG)4259
4260if args.print_graph_breaks:4261torch._logging.set_logs(graph_breaks=True)4262
4263if args.quiet:4264torch._logging.set_logs(dynamo=logging.ERROR)4265
4266torch._dynamo.config.suppress_errors = args.suppress_errors4267
4268if args.training:4269runner.model_iter_fn = runner.forward_and_backward_pass4270runner.skip_models.update(runner.skip_not_suitable_for_training_models)4271else:4272runner.model_iter_fn = runner.forward_pass4273
4274if args.fast:4275runner.skip_models.update(runner.slow_models)4276
4277if args.devices == ["cpu"]:4278runner.skip_models.update(runner.very_slow_models)4279runner.skip_models.update(runner.skip_models_for_cpu)4280elif args.devices == ["cuda"]:4281runner.skip_models.update(runner.skip_models_for_cuda)4282
4283if not args.multiprocess:4284runner.skip_models.update(runner.skip_multiprocess_models)4285
4286if args.freezing:4287runner.skip_models.update(runner.skip_models_for_freezing)4288
4289if args.no_skip:4290runner.skip_models.clear()4291
4292experiment = null_experiment4293global current_name, current_device, current_batch_size, output_filename, disable_output, optimize_ctx, current_onnx_compiler4294optimize_ctx = contextlib.nullcontext()4295
4296if args.disable_output:4297disable_output = True4298
4299if args.overhead:4300optimize_ctx = torch._dynamo.optimize(dummy_fx_compile, nopython=args.nopython)4301experiment = speedup_experiment4302output_filename = "overheads.csv"4303elif args.inductor:4304inductor_config.debug = args.verbose4305if args.threads:4306inductor_config.cpp.threads = args.threads4307
4308optimize_ctx = functools.partial(4309torch.compile,4310backend="inductor",4311fullgraph=args.nopython,4312mode=args.inductor_compile_mode,4313)4314experiment = speedup_experiment4315output_filename = "inductor.csv"4316elif args.export:4317optimize_ctx = export4318experiment = speedup_experiment4319output_filename = "export.csv"4320elif args.xla:4321(dev,) = args.devices4322os.environ["PJRT_DEVICE"] = {"cuda": "GPU", "cpu": "CPU"}[dev]4323torch._dynamo.mark_dynamic = MagicMock()4324experiment = xla4325output_filename = "xla.csv"4326elif args.torchscript_onnx:4327optimize_ctx = functools.partial(4328optimize_onnx_ctx,4329args.output_directory or ".",4330OnnxModelFromTorchScript,4331copy_before_export=args.performance, # Accuarcy bench already did deepcopy4332)4333experiment = speedup_experiment_onnx4334output_filename = "torchscript_onnx.csv"4335current_onnx_compiler = "torchscript"4336elif args.torch_onnx_patch:4337optimize_ctx = functools.partial(4338optimize_onnx_ctx,4339args.output_directory or ".",4340OnnxModelFromTorchScript,4341copy_before_export=args.performance,4342use_experimental_patch=True,4343)4344experiment = speedup_experiment_onnx4345output_filename = "torch_onnx_patch.csv"4346current_onnx_compiler = "dynamo"4347elif args.dynamo_onnx:4348optimize_ctx = functools.partial(4349optimize_onnx_ctx,4350args.output_directory or ".",4351OnnxModelFromDynamo,4352dynamic_shapes=args.dynamic_shapes,4353copy_before_export=args.performance,4354)4355experiment = speedup_experiment_onnx4356output_filename = "dynamo_onnx.csv"4357current_onnx_compiler = "dynamo"4358elif args.dynamo_onnx_aot_inline:4359optimize_ctx = functools.partial(4360optimize_onnx_ctx,4361args.output_directory or ".",4362OnnxModelFromDynamoAotInline,4363dynamic_shapes=args.dynamic_shapes,4364copy_before_export=args.performance,4365)4366experiment = speedup_experiment_onnx4367output_filename = "dynamo_onnx_aot_inline.csv"4368current_onnx_compiler = "dynamo"4369elif args.dynamo_onnx_aot_optimize:4370optimize_ctx = functools.partial(4371optimize_onnx_ctx,4372args.output_directory or ".",4373OnnxModelFromDynamoAotOptimize,4374dynamic_shapes=args.dynamic_shapes,4375copy_before_export=args.performance,4376)4377experiment = speedup_experiment_onnx4378output_filename = "dynamo_onnx_aot_optimize.csv"4379current_onnx_compiler = "dynamo"4380elif args.speedup_dynamo_ts:4381optimize_ctx = torch._dynamo.optimize("ts", nopython=args.nopython)4382experiment = speedup_experiment4383output_filename = "speedup_dynamo_ts.csv"4384elif args.prims_nvfuser:4385optimize_ctx = torch._dynamo.optimize("prims_nvfuser", nopython=args.nopython)4386experiment = speedup_experiment4387backend_str = "prims_nvfuser"4388output_filename = f"accuracy_aot_{backend_str}.csv"4389elif args.print_fx:4390optimize_ctx = torch._dynamo.optimize(4391print_fx,4392nopython=args.nopython,4393)4394elif args.print_aten_ops:4395optimize_ctx = torch._dynamo.optimize(4396print_aten_ops,4397nopython=args.nopython,4398)4399elif args.nothing:4400optimize_ctx = nothing4401experiment = speedup_experiment4402output_filename = "nothing.csv"4403elif args.backend or args.export_aot_inductor:4404if args.export_aot_inductor:4405assert not args.training, "AOTInductor only supports inference"4406optimize_ctx = functools.partial(4407export_aot_inductor, device=args.devices[0]4408)4409
4410# AOTInductor doesn't support control flow yet4411runner.skip_models.update(runner.skip_models_due_to_control_flow)4412elif args.backend == "torchao":4413assert "cuda" in args.devices, "Quantization requires CUDA device."4414assert args.bfloat16, "Quantization requires dtype bfloat16."4415try:4416from torchao_backend import setup_baseline, torchao_optimize_ctx4417except ImportError:4418try:4419from .torchao_backend import setup_baseline, torchao_optimize_ctx4420except ImportError:4421from userbenchmark.dynamo.dynamobench.torchao_backend import (4422setup_baseline,4423torchao_optimize_ctx,4424)4425
4426setup_baseline()4427baseline_ctx = functools.partial(4428torch.compile,4429backend="inductor",4430fullgraph=args.nopython,4431mode=args.inductor_compile_mode,4432)4433model_iter_fn = baseline_ctx(runner.model_iter_fn)4434
4435# needed to avoid error that causes inconsistent timing due to:4436# Unable to hit fast path of CUDAGraphs because of pending, uninvoked backwards4437def model_iter_fn_and_mark_step(*args, **kwargs):4438torch.compiler.cudagraph_mark_step_begin()4439model_iter_fn(*args, **kwargs)4440
4441runner.model_iter_fn = model_iter_fn_and_mark_step4442optimize_ctx = torchao_optimize_ctx(args.quantization)4443else:4444optimize_ctx = torch._dynamo.optimize(args.backend, nopython=args.nopython)4445experiment = (4446speedup_experiment if not args.backend == "torchao" else latency_experiment4447)4448if args.accuracy:4449output_filename = f"accuracy_{args.backend}.csv"4450elif args.tolerance:4451output_filename = f"tolerance_{args.backend}.csv"4452else:4453output_filename = f"speedup_{args.backend}.csv"4454elif args.recompile_profiler:4455output_filename = "recompile_profiler_log.csv"4456experiment = recompile_profiler_experiment4457else:4458optimize_ctx = torch._dynamo.optimize(4459fx_insert_profiling, nopython=args.nopython4460)4461experiment = coverage_experiment4462output_filename = "coverage.csv"4463
4464if args.inductor or args.backend == "inductor" or args.export_aot_inductor:4465inductor_config.triton.cudagraphs = not args.disable_cudagraphs4466inductor_config.triton.persistent_reductions = (4467not args.disable_persistent_reductions4468)4469inductor_config.split_reductions = not args.disable_split_reductions4470inductor_config.triton.divisible_by_16 = not args.disable_divisible_by_164471if args.inference:4472inductor_config.freezing = args.freezing4473if args.inductor_config:4474for config in args.inductor_config:4475key, value = config.split("=")4476typ = type(inductor_config.__getattr__(key))4477if issubclass(typ, bool):4478assert value in ("0", "1", "True", "False")4479value = value in ("1", "True")4480elif issubclass(typ, (str, int, float)):4481value = typ(value)4482else:4483raise NotImplementedError(typ)4484inductor_config.__setattr__(key, value)4485
4486runner.setup_amp()4487
4488if args.output:4489output_filename = args.output4490
4491if output_filename:4492if args.output_directory:4493output_filename = os.path.join(args.output_directory, output_filename)4494else:4495output_filename = os.path.join(4496torch._dynamo.config.base_dir, output_filename4497)4498
4499if args.find_batch_sizes and args.only:4500for device in args.devices:4501batch_size = runner.batch_size_finder(device, args.only)4502print(args.only, batch_size)4503output_csv(output_filename, [], [args.only, batch_size])4504return4505
4506if args.export_profiler_trace:4507if args.profiler_trace_name is None:4508if args.backend:4509args.profiler_trace_name = args.backend4510elif args.inductor:4511args.profiler_trace_name = "inductor"4512else:4513args.profiler_trace_name = "profile"4514else:4515args.profiler_trace_name = args.profiler_trace_name4516
4517if args.no_translation_validation:4518# Overwrite 'translation_validation' config, if specified.4519torch.fx.experimental._config.translation_validation = False4520
4521experiment = functools.partial(experiment, args, runner.model_iter_fn)4522
4523if args.only and should_diff_branch(args):4524import git4525
4526repo = git.Repo()4527main_branch = repo.active_branch.name4528try:4529# Adding diff-branch again to the args will override previous value4530call_args = (4531[sys.executable] + sys.argv + [f"--diff-branch={diff_branch_default}"]4532)4533# Run for main branch4534subprocess.check_call(call_args + [f"--tag={main_branch}"])4535# Run for comparison branch4536repo.git.checkout(args.diff_branch)4537subprocess.check_call(call_args + [f"--tag={args.diff_branch}"])4538finally:4539# Go back to main branch4540repo.git.checkout(main_branch)4541elif args.only:4542model_name = args.only4543for device in args.devices:4544batch_size = args.batch_size4545if args.batch_size_file:4546batch_size = read_batch_size_from_file(4547args, args.batch_size_file, model_name4548)4549if model_specified_by_path(args.only):4550model, example_inputs = load_model_from_path(args.only)4551name = model.__class__.__name__4552model = model.to(device=device)4553example_inputs = tree_map_only(4554torch.Tensor, lambda x: x.to(device=device), example_inputs4555)4556else:4557name = model_name4558try:4559with tqdm(desc="loading model"):4560extra_args = []4561if hasattr(args, "rank") and hasattr(args, "world_size"):4562extra_args += [4563"--rank",4564str(args.rank),4565"--world_size",4566str(args.world_size),4567]4568
4569if args.part:4570(4571device,4572name,4573model,4574example_inputs,4575batch_size,4576) = runner.load_model(4577device,4578model_name,4579batch_size=batch_size,4580part=args.part,4581extra_args=extra_args,4582)4583else:4584if args.fsdp:4585# Always load model on cpu for fsdp4586# When initializing FSDP, we will use the cuda device if args.cuda is set4587(4588_,4589name,4590model,4591example_inputs,4592batch_size,4593) = runner.load_model(4594"cpu",4595model_name,4596batch_size=batch_size,4597extra_args=extra_args,4598)4599else:4600(4601device,4602name,4603model,4604example_inputs,4605batch_size,4606) = runner.load_model(4607device,4608model_name,4609batch_size=batch_size,4610extra_args=extra_args,4611)4612except Exception as e:4613import traceback4614
4615mode = "train" if args.training else "eval"4616print(f"{device:4} {mode:5} {name:34} ")4617print(traceback.format_exc())4618status = (4619"model_fail_to_load"4620if isinstance(e, NotImplementedError)4621else "eager_fail_to_run"4622)4623write_csv_when_exception(args, name, status, device)4624continue # bad benchmark implementation4625
4626if args.trace_on_xla:4627xla_dev = xm.xla_device()4628model = model.to(device=xla_dev)4629example_inputs = tree_map_only(4630torch.Tensor, lambda x: x.to(device=xla_dev), example_inputs4631)4632
4633current_name = name4634current_device = device4635current_batch_size = batch_size4636set_model_name(name)4637
4638# Look for stuff that looks like batch size, and mark it dynamic.4639# Better integration would integrate directly with benchmark suite4640# but cannot conveniently do this4641# NB: This must be done late enough so that we don't do more4642# conversions on the inputs4643# NB: Assumes only the first batch-y like dimension is the batch4644marked = False4645
4646def detect_and_mark_batch(t):4647nonlocal marked4648for i, s in enumerate(t.size()):4649if s == batch_size:4650torch._dynamo.mark_dynamic(t, i)4651marked = True4652break4653
4654if (4655args.dynamic_batch_only4656and batch_size > 14657and model_name not in CI_SKIP_DYNAMIC_BATCH_ONLY4658):4659tree_map_only(torch.Tensor, detect_and_mark_batch, example_inputs)4660assert marked, f"nothing in example_inputs had a dim with {batch_size}"4661
4662if args.log_operator_inputs:4663log_operator_inputs(4664model, example_inputs, runner.model_iter_fn, name, args4665)4666continue4667
4668if args.per_process_memory_fraction != 1:4669torch.cuda.set_per_process_memory_fraction(4670args.per_process_memory_fraction4671)4672if model_name in DO_NOT_CAST_INPUTS:4673model, _ = runner.cast_based_on_args(model, example_inputs)4674
4675else:4676model, example_inputs = runner.cast_based_on_args(model, example_inputs)4677runner.setup_amp(current_device)4678guard_ctx = contextlib.nullcontext()4679if name in runner.guard_on_nn_module_models:4680guard_ctx = torch._dynamo.config.patch(guard_nn_modules=True)4681
4682inline_ctx = contextlib.nullcontext()4683if name in runner.inline_inbuilt_nn_modules_models:4684inline_ctx = torch._dynamo.config.patch(inline_inbuilt_nn_modules=True)4685
4686with guard_ctx:4687with inline_ctx:4688runner.run_one_model(4689name,4690model,4691example_inputs,4692optimize_ctx,4693experiment,4694explain=args.explain,4695tag=args.tag,4696)4697if args.generate_aot_autograd_stats:4698stats_file = output_filename.split(".csv")[0] + "_stats.csv"4699output_csv(4700stats_file,4701("dev", "name", "batch_size", "total_aot_graphs", "ok_aot_graphs"),4702[4703current_device,4704current_name,4705current_batch_size,4706*Stats.aot_summary(),4707],4708)4709else:4710metrics.purge_old_log_files()4711if output_filename and os.path.exists(output_filename):4712os.unlink(output_filename)4713if original_dir:4714os.chdir(original_dir)4715model_names = list(runner.iter_model_names(args))4716nmodels = len(model_names)4717for i, name in enumerate(model_names):4718current_name = name4719if args.progress:4720print(f"Running model {i+1}/{nmodels}", flush=True)4721
4722try:4723timeout = args.timeout4724if should_diff_branch(args):4725timeout *= 24726env = os.environ.copy()4727if args.ci and name in CI_PRESERVE_COMPILE_DEBUG:4728env["TORCH_COMPILE_DEBUG"] = "1"4729subprocess.check_call(4730[sys.executable] + sys.argv + [f"--only={name}"],4731timeout=timeout,4732env=env,4733)4734except subprocess.TimeoutExpired:4735write_csv_when_exception(args, name, "timeout")4736except subprocess.CalledProcessError as e:4737print("Run failed with return code: ", e.returncode, file=sys.stderr)4738print("Output: ", e.output, file=sys.stderr)4739print("Error: ", e.stderr, file=sys.stderr)4740print_summary(output_filename, print_dataframe=args.print_dataframe_summary)4741
4742
4743def log_operator_inputs(model, example_inputs, model_iter_fn, name, args):4744mode = "training" if args.training else "eval"4745output = os.path.join(os.path.dirname(args.output), f"{name}_{mode}.txt")4746
4747# TODO - add option for coalescing inputs over multiple runs4748if os.path.exists(output):4749print(f"Skipping {name}, {output} already exists")4750return4751
4752print(f"Running {name}")4753try:4754from .microbenchmarks.operator_inp_utils import OperatorInputsMode4755except ImportError:4756from microbenchmarks.operator_inp_utils import OperatorInputsMode4757
4758operator_mode = OperatorInputsMode()4759fake_tensor_mode = FakeTensorMode()4760
4761with torch._subclasses.fake_tensor.FakeCopyMode(fake_tensor_mode):4762model_fake = copy.deepcopy(model)4763example_inputs_fake = copy.deepcopy(example_inputs)4764try:4765with fake_tensor_mode, operator_mode:4766model_iter_fn(model_fake, example_inputs_fake, collect_outputs=False)4767except Exception as e:4768print(f"{name} failed to run with fake tensors, trying real. Exception: {e}")4769operator_mode = OperatorInputsMode()4770try:4771with operator_mode:4772model_iter_fn(model, example_inputs, collect_outputs=False)4773except Exception as e2:4774print(f"{name} failed to run with real. Exception: {e2}")4775raise4776
4777print(f"Writing output to {output}")4778operator_mode.log_to_file(output)4779
4780
4781if __name__ == "__main__":4782raise RuntimeError(4783f"You shouldn't run {sys.argv[0]} directly, instead try timm_model.py, torchbench.py or huggingface.py"4784)4785