pytorch
310 строк · 10.6 Кб
1import contextlib2import json3import os4import time5
6import numpy as np7
8import torch9
10from . import tensor_engine11
12
13class Benchmark:14def __init__(self, mode, device, dtype):15self.mode = mode16self.deterministic = False17self.device = device18self.dtype = dtype19self.output_type = "stdout"20self.print_ir = False21self.print_kernel = False22if mode == "both":23self.requires_grad = True24elif mode == "fwd":25self.requires_grad = False26else:27raise ValueError(f"invalid mode: {mode}")28self.result_grad = None29self.grad_variables = []30self.engine = tensor_engine.get_engine()31self.engine.reset(device)32
33# forward all member functions in self.engine to self34for method in dir(self.engine):35if not callable(getattr(self.engine, method)):36continue37# don't forward if this function is overriden here38if hasattr(self, method):39continue40# don't forward if it is a internal function41if method.startswith("_"):42continue43method_engine = getattr(self.engine, method)44setattr(self, method, method_engine)45
46def forward(self):47"""do one step worth of computation"""48raise ValueError("this method should be reimplemented by subclass")49
50def check(self):51if not self.deterministic:52return53np.testing.assert_allclose(54self.reference(), self.numpy(self.compute()), atol=1e-255)56
57def config(self):58"""returns an array for the current benchmark configs"""59raise ValueError("this method should be reimplemented by subclass")60
61def desc(self):62"""return the description of the current benchmark"""63config = self.config()64config_str = "_".join([str(x) for x in config])65device = self.device66if "NNC_NUM_THREADS" in os.environ:67num_threads_str = os.environ["NNC_NUM_THREADS"]68device += num_threads_str69return f"{self.engine.mode}: {self.module()}_{self.mode}_{device}_{config_str}"70
71@staticmethod72def module():73raise ValueError("this method should be reimplemented by subclass")74
75def memory_workload(self):76raise ValueError("this method should be reimplemented by subclass")77
78def compute_workload(self):79"""return the number of scalar operations it takes to finish the tensor op"""80return None81
82@staticmethod83def input_iterable():84"""A benchmark child class should return true if it utilizes the input iter arg"""85return False86
87def dtype_to_bytes(self):88return torch.tensor(0, dtype=self.dtype).element_size()89
90@staticmethod91def default_configs():92"""return a list of defualt configs for this benchmark"""93raise ValueError("this method should be reimplemented by subclass")94
95def is_supported(self):96return True97
98def rand(self, shape, device=None, dtype=None, requires_grad=False):99v = self.engine.rand(100shape, device=device, dtype=dtype, requires_grad=requires_grad101)102if requires_grad:103self.grad_variables.append(v)104return v105
106def nchw_rand(self, shape, device=None, requires_grad=False):107v = self.engine.nchw_rand(shape, device=device, requires_grad=requires_grad)108if requires_grad:109self.grad_variables.append(v)110return v111
112def compute(self):113if self.bm_jit:114return self.bm_jit(*self.inputs)115else:116return self.forward(*self.inputs)117
118def run(self, args):119self.print_ir = args.print_ir120if args.cuda_fuser == "old":121torch._C._jit_override_can_fuse_on_gpu(True)122if args.print_kernel:123os.environ["PYTORCH_FUSION_DEBUG"] = "1"124return self.run_impl(True)125elif args.cuda_fuser == "te":126torch._C._jit_set_texpr_fuser_enabled(True)127with cuda_pointwise_context(128args.cuda_pointwise_loop_levels,129args.cuda_pointwise_block_count,130args.cuda_pointwise_block_size,131):132return self.run_impl(True)133elif args.cuda_fuser == "nvf":134torch._C._jit_set_nvfuser_enabled(True)135torch._C._jit_set_profiling_executor(True)136torch._C._jit_set_profiling_mode(True)137torch._C._jit_override_can_fuse_on_cpu(False)138torch._C._jit_override_can_fuse_on_gpu(False)139torch._C._jit_set_bailout_depth(20)140if args.print_kernel:141os.environ["PYTORCH_CUDA_FUSER_DEBUG"] = "1"142return self.run_impl(True)143else:144return self.run_impl(False)145
146def run_impl(self, use_fuser):147warmups = 10148if self.device == "cuda":149iters = 1000150else:151iters = 10152engine = tensor_engine.get_engine()153
154self.bm_jit = None155for i in range(warmups + iters):156if i == warmups:157if self.device == "cuda":158engine.sync_cuda()159time_start = time.time()160
161if i == 0:162if self.jit_mode == "trace" and use_fuser:163self.bm_jit = torch.jit.trace(164self.forward, example_inputs=self.inputs, check_trace=False165)166if callable(getattr(self, "reference", None)):167self.check()168else:169print("Warning: no reference result for ", self.module())170elif i == 1:171# The fusion graph is visible after the first iter is executed172if self.jit_mode == "trace" and use_fuser and self.print_ir:173print(self.bm_jit.graph_for(*self.inputs))174z = self.compute()175if self.mode == "both":176if self.result_grad is None:177self.result_grad = engine.rand_like(z)178engine.backward([z], [self.result_grad], self.grad_variables)179
180if self.device == "cuda":181engine.sync_cuda()182
183duration = time.time() - time_start184iter_time = duration / iters185memory_workload = self.memory_workload()186compute_workload = self.compute_workload()187
188result_dict = {189"desc": self.desc(),190"us": iter_time * 1e6,191"sol": memory_workload["sol"] * self.dtype_to_bytes() / iter_time / 1e9,192"algorithmic": memory_workload["algorithmic"]193* self.dtype_to_bytes()194/ iter_time195/ 1e9,196}197if compute_workload:198result_dict["compute_workload"] = compute_workload / iter_time / 1e9199self.dump_result(result_dict)200
201def dump_result(self, result_dict):202if self.output_type == "json":203print(json.dumps(result_dict))204elif self.output_type == "stdout":205msg = "{}: {:.2f} us, SOL {:.2f} GB/s, algorithmic {:.2f} GB/s".format(206result_dict["desc"],207result_dict["us"],208result_dict["sol"],209result_dict["algorithmic"],210)211if "compute_workload" in result_dict:212msg += f", compute {result_dict['compute_workload']:.2f} Gops/s"213print(msg)214else:215raise Exception("Unknown output_type " + self.output_type) # noqa: TRY002216
217
218@contextlib.contextmanager219def cuda_pointwise_context(loop_levels, block_count, block_size):220if loop_levels:221old_loop_levels = torch._C._jit_get_te_cuda_pointwise_loop_levels()222torch._C._jit_set_te_cuda_pointwise_loop_levels(loop_levels)223if block_count:224old_block_count = torch._C._jit_get_te_cuda_pointwise_block_count()225torch._C._jit_set_te_cuda_pointwise_block_count(block_count)226if block_size:227old_block_size = torch._C._jit_get_te_cuda_pointwise_block_size()228torch._C._jit_set_te_cuda_pointwise_block_size(block_size)229
230try:231yield232finally:233if loop_levels:234torch._C._jit_set_te_cuda_pointwise_loop_levels(old_loop_levels)235if block_count:236torch._C._jit_set_te_cuda_pointwise_block_count(old_block_count)237if block_size:238torch._C._jit_set_te_cuda_pointwise_block_size(old_block_size)239
240
241# Auxiliary class to facilitate dynamic input shape
242class DynamicShape:243r"""244An Auxiliary class for dynamic shape benchmarks
245
246Pre-computes input with random shapes and also
247modifies the compute method so in each call the
248fuser sees a different input tensor shape
249"""
250
251# Number of random inputs in an instance252SAMPLE_SIZE = 100253
254def __init__(self, dynamic_range=1.2):255self._input_samples = []256self._input_sample_index = 0257self._dynamic_range = (2581.0 / dynamic_range if dynamic_range > 1.0 else dynamic_range259)260self._enable_dynamic_shapes = True261
262# Returns the input test case that current index points to263@property264def inputs(self):265return self._input_samples[self._input_sample_index]266
267# An inputs assignment actually adds a test case in the class buffer268@inputs.setter269def inputs(self, val):270self._input_samples.append(val)271
272# Runs normal compute while increment test case index273def compute(self):274super().compute()275self._input_sample_index = (self._input_sample_index + 1) % self.SAMPLE_SIZE276
277# Defined by benchmark, the benchmark needs to specify the input278# tensor construction in this method, essentially the same way279# a benchmark creates the inputs list in the initializer280def instantiate_input(self):281raise NotImplementedError282
283# Instantiate random shaped inputs and start the benchmark run284def run(self, args):285# force disable dynamic shape from command line286if args.no_dynamic_shape:287self._enable_dynamic_shapes = False288self.load_inputs()289super().run(args)290
291# pre-compute inputs so the creations of random tensors292# do not add to the compute time293def load_inputs(self):294for i in range(self.SAMPLE_SIZE - 1):295self.instantiate_input()296
297# returns a randomized shape298def rand_shape(self, shape):299if not self._enable_dynamic_shapes:300return shape301ratios = np.random.uniform(self._dynamic_range, 1.0, len(shape))302dyn_shape = list(np.multiply(shape, ratios).astype(int))303return dyn_shape304
305
306benchmark_classes = []307
308
309def register_benchmark_class(benchmark_cls):310benchmark_classes.append(benchmark_cls)311