pytorch
195 строк · 6.0 Кб
1# mypy: ignore-errors
2
3import contextlib4import time5import os6import json7
8import torch9from torch.profiler import profile, ProfilerActivity10
11
12def synchronize():13pass14
15
16def dump_chrome_trace(f, input, trace_filename, optimize_ctx, activities, num_runs=1,17devices=None, kwargs_for_f=None, kwargs_for_profiler=None):18"""19Output the chrome trace of running f(input, **kwargs_for_f) with [optimize_ctx]
20[num_runs] times to [trace_filename].
21
22[activities] are the activities that the profiler will record, e.g. ProfilerActivity.CUDA.
23Return total runtime without the profiler
24
25Outputs to trace_filename
26"""
27
28if devices is None:29devices = ["cuda"]30
31global synchronize32if devices != ["cpu"] and torch.cuda.is_available():33synchronize = torch.cuda.synchronize34
35if kwargs_for_f is None:36kwargs_for_f = {}37if kwargs_for_profiler is None:38kwargs_for_profiler = {}39
40with optimize_ctx:41torch.manual_seed(1337)42for _ in range(5): # warmup runs43f(input, **kwargs_for_f)44synchronize()45torch.manual_seed(1337)46t0 = time.perf_counter()47for _ in range(num_runs):48f(input, **kwargs_for_f)49synchronize()50t1 = time.perf_counter()51timing = t1 - t052
53with profile(activities=activities, **kwargs_for_profiler) as prof:54with optimize_ctx:55synchronize()56torch.manual_seed(1337)57for _ in range(num_runs):58f(input, **kwargs_for_f)59synchronize()60prof.export_chrome_trace(trace_filename)61
62return timing63
64
65def get_chrome_trace_events(filename):66f = open(filename)67data = json.load(f)68events = data["traceEvents"]69return events70
71
72def is_gpu_compute_event(event):73global gpu_pids74return "pid" in event and event["pid"] in gpu_pids and "ph" in event and event["ph"] == "X"75
76
77def get_sorted_gpu_events(events):78sorted_gpu_events = []79for event in events:80if not is_gpu_compute_event(event):81continue82sorted_gpu_events.append(event)83return sorted(sorted_gpu_events, key=lambda x: x["ts"])84
85
86def get_duration(sorted_gpu_events):87if len(sorted_gpu_events) == 0:88return 089event = sorted_gpu_events[0]90current_end_time = event["ts"] + event["dur"]91total_duration = event["dur"]92for event in sorted_gpu_events[1:]:93start_time = max(event["ts"], current_end_time)94end_time = event["ts"] + event["dur"]95total_duration = total_duration + max(end_time - start_time, 0)96current_end_time = max(current_end_time, end_time)97return total_duration98
99
100def get_sorted_gpu_mm_conv_events(events):101def is_mm_conv_event(event):102return "name" in event and ("gemm" in event["name"] or "conv" in event["name"]103or "cutlass" in event["name"] or "wgrad" in event["name"])104gpu_events = get_sorted_gpu_events(events)105sorted_events = []106for event in gpu_events:107if not is_mm_conv_event(event):108continue109sorted_events.append(event)110return sorted_events111
112
113gpu_pids = []114
115
116def compute_utilization(filename: str, total_length: float):117"""118Process the chrome traces outputs by the pytorch profiler to compute GPU Utilization
119and percent of times spent on matmul and convolution
120
121Args:
122filename(str): Name of chrome traces file produced by pytorch profiler
123
124total_length(float): total length of the process without profiler in second
125
126Return:
127tuple: (GPU Utilization, percent of time spent on matmul and convolution)
128"""
129events = get_chrome_trace_events(filename)130
131# get pids of GPU events132global gpu_pids133gpu_pids = []134for event in events:135if "name" not in event:136continue137if event["name"] == 'process_labels' and "GPU" in event["args"]["labels"]:138gpu_pids.append(event["pid"])139
140total_length = total_length * 1e6141sorted_gpu_events = get_sorted_gpu_events(events)142utilization = get_duration(sorted_gpu_events) / total_length143
144sorted_gpu_mm_conv_events = get_sorted_gpu_mm_conv_events(events)145mm_conv_utilization = get_duration(sorted_gpu_mm_conv_events) / total_length146
147return utilization, mm_conv_utilization148
149
150def benchmark_utilization(f, input, trace_folder, optimize_ctx=None, trace_file_name="tmp_chrome_trace", num_runs=1):151"""152Benchmark the GPU Utilization and percent of time spent on matmul and convolution operations of
153running f(input, **kwargs_for_f) with [optimize_ctx] [num_runs] times.
154It will produce a chrome trace file in trace_folder/trace_file_name.json
155
156Example:
157
158```
159def f(a):
160return a.sum()
161a = torch.rand(2**20, device="cuda")
162utilization, mm_conv_utilization = benchmark_utilization(f, a, "tmp", trace_file_name = "tmp_chrome_trace")
163```
164
165Args:
166f: function to benchmark
167
168input: input to :attr:`f`
169
170trace_folder: name of the folder to store the chrome trace
171
172optimize_ctx: the context in which f will run
173
174trace_file_name: name of the dumped chrome trace file, default to "tmp_chrome_trace"
175
176num_runs: number of times to run f, excluding the warm-up runs, default to 1.
177
178Return:
179tuple: (GPU Utilization, percent of time spent on matmul and convolution)
180
181"""
182isExist = os.path.exists(trace_folder)183if not isExist:184os.makedirs(trace_folder)185print("create folder " + trace_folder)186
187if optimize_ctx is None:188optimize_ctx = contextlib.nullcontext()189
190chrome_trace_file_name = os.path.join(trace_folder, trace_file_name + ".json")191total_length = dump_chrome_trace(f, input, chrome_trace_file_name, optimize_ctx,192[ProfilerActivity.CUDA], num_runs=num_runs, devices="cuda")193utilization, mm_conv_utilization = compute_utilization(chrome_trace_file_name, total_length)194
195return utilization, mm_conv_utilization196