18
from dataclasses import dataclass, field
19
from typing import List, Optional
20
from unittest.mock import patch
27
import torch.utils.data
28
from torch._C._profiler import _ExperimentalConfig, _ExtraFields_PyCall
29
from torch.autograd.profiler import KinetoStepTracker, profile as _profile
30
from torch.autograd.profiler_legacy import profile as _profile_legacy
31
from torch.profiler import (
41
from torch.profiler._pattern_matcher import (
42
Conv2dBiasFollowedByBatchNorm2dPattern,
44
ForLoopIndexingPattern,
46
GradNotSetToNonePattern,
47
MatMulDimInFP16Pattern,
49
OptimizerSingleTensorPattern,
51
report_all_anti_patterns,
52
SynchronizedDataLoaderPattern,
54
from torch.testing._internal.common_cuda import TEST_MULTIGPU
55
from torch.testing._internal.common_device_type import skipCUDAVersionIn
56
from torch.testing._internal.common_utils import (
57
instantiate_parametrized_tests,
66
TemporaryDirectoryName,
84
tqdm.tqdm.monitor_interval = 0
92
except ModuleNotFoundError:
97
@unittest.skipIf(not HAS_PSUTIL, "Requires psutil to run")
98
@unittest.skipIf(TEST_WITH_ASAN, "Cannot test with ASAN")
99
@unittest.skipIf(IS_WINDOWS, "Test is flaky on Windows")
100
@unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
101
class TestProfilerCUDA(TestCase):
102
@skipCUDAVersionIn([(11, 5)])
103
def test_mem_leak(self):
104
"""Checks that there's no memory leak when using profiler with CUDA"""
105
t = torch.rand(1, 1).cuda()
107
last_rss = collections.deque(maxlen=5)
108
for outer_idx in range(10):
109
with _profile(use_cuda=True):
110
for _ in range(1024):
114
torch.cuda.empty_cache()
115
last_rss.append(p.memory_info().rss)
120
last_rss[idx] > last_rss[idx - 1] for idx in range(1, len(last_rss))
123
for idx in range(1, len(last_rss)):
124
max_diff = max(max_diff, last_rss[idx] - last_rss[idx - 1])
126
not (is_increasing and max_diff > 100 * 1024),
127
msg=f"memory usage is increasing, {str(last_rss)}",
130
def test_custom_module_input_op_ids(self):
131
class MyFunc(torch.autograd.Function):
134
ctx.save_for_backward(x)
138
def backward(ctx, gO):
139
(x,) = ctx.saved_tensors
142
def custom_layer(input_ten):
143
return MyFunc.apply(input_ten)
147
with torch.autograd.profiler.emit_nvtx(record_shapes=True) as prof:
148
x = torch.randn(10, 10, requires_grad=True)
149
y = torch.randn(10, 10, requires_grad=True)
155
@unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
156
def test_cudagraph_profiling_workaround(self):
162
p = subprocess.check_call(
169
from torch.profiler import ProfilerActivity, profile
171
def add_one(in_: torch.Tensor):
174
sample_arg = torch.zeros(10, device="cuda").requires_grad_(True)
176
# add this before cuda graphs are created
177
torch.profiler._utils._init_for_cuda_graphs()
179
add_one_graphed = torch.cuda.graphs.make_graphed_callables(add_one, sample_args=(sample_arg,))
180
zeros = torch.zeros(10, device="cuda")
181
out = add_one_graphed(zeros)
184
with profile(activities=[ProfilerActivity.CPU]):
185
add_one_graphed(zeros)
187
with profile(activities=[ProfilerActivity.CUDA]):
188
add_one_graphed(zeros)
191
universal_newlines=True,
198
@unittest.skipIf(not torch.profiler.itt.is_available(), "ITT is required")
199
class TestProfilerITT(TestCase):
200
def test_custom_module_input_op_ids(self):
201
class MyFunc(torch.autograd.Function):
204
ctx.save_for_backward(x)
208
def backward(ctx, gO):
209
(x,) = ctx.saved_tensors
212
def custom_layer(input_ten):
213
return MyFunc.apply(input_ten)
217
with torch.autograd.profiler.emit_itt(record_shapes=True) as prof:
218
x = torch.randn(10, 10, requires_grad=True)
219
y = torch.randn(10, 10, requires_grad=True)
226
@instantiate_parametrized_tests
227
class TestProfiler(TestCase):
229
TEST_WITH_CROSSREF, "crossref intercepts calls and changes the callsite."
231
def test_source(self):
232
"""Checks that source code attribution works for eager, TS and autograd mode"""
234
prev_opt = torch._C._get_graph_executor_optimize()
235
torch._C._set_graph_executor_optimize(False)
238
def ts_method_2(x, y):
239
return torch.matmul(x, y)
242
def ts_method_1(x, y, z):
244
w = ts_method_2(x, y) + a
247
class DummyModule(nn.Module):
248
def __init__(self) -> None:
250
self.conv = torch.nn.Conv2d(
251
3, 2, kernel_size=1, stride=2, padding=3, bias=False
254
def forward(self, x):
264
use_kineto=kineto_available(),
265
experimental_config=_ExperimentalConfig(verbose=True),
267
x = torch.randn(10, 10, requires_grad=True)
268
y = torch.randn(10, 10, requires_grad=True)
270
w = ts_method_1(x, y, z)
273
a = torch.randn(2, 3, 2, 2, requires_grad=True)
278
for e in p.function_events:
279
if "aten::add" in e.name or "AddBackward" in e.name:
280
self.assertTrue(any("test_profiler" in entry for entry in e.stack))
284
"test_source" in entry
285
or "ts_method_1" in entry
286
or "ts_method_2" in entry
293
if kineto_available() and not IS_WINDOWS:
294
with TemporaryFileName(mode="w+") as fname:
295
p.export_chrome_trace(fname)
296
with open(fname) as f:
297
events = json.load(f)["traceEvents"]
299
def extract(pattern: str):
300
matches = [e for e in events if re.search(pattern, e["name"])]
302
len(matches), 1, repr([e["name"] for e in matches])
306
module_event = extract(r"DummyModule_0")
307
wrapper_event = extract(r"call_module")
309
module_event["args"]["Python parent id"],
310
wrapper_event["args"]["Python id"],
313
torch._C._set_graph_executor_optimize(prev_opt)
318
"basic": ((False, False),),
319
"multiple_preexisting": ((False, False),) * 2,
320
"open_in_scope": ((True, False),),
321
"close_in_scope": ((False, True),),
336
name_fn=lambda name, thread_spec: name,
339
@parametrize("work_in_main_thread", [True, False])
340
def test_source_multithreaded(self, name, thread_spec, work_in_main_thread):
341
"""Test various threading configurations.
343
`thread_spec` is a Tuple[Tuple[bool, bool], ...] where each pair is a
344
thread. The first bool indicates if the thread should be started under
345
the profiler context and the second is if it should be joined under the
350
num_threads = len(thread_spec) + 1
351
start_barrier = threading.Barrier(num_threads, timeout=timeout)
352
end_barrier = threading.Barrier(num_threads, timeout=timeout)
354
class Task(threading.Thread):
355
def __init__(self) -> None:
356
self._end_gate = threading.Event()
357
super().__init__(daemon=True)
359
self.finished = False
362
self._run(self._end_gate)
368
def _run(end_gate=None):
369
def known_preexisting_function():
374
known_preexisting_function()
376
model = torch.nn.Sequential(
377
torch.nn.Linear(10, 10),
381
def invoked_during_run():
386
_ = model(torch.rand(4, 10))
389
if end_gate is not None:
390
end_gate.wait(timeout=timeout)
394
def add_threads(context: bool):
395
for idx, (start_under_profiler, _) in enumerate(thread_spec):
396
if start_under_profiler == context:
397
assert idx not in threads
398
threads[idx] = Task()
400
def join_threads(context: bool):
401
for idx, (_, end_under_profiler) in enumerate(thread_spec):
402
if end_under_profiler == context:
403
threads[idx].release()
405
for idx, (_, end_under_profiler) in enumerate(thread_spec):
407
if end_under_profiler == context:
408
t.join(timeout=timeout)
412
with torch.profiler.profile(with_stack=True) as prof:
418
if work_in_main_thread:
432
start_barrier.abort()
434
for t in threads.values():
437
for t in threads.values():
438
t.join(timeout=timeout)
440
for t in threads.values():
441
self.assertFalse(t.is_alive())
443
roots = prof.profiler.kineto_results.experimental_event_tree()
446
for node in _utils.traverse_dfs(roots)
447
if isinstance(node.extra_fields, _ExtraFields_PyCall)
449
tid_counts = collections.Counter([node.start_tid for node in nodes])
452
not start_under_profiler for start_under_profiler, _ in thread_spec
454
expected_threads = prior_threads + 1
456
len(tid_counts), expected_threads, f"{expected_threads}, {tid_counts}"
458
self.assertEqual(len(nodes), sum(tid_counts.values()))
462
self.assertFalse(no_tid in tid_counts)
464
worker_threads = prior_threads + (1 if work_in_main_thread else 0)
466
observed_preexisting = [
469
if "known_preexisting_function" in node.name
471
self.assertEqual(len(observed_preexisting), worker_threads)
472
self.assertEqual(len(observed_preexisting), len(set(observed_preexisting)))
474
observed_during_run = [
475
node.start_tid for node in nodes if "invoked_during_run" in node.name
477
self.assertEqual(len(observed_during_run), worker_threads)
478
self.assertEqual(len(observed_during_run), len(set(observed_during_run)))
480
def payload(self, use_cuda=False):
481
x = torch.randn(10, 10)
484
y = torch.randn(10, 10)
492
def _check_stats(self, profiler_stats):
493
self.assertGreater(profiler_stats.profiling_window_duration_sec, 0)
494
self.assertGreater(profiler_stats.number_of_events, 0)
495
self.assertGreater(profiler_stats.profiler_prepare_call_duration_us, 0)
496
self.assertGreater(profiler_stats.profiler_enable_call_duration_us, 0)
497
self.assertGreater(profiler_stats.profiler_disable_call_duration_us, 0)
498
self.assertGreater(profiler_stats.parse_kineto_call_duration_us, 0)
500
profiler_stats.function_events_build_tree_call_duration_us, 0
503
@unittest.skipIf(not kineto_available(), "Kineto is required")
504
def test_kineto(self):
505
use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities()
506
with _profile(use_cuda=use_cuda, use_kineto=True):
507
self.payload(use_cuda=use_cuda)
510
with _profile(use_cuda=use_cuda, use_kineto=True) as p:
511
self.payload(use_cuda=use_cuda)
513
self.assertTrue("aten::mm" in str(p))
515
output = p.key_averages().table(
516
sort_by="self_cuda_time_total" if use_cuda else "self_cpu_time_total",
523
for e in p.function_events:
524
if "aten::mm" in e.name:
526
if "gemm" in e.name.lower() or "Cijk" in e.name:
528
if "memcpy" in e.name.lower():
531
self.assertTrue(found_gemm)
532
self.assertTrue(found_memcpy)
534
self.assertTrue(found_mm)
535
self._check_stats(p._stats)
538
@unittest.skipIf(not kineto_available(), "Kineto is required")
539
@unittest.skipIf(not TEST_MULTIGPU, "Multiple GPUs needed")
540
@unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
541
def test_kineto_multigpu(self):
542
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
543
for gpu_id in [0, 1]:
544
x = torch.randn(10, 10).cuda(gpu_id)
545
y = torch.randn(10, 10).cuda(gpu_id)
551
for evt in prof.events():
552
if "gemm" in evt.name.lower() and evt.device_type == DeviceType.CUDA:
553
if evt.device_index == 0:
555
elif evt.device_index == 1:
557
if "cuda" in evt.name.lower() and evt.device_type == DeviceType.CPU:
560
self.assertTrue(found_gemm_0)
561
self.assertTrue(found_gemm_1)
562
self.assertTrue(found_cuda)
563
self._check_stats(prof._stats())
565
def test_memory_profiler(self):
566
def run_profiler(tensor_creation_fn):
571
use_kineto=kineto_available(),
574
with record_function("test_user_scope_alloc"):
575
x = tensor_creation_fn()
576
with record_function("test_user_scope_dealloc"):
578
return prof.key_averages(group_by_input_shape=True)
580
def check_metrics(stats, metric, allocs=None, deallocs=None):
584
stat_metrics[stat.key] = getattr(stat, metric)
586
if allocs is not None:
587
for alloc_fn in allocs:
588
self.assertTrue(alloc_fn in stat_metrics)
590
stat_metrics[alloc_fn], 0, f"alloc_fn = {alloc_fn}"
592
if deallocs is not None:
593
for dealloc_fn in deallocs:
594
self.assertTrue(dealloc_fn in stat_metrics)
596
stat_metrics[dealloc_fn], 0, f"alloc_fn = {dealloc_fn}"
599
def create_cpu_tensor():
600
return torch.rand(10, 10)
602
def create_cuda_tensor():
603
return torch.rand(10, 10).cuda()
605
def create_mkldnn_tensor():
606
return torch.rand(10, 10, dtype=torch.float32).to_mkldnn()
608
stats = run_profiler(create_cpu_tensor)
615
"test_user_scope_alloc",
618
"test_user_scope_dealloc",
622
if kineto_available():
623
with TemporaryFileName(mode="w+") as fname:
624
with profile(profile_memory=True) as prof:
626
with record_function("test_user_scope_alloc"):
627
x = create_cpu_tensor()
628
with record_function("test_user_scope_dealloc"):
630
prof.export_chrome_trace(fname)
631
with open(fname) as f:
633
assert "traceEvents" in trace
634
events = trace["traceEvents"]
635
found_memory_events = False
638
if evt["name"] == "[memory]":
639
found_memory_events = True
641
assert "Addr" in evt["args"]
642
assert "Device Type" in evt["args"]
643
assert "Device Id" in evt["args"]
644
assert "Bytes" in evt["args"]
647
assert "dur" not in evt["args"]
648
assert "cat" not in evt["args"]
649
assert found_memory_events
651
if torch.cuda.is_available():
653
stats = run_profiler(create_cuda_tensor)
656
"device_memory_usage",
658
"test_user_scope_alloc",
660
"aten::empty_strided",
663
"test_user_scope_dealloc",
675
if torch.backends.mkldnn.is_available():
676
create_mkldnn_tensor()
677
stats = run_profiler(create_mkldnn_tensor)
682
"test_user_scope_alloc",
688
"test_user_scope_dealloc",
693
with _profile(profile_memory=True, use_kineto=kineto_available()) as prof:
694
x = torch.rand(10, 10)
696
if torch.cuda.is_available():
697
y = torch.rand(10, 10).cuda()
700
stats = prof.key_averages(group_by_input_shape=True)
704
allocs=["aten::rand", "aten::empty"],
705
deallocs=["[memory]"],
707
if torch.cuda.is_available():
708
check_metrics(stats, "device_memory_usage", deallocs=["[memory]"])
711
IS_JETSON, "Jetson has a guard against OOM since host and gpu memory are shared"
713
def test_oom_tracing(self):
714
def run_profiler(tensor_creation_fn):
715
with _profile(profile_memory=True, record_shapes=True) as prof:
716
with self.assertRaisesRegex(RuntimeError, ".*[tT]ried to allocate.*"):
717
x = tensor_creation_fn()
720
def create_cuda_tensor_oom():
721
device = torch.device("cuda:0")
723
1024, 1024, 1024, 1024, dtype=torch.float32, device=device
726
def check_trace(fname):
727
prof.export_chrome_trace(fname)
728
with open(fname) as f:
730
self.assertTrue("traceEvents" in trace)
731
events = trace["traceEvents"]
732
found_out_of_memory_events = False
734
self.assertTrue("name" in evt)
735
if evt["name"] == "[OutOfMemory]":
736
found_out_of_memory_events = True
737
self.assertTrue("args" in evt)
738
self.assertTrue("Device Type" in evt["args"])
739
self.assertTrue("Device Id" in evt["args"])
740
self.assertTrue("Bytes" in evt["args"])
743
self.assertTrue("dur" not in evt["args"])
744
self.assertTrue("cat" not in evt["args"])
745
self.assertTrue(found_out_of_memory_events)
747
if torch.cuda.is_available():
748
with TemporaryFileName(mode="w+") as fname:
749
prof = run_profiler(create_cuda_tensor_oom)
752
@unittest.skipIf(not kineto_available(), "Kineto is required")
753
def test_module_hierarchy(self):
755
def my_new_method(self, x):
758
def forward_impl_(self, x, y):
759
return self.my_new_method(x) + y
761
def forward(self, x, y):
763
return self.forward_impl_(x, y)
766
def forward(self, x):
770
def __init__(self) -> None:
776
return self.B0.forward(x)
778
def forward(self, x, y):
779
return self.A0.forward(x, y) + self.call_b(x)
782
model = torch.jit.script(model)
783
input_a = torch.rand(128, 128)
784
input_b = torch.rand(128, 128)
785
op_to_module_hierarchy = {}
786
op_to_module_hierarchy["aten::sub"] = ["TOP(C)::forward.A0(A)::forward."]
787
op_to_module_hierarchy["aten::mul"] = [
788
"TOP(C)::forward.A0(A)::forward.SELF(A)::forward_impl_.SELF(A)::my_new_method."
790
op_to_module_hierarchy["aten::add"] = [
791
"TOP(C)::forward.A0(A)::forward.SELF(A)::forward_impl_.",
792
"TOP(C)::forward.SELF(C)::call_b.B0(B)::forward.",
795
with TemporaryFileName(mode="w+") as fname:
797
activities=[torch.profiler.ProfilerActivity.CPU],
800
model(input_a, input_b)
801
prof.export_chrome_trace(fname)
802
with open(fname) as f:
804
assert "traceEvents" in trace
805
events = trace["traceEvents"]
806
found_memory_events = False
810
op_name = evt["name"]
811
if "Module Hierarchy" in evt["args"]:
812
hierarchy = evt["args"]["Module Hierarchy"]
813
if op_name in op_to_module_hierarchy:
814
assert hierarchy in op_to_module_hierarchy[op_name]
816
def test_high_level_trace(self):
817
"""Checks that python side high level events are recorded."""
819
class RepeatedDataset(torch.utils.data.Dataset):
820
def __init__(self, N, D_in, D_out):
822
self.x = torch.randn(N, D_in)
823
self.y = torch.randn(N, D_out)
828
def __getitem__(self, idx):
829
return self.x, self.y
831
class TwoLayerNet(torch.nn.Module):
832
def __init__(self, D_in, H, D_out):
834
self.linear1 = torch.nn.Linear(D_in, H)
835
self.linear2 = torch.nn.Linear(H, D_out)
837
def forward(self, x):
838
h_relu = self.linear1(x).clamp(min=0)
839
y_pred = self.linear2(h_relu)
842
class CustomSGD(torch.optim.SGD):
843
def __init__(self, *args, **kwargs):
844
super().__init__(*args, **kwargs)
847
for _, data in enumerate(dataloader):
848
x, y = data[0], data[1]
850
loss = criterion(y_pred, y)
851
optimizer.zero_grad()
855
N, D_in, H, D_out = 8, 10, 5, 2
856
model = TwoLayerNet(D_in, H, D_out)
857
criterion = torch.nn.MSELoss(reduction="sum")
858
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
859
ds = RepeatedDataset(N, D_in, D_out)
860
dataloader = torch.utils.data.DataLoader(ds, batch_size=1)
865
self.assertTrue(False, "Expected no exception without profiling.")
869
optimizer_duplicate = torch.optim.SGD(model.parameters(), lr=1e-4)
870
dataloader_duplicate = torch.utils.data.DataLoader(ds, batch_size=1)
872
def judge(expected_event_count, prof):
873
actual_event_count = {}
874
for e in prof.function_events:
877
if key in expected_event_count.keys():
878
actual_event_count[key] = (
879
actual_event_count.setdefault(key, 0) + 1
881
for key, count in expected_event_count.items():
883
(key in actual_event_count.keys())
884
and (count == actual_event_count[key])
887
with _profile(use_kineto=kineto_available()) as prof:
889
expected_event_count = {
891
"enumerate(DataLoader)#_SingleProcessDataLoaderIter.__next__": (N + 1),
892
"Optimizer.step#SGD.step": N,
893
"Optimizer.zero_grad#SGD.zero_grad": N,
895
judge(expected_event_count, prof)
898
optimizer = pickle.loads(pickle.dumps(optimizer))
899
with _profile(use_kineto=kineto_available()) as prof:
901
judge(expected_event_count, prof)
904
optimizer = CustomSGD(model.parameters(), lr=1e-4)
905
with _profile(use_kineto=kineto_available()) as prof:
907
expected_event_count = {
908
"enumerate(DataLoader)#_SingleProcessDataLoaderIter.__next__": (N + 1),
909
"Optimizer.step#CustomSGD.step": N,
910
"Optimizer.zero_grad#CustomSGD.zero_grad": N,
912
judge(expected_event_count, prof)
914
def test_flops(self):
915
model = torch.nn.Sequential(
916
nn.Conv2d(16, 33, 18),
921
inputs = torch.randn(40, 16, 18, 260)
922
nested_tensor = torch.nested.nested_tensor(
923
[torch.randn((2, 5)), torch.randn((3, 5))], layout=torch.jagged
926
record_shapes=True, with_flops=True, use_kineto=kineto_available()
930
nested_tensor = nested_tensor + nested_tensor
931
profiler_output = prof.key_averages(group_by_input_shape=True).table(
932
sort_by="cpu_time_total", row_limit=10
934
self.assertIn("Total MFLOPs", profiler_output)
935
if not (kineto_available() and torch.cuda.is_available()):
940
torch.profiler.ProfilerActivity.CPU,
941
torch.profiler.ProfilerActivity.CUDA,
945
) as kineto_profiler:
947
profiler_output = kineto_profiler.key_averages().table(
948
sort_by="self_cuda_time_total", row_limit=-1
950
self.assertIn("Total MFLOPs", profiler_output)
952
def test_kineto_profiler_api(self):
955
use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities()
956
with profile(activities=supported_activities()):
957
self.payload(use_cuda=use_cuda)
959
def trace_handler(p):
960
output = p.key_averages().table(
961
sort_by="self_cuda_time_total" if use_cuda else "self_cpu_time_total",
968
initial_step = KinetoStepTracker.current_step()
971
activities=supported_activities(),
972
schedule=torch.profiler.schedule(wait=1, warmup=1, active=2),
973
on_trace_ready=trace_handler,
976
self.payload(use_cuda=use_cuda)
979
self.assertEqual(called_num[0], 2)
980
self.assertEqual(KinetoStepTracker.current_step(), initial_step + 8)
983
with profile(activities=supported_activities()) as p:
984
self.payload(use_cuda=use_cuda)
985
self.payload(use_cuda=use_cuda)
986
output = p.key_averages().table(
987
sort_by="self_cuda_time_total" if use_cuda else "self_cpu_time_total",
992
test_schedule = torch.profiler.schedule(
993
skip_first=2, wait=1, warmup=1, active=2, repeat=2
995
test_schedule_expected_outputs = [
999
ProfilerAction.WARMUP,
1000
ProfilerAction.RECORD,
1001
ProfilerAction.RECORD_AND_SAVE,
1002
ProfilerAction.NONE,
1003
ProfilerAction.WARMUP,
1004
ProfilerAction.RECORD,
1005
ProfilerAction.RECORD_AND_SAVE,
1006
ProfilerAction.NONE,
1007
ProfilerAction.NONE,
1008
ProfilerAction.NONE,
1009
ProfilerAction.NONE,
1011
for step in range(len(test_schedule_expected_outputs)):
1012
self.assertEqual(test_schedule(step), test_schedule_expected_outputs[step])
1014
def test_kineto_profiler_multiple_steppers(self):
1016
use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities()
1018
opt = torch.optim.SGD(net.parameters(), lr=0.01, momentum=0.9)
1020
inputs = torch.rand(10)
1022
with profile(activities=supported_activities()):
1023
self.payload(use_cuda=use_cuda)
1025
def optimizer_step():
1026
"""This simulates a step() hook in the optimizer"""
1027
KinetoStepTracker.increment_step("yet_another_step")
1029
initial_step = KinetoStepTracker.current_step()
1033
loss = torch.nn.functional.cross_entropy(out, torch.rand(2))
1041
for idx in range(niters):
1045
activities=supported_activities(),
1046
schedule=torch.profiler.schedule(wait=1, warmup=1, active=2),
1048
for idx in range(niters):
1052
self.assertEqual(KinetoStepTracker.current_step(), initial_step + 2 * niters)
1054
def test_export_stacks(self):
1057
use_kineto=kineto_available(),
1058
experimental_config=_ExperimentalConfig(verbose=True),
1060
x = torch.randn(10, 10)
1061
y = torch.randn(10, 10)
1065
with TemporaryFileName(mode="w+") as fname:
1066
p.export_stacks(fname)
1067
with open(fname) as f:
1068
lines = f.readlines()
1069
assert len(lines) > 0, "Empty stacks file"
1073
assert int(line.split(" ")[-1]) > 0, "Invalid stacks record"
1077
assert is_int, "Invalid stacks record"
1079
@unittest.skipIf(not kineto_available(), "Kineto is required")
1080
def test_tensorboard_trace_handler(self):
1081
use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities()
1082
with _profile(use_cuda=use_cuda, use_kineto=True):
1083
self.payload(use_cuda=use_cuda)
1085
with TemporaryDirectoryName() as dname:
1087
activities=[torch.profiler.ProfilerActivity.CPU]
1088
+ ([torch.profiler.ProfilerActivity.CUDA] if use_cuda else []),
1089
schedule=torch.profiler.schedule(wait=1, warmup=1, active=2, repeat=3),
1090
on_trace_ready=torch.profiler.tensorboard_trace_handler(dname),
1093
self.payload(use_cuda=use_cuda)
1096
self.assertTrue(os.path.exists(dname))
1098
for file_name in os.listdir(dname):
1099
parts = file_name.split(".")
1100
self.assertTrue(len(parts) > 4)
1102
parts[-4].isdigit() and int(parts[-4]) > 0,
1103
"Wrong tracing file name pattern",
1105
self.assertEqual(parts[-3:], ["pt", "trace", "json"])
1107
self.assertEqual(file_num, 3)
1110
with TemporaryDirectoryName() as dname:
1112
activities=[torch.profiler.ProfilerActivity.CPU]
1113
+ ([torch.profiler.ProfilerActivity.CUDA] if use_cuda else []),
1114
schedule=torch.profiler.schedule(wait=1, warmup=1, active=2, repeat=3),
1115
on_trace_ready=torch.profiler.tensorboard_trace_handler(
1116
dname, use_gzip=True
1121
self.payload(use_cuda=use_cuda)
1125
self.assertTrue(os.path.exists(dname))
1127
for file_name in os.listdir(dname):
1128
parts = file_name.split(".")
1129
self.assertTrue(len(parts) > 4)
1131
parts[-5].isdigit() and int(parts[-5]) > 0,
1132
"Wrong tracing file name pattern",
1134
self.assertEqual(parts[-4:], ["pt", "trace", "json", "gz"])
1136
self.assertEqual(file_num, 3)
1138
@unittest.skipIf(not kineto_available(), "Kineto is required")
1139
def test_profiler_metadata(self):
1140
t1, t2 = torch.ones(1), torch.ones(1)
1141
with profile() as prof:
1143
prof.add_metadata("test_key1", "test_value1")
1144
prof.add_metadata_json("test_key2", "[1,2,3]")
1146
with TemporaryFileName(mode="w+") as fname:
1147
prof.export_chrome_trace(fname)
1148
with open(fname) as f:
1149
trace = json.load(f)
1150
assert "test_key1" in trace
1151
assert trace["test_key1"] == "test_value1"
1152
assert "test_key2" in trace
1153
assert trace["test_key2"] == [1, 2, 3]
1155
def _test_profiler_tracing(self, use_kineto):
1156
with _profile(use_kineto=use_kineto) as prof:
1157
t1, t2 = torch.ones(1), torch.ones(1)
1160
with TemporaryFileName(mode="w+") as fname:
1161
prof.export_chrome_trace(fname)
1164
with open(fname) as f:
1168
with _profile(use_kineto=use_kineto) as prof:
1171
with TemporaryFileName(mode="w+") as fname:
1172
prof.export_chrome_trace(fname)
1174
with open(fname) as f:
1175
contents = json.load(f)
1178
if "WARNING" in contents:
1179
found_empty_warning = False
1180
for warning in contents["WARNING"]:
1181
if "No Valid Trace Events" in warning:
1182
found_empty_warning = True
1183
self.assertTrue(found_empty_warning)
1186
use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities()
1190
device = torch.device("cuda:0")
1191
with _profile(use_cuda=True, use_kineto=use_kineto) as prof:
1192
t1, t2 = torch.ones(1, device=device), torch.ones(1, device=device)
1195
with TemporaryFileName(mode="w+") as fname:
1196
prof.export_chrome_trace(fname)
1198
with open(fname) as f:
1201
def test_profiler_tracing(self):
1202
self._test_profiler_tracing(False)
1203
if kineto_available():
1204
self._test_profiler_tracing(True)
1206
def test_profiler_op_event_args(self):
1207
torch._C._profiler._set_record_concrete_inputs_enabled_val(True)
1208
with _profile(record_shapes=True) as prof:
1209
a = torch.ones((64, 32), dtype=torch.float32)
1210
c = torch.cat([a, a]).sin()
1211
with TemporaryFileName(mode="w+") as fname:
1212
prof.export_chrome_trace(fname)
1213
with open(fname) as f:
1216
e for e in j["traceEvents"] if e.get("cat", "") == "cpu_op"
1220
if e["name"] == "aten::ones":
1223
["ScalarList", "Scalar", "", "", "Scalar"],
1226
args["Concrete Inputs"], ["[64, 32]", "6", "", "", "False"]
1229
if e["name"] == "aten::cat":
1230
self.assertEqual(args["Input Dims"], [[[64, 32], [64, 32]], []])
1231
self.assertEqual(args["Input type"], ["TensorList", "Scalar"])
1234
self.assertGreaterEqual(
1235
args.get("Record function id", -1),
1237
f"Failed finding record funciont for op = {e}",
1240
def test_profiler_strides(self):
1241
torch._C._profiler._set_record_concrete_inputs_enabled_val(True)
1242
base_tensor = torch.randn(1024, dtype=torch.float32)
1243
a = base_tensor.as_strided((16, 16), (17, 1), 0)
1244
b = base_tensor.as_strided((16, 16), (25, 2), 272)
1245
with _profile(record_shapes=True) as prof:
1248
with TemporaryFileName(mode="w+") as fname:
1249
prof.export_chrome_trace(fname)
1250
with open(fname) as f:
1253
e for e in j["traceEvents"] if e.get("cat", "") == "cpu_op"
1257
if e["name"] == "aten::add":
1258
self.assertEqual(args["Input Strides"], [[17, 1], [25, 2], []])
1260
def test_profiler_fwd_bwd_link(self):
1261
with _profile(use_kineto=True) as prof:
1262
t1, t2 = torch.ones(1, requires_grad=True), torch.ones(
1263
1, requires_grad=True
1265
z = torch.add(t1, t2)
1267
loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)
1269
with TemporaryFileName(mode="w+") as fname:
1270
prof.export_chrome_trace(fname)
1271
with open(fname) as f:
1273
events = j["traceEvents"]
1279
ts_to_name[e["ts"]] = e["name"]
1283
and e["cat"] == "fwdbwd"
1284
and e["name"] == "fwdbwd"
1287
flow_s_to_ts[e["id"]] = e["ts"]
1288
elif e["ph"] == "f":
1289
flow_f_to_ts[e["id"]] = e["ts"]
1291
self.assertEqual(len(flow_s_to_ts), 2)
1292
self.assertEqual(len(flow_f_to_ts), 2)
1293
self.assertIn(1, flow_s_to_ts)
1294
self.assertIn(1, flow_f_to_ts)
1295
self.assertIn(2, flow_s_to_ts)
1296
self.assertIn(2, flow_f_to_ts)
1297
s_ts_1 = flow_s_to_ts[1]
1298
f_ts_1 = flow_f_to_ts[1]
1299
s_ts_2 = flow_s_to_ts[2]
1300
f_ts_2 = flow_f_to_ts[2]
1303
ts in ts_to_name.keys()
1304
for ts in [s_ts_1, f_ts_1, s_ts_2, f_ts_2]
1308
ts_to_name[s_ts_1] == "aten::binary_cross_entropy_with_logits"
1310
self.assertTrue(ts_to_name[s_ts_2] == "aten::add")
1312
def test_profiler_disable_fwd_bwd_link(self):
1314
torch._C._profiler._set_fwd_bwd_enabled_val(False)
1316
with _profile(use_kineto=True) as prof:
1317
t1, t2 = torch.ones(1, requires_grad=True), torch.ones(
1318
1, requires_grad=True
1320
z = torch.add(t1, t2)
1322
loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)
1325
with TemporaryFileName(mode="w+") as fname:
1326
prof.export_chrome_trace(fname)
1327
with open(fname) as f:
1329
events = j["traceEvents"]
1332
self.assertNotEqual(e.get("cat", None), "fwdbwd")
1334
torch._C._profiler._set_fwd_bwd_enabled_val(True)
1339
@unittest.skipIf(not kineto_available(), "Kineto is required")
1340
@unittest.skipIf(IS_WINDOWS, "Test does not work on Windows")
1341
@unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
1342
def test_profiler_cuda_sync_events(self):
1343
device = torch.device("cuda:0")
1344
t1, t2 = torch.ones(1, device=device), torch.ones(1, device=device)
1346
def workload() -> None:
1348
torch.cuda.synchronize()
1351
def trace_and_check(exp_config: Optional[_ExperimentalConfig]) -> None:
1355
experimental_config=exp_config,
1359
with TemporaryFileName(mode="w+") as fname:
1361
prof.export_chrome_trace(fname)
1362
with open(fname) as f:
1364
cats = {e.get("cat", None) for e in j["traceEvents"]}
1366
"cuda_sync" in cats,
1367
"Expected to find cuda_sync event" f" found = {cats}",
1370
print("Testing enable_cuda_sync_events in _ExperimentalConfig")
1371
trace_and_check(exp_config=_ExperimentalConfig(enable_cuda_sync_events=True))
1373
print("Testing _profiler._set_cuda_sync_enabled_val()")
1375
torch._C._profiler._set_cuda_sync_enabled_val(True)
1376
trace_and_check(exp_config=None)
1378
torch._C._profiler._set_cuda_sync_enabled_val(False)
1380
def test_profiler_type(self):
1381
profiler_type = torch._C._autograd._profiler_type
1382
ActiveProfilerType = torch._C._profiler.ActiveProfilerType
1383
self.assertEqual(profiler_type(), ActiveProfilerType.NONE)
1386
with _profile_legacy():
1387
self.assertEqual(profiler_type(), ActiveProfilerType.LEGACY)
1391
self.assertEqual(profiler_type(), ActiveProfilerType.KINETO)
1393
def test_profiler_correlation_id(self):
1395
We expect the correlation_id to be unique across multiple invokation of the profiler,
1396
So we will reuse id_uniqueness_set.
1398
id_uniqueness_set = set()
1399
model = torch.nn.Sequential(
1400
nn.Conv2d(16, 33, 18),
1402
nn.Linear(243, 243),
1405
inputs = torch.randn(40, 16, 18, 260)
1406
uint32_max = 2**32 - 1
1408
with profile() as prof:
1410
for event in prof.profiler.kineto_results.events():
1411
corr_id = event.correlation_id()
1412
if (corr_id) and event.device_type() == DeviceType.CPU:
1413
self.assertTrue(corr_id not in id_uniqueness_set)
1414
id_uniqueness_set.add(corr_id)
1415
self.assertTrue(corr_id < uint32_max)
1417
def test_nested_tensor_with_shapes(self):
1418
a = torch.randn(4, 4)
1419
b = torch.randn(4, 4)
1420
c = torch.randn(4, 4)
1421
inp = torch.nested.nested_tensor([a, b])
1422
with torch.profiler.profile(record_shapes=True) as prof:
1423
torch.nn.functional.linear(inp, c, None)
1424
for e in prof.events():
1425
if e.name in ("aten::mm", "aten::addmm"):
1428
self.assertTrue(len(e.input_shapes) > 0)
1429
self.assertTrue(len(e.input_shapes[0]) > 0)
1431
@patch.dict(os.environ, {"KINETO_USE_DAEMON": "1"})
1432
@patch.dict(os.environ, {"KINETO_DAEMON_INIT_DELAY_S": "1"})
1433
def test_kineto_profiler_with_environment_variable(self):
1436
import torch.nn as nn
1437
from torch.profiler import supported_activities, profile
1438
from torch.autograd.profiler import KinetoStepTracker
1440
class SimpleNet(nn.Module):
1441
def __init__(self) -> None:
1443
self.fc1 = nn.Linear(10, 5)
1444
self.fc2 = nn.Linear(5, 2)
1446
def forward(self, x):
1447
return self.fc2(self.fc1(x))
1450
def payload(use_cuda=False):
1451
x = torch.randn(10, 10)
1454
y = torch.randn(10, 10)
1463
use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities()
1465
opt = torch.optim.SGD(net.parameters(), lr=0.01)
1467
inputs = torch.rand(10)
1469
with profile(activities=supported_activities()):
1470
payload(use_cuda=use_cuda)
1472
initial_step = KinetoStepTracker.current_step()
1476
loss = torch.nn.functional.cross_entropy(out, torch.rand(2))
1480
for _ in range(niters):
1484
activities=supported_activities(),
1485
schedule=torch.profiler.schedule(
1490
for _ in range(niters):
1493
assert KinetoStepTracker.current_step() == initial_step + 2 * niters
1496
subprocess.check_output(
1497
[sys.executable, "-W", "always", "-c", script],
1498
cwd=os.path.dirname(os.path.realpath(__file__)),
1500
except subprocess.CalledProcessError as e:
1501
if e.returncode != 0:
1504
"Kineto is not working properly with the Dynolog environment variable",
1507
def test_concrete_inputs_profiling(self):
1508
x = torch.rand(2, 6)
1509
with profile(record_shapes=True) as p:
1510
y = x.as_strided([4, 3], [1, 4])
1513
for e in p.events():
1514
if e.name in ("aten::as_strided"):
1516
self.assertTrue(len(e.input_shapes) > 0)
1517
self.assertTrue(len(e.concrete_inputs) > 0)
1518
self.assertEqual([2, 6], e.input_shapes[0])
1519
self.assertEqual([4, 3], e.concrete_inputs[1])
1520
self.assertEqual([1, 4], e.concrete_inputs[2])
1522
self.assertTrue(found, "Expected to find aten::as_strided but did not")
1524
def test_concrete_inputs_profiling_toggling(self):
1526
for before, after in [(True, False), (False, True)]:
1527
x = torch.rand(2, 6)
1528
torch._C._profiler._set_record_concrete_inputs_enabled_val(before)
1529
with profile(record_shapes=True) as p:
1530
y = x.as_strided([4, 3], [1, 4])
1531
torch._C._profiler._set_record_concrete_inputs_enabled_val(after)
1534
for e in p.events():
1535
if e.name in ("aten::as_strided"):
1537
self.assertTrue(len(e.input_shapes))
1539
self.assertTrue(found, "Expected to find aten::as_strided but did not")
1541
torch._C._profiler._set_record_concrete_inputs_enabled_val(True)
1543
def test_record_function_fast(self):
1544
x, y = (torch.rand((4, 4)) for _ in range(2))
1545
with profile(record_shapes=True) as p:
1548
with torch._C._profiler._RecordFunctionFast("add_test_fast_rf1"):
1551
self.assertGreaterEqual(
1552
len([e for e in p.events() if e.name == "add_test_fast_rf1"]), 4
1554
for e in p.events():
1555
if e.name == "add_test_fast_rf1":
1556
self.assertTrue(e.input_shapes == [])
1557
self.assertTrue(e.kwinputs == {})
1558
with profile(record_shapes=True) as p:
1560
cm = torch._C._profiler._RecordFunctionFast(
1561
"add_test_fast_rf2", [x, y], {"stream": 0, "grid": "lambda x : x + 1"}
1567
self.assertGreaterEqual(
1568
len([e for e in p.events() if e.name == "add_test_fast_rf2"]), 4
1571
for e in p.events():
1572
if e.name == "add_test_fast_rf2":
1573
self.assertTrue(e.input_shapes == [[4, 4], [4, 4]])
1574
self.assertTrue(e.kwinputs == {"stream": 0, "grid": "lambda x : x + 1"})
1576
with profile(record_shapes=True) as p:
1577
cm = torch._C._profiler._RecordFunctionFast(
1578
"add_test_fast_rf3", input_values=["hi"], keyword_values={"hi": "hello"}
1589
self.assertGreaterEqual(
1590
len([e for e in p.events() if e.name == "add_test_fast_rf3"]), 4
1592
self.assertFalse(any((e.name and "relu" in e.name) for e in p.events()))
1594
for e in p.events():
1595
if e.name == "add_test_fast_rf3":
1596
self.assertTrue(e.input_shapes == [[]])
1598
with profile() as p:
1600
with torch._C._profiler._RecordFunctionFast(
1601
"add_test_fast_rf4", [x, y]
1604
with torch._C._profiler._RecordFunctionFast("add_test_fast_rf5"):
1607
self.assertGreaterEqual(
1608
len([e for e in p.events() if e.name == "add_test_fast_rf4"]), 4
1611
for e in p.events():
1612
if e.name == "add_test_fast_rf4":
1613
self.assertTrue(e.input_shapes == [])
1615
self.assertGreaterEqual(
1616
len([e for e in p.events() if e.name == "add_test_fast_rf5"]), 4
1619
with profile(record_shapes=True) as p:
1621
cm = torch._C._profiler._RecordFunctionFast(
1622
"add_test_fast_rf6",
1632
self.assertGreaterEqual(
1633
len([e for e in p.events() if e.name == "add_test_fast_rf6"]), 4
1636
for e in p.events():
1637
if e.name == "add_test_fast_rf6":
1638
self.assertTrue(e.input_shapes == [[4, 4], [4, 4]])
1640
@skipIfTorchDynamo("profiler gets ignored if dynamo activated")
1641
def test_profiler_op_event_kwargs(self):
1642
x, y = (torch.rand((4, 4)) for _ in range(2))
1643
with profile(record_shapes=True) as p:
1644
cm = torch._C._profiler._RecordFunctionFast(
1645
"add_test_kwinputs",
1647
{"stream": 0, "grid": "lambda x : x + 1", "debug": 'debug"'},
1652
with TemporaryFileName(mode="w+") as fname:
1653
p.export_chrome_trace(fname)
1654
with open(fname) as f:
1657
e for e in j["traceEvents"] if e.get("cat", "") == "cpu_op"
1660
if e["name"] == "add_test_kwinputs":
1662
self.assertTrue("stream" in args)
1663
self.assertTrue("grid" in args)
1664
self.assertTrue(args["stream"] == "0")
1665
self.assertTrue(args["grid"] == "lambda x : x + 1")
1666
self.assertTrue(args["debug"] == "None")
1668
def test_is_profiler_enabled(self):
1669
self.assertFalse(torch.autograd.profiler._is_profiler_enabled)
1671
with profile() as p:
1672
self.assertTrue(torch.autograd.profiler._is_profiler_enabled)
1674
self.assertFalse(torch.autograd.profiler._is_profiler_enabled)
1676
with torch.autograd.profiler.profile() as p:
1677
self.assertTrue(torch.autograd.profiler._is_profiler_enabled)
1679
self.assertFalse(torch.autograd.profiler._is_profiler_enabled)
1681
def test_guarded_record_function_fast(self):
1682
x, y = (torch.rand((4, 4)) for _ in range(2))
1684
with profile() as p:
1685
cm = torch._C._profiler._RecordFunctionFast("guarded_rff")
1687
if torch.autograd.profiler._is_profiler_enabled:
1693
self.assertGreaterEqual(
1694
len([e for e in p.events() if e.name == "guarded_rff"]), 4
1697
@unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
1698
def test_event_list(self):
1701
x, y = (torch.rand((4, 4), requires_grad=True, device="cuda") for _ in range(2))
1702
with profile(with_stack=True) as p:
1703
z = (x @ y).relu().sum()
1706
event_list = torch.autograd.profiler_util.EventList(p.events())
1709
with TemporaryFileName(mode="w+") as fname:
1710
event_list.export_chrome_trace(fname)
1711
with open(fname) as f:
1716
def _check_all_gpu_present(self, gpu_dict, max_gpu_count):
1717
for i in range(0, max_gpu_count):
1718
self.assertEqual(gpu_dict["GPU " + str(i)], 1)
1722
def _validate_basic_json(self, traceEvents, cuda_available=False):
1727
traceEventProfiler = traceEvents[PROFILER_IDX]
1729
self.assertTrue(traceEventProfiler["name"] == "PyTorch Profiler (0)")
1730
self.assertTrue(traceEvents[RECORD_END]["name"] == "Record Window End")
1732
traceEvents[RECORD_START]["name"] == "Iteration Start: PyTorch Profiler"
1735
self.assertGreaterEqual(
1736
traceEventProfiler["ts"],
1737
traceEvents[RECORD_START]["ts"],
1738
"Profiler starts before record!",
1740
self.assertLessEqual(
1741
traceEventProfiler["ts"] + traceEventProfiler["dur"],
1742
traceEvents[RECORD_END]["ts"],
1743
"Profiler ends after record end!",
1746
gpu_dict = collections.defaultdict(int)
1747
for i, traceEvent in enumerate(traceEvents):
1749
i == len(traceEvents) + RECORD_END
1750
or i == len(traceEvents) + RECORD_START
1754
if "ts" in traceEvent:
1755
self.assertGreaterEqual(
1757
traceEventProfiler["ts"],
1758
"Trace event is out of bounds",
1762
if "dur" in traceEvent:
1763
self.assertLessEqual(
1764
traceEvent["ts"] + traceEvent["dur"],
1765
traceEvents[RECORD_END]["ts"],
1766
"Trace event ends too late!",
1768
gpu_value = traceEvent.get("args", {}).get("labels", None)
1769
if gpu_value and "GPU" in gpu_value:
1770
gpu_dict[gpu_value] += 1
1773
kExceedMaxPid = 5000000
1775
traceEvents[i + 1]["args"]["sort_index"]
1776
== kExceedMaxPid + int(gpu_value.split()[1])
1781
def _test_chrome_trace_basic_helper(self, with_cuda=False):
1786
x, y = (torch.rand(4, 4).to(device) for _ in range(2))
1788
with profile(with_stack=True) as p:
1790
with TemporaryFileName(mode="w+") as fname:
1791
p.export_chrome_trace(fname)
1792
with open(fname) as f:
1793
report = json.load(f)
1794
self._validate_basic_json(report["traceEvents"], with_cuda)
1796
@unittest.skipIf(not kineto_available(), "Kineto is required")
1797
@skipIfTorchDynamo("profiler gets ignored if dynamo activated")
1798
def test_basic_chrome_trace(self):
1799
self._test_chrome_trace_basic_helper()
1800
if torch.cuda.is_available():
1801
self._test_chrome_trace_basic_helper(with_cuda=True)
1803
@skipIfTorchDynamo("profiler gets ignored if dynamo activated")
1804
def test_profiler_time_scale(self):
1806
SEC_TO_US = 1000 * 1000
1808
with profile() as p:
1809
with torch.profiler.record_function("test_span"):
1810
for i in range(WAIT_TIME):
1816
self.assertTrue(events[0].name == "test_span")
1817
test_span = events[0]
1818
self.assertGreaterEqual(
1819
test_span.cpu_time / SEC_TO_US,
1820
WAIT_TIME - MARGIN_ERROR,
1821
"event out of range",
1823
self.assertLessEqual(
1824
test_span.cpu_time / SEC_TO_US,
1825
WAIT_TIME + MARGIN_ERROR,
1826
"event out of range",
1830
with TemporaryFileName(mode="w+") as fname:
1831
p.export_chrome_trace(fname)
1832
with open(fname) as f:
1833
report = json.load(f)
1834
events = report["traceEvents"]
1835
for event in events:
1836
if event["name"] == "test_span":
1837
self.assertGreaterEqual(
1838
event["dur"] / SEC_TO_US,
1839
WAIT_TIME - MARGIN_ERROR,
1840
"profiling out of range",
1842
self.assertLessEqual(
1843
event["dur"] / SEC_TO_US,
1844
WAIT_TIME + MARGIN_ERROR,
1845
"profiling out of range",
1848
def _schedule_helper(self, warmup, active, repeat, acc_events=True):
1850
schedule=torch.profiler.schedule(
1857
acc_events=acc_events,
1859
for i in range(100):
1863
for ev in prof.key_averages():
1864
if ev.key == "aten::add":
1868
@skipIfTorchDynamo("profiler gets ignored if dynamo activated")
1869
def test_schedule_function_count(self):
1870
self.assertEqual(self._schedule_helper(warmup=0, active=1, repeat=1), 1)
1871
self.assertEqual(self._schedule_helper(warmup=0, active=5, repeat=0), 100)
1872
self.assertEqual(self._schedule_helper(warmup=0, active=5, repeat=10), 50)
1873
self.assertEqual(self._schedule_helper(warmup=1, active=5, repeat=0), 83)
1874
self.assertEqual(self._schedule_helper(warmup=10, active=10, repeat=4), 40)
1875
self.assertEqual(self._schedule_helper(warmup=50, active=1, repeat=0), 1)
1877
self._schedule_helper(warmup=0, active=5, repeat=0, acc_events=False), 0
1880
self._schedule_helper(warmup=10, active=10, repeat=4, acc_events=False), 10
1883
def _step_helper_func(self, prof):
1885
torch.randn(1, 3, 224, 224)
1888
def _partial_overlap(self, prof_step, step_helper_func):
1889
p_start = prof_step["ts"]
1890
p_end = prof_step["ts"] + prof_step["dur"]
1891
h_start = step_helper_func["ts"]
1892
h_end = step_helper_func["ts"] + step_helper_func["dur"]
1894
if p_start < h_start and p_end < h_end and p_end > h_start:
1896
if p_start > h_start and p_start < h_end and p_end > h_end:
1900
@skipIfTorchDynamo("profiler gets ignored if dynamo activated")
1901
def test_cpu_annotation_overlap(self):
1902
with torch.profiler.profile(
1903
activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
1906
schedule=torch.profiler.schedule(wait=0, warmup=0, active=5, repeat=1),
1909
self._step_helper_func(prof)
1910
with TemporaryFileName(mode="w+") as fname:
1911
prof.export_chrome_trace(fname)
1913
step_helper_funcs = []
1914
with open(fname) as f:
1915
report = json.load(f)
1916
for event in report["traceEvents"]:
1917
if "ProfilerStep" in event["name"]:
1918
prof_steps.append(event)
1919
if "step_helper_func" in event["name"]:
1920
step_helper_funcs.append(event)
1921
self.assertEqual(len(prof_steps), 5)
1922
self.assertEqual(len(step_helper_funcs), 5)
1923
for i in range(0, len(step_helper_funcs)):
1924
for j in range(0, len(step_helper_funcs)):
1926
not self._partial_overlap(prof_steps[i], step_helper_funcs[j])
1929
@skipIfTorchDynamo("profiler gets ignored if dynamo activated")
1930
def test_user_annotation(self):
1931
use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities()
1932
with profile(activities=supported_activities()) as p:
1933
with torch.profiler.record_function("test_user_annotation"):
1934
self.payload(use_cuda=use_cuda)
1936
for evt in p.key_averages():
1937
if evt.key == "test_user_annotation":
1938
self.assertTrue(evt.is_user_annotation)
1940
self.assertFalse(evt.is_user_annotation)
1942
@unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
1943
@skipIfTorchDynamo("profiler gets ignored if dynamo activated")
1944
def test_dynamic_toggle(self):
1945
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as p:
1946
with torch.profiler.record_function("test_user_annotation"):
1947
x, y = (torch.rand(4, 4).to("cuda") for _ in range(2))
1950
self.assertTrue(any("aten" in e.name for e in p.events()))
1952
self.assertTrue(any("cuda" in e.name for e in p.events()))
1954
self.assertTrue(any("kernel" in e.name for e in p.events()))
1956
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as p1:
1957
p1.toggle_collection_dynamic(False, [ProfilerActivity.CUDA])
1958
with torch.profiler.record_function("test_user_annotation"):
1959
x, y = (torch.rand(4, 4).to("cuda") for _ in range(2))
1962
self.assertTrue(any("aten" in e.name for e in p1.events()))
1964
self.assertTrue(all("cuda" not in e.name for e in p1.events()))
1966
self.assertTrue(all("kernel" not in e.name for e in p1.events()))
1968
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as p2:
1969
p2.toggle_collection_dynamic(
1970
False, [ProfilerActivity.CUDA, ProfilerActivity.CPU]
1972
with torch.profiler.record_function("test_user_annotation"):
1973
x, y = (torch.rand(4, 4).to("cuda") for _ in range(2))
1975
self.assertTrue(len(p2.events()) == 0)
1977
@skipIfTorchDynamo("profiler gets ignored if dynamo activated")
1978
def test_lazy_build_tree(self):
1979
with profile() as p:
1984
self.assertEqual(stats.function_events_build_tree_call_duration_us, 0)
1985
self.assertEqual(stats.number_of_events, 0)
1989
self.assertGreater(stats.function_events_build_tree_call_duration_us, 0)
1990
self.assertGreater(stats.number_of_events, 0)
1993
class SimpleNet(nn.Module):
1994
def __init__(self) -> None:
1996
self.fc1 = nn.Linear(10, 5)
1997
self.fc2 = nn.Linear(5, 2)
1999
def forward(self, x):
2000
return self.fc2(self.fc1(x))
2003
@dataclass(frozen=True)
2004
class MockKinetoEvent:
2008
_linked_correlation_id: int
2012
def name(self) -> str:
2015
def start_ns(self) -> int:
2016
return self._start_us * 1000
2018
def duration_ns(self) -> int:
2019
return self._duration_us * 1000
2021
def linked_correlation_id(self) -> int:
2022
return self._linked_correlation_id
2024
def device_type(self) -> DeviceType:
2025
return DeviceType.CUDA if self._device_type == 1 else DeviceType.CPU
2028
@dataclass(frozen=True)
2029
class MockProfilerEvent:
2033
duration_time_ns: int
2034
correlation_id: int = 0
2035
children: List["MockProfilerEvent"] = field(default_factory=list)
2036
parent: Optional["MockProfilerEvent"] = None
2039
def end_time_ns(self):
2040
return self.start_time_ns + self.duration_time_ns
2043
def name(self) -> str:
2046
def __post__init__(self, parent, children):
2047
object.__setattr__(self, "parent", parent)
2048
object.__setattr__(self, "children", children)
2052
def __init__(self, name, children) -> None:
2054
self.children = [MockNode(name, i) for name, i in children.items()]
2057
class TestExperimentalUtils(TestCase):
2058
def make_tree(self) -> List[MockNode]:
2075
return [MockNode(name, i) for name, i in tree.items()]
2077
def test_dfs(self) -> None:
2079
" ".join(i.name for i in _utils.traverse_dfs(self.make_tree())),
2080
"root_0 1 2 3 4 5 root_1 6 7 8 9 10",
2083
def test_bfs(self) -> None:
2085
" ".join(i.name for i in _utils.traverse_bfs(self.make_tree())),
2086
"root_0 root_1 1 3 6 7 8 2 4 5 9 10",
2090
def generate_mock_profile():
2092
MockKinetoEvent("cudaLaunchKernel", 400, 100, 1, 0),
2093
MockKinetoEvent("cudaLaunchKernel", 500, 100, 2, 0),
2094
MockKinetoEvent("cudaLaunchKernel", 600, 100, 3, 0),
2095
MockKinetoEvent("cudaLaunchKernel", 700, 100, 4, 0),
2096
MockKinetoEvent("cudaLaunchKernel", 800, 100, 5, 0),
2097
MockKinetoEvent("cudaLaunchKernel", 1500, 100, 6, 0),
2098
MockKinetoEvent("GPU", 900, 100, 1, 1),
2099
MockKinetoEvent("GPU", 1000, 100, 2, 1),
2100
MockKinetoEvent("GPU", 1100, 100, 3, 1),
2101
MockKinetoEvent("GPU", 1200, 100, 4, 1),
2102
MockKinetoEvent("GPU", 1300, 100, 5, 1),
2103
MockKinetoEvent("GPU", 1700, 100, 6, 1),
2106
MockProfilerEvent("CPU (Before cudaLaunchKernel)", 1, 0, 100000),
2107
MockProfilerEvent("CPU (Before cudaLaunchKernel)", 2, 100000, 100000),
2108
MockProfilerEvent("CPU (Before cudaLaunchKernel)", 3, 200000, 100000),
2109
MockProfilerEvent("CPU (Before cudaLaunchKernel)", 4, 300000, 100000),
2110
MockProfilerEvent("CPU (After cudaLaunchKernel)", 5, 400000, 100000),
2111
MockProfilerEvent("CPU (After cudaLaunchKernel)", 6, 500000, 100000),
2112
MockProfilerEvent("CPU (After cudaLaunchKernel)", 7, 600000, 100000),
2113
MockProfilerEvent("CPU (After cudaLaunchKernel)", 8, 700000, 100000),
2114
MockProfilerEvent("CPU (After GPU)", 9, 800000, 100000),
2115
MockProfilerEvent("CPU (After GPU)", 10, 900000, 100000),
2116
MockProfilerEvent("CPU (After GPU)", 11, 1100000, 100000),
2117
MockProfilerEvent("CPU (After GPU)", 12, 1200000, 500000),
2120
profiler = unittest.mock.Mock()
2121
profiler.kineto_results = unittest.mock.Mock()
2122
profiler.kineto_results.events = unittest.mock.Mock(return_value=cuda_events)
2123
profiler.kineto_results.experimental_event_tree = unittest.mock.Mock(
2124
return_value=cpu_events
2129
def load_mock_profile():
2130
accept = expecttest.ACCEPT
2131
json_file_path = os.path.join(
2132
os.path.dirname(os.path.realpath(__file__)),
2133
"profiler_utils_mock_events.json",
2135
if accept and torch.cuda.is_available():
2137
def garbage_code(x):
2141
x = torch.ones((4096, 4096), device="cuda")
2144
activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
2157
"_start_ns": e.start_ns(),
2158
"_duration_ns": e.duration_ns(),
2159
"_linked_correlation_id": e.linked_correlation_id(),
2160
"_device_type": 1 if e.device_type() == DeviceType.CUDA else 0,
2162
for e in prof.profiler.kineto_results.events()
2165
def EventTreeDFS(event_tree):
2166
from collections import deque
2168
stack = deque(event_tree)
2170
curr_event = stack.pop()
2172
for child_event in curr_event.children:
2173
stack.append(child_event)
2179
"start_time_ns": e.start_time_ns,
2180
"duration_time_ns": e.duration_time_ns,
2181
"correlation_id": e.correlation_id,
2182
"children": [child.id for child in e.children],
2183
"parent": e.parent.id if e.parent else None,
2185
for e in EventTreeDFS(
2186
prof.profiler.kineto_results.experimental_event_tree()
2190
with open(json_file_path, "w") as f:
2191
json.dump([kineto_events, profiler_events], f)
2193
assert os.path.exists(json_file_path)
2194
with open(json_file_path) as f:
2195
kineto_events, profiler_events = json.load(f)
2197
cuda_events = [MockKinetoEvent(*event.values()) for event in kineto_events]
2200
for e in profiler_events:
2201
event = MockProfilerEvent(**e)
2202
id_map[event.id] = event
2203
cpu_events.append(event)
2204
for event in cpu_events:
2205
parent = None if event.parent is None else id_map[event.parent]
2206
children = [id_map[child] for child in event.children]
2207
event.__post__init__(parent, children)
2208
cpu_events = [event for event in cpu_events if event.parent is None]
2209
profiler = unittest.mock.Mock()
2210
profiler.kineto_results = unittest.mock.Mock()
2211
profiler.kineto_results.events = unittest.mock.Mock(return_value=cuda_events)
2212
profiler.kineto_results.experimental_event_tree = unittest.mock.Mock(
2213
return_value=cpu_events
2217
def test_utils_compute_self_time(self):
2218
with profile() as prof:
2219
t1, t2 = torch.ones(1, requires_grad=True), torch.ones(
2220
1, requires_grad=True
2222
z = torch.add(t1, t2)
2224
loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)
2226
basic_eval = _utils.BasicEvaluation(prof.profiler)
2227
metrics = basic_eval.metrics
2228
self.assertTrue(len(metrics) > 0)
2229
for event_key, event_metrics in metrics.items():
2231
event_metrics.self_time_ns,
2232
event_key.event.duration_time_ns
2233
- sum(child.duration_time_ns for child in event_key.event.children),
2236
def test_utils_intervals_overlap(self):
2237
event = _utils.EventKey(MockProfilerEvent("Event 1", 1, 5, 5))
2239
_utils.Interval(0, 9),
2240
_utils.Interval(1, 2),
2241
_utils.Interval(2, 3),
2242
_utils.Interval(3, 4),
2243
_utils.Interval(4, 5),
2244
_utils.Interval(8, 12),
2246
print(event.intervals_overlap(intervals))
2247
self.assertEqual(event.intervals_overlap(intervals), 5)
2249
def test_utils_compute_queue_depth(self):
2250
def format_queue_depth(queue_depth_list, events):
2252
for data, event in zip(queue_depth_list, events):
2253
res += f"{data.queue_depth} [{event.name}]\n"
2257
profiler = self.generate_mock_profile()
2258
basic_evaluation = _utils.BasicEvaluation(profiler)
2259
self.assertExpectedInline(
2261
basic_evaluation.queue_depth_list, basic_evaluation.cuda_events
2278
self.assertExpectedInline(
2280
[basic_evaluation.metrics[k] for k in basic_evaluation.event_keys],
2281
basic_evaluation.events,
2284
0 [CPU (Before cudaLaunchKernel)]
2285
0 [CPU (Before cudaLaunchKernel)]
2286
0 [CPU (Before cudaLaunchKernel)]
2287
0 [CPU (Before cudaLaunchKernel)]
2288
1 [CPU (After cudaLaunchKernel)]
2289
2 [CPU (After cudaLaunchKernel)]
2290
3 [CPU (After cudaLaunchKernel)]
2291
4 [CPU (After cudaLaunchKernel)]
2299
def test_utils_compute_queue_depth_when_no_cuda_events(self):
2301
x = torch.ones((1024, 1024))
2302
with profile() as prof:
2305
basic_evaluation = _utils.BasicEvaluation(prof.profiler)
2306
self.assertFalse(basic_evaluation.compute_queue_depth())
2308
def test_utils_compute_idle_time(self):
2309
profiler = self.generate_mock_profile()
2310
basic_evaluation = _utils.BasicEvaluation(profiler)
2311
expected_output = "\n".join(
2313
f"{basic_evaluation.metrics[event_key].idle_time_ns} [{event_key.event.name}]"
2314
for event_key in basic_evaluation.event_keys
2317
self.assertExpectedInline(
2320
100000 [CPU (Before cudaLaunchKernel)]
2321
100000 [CPU (Before cudaLaunchKernel)]
2322
100000 [CPU (Before cudaLaunchKernel)]
2323
100000 [CPU (Before cudaLaunchKernel)]
2324
0 [CPU (After cudaLaunchKernel)]
2325
0 [CPU (After cudaLaunchKernel)]
2326
0 [CPU (After cudaLaunchKernel)]
2327
0 [CPU (After cudaLaunchKernel)]
2331
100000 [CPU (After GPU)]""",
2334
@unittest.skipIf(IS_JETSON, "JSON not behaving as expected on Jetson")
2335
def test_utils_get_optimizable_events(self):
2336
basic_evaluation = _utils.BasicEvaluation(self.load_mock_profile())
2337
optimizable_events = basic_evaluation.get_optimizable_events(
2338
2, print_enable=False
2340
expected_output = "\n".join(
2341
[f"{event_key.event.name}" for event_key in optimizable_events]
2343
self.assertExpectedInline(
2346
<built-in function _cuda_synchronize>
2350
def test_profiler_name_pattern(self):
2351
x = torch.ones((4096, 4096))
2352
with profile() as prof:
2356
matched_events = NamePattern(prof, "aten::mm").matched_events()
2357
output = "\n".join([f"{event.name}" for event in matched_events])
2358
self.assertExpectedInline(
2369
@unittest.skipIf(torch.cuda.is_available(), "Test not working for CUDA")
2370
def test_profiler_pattern_match_helper(self):
2371
x = torch.ones((100, 100))
2372
with profile() as prof:
2376
event_tree = prof.profiler.kineto_results.experimental_event_tree()
2377
pattern = Pattern(prof)
2378
self.assertEqual([], pattern.siblings_of(event_tree[0])[0])
2379
self.assertEqual(event_tree[1:], pattern.siblings_of(event_tree[0])[1])
2380
child_nodes = event_tree[0].children
2381
self.assertEqual([], pattern.siblings_of(child_nodes[0])[0])
2382
self.assertEqual(child_nodes[1:], pattern.siblings_of(child_nodes[0])[1])
2384
event_tree[0], pattern.root_of(event_tree[0].children[0].children[0])
2386
self.assertEqual(None, pattern.next_of(event_tree[-1]))
2387
self.assertEqual(event_tree[1], pattern.next_of(event_tree[0]))
2388
self.assertEqual(event_tree[0], pattern.prev_of(event_tree[1]))
2391
TEST_WITH_CROSSREF, "crossref intercepts calls and changes the callsite."
2393
@unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
2394
def test_profiler_extra_cuda_copy_pattern(self):
2396
(0, lambda: torch.ones((100, 100), device="cuda")),
2397
(1, lambda: torch.ones((100, 100)).to("cuda")),
2398
(1, lambda: torch.zeros((100, 100)).to("cuda")),
2399
(1, lambda: torch.empty((100, 100)).fill_(5).to("cuda")),
2400
(1, lambda: torch.ones((100, 100)).cuda()),
2401
(1, lambda: torch.zeros((100, 100)).cuda()),
2402
(1, lambda: torch.empty((100, 100)).fill_(5).cuda()),
2403
(1, lambda: torch.rand((100, 100)).cuda()),
2404
(1, lambda: torch.randn((100, 100)).cuda()),
2405
(1, lambda: torch.full((100, 100), 10).cuda()),
2406
(0, lambda: torch.rand((100, 100)).to(dtype=torch.float16)),
2407
(0, lambda: torch.rand((100, 100)).half()),
2408
(0, lambda: torch.rand((100, 100), device="cuda").half()),
2412
with profile(with_stack=True, record_shapes=True) as prof:
2414
pattern = ExtraCUDACopyPattern(prof)
2415
num_matched.append(len(pattern.matched_events()))
2416
self.assertEqual(num_matched, [i for i, _ in cases])
2419
TEST_WITH_CROSSREF, "crossref intercepts calls and changes the callsite."
2421
def test_profiler_for_loop_indexing_pattern(self):
2422
x = torch.ones((100, 100))
2425
for i in range(100):
2430
for i in range(100):
2435
for i in range(100):
2440
for _ in range(100):
2444
for i in range(100):
2445
x[i, :] = torch.arange(100) + i
2447
cases = ((1, case1), (1, case2), (1, case3), (0, case4), (1, case5))
2450
with profile(with_stack=True) as prof:
2452
pattern = ForLoopIndexingPattern(prof)
2453
num_matched.append(len(pattern.matched_events()))
2454
self.assertEqual(num_matched, [i for i, _ in cases])
2456
@unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
2457
def test_profiler_fp32_matmul_pattern(self):
2458
x = torch.ones((100, 100), device="cuda")
2459
with profile(with_stack=True) as prof:
2461
pattern = FP32MatMulPattern(prof)
2462
has_tf32 = 0 if pattern.skip else 1
2463
num_matched = len(pattern.matched_events())
2464
self.assertEqual(num_matched, has_tf32)
2466
@unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
2467
def test_profiler_extra_cuda_copy_pattern_benchmark(self):
2468
with profile(with_stack=True, record_shapes=True) as prof:
2469
x = torch.ones((100, 100)).to("cuda")
2470
x = torch.ones((50, 50)).to("cuda")
2471
pattern = ExtraCUDACopyPattern(prof)
2472
shapes_factor_map = pattern.benchmark(pattern.matched_events())
2473
self.assertEqual(len(shapes_factor_map), 2)
2475
def test_profiler_optimizer_single_tensor_pattern(self):
2476
x = torch.ones((100, 100))
2478
(1, lambda: torch.optim.Adam(model.parameters())),
2479
(1, lambda: torch.optim.SGD(model.parameters(), lr=0.01)),
2480
(1, lambda: torch.optim.AdamW(model.parameters())),
2481
(0, lambda: torch.optim.Adam(model.parameters(), foreach=True)),
2482
(0, lambda: torch.optim.SGD(model.parameters(), lr=0.01, foreach=True)),
2483
(0, lambda: torch.optim.AdamW(model.parameters(), foreach=True)),
2487
with profile(with_stack=True) as prof:
2488
model = nn.Sequential(
2489
nn.Linear(100, 100),
2494
optimizer.zero_grad()
2496
loss = torch.nn.functional.cross_entropy(
2497
y_hat, torch.randint(0, 10, (100,))
2501
pattern = OptimizerSingleTensorPattern(prof)
2502
num_matched.append(len(pattern.matched_events()))
2503
self.assertEqual(num_matched, [i for i, _ in cases])
2505
def test_profiler_synchronized_dataloader_pattern(self):
2506
dataset = torch.rand((100, 100))
2507
sync_dataloader = torch.utils.data.DataLoader(dataset, batch_size=10)
2508
async_dataloader = torch.utils.data.DataLoader(
2509
dataset, batch_size=10, num_workers=4
2511
with profile(with_stack=True) as prof:
2512
next(iter(sync_dataloader))
2513
next(iter(async_dataloader))
2514
pattern = SynchronizedDataLoaderPattern(prof)
2515
num_matched = len(pattern.matched_events())
2516
self.assertEqual(num_matched, 1)
2519
"pattern checks for aten::_zero op which might not be there with torch.compile'd graph"
2521
def test_profiler_grad_not_set_to_none_pattern(self):
2522
x = torch.ones((100, 100))
2523
model = nn.Sequential(
2524
nn.Linear(100, 100),
2528
optimizer = torch.optim.Adam(model.parameters())
2530
(0, lambda: optimizer.zero_grad()),
2531
(0, lambda: model.zero_grad()),
2532
(1, lambda: optimizer.zero_grad(set_to_none=False)),
2533
(1, lambda: model.zero_grad(set_to_none=False)),
2537
with profile(with_stack=True) as prof:
2539
loss = torch.nn.functional.cross_entropy(
2540
y_hat, torch.randint(0, 10, (100,))
2545
pattern = GradNotSetToNonePattern(prof)
2546
num_matched.append(len(pattern.matched_events()))
2547
self.assertEqual(num_matched, [i for i, _ in cases])
2549
def test_profiler_conv2d_bias_followed_by_batchnorm2d_pattern(self):
2550
x = torch.randn((1, 3, 32, 32))
2552
(1, nn.Sequential(nn.Conv2d(3, 3, 3, 1, 1), nn.BatchNorm2d(3))),
2553
(0, nn.Sequential(nn.Conv2d(3, 3, 3, 1, 1, bias=False), nn.BatchNorm2d(3))),
2554
(0, nn.Sequential(nn.Conv2d(3, 3, 3, 1, 1))),
2557
for _, model in cases:
2558
with profile(with_stack=True, record_shapes=True) as prof:
2560
pattern = Conv2dBiasFollowedByBatchNorm2dPattern(prof)
2561
num_matched.append(len(pattern.matched_events()))
2562
self.assertEqual(num_matched, [i for i, _ in cases])
2564
@unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
2565
def test_profiler_matmul_dim_fp16_pattern(self):
2567
(1, torch.randn((201, 201), device="cuda", dtype=torch.float16)),
2568
(1, torch.randn((3, 97, 97), device="cuda", dtype=torch.float16)),
2569
(0, torch.randn((200, 200), device="cuda", dtype=torch.float16)),
2570
(0, torch.randn((3, 200, 200), device="cuda", dtype=torch.float16)),
2574
with profile(with_stack=True, record_shapes=True) as prof:
2576
pattern = MatMulDimInFP16Pattern(prof)
2577
num_matched.append(len(pattern.matched_events()))
2578
self.assertEqual(num_matched, [i for i, _ in cases])
2580
@skipIfTorchDynamo("profiler gets ignored if dynamo activated")
2581
def test_profiler_pattern_matcher_json_report(self):
2582
x = torch.ones((100, 100))
2583
model = nn.Sequential(
2584
nn.Linear(100, 100),
2588
optimizer = torch.optim.Adam(model.parameters())
2589
with profile(with_stack=True, record_shapes=True) as prof:
2591
loss = torch.nn.functional.cross_entropy(
2592
y_hat, torch.randint(0, 10, (100,))
2596
optimizer.zero_grad()
2598
with tempfile.TemporaryDirectory() as tmpdir:
2599
report_all_anti_patterns(prof, json_report_dir=tmpdir, print_enable=False)
2601
with open(os.path.join(tmpdir, "torchtidy_report.json")) as f:
2602
report = json.load(f)
2605
keys = [k for k in report.keys() if k.endswith("test_profiler.py")]
2606
self.assertEqual(len(keys), 1, f"{keys}")
2607
entry = report[keys[0]]
2609
self.assertTrue(len(entry) > 0)
2610
expected_fields = sorted(["line_number", "name", "url", "message"])
2612
actual_fields = sorted(event.keys())
2613
self.assertEqual(expected_fields, actual_fields)
2615
@unittest.skipIf(IS_ARM64 or not IS_LINUX, "x86 linux only cpp unwinding")
2616
def test_fuzz_symbolize(self):
2619
def get_text_sections():
2622
for filename in os.listdir("/proc/self/map_files"):
2623
library = os.readlink("/proc/self/map_files/" + filename)
2624
if ".so" not in library or library in seen:
2627
with open(os.path.join("/proc/self/map_files", library), "rb") as f:
2628
mm = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)
2630
def unpack(fmt, offset):
2631
return struct.unpack(
2632
fmt, mm[offset : offset + struct.calcsize(fmt)]
2635
if mm[:4] != b"\x7fELF":
2637
(section_headers_start,) = unpack("Q", 40)
2638
(section_header_size,) = unpack("H", 58)
2639
(num_section_headers,) = unpack("H", 60)
2640
(shstrndx,) = unpack("H", 62)
2641
(shstrtab_offset,) = unpack(
2642
"Q", section_headers_start + shstrndx * section_header_size + 24
2644
for i in range(num_section_headers):
2645
(section_name_offset,) = unpack(
2646
"I", section_headers_start + i * section_header_size
2648
name_start = shstrtab_offset + section_name_offset
2649
section_name = mm[name_start : name_start + 6]
2650
if section_name != b".text\0":
2652
(section_offset,) = unpack(
2653
"Q", section_headers_start + i * section_header_size + 24
2655
(section_size,) = unpack(
2656
"Q", section_headers_start + i * section_header_size + 32
2658
start = int(filename.split("-")[0], 16) + section_offset
2659
text_sections.append((start, section_size))
2662
return text_sections
2666
text_sections = get_text_sections()
2668
for i in range(200):
2669
s = r.randrange(0, len(text_sections))
2670
start, size = text_sections[s]
2671
addr = r.randrange(start, start + size)
2673
fast = torch._C._profiler.symbolize_addresses(addrs, "fast")
2674
dladdr = torch._C._profiler.symbolize_addresses(addrs, "dladdr")
2675
addr2line = torch._C._profiler.symbolize_addresses(addrs, "addr2line")
2676
self.assertEqual(len(fast), len(addrs))
2677
self.assertEqual(len(addr2line), len(fast))
2680
if __name__ == "__main__":