1
# Owner(s): ["oncall: profiler"]
13
from torch._C._profiler import _ExtraFields_PyCall, _ExtraFields_PyCCall
14
from torch.testing._internal.common_utils import (
22
from torch.utils._pytree import tree_map
25
# These functions can vary from based on platform and build (e.g. with CUDA)
26
# and generally distract from rather than adding to the test.
29
KEEP_NAME_AND_ELLIPSES = 3
32
"torch/utils/_pytree.py(...): tree_map": KEEP_NAME_AND_ELLIPSES,
33
"torch/profiler/profiler.py(...): start": KEEP_ELLIPSES,
34
"torch/profiler/profiler.py(...): stop_trace": KEEP_ELLIPSES,
35
"torch/profiler/profiler.py(...): _transit_action": KEEP_ELLIPSES,
36
"<built-in method __exit__ of torch._C.DisableTorchFunctionSubclass object at 0xXXXXXXXXXXXX>": PRUNE_ALL,
37
"cudaStreamIsCapturing": PRUNE_ALL,
38
# These show up only on CUDA, prune them so the CUDA and CPU expected results can be the same
39
"cudaGetDeviceCount": PRUNE_ALL,
40
"cudaGetDeviceProperties_v2": PRUNE_ALL,
43
# ROCTracer is currently not producing events that profiler can extract. We
44
# should bring it up to parity with CUPTI Kineto / profiler integration, but in
45
# the mean time there is still utility in running tests but not checking that
46
# the values match expected value.
47
# 1) We will still catch runtime errors and assert failures
48
# 2) We can diff the output to see how far we are from parity
50
# TODO: We also fail to capture events for Windows on some platforms.
51
ALLOW_CUDA_FAILURE = (torch.version.hip is not None) or IS_WINDOWS
54
class TorchFunctionTensor(torch.Tensor):
56
def __torch_function__(cls, func, types, args=(), kwargs=None):
57
return super().__torch_function__(func, types, args, kwargs)
60
class TorchDispatchTensor(torch.Tensor):
62
def __new__(cls, elem):
63
t = torch.Tensor._make_subclass(cls, elem, elem.requires_grad)
68
def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
70
return x.elem if isinstance(x, TorchDispatchTensor) else x
73
return TorchDispatchTensor(x) if isinstance(x, torch.Tensor) else x
75
args = tree_map(unwrap, args)
76
kwargs = tree_map(unwrap, kwargs or {})
78
return tree_map(wrap, func(*args, **kwargs))
84
"""Mark unit test that will be using ProfilerTree to test traces.
86
This decorator serves two purposes. First, it provides a method name
87
that `format` can use to tell where the test runner (which is
88
environment specific) ends and the unit test begins. Second, it runs
89
the test with replicates and allows `assertTreesMatch` to adjust
90
based on which replicate is running.
94
def begin_unit_test_marker(self, replicates=3):
96
for i in range(replicates):
97
self.tree_replicate = i
99
if self.tree_replicate is None:
103
delattr(self, "tree_replicate")
105
return begin_unit_test_marker
108
def format(cls, profiler, indent: int = 0):
109
def flatten(nodes, depth=0, out=None):
114
cls.validate_node(node)
115
name = cls.fmt_name(node.name)
116
prune_level = PRUNE_FUNCTIONS.get(name.strip(), None)
117
if prune_level is None:
118
out.append((depth, name))
119
flatten(node.children, depth + 1, out)
120
elif prune_level == KEEP_NAME_AND_ELLIPSES:
121
out.append((depth, name))
123
out.append((depth + 1, "..."))
124
elif prune_level == KEEP_ELLIPSES:
125
out.append((depth, "..."))
127
assert prune_level == PRUNE_ALL
131
flat_nodes = flatten(profiler.kineto_results.experimental_event_tree())
133
# Profiler inserts a `cudaDeviceSynchronize` at the end of profiling.
134
# and may also insert 'Context Sync' CUDA synchronization event.
135
if flat_nodes and flat_nodes[-2][1] == "cudaDeviceSynchronize":
136
flat_nodes = flat_nodes[:-2]
138
if flat_nodes and flat_nodes[-1][1] == "cudaDeviceSynchronize":
139
flat_nodes = flat_nodes[:-1]
141
# Profiler inserts a `hipDeviceSynchronize` at the end of profiling.
142
if flat_nodes and flat_nodes[-1][1] == "hipDeviceSynchronize":
143
flat_nodes = flat_nodes[:-1]
146
[d + 1 for d, name in flat_nodes if "begin_unit_test_marker" in name] or [0]
148
return textwrap.indent(
151
f"{' ' * (d - min_depth)}{name.rstrip()}"
152
for d, name in flat_nodes
160
def fmt_name(name: str) -> str:
161
match = re.match(r"^(.*)\.py\(([0-9]+)\): (.*)$", name)
163
filename, _, fn = match.groups()
165
# This test can appear as `test/profiler/test_profiler_tree.py`
166
# depending on where it is run from.
167
test_file = os.path.splitext(os.path.split(__file__)[1])[0]
168
if filename.endswith(test_file):
171
# We test against a string literal, so all paths have to look like POSIX paths.
172
filename = filename.replace(os.sep, "/")
174
# We don't want to have to update this test every time PyTorch changes.
175
# At some point we should test some line numbers, but for now it's
179
return f"{filename}.py({lineno}): {fn}"
181
for kernel_pattern in (
182
"void at::native::elementwise_kernel",
183
"void at::native::reduce_kernel",
184
"void at::native::vectorized_elementwise_kernel",
185
"void at::native::unrolled_elementwise_kernel",
186
r"void [a-zA-Z0-9]+_kernel", # Nvidia kernels.
189
rf"{kernel_pattern}<.+>\(.+\)$",
190
f"{kernel_pattern.replace('[a-zA-Z0-9]+', '...')}<...>(...)",
194
return re.sub("object at 0x[0-9a-fA-F]+>", "object at 0xXXXXXXXXXXXX>", name)
197
def validate_node(cls, node):
198
extra_fields = node.extra_fields
199
if isinstance(extra_fields, (_ExtraFields_PyCall, _ExtraFields_PyCCall)):
200
# Check that the lineage established by the profiler matches the
201
# caller recorded by the Python tracer.
203
while parent is not None:
204
if isinstance(parent.extra_fields, _ExtraFields_PyCall):
206
parent = parent.parent
208
def to_string(frame_state):
209
return f"{frame_state.file_name}(...): {frame_state.function_name}"
212
parent_name = to_string(parent.extra_fields.callsite)
213
caller_name = to_string(extra_fields.caller)
214
assert parent_name == caller_name, f"{parent_name} vs. {caller_name}"
217
@unittest.skipIf(IS_ARM64, "Not working on ARM")
218
class TestProfilerTree(TestCase):
219
def assertTreesMatch(self, actual: str, expected: str, allow_failure: bool = False):
220
# Warning: Here be dragons
221
# Different platforms will have subtly different behavior for Python
222
# tracing. Observed differences include:
223
# 1) Windows symbolicates names differently from posix
224
# 2) The profile callback for c_call does not fire for Tensor.__pow__
225
# on certain platforms. This is not caused by the function tracer,
226
# but by cPython itself.
228
# The purpose of these unit tests is to ensure that the profiler is
229
# doing reasonable things. When these platform dependent variations occur
230
# simply coerce them into a platform independent form. If you made a
231
# change in the codebase which changes the trace produced, simply use
232
# EXPECTTEST_ACCEPT=1 to update the tests to reflect the new structure.
234
# expecttest will not show the diff view if `len(actual) < len(expected)`
235
if not expecttest.ACCEPT:
236
actual = actual.ljust(len(expected))
239
replicate = getattr(self, "tree_replicate", None)
240
self.assertIsNotNone(
241
replicate, "Please annotate test with `@ProfilerTree.test`"
244
# The profiler should produce deterministic results and should return
245
# to a clean state after each run. As a result, only the first
246
# replicate is allowed to update `expected`. If subsequent runs do not
247
# match it is a bug in the profiler.
249
self.assertEqual(actual, expected)
252
self.assertExpectedInline(actual, expected, skip=1)
253
except AssertionError as e:
255
self.tree_replicate = None
256
msg = traceback.format_exception_only(type(e), e)[0]
257
print(msg.split("AssertionError:")[-1])
261
# TODO: Add logic for CUDA version of test
263
@unittest.skipIf(torch.cuda.is_available(), "Test not working for CUDA")
264
def test_profiler_experimental_tree(self):
265
t1, t2 = torch.ones(1, requires_grad=True), torch.ones(1, requires_grad=True)
266
with torch.profiler.profile() as p:
267
z = torch.add(t1, t2)
272
self.assertTreesMatch(
273
ProfilerTree.format(p.profiler, 12),
287
autograd::engine::evaluate_function: PowBackward0
300
autograd::engine::evaluate_function: SubBackward0
303
autograd::engine::evaluate_function: AddBackward0
305
autograd::engine::evaluate_function: torch::autograd::AccumulateGrad
306
torch::autograd::AccumulateGrad
307
aten::new_empty_strided
310
autograd::engine::evaluate_function: torch::autograd::AccumulateGrad
311
torch::autograd::AccumulateGrad
316
# TODO: Add logic for CUDA version of test
318
@unittest.skipIf(torch.cuda.is_available(), "Test not working for CUDA")
319
def test_profiler_experimental_tree_with_record_function(self):
320
with torch.profiler.profile() as p:
321
with torch.autograd.profiler.record_function("Top level Annotation"):
322
with torch.autograd.profiler.record_function("First Annotation"):
323
x = torch.ones((1,), requires_grad=True)
325
# Check that we correctly handle the case when a user
326
# annotation does not call `__exit__`.
327
_ = torch.autograd.profiler.record_function(
332
with torch.autograd.profiler.record_function("Third Annotation"):
335
# NB: The `aten::zeros` before the record function annotations are due to
336
# `at::cpp_custom_type_hack`. When we switch to `torch::CustomClassHolder`
337
# they will disappear.
338
self.assertTreesMatch(
339
ProfilerTree.format(p.profiler, 12),
357
autograd::engine::evaluate_function: AddBackward0
359
autograd::engine::evaluate_function: torch::autograd::AccumulateGrad
360
torch::autograd::AccumulateGrad
361
aten::new_empty_strided
366
# TODO: Add logic for CUDA version of test
368
@unittest.skipIf(torch.cuda.is_available(), "Test not working for CUDA")
369
def test_profiler_experimental_tree_with_memory(self):
370
t1, t2 = torch.ones(1, requires_grad=True), torch.ones(1, requires_grad=True)
371
with torch.profiler.profile(profile_memory=True) as p:
372
z = torch.add(t1, t2)
377
self.assertTreesMatch(
378
ProfilerTree.format(p.profiler, 12),
397
autograd::engine::evaluate_function: PowBackward0
420
autograd::engine::evaluate_function: SubBackward0
425
autograd::engine::evaluate_function: AddBackward0
427
autograd::engine::evaluate_function: torch::autograd::AccumulateGrad
428
torch::autograd::AccumulateGrad
429
aten::new_empty_strided
433
autograd::engine::evaluate_function: torch::autograd::AccumulateGrad
434
torch::autograd::AccumulateGrad
440
@unittest.skip("https://github.com/pytorch/pytorch/issues/83606")
442
TEST_WITH_CROSSREF, "crossref intercepts calls and changes the callsite."
445
def test_profiler_experimental_tree_with_memory_and_stack(self):
446
t1, t2 = torch.ones(1, requires_grad=True), torch.ones(1, requires_grad=True)
447
with torch.profiler.profile(with_stack=True, profile_memory=True) as p:
448
z = torch.add(t1, t2)
450
loss = torch.pow(y - z, 2)
453
self.assertTreesMatch(
454
ProfilerTree.format(p.profiler, 12),
456
test_profiler_tree.py(...): test_profiler_experimental_tree_with_memory_and_stack
457
torch/profiler/profiler.py(...): __enter__
459
<built-in method add of type object at 0xXXXXXXXXXXXX>
462
<built-in method ones of type object at 0xXXXXXXXXXXXX>
469
<built-in method pow of type object at 0xXXXXXXXXXXXX>
474
torch/_tensor.py(...): backward
475
<built-in function _has_torch_function_unary>
476
torch/autograd/__init__.py(...): backward
477
<built-in method _are_functorch_transforms_active of PyCapsule object at 0xXXXXXXXXXXXX>
478
<built-in function isinstance>
479
<built-in function isinstance>
480
<built-in function len>
481
torch/autograd/__init__.py(...): _tensor_or_tensors_to_tuple
482
torch/autograd/__init__.py(...): _make_grads
483
typing.py(...): inner
484
typing.py(...): __hash__
485
<built-in function hash>
487
<built-in function isinstance>
488
<built-in function isinstance>
489
<built-in function isinstance>
490
<built-in function isinstance>
491
<built-in function isinstance>
492
<built-in function isinstance>
493
<built-in method numel of Tensor object at 0xXXXXXXXXXXXX>
494
<built-in function isinstance>
495
<built-in function isinstance>
496
<built-in method ones_like of type object at 0xXXXXXXXXXXXX>
502
<built-in method append of list object at 0xXXXXXXXXXXXX>
503
torch/autograd/graph.py(...): _engine_run_backward
504
logging/__init__.py(...): getEffectiveLevel
505
<built-in method run_backward of torch._C._EngineBase object at 0xXXXXXXXXXXXX>
506
autograd::engine::evaluate_function: PowBackward0
529
autograd::engine::evaluate_function: SubBackward0
534
autograd::engine::evaluate_function: AddBackward0
536
autograd::engine::evaluate_function: torch::autograd::AccumulateGrad
537
torch::autograd::AccumulateGrad
538
aten::new_empty_strided
542
autograd::engine::evaluate_function: torch::autograd::AccumulateGrad
543
torch::autograd::AccumulateGrad
547
torch/profiler/profiler.py(...): __exit__
548
torch/profiler/profiler.py(...): stop
552
@skipIfTorchDynamo("too slow")
554
TEST_WITH_CROSSREF, "crossref intercepts calls and changes the callsite."
557
def test_profiler_experimental_tree_with_stack_and_modules(self):
558
class MyModule(torch.nn.Module):
559
def __init__(self) -> None:
563
torch.nn.Linear(1, 1),
567
def forward(self, x: torch.Tensor) -> torch.Tensor:
568
for l in self.layers:
573
with torch.profiler.profile(with_stack=True) as p:
575
model(torch.ones((1,)))
577
self.assertTreesMatch(
578
ProfilerTree.format(p.profiler, 12),
580
test_profiler_tree.py(...): test_profiler_experimental_tree_with_stack_and_modules
581
torch/profiler/profiler.py(...): __enter__
583
<built-in method ones of type object at 0xXXXXXXXXXXXX>
587
nn.Module: MyModule_0
588
torch/nn/modules/module.py(...): _call_impl
589
<built-in method _get_tracing_state of PyCapsule object at 0xXXXXXXXXXXXX>
590
test_profiler_tree.py(...): forward
592
torch/nn/modules/module.py(...): _call_impl
593
<built-in method _get_tracing_state of PyCapsule object at 0xXXXXXXXXXXXX>
594
torch/nn/modules/activation.py(...): forward
595
torch/nn/functional.py(...): relu
596
<built-in function _has_torch_function_unary>
597
<built-in method relu of type object at 0xXXXXXXXXXXXX>
601
torch/nn/modules/module.py(...): _call_impl
602
<built-in method _get_tracing_state of PyCapsule object at 0xXXXXXXXXXXXX>
603
torch/nn/modules/linear.py(...): forward
604
torch/nn/modules/module.py(...): __getattr__
605
torch/nn/modules/module.py(...): __getattr__
606
<built-in function linear>
622
torch/nn/modules/module.py(...): _call_impl
623
<built-in method _get_tracing_state of PyCapsule object at 0xXXXXXXXXXXXX>
624
torch/nn/modules/activation.py(...): forward
625
torch/nn/functional.py(...): relu
626
<built-in function _has_torch_function_unary>
627
<built-in method relu of type object at 0xXXXXXXXXXXXX>
630
<built-in method ones of type object at 0xXXXXXXXXXXXX>
634
nn.Module: MyModule_0
635
torch/nn/modules/module.py(...): _call_impl
636
<built-in method _get_tracing_state of PyCapsule object at 0xXXXXXXXXXXXX>
637
test_profiler_tree.py(...): forward
639
torch/nn/modules/module.py(...): _call_impl
640
<built-in method _get_tracing_state of PyCapsule object at 0xXXXXXXXXXXXX>
641
torch/nn/modules/activation.py(...): forward
642
torch/nn/functional.py(...): relu
643
<built-in function _has_torch_function_unary>
644
<built-in method relu of type object at 0xXXXXXXXXXXXX>
648
torch/nn/modules/module.py(...): _call_impl
649
<built-in method _get_tracing_state of PyCapsule object at 0xXXXXXXXXXXXX>
650
torch/nn/modules/linear.py(...): forward
651
torch/nn/modules/module.py(...): __getattr__
652
torch/nn/modules/module.py(...): __getattr__
653
<built-in function linear>
669
torch/nn/modules/module.py(...): _call_impl
670
<built-in method _get_tracing_state of PyCapsule object at 0xXXXXXXXXXXXX>
671
torch/nn/modules/activation.py(...): forward
672
torch/nn/functional.py(...): relu
673
<built-in function _has_torch_function_unary>
674
<built-in method relu of type object at 0xXXXXXXXXXXXX>
677
torch/profiler/profiler.py(...): __exit__
678
torch/profiler/profiler.py(...): stop
683
TEST_WITH_CROSSREF, "crossref intercepts calls and changes the callsite."
686
def test_profiler_experimental_tree_with_stack_and_torch_function(self):
687
x = TorchFunctionTensor(torch.ones((1,)))
690
# There's some lazy initialization in __torch_function__. If we don't
691
# run this the first run won't match the replicates.
694
with torch.profiler.profile(with_stack=True) as p:
697
self.assertTreesMatch(
698
ProfilerTree.format(p.profiler, 12),
700
test_profiler_tree.py(...): test_profiler_experimental_tree_with_stack_and_torch_function
701
torch/profiler/profiler.py(...): __enter__
703
<built-in method add of type object at 0xXXXXXXXXXXXX>
704
test_profiler_tree.py(...): __torch_function__
705
torch/_tensor.py(...): __torch_function__
706
<built-in function all>
707
torch/_tensor.py(...): <genexpr>
708
<built-in function issubclass>
709
torch/_tensor.py(...): <genexpr>
710
<built-in method add of type object at 0xXXXXXXXXXXXX>
712
torch/_tensor.py(...): _convert
713
<built-in function isinstance>
714
<built-in function isinstance>
715
<built-in method as_subclass of Tensor object at 0xXXXXXXXXXXXX>
717
<built-in function isinstance>
718
torch/profiler/profiler.py(...): __exit__
719
torch/profiler/profiler.py(...): stop
724
TEST_WITH_CROSSREF, "crossref intercepts calls and changes the callsite."
727
def test_profiler_experimental_tree_with_stack_and_torch_dispatch(self):
728
x = TorchDispatchTensor(torch.ones((1,)))
732
with torch.profiler.profile(with_stack=True):
735
with torch.profiler.profile(with_stack=True) as p:
738
self.assertTreesMatch(
739
ProfilerTree.format(p.profiler, 12),
741
test_profiler_tree.py(...): test_profiler_experimental_tree_with_stack_and_torch_dispatch
742
torch/profiler/profiler.py(...): __enter__
745
torch/_library/simple_registry.py(...): find_torch_dispatch_rule
746
torch/_library/simple_registry.py(...): find
747
torch/_library/simple_registry.py(...): find
748
<built-in method get of dict object at 0xXXXXXXXXXXXX>
749
test_profiler_tree.py(...): __torch_dispatch__
750
torch/utils/_pytree.py(...): tree_map
752
torch/utils/_pytree.py(...): tree_map
754
torch/_ops.py(...): __call__
755
<built-in method of PyCapsule object at 0xXXXXXXXXXXXX>
757
torch/utils/_pytree.py(...): tree_map
759
torch/profiler/profiler.py(...): __exit__
760
torch/profiler/profiler.py(...): stop
764
@unittest.skip("https://github.com/pytorch/pytorch/issues/83606")
765
@unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
767
def test_profiler_experimental_tree_cuda(self):
768
with torch.profiler.profile(profile_memory=True) as p:
769
weight = torch.ones(1, device="cuda", requires_grad=True)
770
x = torch.ones(1, device="cuda")
771
y = torch.add(weight, x)
772
loss = torch.pow(y, 2)
774
torch.optim.SGD([weight], lr=0.01, momentum=0.9).step()
776
self.assertTreesMatch(
777
ProfilerTree.format(p.profiler, 12),
784
void at::native::vectorized_elementwise_kernel<...>(...)
790
void at::native::vectorized_elementwise_kernel<...>(...)
793
void at::native::vectorized_elementwise_kernel<...>(...)
797
void at::native::vectorized_elementwise_kernel<...>(...)
807
void at::native::vectorized_elementwise_kernel<...>(...)
808
autograd::engine::evaluate_function: PowBackward0
816
Memcpy DtoD (Device -> Device)
821
void at::native::vectorized_elementwise_kernel<...>(...)
826
void at::native::vectorized_elementwise_kernel<...>(...)
830
autograd::engine::evaluate_function: AddBackward0
832
autograd::engine::evaluate_function: torch::autograd::AccumulateGrad
833
torch::autograd::AccumulateGrad
842
Optimizer.step#SGD.step
852
Memcpy DtoD (Device -> Device)
857
void at::native::vectorized_elementwise_kernel<...>(...)
858
[memory]""", # noqa: B950
859
allow_failure=ALLOW_CUDA_FAILURE,
862
@unittest.skip("https://github.com/pytorch/pytorch/issues/83606")
863
@unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
865
def test_profiler_experimental_tree_cuda_with_stream(self):
866
streams = [torch.cuda.Stream() for _ in range(3)]
868
with torch.profiler.profile(profile_memory=True) as p:
869
x = torch.ones((4, 4), device="cuda")
870
for stream in streams:
871
with torch.cuda.stream(stream):
872
results.append(torch.tanh(x) - x)
875
torch.cuda.current_stream().wait_stream(s)
877
self.assertTreesMatch(
878
ProfilerTree.format(p.profiler, 12),
885
void at::native::vectorized_elementwise_kernel<...>(...)
889
void at::native::vectorized_elementwise_kernel<...>(...)
893
void at::native::vectorized_elementwise_kernel<...>(...)
899
void at::native::vectorized_elementwise_kernel<...>(...)
903
void at::native::vectorized_elementwise_kernel<...>(...)
909
void at::native::vectorized_elementwise_kernel<...>(...)
913
void at::native::vectorized_elementwise_kernel<...>(...)
916
allow_failure=ALLOW_CUDA_FAILURE,
919
@unittest.skip("https://github.com/pytorch/pytorch/issues/83606")
921
TEST_WITH_CROSSREF, "crossref intercepts calls and changes the callsite."
923
@unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
925
def test_profiler_experimental_tree_cuda_detailed(self):
926
# Do lazy imports ahead of time to avoid it showing up in the tree
927
import torch.nested._internal.nested_tensor
929
model = torch.nn.modules.Linear(1, 1, device="cuda")
931
opt = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
934
x = torch.ones((1, 1), device="cuda")
943
with torch.profiler.profile(profile_memory=True, with_stack=True) as p:
946
self.assertTreesMatch(
947
ProfilerTree.format(p.profiler, 12),
949
test_profiler_tree.py(...): test_profiler_experimental_tree_cuda_detailed
950
torch/profiler/profiler.py(...): __enter__
952
test_profiler_tree.py(...): step
953
<built-in method ones of type object at 0xXXXXXXXXXXXX>
959
void at::native::vectorized_elementwise_kernel<...>(...)
961
<built-in method _get_tracing_state of PyCapsule object at 0xXXXXXXXXXXXX>
962
torch/nn/modules/linear.py(...): forward
963
torch/nn/modules/module.py(...): __getattr__
964
torch/nn/modules/module.py(...): __getattr__
965
<built-in function linear>
972
Memcpy DtoD (Device -> Device)
974
void ..._kernel<...>(...)
978
torch/_tensor.py(...): backward
979
<built-in function _has_torch_function_unary>
980
torch/autograd/__init__.py(...): backward
981
<built-in function isinstance>
982
<built-in function isinstance>
983
<built-in function len>
984
torch/autograd/__init__.py(...): _tensor_or_tensors_to_tuple
985
torch/autograd/__init__.py(...): _make_grads
986
typing.py(...): inner
987
typing.py(...): __hash__
988
<built-in function hash>
990
<built-in function isinstance>
991
<built-in function isinstance>
992
<built-in function isinstance>
993
<built-in function isinstance>
994
<built-in function isinstance>
995
<built-in function isinstance>
996
<built-in method numel of Tensor object at 0xXXXXXXXXXXXX>
997
<built-in function isinstance>
998
<built-in function isinstance>
999
<built-in method ones_like of type object at 0xXXXXXXXXXXXX>
1006
void at::native::vectorized_elementwise_kernel<...>(...)
1007
<built-in method append of list object at 0xXXXXXXXXXXXX>
1008
<built-in method run_backward of torch._C._EngineBase object at 0xXXXXXXXXXXXX>
1009
autograd::engine::evaluate_function: AddmmBackward0
1016
void ..._kernel<...>(...)
1024
void at::native::reduce_kernel<...>(...)
1028
autograd::engine::evaluate_function: torch::autograd::AccumulateGrad
1029
torch::autograd::AccumulateGrad
1032
void at::native::vectorized_elementwise_kernel<...>(...)
1034
autograd::engine::evaluate_function: TBackward0
1039
autograd::engine::evaluate_function: torch::autograd::AccumulateGrad
1040
torch::autograd::AccumulateGrad
1043
void at::native::vectorized_elementwise_kernel<...>(...)
1046
torch/optim/optimizer.py(...): wrapper
1047
<built-in method format of str object at 0xXXXXXXXXXXXX>
1048
torch/autograd/profiler.py(...): __init__
1049
<built-in method zeros of type object at 0xXXXXXXXXXXXX>
1055
torch/autograd/profiler.py(...): __enter__
1056
torch/_ops.py(...): __call__
1057
<built-in method _record_function_enter of PyCapsule object at 0xXXXXXXXXXXXX>
1058
Optimizer.step#SGD.step
1063
torch/optim/optimizer.py(...): _use_grad
1064
<built-in function is_grad_enabled>
1065
torch/autograd/grad_mode.py(...): __init__
1066
<built-in function is_grad_enabled>
1067
<built-in function _set_grad_enabled>
1068
torch/optim/sgd.py(...): step
1069
<built-in method append of list object at 0xXXXXXXXXXXXX>
1070
<built-in method append of list object at 0xXXXXXXXXXXXX>
1071
torch/_tensor.py(...): __hash__
1072
<built-in function id>
1073
<built-in method append of list object at 0xXXXXXXXXXXXX>
1074
<built-in method append of list object at 0xXXXXXXXXXXXX>
1075
<built-in method append of list object at 0xXXXXXXXXXXXX>
1076
torch/_tensor.py(...): __hash__
1077
<built-in function id>
1078
<built-in method append of list object at 0xXXXXXXXXXXXX>
1079
torch/optim/sgd.py(...): sgd
1080
torch/optim/sgd.py(...): _single_tensor_sgd
1081
<built-in method mul_ of Tensor object at 0xXXXXXXXXXXXX>
1085
void at::native::vectorized_elementwise_kernel<...>(...)
1087
<built-in method add_ of Tensor object at 0xXXXXXXXXXXXX>
1090
void at::native::vectorized_elementwise_kernel<...>(...)
1091
<built-in method add_ of Tensor object at 0xXXXXXXXXXXXX>
1094
void at::native::vectorized_elementwise_kernel<...>(...)
1095
<built-in method mul_ of Tensor object at 0xXXXXXXXXXXXX>
1099
void at::native::vectorized_elementwise_kernel<...>(...)
1101
<built-in method add_ of Tensor object at 0xXXXXXXXXXXXX>
1104
void at::native::vectorized_elementwise_kernel<...>(...)
1105
<built-in method add_ of Tensor object at 0xXXXXXXXXXXXX>
1108
void at::native::vectorized_elementwise_kernel<...>(...)
1109
torch/_tensor.py(...): __hash__
1110
<built-in function id>
1111
torch/_tensor.py(...): __hash__
1112
<built-in function id>
1113
torch/autograd/grad_mode.py(...): __init__
1114
<built-in function is_grad_enabled>
1115
<built-in function _set_grad_enabled>
1116
torch/autograd/profiler.py(...): __exit__
1117
torch/_ops.py(...): __call__
1118
<built-in method _record_function_exit of PyCapsule object at 0xXXXXXXXXXXXX>
1121
torch/profiler/profiler.py(...): __exit__
1122
torch/profiler/profiler.py(...): stop
1123
torch/profiler/profiler.py(...): _transit_action
1124
<built-in method get of dict object at 0xXXXXXXXXXXXX>
1125
enum.py(...): __hash__
1126
<built-in function hash>
1127
...""", # noqa: B950
1128
allow_failure=ALLOW_CUDA_FAILURE,
1132
if __name__ == "__main__":