5
from collections import defaultdict, namedtuple
6
from operator import attrgetter
7
from typing import Any, Dict, List, Optional, Tuple
8
from typing_extensions import deprecated
11
from torch.autograd import DeviceType
16
"FormattedTimesMixin",
27
"""A list of Events (for pretty printing)."""
29
def __init__(self, *args, **kwargs):
30
use_device = kwargs.pop("use_device", None)
31
profile_memory = kwargs.pop("profile_memory", False)
32
with_flops = kwargs.pop("with_flops", False)
33
super().__init__(*args, **kwargs)
34
self._use_device = use_device
35
self._profile_memory = profile_memory
36
self._tree_built = False
37
self._with_flops = with_flops
39
def _build_tree(self):
40
self._populate_cpu_children()
41
self._remove_dup_nodes()
42
self._set_backward_stacktraces()
43
self._tree_built = True
48
def _remove_dup_nodes(self):
51
for idx in range(len(self)):
53
self[idx].cpu_parent is not None
54
and self[idx].cpu_parent.name == self[idx].name
55
and len(self[idx].cpu_parent.cpu_children) == 1
57
self[idx].cpu_parent.cpu_children = self[idx].cpu_children
58
self[idx].cpu_parent.kernels = self[idx].kernels
59
for ch in self[idx].cpu_children:
60
ch.cpu_parent = self[idx].cpu_parent
62
if len(to_delete) == 0:
64
new_evts = [ev for ind, ev in enumerate(self) if ind not in to_delete]
68
def _populate_cpu_children(self):
69
"""Populate child events into each underlying FunctionEvent object.
71
One event is a child of another if [s1, e1) is inside [s2, e2). Where
72
s1 and e1 would be start and end of the child event's interval. And
73
s2 and e2 start and end of the parent event's interval
75
Example: In event list [[0, 10], [1, 3], [3, 4]] would have make [0, 10]
76
be a parent of two other intervals.
78
If for any reason two intervals intersect only partially, this function
79
will not record a parent child relationship between then.
87
if not evt.is_async and evt.device_type == DeviceType.CPU
91
key=attrgetter("thread"),
96
threads = itertools.groupby(
97
events, key=lambda event: (event.thread, event.node_id)
112
for thread_id, thread_events in threads:
113
thread_events_ = sorted(
115
key=lambda event: [event.time_range.start, -event.time_range.end],
117
current_events: List[FunctionEvent] = []
119
for event in thread_events_:
120
while len(current_events) > 0:
121
parent = current_events[-1]
123
event.time_range.start >= parent.time_range.end
124
or event.time_range.end > parent.time_range.end
129
parent.append_cpu_child(event)
131
event.cpu_parent is None
132
), f"There is already a CPU parent event for {event.key}"
133
event.set_cpu_parent(parent)
136
current_events.append(event)
138
def _set_backward_stacktraces(self):
145
return bw_parent(evt.cpu_parent)
149
if bw_parent(evt) is None and evt.stack is not None:
150
t = (evt.sequence_nr, evt.thread)
151
if t not in fwd_stacks:
152
fwd_stacks[t] = evt.stack
157
assert p.fwd_thread is not None
158
t = (p.sequence_nr, p.fwd_thread)
160
evt.stack = fwd_stacks[t]
165
def self_cpu_time_total(self):
166
return sum(event.self_cpu_time_total for event in self)
172
max_src_column_width=75,
173
max_name_column_width=55,
174
max_shapes_column_width=80,
176
top_level_events_only=False,
178
"""Print an EventList as a nicely formatted table.
181
sort_by (str, optional): Attribute used to sort entries. By default
182
they are printed in the same order as they were registered.
183
Valid keys include: ``cpu_time``, ``cuda_time``, ``xpu_time``,
184
``cpu_time_total``, ``cuda_time_total``, ``xpu_time_total``,
185
``cpu_memory_usage``, ``cuda_memory_usage``, ``xpu_memory_usage``,
186
``self_cpu_memory_usage``, ``self_cuda_memory_usage``,
187
``self_xpu_memory_usage``, ``count``.
188
top_level_events_only(bool, optional): Boolean flag to determine the
189
selection of events to display. If true, the profiler will only
190
display events at top level like top-level invocation of python
191
`lstm`, python `add` or other functions, nested events like low-level
192
cpu/cuda/xpu ops events are omitted for profiler result readability.
195
A string containing the table.
201
max_src_column_width=max_src_column_width,
202
max_name_column_width=max_name_column_width,
203
max_shapes_column_width=max_shapes_column_width,
205
profile_memory=self._profile_memory,
206
with_flops=self._with_flops,
207
top_level_events_only=top_level_events_only,
210
def export_chrome_trace(self, path):
211
"""Export an EventList as a Chrome tracing tools file.
213
The checkpoint can be later loaded and inspected under ``chrome://tracing`` URL.
216
path (str): Path where the trace will be written.
220
device_name = "cuda" if not self._use_device else self._use_device
221
with open(path, "w") as f:
228
if evt.trace_name is None:
236
'"pid": "CPU functions", '
237
'"args": {{}}}}, '.format(
239
evt.time_range.start,
240
evt.time_range.elapsed_us(),
243
else f'" node_id:{evt.node_id}, thread_id:{evt.thread} "',
246
for k in evt.kernels:
250
f'{{"name": "{evt.trace_name}", '
252
f'"ts": {evt.time_range.start}, '
253
f'"tid": {evt.thread}, '
254
'"pid": "CPU functions", '
256
f'"cat": "cpu_to_{device_name}", '
263
f.seek(f.tell() - 2, os.SEEK_SET)
267
def supported_export_stacks_metrics(self):
269
"self_cpu_time_total",
270
"self_cuda_time_total",
271
"self_xpu_time_total",
272
"self_privateuse1_time_total",
275
def export_stacks(self, path: str, metric: str):
276
if metric not in self.supported_export_stacks_metrics():
278
"metric should be one of: "
279
+ str(self.supported_export_stacks_metrics())
281
translate_table = str.maketrans(" ;\t\n", "____")
282
with open(path, "w") as f:
284
if evt.stack and len(evt.stack) > 0:
285
metric_value = getattr(
287
metric.replace("cuda", "device")
288
.replace("xpu", "device")
289
.replace("privateuse1", "device"),
291
if int(metric_value) > 0:
293
for entry in reversed(evt.stack):
294
stack_str += entry.translate(translate_table)
296
stack_str = stack_str[:-1] + " " + str(int(metric_value))
297
f.write(stack_str + "\n")
299
def key_averages(self, group_by_input_shapes=False, group_by_stack_n=0):
300
"""Averages all function events over their keys.
303
group_by_input_shapes: group entries by
304
(event name, input shapes) rather than just event name.
305
This is useful to see which input shapes contribute to the runtime
306
the most and may help with size-specific optimizations or
307
choosing the best candidates for quantization (aka fitting a roof line)
309
group_by_stack_n: group by top n stack trace entries
312
An EventList containing FunctionEventAvg objects.
314
assert self._tree_built
315
stats: Dict[Tuple[str, ...], FunctionEventAvg] = defaultdict(FunctionEventAvg)
317
def get_key(event, group_by_input_shapes, group_by_stack_n) -> Tuple[str, ...]:
321
str(event.device_type),
322
str(event.is_legacy),
323
str(event.is_user_annotation),
325
if group_by_input_shapes:
326
key.append(str(event.input_shapes))
327
if group_by_stack_n > 0:
328
key += event.stack[:group_by_stack_n]
332
stats[get_key(evt, group_by_input_shapes, group_by_stack_n)].add(evt)
334
avg_list = EventList(
336
use_device=self._use_device,
337
profile_memory=self._profile_memory,
338
with_flops=self._with_flops,
341
evt.stack = evt.stack[:group_by_stack_n]
342
if not group_by_input_shapes:
343
evt.input_shapes = ""
346
def total_average(self):
347
"""Averages all events.
350
A FunctionEventAvg object.
352
total_stat = FunctionEventAvg()
355
total_stat.key = None
356
total_stat.key = "Total"
360
def _format_time(time_us):
361
"""Define how to format time in FunctionEvent."""
362
US_IN_SECOND = 1000.0 * 1000.0
364
if time_us >= US_IN_SECOND:
365
return f"{time_us / US_IN_SECOND:.3f}s"
366
if time_us >= US_IN_MS:
367
return f"{time_us / US_IN_MS:.3f}ms"
368
return f"{time_us:.3f}us"
371
def _format_time_share(time_us, total_time_us):
372
"""Define how to format time in FunctionEvent."""
373
if total_time_us == 0:
374
assert time_us == 0, f"Expected time_us == 0 but got {time_us}"
376
return f"{time_us * 100.0 / total_time_us:.2f}%"
379
def _format_memory(nbytes):
380
"""Return a formatted memory size string."""
384
if abs(nbytes) >= GB:
385
return f"{nbytes * 1.0 / GB:.2f} Gb"
386
elif abs(nbytes) >= MB:
387
return f"{nbytes * 1.0 / MB:.2f} Mb"
388
elif abs(nbytes) >= KB:
389
return f"{nbytes * 1.0 / KB:.2f} Kb"
391
return str(nbytes) + " b"
394
def _attr_formatter(name):
395
return property(lambda self: _format_time(getattr(self, name)))
398
class FormattedTimesMixin:
399
"""Helpers for FunctionEvent and FunctionEventAvg.
401
The subclass should define `*_time_total` and `count` attributes.
404
cpu_time_str = _attr_formatter("cpu_time")
405
device_time_str = _attr_formatter("device_time")
406
cpu_time_total_str = _attr_formatter("cpu_time_total")
407
device_time_total_str = _attr_formatter("device_time_total")
408
self_cpu_time_total_str = _attr_formatter("self_cpu_time_total")
409
self_device_time_total_str = _attr_formatter("self_device_time_total")
413
return 0.0 if self.count == 0 else 1.0 * self.cpu_time_total / self.count
416
def device_time(self):
417
return 0.0 if self.count == 0 else 1.0 * self.device_time_total / self.count
421
"`cuda_time` is deprecated, please use `device_time` instead.",
422
category=FutureWarning,
425
return self.device_time
429
def __init__(self, start, end):
433
def elapsed_us(self):
435
Returns the length of the interval
437
return self.end - self.start
440
Kernel = namedtuple("Kernel", ["name", "device", "duration"])
443
class FunctionEvent(FormattedTimesMixin):
444
"""Profiling information about a single function."""
459
device_memory_usage=0,
464
device_type=DeviceType.CPU,
466
device_resource_id=None,
470
concrete_inputs=None,
472
is_user_annotation=False,
475
self.node_id: int = node_id
476
self.name: str = name
477
self.trace_name: str = trace_name
478
self.time_range: Interval = Interval(start_us, end_us)
479
self.thread: int = thread
480
self.fwd_thread: Optional[int] = fwd_thread
481
self.kernels: List[Kernel] = []
483
self.cpu_children: List[FunctionEvent] = []
484
self.cpu_parent: Optional[FunctionEvent] = None
485
self.input_shapes: Tuple[int, ...] = input_shapes
486
self.concrete_inputs: List[Any] = concrete_inputs
487
self.kwinputs: Dict[str, Any] = kwinputs
488
self.stack: List = stack
489
self.scope: int = scope
490
self.use_device: Optional[str] = use_device
491
self.cpu_memory_usage: int = cpu_memory_usage
492
self.device_memory_usage: int = device_memory_usage
493
self.is_async: bool = is_async
494
self.is_remote: bool = is_remote
495
self.sequence_nr: int = sequence_nr
496
self.device_type: DeviceType = device_type
497
self.device_index: int = device_index
498
self.device_resource_id: int = (
499
thread if device_resource_id is None else device_resource_id
501
self.is_legacy: bool = is_legacy
502
self.flops: Optional[int] = flops
503
self.is_user_annotation: Optional[bool] = is_user_annotation
505
def append_kernel(self, name, device, duration):
506
assert self.device_type == DeviceType.CPU
507
self.kernels.append(Kernel(name, device, duration))
509
def append_cpu_child(self, child):
510
"""Append a CPU child of type FunctionEvent.
512
One is supposed to append only direct children to the event to have
513
correct self cpu time being reported.
515
assert self.device_type == DeviceType.CPU
516
assert isinstance(child, FunctionEvent)
517
assert child.device_type == DeviceType.CPU
518
self.cpu_children.append(child)
520
def set_cpu_parent(self, parent):
521
"""Set the immediate CPU parent of type FunctionEvent.
523
One profiling FunctionEvent should have only one CPU parent such that
524
the child's range interval is completely inside the parent's. We use
525
this connection to determine the event is from top-level op or not.
527
assert self.device_type == DeviceType.CPU
528
assert isinstance(parent, FunctionEvent)
529
assert parent.device_type == DeviceType.CPU
530
self.cpu_parent = parent
535
def self_cpu_memory_usage(self):
536
if self.is_async or self.device_type != DeviceType.CPU:
538
return self.cpu_memory_usage - sum(
539
child.cpu_memory_usage for child in self.cpu_children
543
def self_device_memory_usage(self):
544
if self.is_async or self.device_type != DeviceType.CPU:
546
return self.device_memory_usage - sum(
547
child.device_memory_usage for child in self.cpu_children
552
"`self_cuda_memory_usage` is deprecated. Use `self_device_memory_usage` instead.",
553
category=FutureWarning,
555
def self_cuda_memory_usage(self):
556
return self.self_device_memory_usage
559
def cpu_time_total(self):
560
if self.device_type == DeviceType.CPU:
561
return self.time_range.elapsed_us()
566
def self_cpu_time_total(self):
567
if self.is_async or self.device_type != DeviceType.CPU:
569
return self.cpu_time_total - sum(
570
child.cpu_time_total for child in self.cpu_children
574
def device_time_total(self):
575
if self.is_async or not self.use_device:
577
if self.device_type == DeviceType.CPU:
578
if not self.is_legacy:
580
return sum(kinfo.duration for kinfo in self.kernels) + sum(
581
ch.device_time_total for ch in self.cpu_children
585
return sum(kinfo.duration for kinfo in self.kernels)
587
assert self.device_type in [
589
DeviceType.PrivateUse1,
592
return self.time_range.elapsed_us()
596
"`cuda_time_total` is deprecated. Use `device_time_total` instead.",
597
category=FutureWarning,
599
def cuda_time_total(self):
600
return self.device_time_total
603
def self_device_time_total(self):
604
if self.is_async or not self.use_device:
606
if self.device_type == DeviceType.CPU:
607
return self.device_time_total - sum(
608
child.device_time_total for child in self.cpu_children
611
assert self.device_type in [
613
DeviceType.PrivateUse1,
616
return self.device_time_total
620
"`self_cuda_time_total` is deprecated. Use `self_device_time_total` instead.",
621
category=FutureWarning,
623
def self_cuda_time_total(self):
624
return self.self_device_time_total
631
device_name = self.use_device
632
device_time = self.device_time_str
633
device_memory_usage = self.device_memory_usage
635
f"<FunctionEvent id={self.id} name={self.name} device_type={self.device_type} node_id={self.node_id} "
636
f"cpu_time={self.cpu_time_str} start_us={self.time_range.start} end_us={self.time_range.end} "
637
f"cpu_children={str([child.id for child in self.cpu_children])} {device_name}_time={device_time} "
638
f"name={self.name} thread={self.thread} input_shapes={str(self.input_shapes)} "
639
f"cpu_memory_usage={self.cpu_memory_usage} {device_name}_memory_usage={device_memory_usage} "
640
f"is_async={self.is_async} is_remote={self.is_remote} seq_nr={self.sequence_nr} is_legacy={self.is_legacy}>"
644
class FunctionEventAvg(FormattedTimesMixin):
645
"""Used to average stats over multiple FunctionEvent objects."""
647
def __init__(self) -> None:
648
self.key: Optional[str] = None
650
self.node_id: int = 0
651
self.is_async: bool = False
652
self.is_remote: bool = False
653
self.use_device: Optional[str] = None
654
self.cpu_time_total: int = 0
655
self.device_time_total: int = 0
656
self.self_cpu_time_total: int = 0
657
self.self_device_time_total: int = 0
658
self.input_shapes: Optional[List[List[int]]] = None
659
self.stack: Optional[List] = None
660
self.scope: Optional[int] = None
661
self.cpu_memory_usage: int = 0
662
self.device_memory_usage: int = 0
663
self.self_cpu_memory_usage: int = 0
664
self.self_device_memory_usage: int = 0
665
self.cpu_children: Optional[List[FunctionEvent]] = None
666
self.cpu_parent: Optional[FunctionEvent] = None
667
self.device_type: DeviceType = DeviceType.CPU
668
self.is_legacy: bool = False
671
def add(self, other):
676
self.node_id = other.node_id
677
self.is_async = other.is_async
678
self.is_remote = other.is_remote
679
self.cpu_parent = other.cpu_parent
680
self.cpu_children = other.cpu_children
682
self.input_shapes = other.input_shapes
683
self.stack = other.stack
684
self.scope = other.scope
685
self.device_type = other.device_type
686
self.is_legacy = other.is_legacy
687
self.use_device = other.use_device
688
self.is_user_annotation = other.is_user_annotation
690
assert isinstance(other, (FunctionEvent, FunctionEventAvg))
691
assert other.key == self.key
692
self.cpu_time_total += other.cpu_time_total
693
self.device_time_total += other.device_time_total
694
self.self_cpu_time_total += other.self_cpu_time_total
695
self.self_device_time_total += other.self_device_time_total
696
self.cpu_memory_usage += other.cpu_memory_usage
697
self.device_memory_usage += other.device_memory_usage
698
self.self_cpu_memory_usage += other.self_cpu_memory_usage
699
self.self_device_memory_usage += other.self_device_memory_usage
700
self.count += other.count
701
if self.flops is None:
702
self.flops = other.flops
703
elif other.flops is not None:
704
self.flops += other.flops
707
def __iadd__(self, other):
708
return self.add(other)
711
device_name = "cuda" if not self.use_device else self.use_device
712
self_device_time = self.self_device_time_total_str
713
device_time = self.device_time_str
714
device_memory = self.device_memory_usage
716
f"<FunctionEventAvg key={self.key} self_cpu_time={self.self_cpu_time_total_str} cpu_time={self.cpu_time_str} "
717
f" self_{device_name}_time={self_device_time} {device_name}_time={device_time} input_shapes={str(self.input_shapes)} "
718
f"cpu_memory_usage={self.cpu_memory_usage} {device_name}_memory_usage={device_memory}>"
722
class StringTable(defaultdict):
723
def __missing__(self, key):
727
self[key] = torch._C._demangle(key) if len(key) > 1 else key
732
"""Acceleration structure for accessing mem_records in interval."""
734
def __init__(self, mem_records):
735
self._mem_records = mem_records
736
self._start_nses: List[int] = []
737
self._indices: List[int] = []
738
if len(mem_records) > 0:
739
tmp = sorted([(r[0].start_ns(), i) for i, r in enumerate(mem_records)])
740
self._start_nses, self._indices = zip(*tmp)
742
def in_interval(self, start_us, end_us):
744
Return all records in the given interval
745
To maintain backward compatibility, convert us to ns in function
747
start_idx = bisect.bisect_left(self._start_nses, start_us * 1000)
748
end_idx = bisect.bisect_right(self._start_nses, end_us * 1000)
749
for i in range(start_idx, end_idx):
750
yield self._mem_records[self._indices[i]]
753
def _filter_stack_entry(entry):
755
("autograd/__init__", "_make_grads"),
756
("autograd/__init__", "backward"),
757
("torch/tensor", "backward"),
758
("_internal/common_utils", "prof_callable"),
759
("_internal/common_utils", "prof_func_call"),
760
("_internal/common_utils", "prof_meth_call"),
762
return all(not (f[0] in entry and f[1] in entry) for f in filtered_entries)
765
MEMORY_EVENT_NAME = "[memory]"
766
OUT_OF_MEMORY_EVENT_NAME = "[OutOfMemory]"
769
def _filter_name(name):
771
filtered_out_names = [
773
OUT_OF_MEMORY_EVENT_NAME,
774
"profiler::_record_function_enter",
775
"profiler::_record_function_enter_new",
776
"profiler::_record_function_exit",
781
return name in filtered_out_names
788
def _rewrite_name(name, with_wildcard=False):
789
string_table = StringTable()
790
name = string_table[name]
792
if name.startswith("ProfilerStep#"):
793
name = "ProfilerStep*"
802
max_src_column_width=75,
803
max_name_column_width=55,
804
max_shapes_column_width=80,
806
profile_memory=False,
807
top_level_events_only=False,
809
"""Print a summary of events (which can be a list of FunctionEvent or FunctionEventAvg)."""
813
has_device_time = any(event.self_device_time_total > 0 for event in events)
814
has_device_mem = any(event.self_device_memory_usage > 0 for event in events)
815
use_device = events[0].use_device
819
if not use_device and has_device_time:
820
raise RuntimeError("use_device is None, but there is device performance data.")
822
has_input_shapes = any(
823
(event.input_shapes is not None and len(event.input_shapes) > 0)
827
if sort_by is not None:
831
key=lambda evt: getattr(
833
sort_by.replace("cuda", "device")
834
.replace("xpu", "device")
835
.replace("privateuse1", "device"),
839
use_device=use_device,
840
profile_memory=profile_memory,
841
with_flops=with_flops,
844
name_column_width = max(len(evt.key) for evt in events) + 4
845
if max_name_column_width is not None:
846
name_column_width = min(name_column_width, max_name_column_width)
848
shapes_column_width = max(len(str(evt.input_shapes)) for evt in events) + 4
849
if max_shapes_column_width is not None:
850
shapes_column_width = min(shapes_column_width, max_shapes_column_width)
852
DEFAULT_COLUMN_WIDTH = 12
853
flops_column_width = DEFAULT_COLUMN_WIDTH
855
src_column_width = None
858
if evt.stack is not None and len(evt.stack) > 0:
859
stacks.append(evt.stack)
860
has_stack = len(stacks) > 0
863
max(max(len(entry) for entry in stack) for stack in stacks) + 4
865
if max_src_column_width is not None:
866
src_column_width = min(src_column_width, max_src_column_width)
876
device_name = use_device.upper() if use_device is not None else "None"
880
f"Self {device_name}",
881
f"Self {device_name} %",
882
f"{device_name} total",
883
f"{device_name} time avg",
893
if use_device and has_device_mem:
896
f"{device_name} Mem",
897
f"Self {device_name} Mem",
900
headers.append("# of Calls")
902
append_node_id = any(evt.node_id != -1 for evt in events)
904
headers.append("Node ID")
908
row_format_lst = [""]
909
header_sep_lst = [""]
910
line_length_lst = [-SPACING_SIZE]
912
def add_column(padding, text_dir=">"):
913
row_format_lst[0] += (
914
"{: " + text_dir + str(padding) + "}" + (" " * SPACING_SIZE)
916
header_sep_lst[0] += "-" * padding + (" " * SPACING_SIZE)
917
line_length_lst[0] += padding + SPACING_SIZE
919
def auto_scale_flops(flops):
929
log_flops = max(0, min(math.log10(flops) / 3, float(len(flop_headers) - 1)))
930
assert log_flops >= 0 and log_flops < len(flop_headers)
931
return (pow(10, (math.floor(log_flops) * -3.0)), flop_headers[int(log_flops)])
933
add_column(name_column_width)
934
for _ in headers[1:]:
935
add_column(DEFAULT_COLUMN_WIDTH)
938
headers.append("Input Shapes")
939
add_column(shapes_column_width)
942
headers.append("Source Location")
943
add_column(src_column_width, text_dir="<")
950
raw_flops.append(evt.flops)
951
if len(raw_flops) != 0:
952
(flops_scale, flops_header) = auto_scale_flops(min(raw_flops))
953
headers.append(f"Total {flops_header}")
954
add_column(flops_column_width)
958
row_format = row_format_lst[0]
959
header_sep = header_sep_lst[0]
960
line_length = line_length_lst[0]
970
sum_self_cpu_time_total = 0
971
sum_self_device_time_total = 0
973
sum_self_cpu_time_total += evt.self_cpu_time_total
974
if evt.device_type == DeviceType.CPU and evt.is_legacy:
976
sum_self_device_time_total += evt.self_device_time_total
981
DeviceType.PrivateUse1,
984
and not evt.is_user_annotation
987
sum_self_device_time_total += evt.self_device_time_total
990
if header is not None:
991
append("=" * line_length)
993
if top_level_events_only:
994
append("=" * line_length)
995
append("This report only display top-level ops statistics")
997
append(row_format.format(*headers))
1001
def trim_path(path, src_column_width):
1002
if len(path) > src_column_width:
1003
offset = len(path) - src_column_width
1004
path = path[offset:]
1006
path = "..." + path[3:]
1011
if event_limit == row_limit:
1013
if top_level_events_only and evt.cpu_parent is not None:
1018
if max_name_column_width is not None and len(name) >= max_name_column_width - 3:
1019
name = name[: (max_name_column_width - 3)] + "..."
1023
_format_time_share(evt.self_cpu_time_total, sum_self_cpu_time_total),
1024
evt.self_cpu_time_total_str,
1026
_format_time_share(evt.cpu_time_total, sum_self_cpu_time_total)
1029
evt.cpu_time_total_str,
1035
evt.self_device_time_total_str,
1038
evt.self_device_time_total, sum_self_device_time_total
1040
evt.device_time_total_str,
1041
evt.device_time_str,
1048
_format_memory(evt.cpu_memory_usage),
1050
_format_memory(evt.self_cpu_memory_usage),
1053
if use_device and has_device_mem:
1057
_format_memory(evt.device_memory_usage),
1059
_format_memory(evt.self_device_memory_usage),
1067
row_values.append(evt.node_id)
1068
if has_input_shapes:
1069
row_values.append(str(evt.input_shapes)[:shapes_column_width])
1072
row_values.append("--")
1074
row_values.append(f"{evt.flops * flops_scale:8.3f}")
1077
if len(evt.stack) > 0:
1078
src_field = trim_path(evt.stack[0], src_column_width)
1079
row_values.append(src_field)
1080
append(row_format.format(*row_values))
1083
empty_headers = [""] * (len(headers) - 1)
1084
for entry in evt.stack[1:]:
1087
*(empty_headers + [trim_path(entry, src_column_width)])
1090
empty_headers.append("")
1091
append(row_format.format(*empty_headers))
1094
append(f"Self CPU time total: {_format_time(sum_self_cpu_time_total)}")
1097
f"Self {use_device.upper() if use_device is not None else 'None'} "
1098
f"time total: {_format_time(sum_self_device_time_total)}"
1100
return "".join(result)