CSS-LM
871 строка · 36.2 Кб
1"""
2Utilities for working with the local dataset cache.
3This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
4Copyright by the AllenNLP authors.
5"""
6
7import copy8import csv9import linecache10import logging11import os12import platform13import sys14from abc import ABC, abstractmethod15from collections import defaultdict, namedtuple16from datetime import datetime17from multiprocessing import Pipe, Process, Queue18from multiprocessing.connection import Connection19from typing import Callable, Iterable, List, NamedTuple, Optional, Union20
21from transformers import AutoConfig, PretrainedConfig22from transformers import __version__ as version23
24from ..file_utils import is_psutil_available, is_py3nvml_available, is_tf_available, is_torch_available25from .benchmark_args_utils import BenchmarkArguments26
27
28if is_torch_available():29from torch.cuda import empty_cache as torch_empty_cache30
31if is_tf_available():32from tensorflow.python.eager import context as tf_context33
34if is_psutil_available():35import psutil36
37if is_py3nvml_available():38import py3nvml.py3nvml as nvml39
40if platform.system() == "Windows":41from signal import CTRL_C_EVENT as SIGKILL42else:43from signal import SIGKILL44
45
46logger = logging.getLogger(__name__) # pylint: disable=invalid-name47
48
49_is_memory_tracing_enabled = False50
51BenchmarkOutput = namedtuple(52"BenchmarkOutput",53[54"time_inference_result",55"memory_inference_result",56"time_train_result",57"memory_train_result",58"inference_summary",59"train_summary",60],61)
62
63
64def separate_process_wrapper_fn(func: Callable[[], None], do_multi_processing: bool) -> Callable[[], None]:65"""66This function wraps another function into its own separated process.
67In order to ensure accurate memory measurements it is important that the function
68is executed in a separate process
69
70Args:
71- `func`: (`callable`): function() -> ...
72generic function which will be executed in its own separate process
73- `do_multi_processing`: (`bool`)
74Whether to run function on separate process or not
75"""
76
77def multi_process_func(*args, **kwargs):78# run function in an individual79# process to get correct memory80def wrapper_func(queue: Queue, *args):81try:82result = func(*args)83except Exception as e:84logger.error(e)85print(e)86result = "N/A"87queue.put(result)88
89queue = Queue()90p = Process(target=wrapper_func, args=[queue] + list(args))91p.start()92result = queue.get()93p.join()94return result95
96if do_multi_processing:97logging.info("fFunction {func} is executed in its own process...")98return multi_process_func99else:100return func101
102
103def is_memory_tracing_enabled():104global _is_memory_tracing_enabled105return _is_memory_tracing_enabled106
107
108class Frame(NamedTuple):109""" `Frame` is a NamedTuple used to gather the current frame state.110`Frame` has the following fields:
111- 'filename' (string): Name of the file currently executed
112- 'module' (string): Name of the module currently executed
113- 'line_number' (int): Number of the line currently executed
114- 'event' (string): Event that triggered the tracing (default will be "line")
115- 'line_text' (string): Text of the line in the python script
116"""
117
118filename: str119module: str120line_number: int121event: str122line_text: str123
124
125class UsedMemoryState(NamedTuple):126""" `UsedMemoryState` are named tuples with the following fields:127- 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
128- 'cpu_memory': CPU RSS memory state *before* executing the line
129- 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
130"""
131
132frame: Frame133cpu_memory: int134gpu_memory: int135
136
137class Memory(NamedTuple):138""" `Memory` NamedTuple have a single field `bytes` and139you can get a human readable str of the number of mega bytes by calling `__repr__`
140- `byte` (integer): number of bytes,
141"""
142
143bytes: int144
145def __repr__(self) -> str:146return str(bytes_to_mega_bytes(self.bytes))147
148
149class MemoryState(NamedTuple):150""" `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:151- `frame` (`Frame`): the current frame (see above)
152- `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
153- `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
154- `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
155"""
156
157frame: Frame158cpu: Memory159gpu: Memory160cpu_gpu: Memory161
162
163class MemorySummary(NamedTuple):164""" `MemorySummary` namedtuple otherwise with the fields:165- `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
166by substracting the memory after executing each line from the memory before executing said line.
167- `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
168obtained by summing repeated memory increase for a line if it's executed several times.
169The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
170- `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
171Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
172"""
173
174sequential: List[MemoryState]175cumulative: List[MemoryState]176current: List[MemoryState]177total: Memory178
179
180MemoryTrace = List[UsedMemoryState]181
182
183def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5, device_idx=None) -> int:184"""185measures peak cpu memory consumption of a given `function`
186running the function for at least interval seconds
187and at most 20 * interval seconds.
188This function is heavily inspired by: `memory_usage`
189of the package `memory_profiler`: https://github.com/pythonprofilers/memory_profiler/blob/895c4ac7a08020d66ae001e24067da6dcea42451/memory_profiler.py#L239
190
191Args:
192- `function`: (`callable`): function() -> ...
193function without any arguments to measure for which to measure the peak memory
194
195- `interval`: (`float`, `optional`, defaults to `0.5`)
196interval in second for which to measure the memory usage
197
198- `device_idx`: (`int`, `optional`, defaults to `None`)
199device id for which to measure gpu usage
200
201Returns:
202- `max_memory`: (`int`)
203cosumed memory peak in Bytes
204"""
205
206def get_cpu_memory(process_id: int) -> int:207"""208measures current cpu memory usage of a given `process_id`
209
210Args:
211- `process_id`: (`int`)
212process_id for which to measure memory
213
214Returns
215- `memory`: (`int`)
216cosumed memory in Bytes
217"""
218process = psutil.Process(process_id)219try:220meminfo_attr = "memory_info" if hasattr(process, "memory_info") else "get_memory_info"221memory = getattr(process, meminfo_attr)()[0]222except psutil.AccessDenied:223raise ValueError("Error with Psutil.")224return memory225
226if not is_psutil_available():227logger.warning(228"Psutil not installed, we won't log CPU memory usage. "229"Install Psutil (pip install psutil) to use CPU memory tracing."230)231max_memory = "N/A"232else:233
234class MemoryMeasureProcess(Process):235
236"""237`MemoryMeasureProcess` inherits from `Process` and overwrites
238its `run()` method. Used to measure the memory usage of a process
239"""
240
241def __init__(self, process_id: int, child_connection: Connection, interval: float):242super().__init__()243self.process_id = process_id244self.interval = interval245self.connection = child_connection246self.num_measurements = 1247self.mem_usage = get_cpu_memory(self.process_id)248
249def run(self):250self.connection.send(0)251stop = False252while True:253self.mem_usage = max(self.mem_usage, get_cpu_memory(self.process_id))254self.num_measurements += 1255
256if stop:257break258
259stop = self.connection.poll(self.interval)260
261# send results to parent pipe262self.connection.send(self.mem_usage)263self.connection.send(self.num_measurements)264
265while True:266# create child, parent connection267child_connection, parent_connection = Pipe()268
269# instantiate process270mem_process = MemoryMeasureProcess(os.getpid(), child_connection, interval)271mem_process.start()272
273# wait until we get memory274parent_connection.recv()275
276try:277# execute function278function()279
280# start parent connection281parent_connection.send(0)282
283# receive memory and num measurements284max_memory = parent_connection.recv()285num_measurements = parent_connection.recv()286except Exception:287# kill process in a clean way288parent = psutil.Process(os.getpid())289for child in parent.children(recursive=True):290os.kill(child.pid, SIGKILL)291mem_process.join(0)292raise RuntimeError("Process killed. Error in Process")293
294# run process at least 20 * interval or until it finishes295mem_process.join(20 * interval)296
297if (num_measurements > 4) or (interval < 1e-6):298break299
300# reduce interval301interval /= 10302
303return max_memory304
305
306def start_memory_tracing(307modules_to_trace: Optional[Union[str, Iterable[str]]] = None,308modules_not_to_trace: Optional[Union[str, Iterable[str]]] = None,309events_to_trace: str = "line",310gpus_to_trace: Optional[List[int]] = None,311) -> MemoryTrace:312""" Setup line-by-line tracing to record rss mem (RAM) at each line of a module or sub-module.313See `./benchmark.py` for usage examples.
314Current memory consumption is returned using psutil and in particular is the RSS memory
315"Resident Set Size” (the non-swapped physical memory the process is using).
316See https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_info
317
318Args:
319- `modules_to_trace`: (None, string, list/tuple of string)
320if None, all events are recorded
321if string or list of strings: only events from the listed module/sub-module will be recorded (e.g. 'fairseq' or 'transformers.modeling_gpt2')
322- `modules_not_to_trace`: (None, string, list/tuple of string)
323if None, no module is avoided
324if string or list of strings: events from the listed module/sub-module will not be recorded (e.g. 'torch')
325- `events_to_trace`: string or list of string of events to be recorded (see official python doc for `sys.settrace` for the list of events)
326default to line
327- `gpus_to_trace`: (optional list, default None) list of GPUs to trace. Default to tracing all GPUs
328
329Return:
330- `memory_trace` is a list of `UsedMemoryState` for each event (default each line of the traced script).
331- `UsedMemoryState` are named tuples with the following fields:
332- 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
333- 'cpu_memory': CPU RSS memory state *before* executing the line
334- 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
335
336`Frame` is a namedtuple used by `UsedMemoryState` to list the current frame state.
337`Frame` has the following fields:
338- 'filename' (string): Name of the file currently executed
339- 'module' (string): Name of the module currently executed
340- 'line_number' (int): Number of the line currently executed
341- 'event' (string): Event that triggered the tracing (default will be "line")
342- 'line_text' (string): Text of the line in the python script
343
344"""
345if is_psutil_available():346process = psutil.Process(os.getpid())347else:348logger.warning(349"Psutil not installed, we won't log CPU memory usage. "350"Install psutil (pip install psutil) to use CPU memory tracing."351)352process = None353
354if is_py3nvml_available():355try:356nvml.nvmlInit()357devices = list(range(nvml.nvmlDeviceGetCount())) if gpus_to_trace is None else gpus_to_trace358nvml.nvmlShutdown()359except (OSError, nvml.NVMLError):360logger.warning("Error while initializing comunication with GPU. " "We won't perform GPU memory tracing.")361log_gpu = False362else:363log_gpu = is_torch_available() or is_tf_available()364else:365logger.warning(366"py3nvml not installed, we won't log GPU memory usage. "367"Install py3nvml (pip install py3nvml) to use GPU memory tracing."368)369log_gpu = False370
371memory_trace = []372
373def traceit(frame, event, args):374""" Tracing method executed before running each line in a module or sub-module375Record memory allocated in a list with debugging information
376"""
377global _is_memory_tracing_enabled378
379if not _is_memory_tracing_enabled:380return traceit381
382# Filter events383if events_to_trace is not None:384if isinstance(events_to_trace, str) and event != events_to_trace:385return traceit386elif isinstance(events_to_trace, (list, tuple)) and event not in events_to_trace:387return traceit388
389if "__name__" not in frame.f_globals:390return traceit391
392# Filter modules393name = frame.f_globals["__name__"]394if not isinstance(name, str):395return traceit396else:397# Filter whitelist of modules to trace398if modules_to_trace is not None:399if isinstance(modules_to_trace, str) and modules_to_trace not in name:400return traceit401elif isinstance(modules_to_trace, (list, tuple)) and all(m not in name for m in modules_to_trace):402return traceit403
404# Filter blacklist of modules not to trace405if modules_not_to_trace is not None:406if isinstance(modules_not_to_trace, str) and modules_not_to_trace in name:407return traceit408elif isinstance(modules_not_to_trace, (list, tuple)) and any(m in name for m in modules_not_to_trace):409return traceit410
411# Record current tracing state (file, location in file...)412lineno = frame.f_lineno413filename = frame.f_globals["__file__"]414if filename.endswith(".pyc") or filename.endswith(".pyo"):415filename = filename[:-1]416line = linecache.getline(filename, lineno).rstrip()417traced_state = Frame(filename, name, lineno, event, line)418
419# Record current memory state (rss memory) and compute difference with previous memory state420cpu_mem = 0421if process is not None:422mem = process.memory_info()423cpu_mem = mem.rss424
425gpu_mem = 0426if log_gpu:427# Clear GPU caches428if is_torch_available():429torch_empty_cache()430if is_tf_available():431tf_context.context()._clear_caches() # See https://github.com/tensorflow/tensorflow/issues/20218#issuecomment-416771802432
433# Sum used memory for all GPUs434nvml.nvmlInit()435
436for i in devices:437handle = nvml.nvmlDeviceGetHandleByIndex(i)438meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)439gpu_mem += meminfo.used440
441nvml.nvmlShutdown()442
443mem_state = UsedMemoryState(traced_state, cpu_mem, gpu_mem)444memory_trace.append(mem_state)445
446return traceit447
448sys.settrace(traceit)449
450global _is_memory_tracing_enabled451_is_memory_tracing_enabled = True452
453return memory_trace454
455
456def stop_memory_tracing(457memory_trace: Optional[MemoryTrace] = None, ignore_released_memory: bool = True458) -> Optional[MemorySummary]:459""" Stop memory tracing cleanly and return a summary of the memory trace if a trace is given.460
461Args:
462- `memory_trace` (optional output of start_memory_tracing, default: None): memory trace to convert in summary
463- `ignore_released_memory` (boolean, default: None): if True we only sum memory increase to compute total memory
464
465Return:
466- None if `memory_trace` is None
467- `MemorySummary` namedtuple otherwise with the fields:
468- `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
469by substracting the memory after executing each line from the memory before executing said line.
470- `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
471obtained by summing repeated memory increase for a line if it's executed several times.
472The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
473- `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
474Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
475
476`Memory` named tuple have fields
477- `byte` (integer): number of bytes,
478- `string` (string): same as human readable string (ex: "3.5MB")
479
480`Frame` are namedtuple used to list the current frame state and have the following fields:
481- 'filename' (string): Name of the file currently executed
482- 'module' (string): Name of the module currently executed
483- 'line_number' (int): Number of the line currently executed
484- 'event' (string): Event that triggered the tracing (default will be "line")
485- 'line_text' (string): Text of the line in the python script
486
487`MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
488- `frame` (`Frame`): the current frame (see above)
489- `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
490- `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
491- `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
492"""
493global _is_memory_tracing_enabled494_is_memory_tracing_enabled = False495
496if memory_trace is not None and len(memory_trace) > 1:497memory_diff_trace = []498memory_curr_trace = []499
500cumulative_memory_dict = defaultdict(lambda: [0, 0, 0])501
502for ((frame, cpu_mem, gpu_mem), (next_frame, next_cpu_mem, next_gpu_mem),) in zip(503memory_trace[:-1], memory_trace[1:]504):505cpu_mem_inc = next_cpu_mem - cpu_mem506gpu_mem_inc = next_gpu_mem - gpu_mem507cpu_gpu_mem_inc = cpu_mem_inc + gpu_mem_inc508memory_diff_trace.append(509MemoryState(510frame=frame, cpu=Memory(cpu_mem_inc), gpu=Memory(gpu_mem_inc), cpu_gpu=Memory(cpu_gpu_mem_inc),511)512)513
514memory_curr_trace.append(515MemoryState(516frame=frame,517cpu=Memory(next_cpu_mem),518gpu=Memory(next_gpu_mem),519cpu_gpu=Memory(next_gpu_mem + next_cpu_mem),520)521)522
523cumulative_memory_dict[frame][0] += cpu_mem_inc524cumulative_memory_dict[frame][1] += gpu_mem_inc525cumulative_memory_dict[frame][2] += cpu_gpu_mem_inc526
527cumulative_memory = sorted(528list(cumulative_memory_dict.items()), key=lambda x: x[1][2], reverse=True529) # order by the total CPU + GPU memory increase530cumulative_memory = list(531MemoryState(532frame=frame, cpu=Memory(cpu_mem_inc), gpu=Memory(gpu_mem_inc), cpu_gpu=Memory(cpu_gpu_mem_inc),533)534for frame, (cpu_mem_inc, gpu_mem_inc, cpu_gpu_mem_inc) in cumulative_memory535)536
537memory_curr_trace = sorted(memory_curr_trace, key=lambda x: x.cpu_gpu.bytes, reverse=True)538
539if ignore_released_memory:540total_memory = sum(max(0, step_trace.cpu_gpu.bytes) for step_trace in memory_diff_trace)541else:542total_memory = sum(step_trace.cpu_gpu.bytes for step_trace in memory_diff_trace)543
544total_memory = Memory(total_memory)545
546return MemorySummary(547sequential=memory_diff_trace, cumulative=cumulative_memory, current=memory_curr_trace, total=total_memory,548)549
550return None551
552
553def bytes_to_mega_bytes(memory_amount: int) -> int:554""" Utility to convert a number of bytes (int) into a number of mega bytes (int)555"""
556return memory_amount >> 20557
558
559class Benchmark(ABC):560"""561Benchmarks is a simple but feature-complete benchmarking script
562to compare memory and time performance of models in Transformers.
563"""
564
565args: BenchmarkArguments566configs: PretrainedConfig567framework: str568
569def __init__(self, args: BenchmarkArguments = None, configs: PretrainedConfig = None):570self.args = args571if configs is None:572self.config_dict = {573model_name: AutoConfig.from_pretrained(model_name) for model_name in self.args.model_names574}575else:576self.config_dict = {model_name: config for model_name, config in zip(self.args.model_names, configs)}577
578if not self.args.no_memory and os.getenv("TRANSFORMERS_USE_MULTIPROCESSING") == 0:579logger.warning(580"Memory consumption will not be measured accurately if `args.no_multi_process` is set to `True.` The flag 'TRANSFORMERS_USE_MULTIPROCESSING' should only be disabled for debugging / testing."581)582
583self._print_fn = None584self._framework_version = None585self._environment_info = None586
587@property588def print_fn(self):589if self._print_fn is None:590if self.args.log_print:591
592def print_and_log(*args):593with open(self.args.log_filename, "a") as log_file:594log_file.write("".join(args) + "\n")595print(*args)596
597self._print_fn = print_and_log598else:599self._print_fn = print600return self._print_fn601
602@property603@abstractmethod604def framework_version(self):605pass606
607@abstractmethod608def _inference_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:609pass610
611@abstractmethod612def _train_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:613pass614
615@abstractmethod616def _inference_memory(617self, model_name: str, batch_size: int, sequence_length: int618) -> [Memory, Optional[MemorySummary]]:619pass620
621@abstractmethod622def _train_memory(623self, model_name: str, batch_size: int, sequence_length: int624) -> [Memory, Optional[MemorySummary]]:625pass626
627def inference_speed(self, *args, **kwargs) -> float:628return separate_process_wrapper_fn(self._inference_speed, self.args.do_multi_processing)(*args, **kwargs)629
630def train_speed(self, *args, **kwargs) -> float:631return separate_process_wrapper_fn(self._train_speed, self.args.do_multi_processing)(*args, **kwargs)632
633def inference_memory(self, *args, **kwargs) -> [Memory, Optional[MemorySummary]]:634return separate_process_wrapper_fn(self._inference_memory, self.args.do_multi_processing)(*args, **kwargs)635
636def train_memory(self, *args, **kwargs) -> [Memory, Optional[MemorySummary]]:637return separate_process_wrapper_fn(self._train_memory, self.args.do_multi_processing)(*args, **kwargs)638
639def run(self):640result_dict = {model_name: {} for model_name in self.args.model_names}641inference_result_time = copy.deepcopy(result_dict)642inference_result_memory = copy.deepcopy(result_dict)643train_result_time = copy.deepcopy(result_dict)644train_result_memory = copy.deepcopy(result_dict)645
646for c, model_name in enumerate(self.args.model_names):647self.print_fn(f"{c + 1} / {len(self.args.model_names)}")648
649model_dict = {650"bs": self.args.batch_sizes,651"ss": self.args.sequence_lengths,652"result": {i: {} for i in self.args.batch_sizes},653}654inference_result_time[model_name] = copy.deepcopy(model_dict)655inference_result_memory[model_name] = copy.deepcopy(model_dict)656train_result_time[model_name] = copy.deepcopy(model_dict)657train_result_memory[model_name] = copy.deepcopy(model_dict)658
659inference_summary = train_summary = None660
661for batch_size in self.args.batch_sizes:662for sequence_length in self.args.sequence_lengths:663if not self.args.no_inference:664if not self.args.no_memory:665memory, inference_summary = self.inference_memory(model_name, batch_size, sequence_length)666inference_result_memory[model_name]["result"][batch_size][sequence_length] = memory667if not self.args.no_speed:668time = self.inference_speed(model_name, batch_size, sequence_length)669inference_result_time[model_name]["result"][batch_size][sequence_length] = time670
671if self.args.training:672if not self.args.no_memory:673memory, train_summary = self.train_memory(model_name, batch_size, sequence_length)674train_result_memory[model_name]["result"][batch_size][sequence_length] = memory675if not self.args.no_speed:676time = self.train_speed(model_name, batch_size, sequence_length)677train_result_time[model_name]["result"][batch_size][sequence_length] = time678
679if not self.args.no_inference:680if not self.args.no_speed:681self.print_fn("\n" + 20 * "=" + ("INFERENCE - SPEED - RESULT").center(40) + 20 * "=")682self.print_results(inference_result_time, type_label="Time in s")683self.save_to_csv(inference_result_time, self.args.inference_time_csv_file)684if self.args.is_tpu:685self.print_fn(686"TPU was used for inference. Note that the time after compilation stabilized (after ~10 inferences model.forward(..) calls) was measured."687)688
689if not self.args.no_memory:690self.print_fn("\n" + 20 * "=" + ("INFERENCE - MEMORY - RESULT").center(40) + 20 * "=")691self.print_results(inference_result_memory, type_label="Memory in MB")692self.save_to_csv(inference_result_memory, self.args.inference_memory_csv_file)693
694if self.args.trace_memory_line_by_line:695self.print_fn("\n" + 20 * "=" + ("INFERENCE - MEMOMRY - LINE BY LINE - SUMMARY").center(40) + 20 * "=")696self.print_memory_trace_statistics(inference_summary)697
698if self.args.training:699if not self.args.no_speed:700self.print_fn("\n" + 20 * "=" + ("TRAIN - SPEED - RESULTS").center(40) + 20 * "=")701self.print_results(train_result_time, "Time in s")702self.save_to_csv(train_result_time, self.args.train_time_csv_file)703if self.args.is_tpu:704self.print_fn(705"TPU was used for training. Note that the time after compilation stabilized (after ~10 train loss=model.forward(...) + loss.backward() calls) was measured."706)707
708if not self.args.no_memory:709self.print_fn("\n" + 20 * "=" + ("TRAIN - MEMORY - RESULTS").center(40) + 20 * "=")710self.print_results(train_result_memory, type_label="Memory in MB")711self.save_to_csv(train_result_memory, self.args.train_memory_csv_file)712
713if self.args.trace_memory_line_by_line:714self.print_fn("\n" + 20 * "=" + ("TRAIN - MEMOMRY - LINE BY LINE - SUMMARY").center(40) + 20 * "=")715self.print_memory_trace_statistics(train_summary)716
717if not self.args.no_env_print:718self.print_fn("\n" + 20 * "=" + ("ENVIRONMENT INFORMATION").center(40) + 20 * "=")719self.print_fn(720"\n".join(["- {}: {}".format(prop, val) for prop, val in self.environment_info.items()]) + "\n"721)722
723if self.args.save_to_csv:724with open(self.args.env_info_csv_file, mode="w", newline="") as csv_file:725writer = csv.writer(csv_file)726for key, value in self.environment_info.items():727writer.writerow([key, value])728
729return BenchmarkOutput(730inference_result_time,731inference_result_memory,732train_result_time,733train_result_memory,734inference_summary,735train_summary,736)737
738@property739def environment_info(self):740if self._environment_info is None:741info = {}742info["transformers_version"] = version743info["framework"] = self.framework744if self.framework == "PyTorch":745info["use_torchscript"] = self.args.torchscript746if self.framework == "TensorFlow":747info["eager_mode"] = self.args.eager_mode748info["use_xla"] = self.args.use_xla749info["framework_version"] = self.framework_version750info["python_version"] = platform.python_version()751info["system"] = platform.system()752info["cpu"] = platform.processor()753info["architecture"] = platform.architecture()[0]754info["date"] = datetime.date(datetime.now())755info["time"] = datetime.time(datetime.now())756info["fp16"] = self.args.fp16757info["use_multiprocessing"] = self.args.do_multi_processing758info["only_pretrain_model"] = self.args.only_pretrain_model759
760if is_psutil_available():761info["cpu_ram_mb"] = bytes_to_mega_bytes(psutil.virtual_memory().total)762else:763logger.warning(764"Psutil not installed, we won't log available CPU memory."765"Install psutil (pip install psutil) to log available CPU memory."766)767info["cpu_ram_mb"] = "N/A"768
769info["use_gpu"] = self.args.is_gpu770if self.args.is_gpu:771info["num_gpus"] = 1 # TODO(PVP) Currently only single GPU is supported772if is_py3nvml_available():773nvml.nvmlInit()774handle = nvml.nvmlDeviceGetHandleByIndex(self.args.device_idx)775info["gpu"] = nvml.nvmlDeviceGetName(handle)776info["gpu_ram_mb"] = bytes_to_mega_bytes(nvml.nvmlDeviceGetMemoryInfo(handle).total)777info["gpu_power_watts"] = nvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000778info["gpu_performance_state"] = nvml.nvmlDeviceGetPerformanceState(handle)779nvml.nvmlShutdown()780else:781logger.warning(782"py3nvml not installed, we won't log GPU memory usage. "783"Install py3nvml (pip install py3nvml) to log information about GPU."784)785info["gpu"] = "N/A"786info["gpu_ram_mb"] = "N/A"787info["gpu_power_watts"] = "N/A"788info["gpu_performance_state"] = "N/A"789
790info["use_tpu"] = self.args.is_tpu791# TODO(PVP): See if we can add more information about TPU792# see: https://github.com/pytorch/xla/issues/2180793
794self._environment_info = info795return self._environment_info796
797def print_results(self, result_dict, type_label):798self.print_fn(80 * "-")799self.print_fn(800"Model Name".center(30) + "Batch Size".center(15) + "Seq Length".center(15) + type_label.center(15)801)802self.print_fn(80 * "-")803for model_name in self.args.model_names:804for batch_size in result_dict[model_name]["bs"]:805for sequence_length in result_dict[model_name]["ss"]:806result = result_dict[model_name]["result"][batch_size][sequence_length]807if isinstance(result, float):808result = round(1000 * result) / 1000809result = "< 0.001" if result == 0.0 else str(result)810else:811result = str(result)812self.print_fn(813model_name[:30].center(30) + str(batch_size).center(15),814str(sequence_length).center(15),815result.center(15),816)817self.print_fn(80 * "-")818
819def print_memory_trace_statistics(self, summary: MemorySummary):820self.print_fn(821"\nLine by line memory consumption:\n"822+ "\n".join(823f"{state.frame.filename}:{state.frame.line_number}: mem {state.cpu_gpu}: {state.frame.line_text}"824for state in summary.sequential825)826)827self.print_fn(828"\nLines with top memory consumption:\n"829+ "\n".join(830f"=> {state.frame.filename}:{state.frame.line_number}: mem {state.cpu_gpu}: {state.frame.line_text}"831for state in summary.cumulative[:6]832)833)834self.print_fn(835"\nLines with lowest memory consumption:\n"836+ "\n".join(837f"=> {state.frame.filename}:{state.frame.line_number}: mem {state.cpu_gpu}: {state.frame.line_text}"838for state in summary.cumulative[-6:]839)840)841self.print_fn(f"\nTotal memory increase: {summary.total}")842
843def save_to_csv(self, result_dict, filename):844if not self.args.save_to_csv:845return846self.print_fn("Saving results to csv.")847with open(filename, mode="w") as csv_file:848
849assert len(self.args.model_names) > 0, "At least 1 model should be defined, but got {}".format(850self.model_names851)852
853fieldnames = ["model", "batch_size", "sequence_length"]854writer = csv.DictWriter(csv_file, fieldnames=fieldnames + ["result"])855writer.writeheader()856
857for model_name in self.args.model_names:858result_dict_model = result_dict[model_name]["result"]859for bs in result_dict_model:860for ss in result_dict_model[bs]:861result_model = result_dict_model[bs][ss]862writer.writerow(863{864"model": model_name,865"batch_size": bs,866"sequence_length": ss,867"result": ("{}" if not isinstance(result_model, float) else "{:.4f}").format(868result_model
869),870}871)872