CSS-LM
316 строк · 14.6 Кб
1import dataclasses
2import json
3import logging
4import os
5from dataclasses import dataclass, field
6from typing import Any, Dict, Optional, Tuple
7
8from .file_utils import cached_property, is_torch_available, is_torch_tpu_available, torch_required
9
10
11if is_torch_available():
12import torch
13
14if is_torch_tpu_available():
15import torch_xla.core.xla_model as xm
16
17
18logger = logging.getLogger(__name__)
19
20
21def default_logdir() -> str:
22"""
23Same default as PyTorch
24"""
25import socket
26from datetime import datetime
27
28current_time = datetime.now().strftime("%b%d_%H-%M-%S")
29return os.path.join("runs", current_time + "_" + socket.gethostname())
30
31
32@dataclass
33class TrainingArguments:
34"""
35TrainingArguments is the subset of the arguments we use in our example scripts
36**which relate to the training loop itself**.
37
38Using :class:`~transformers.HfArgumentParser` we can turn this class
39into argparse arguments to be able to specify them on the command line.
40
41Parameters:
42output_dir (:obj:`str`):
43The output directory where the model predictions and checkpoints will be written.
44overwrite_output_dir (:obj:`bool`, `optional`, defaults to :obj:`False`):
45If :obj:`True`, overwrite the content of the output directory. Use this to continue training if
46:obj:`output_dir` points to a checkpoint directory.
47do_train (:obj:`bool`, `optional`, defaults to :obj:`False`):
48Whether to run training or not.
49do_eval (:obj:`bool`, `optional`, defaults to :obj:`False`):
50Whether to run evaluation on the dev set or not.
51do_predict (:obj:`bool`, `optional`, defaults to :obj:`False`):
52Whether to run predictions on the test set or not.
53evaluate_during_training (:obj:`bool`, `optional`, defaults to :obj:`False`):
54Whether to run evaluation during training at each logging step or not.
55per_device_train_batch_size (:obj:`int`, `optional`, defaults to 8):
56The batch size per GPU/TPU core/CPU for training.
57per_device_eval_batch_size (:obj:`int`, `optional`, defaults to 8):
58The batch size per GPU/TPU core/CPU for evaluation.
59gradient_accumulation_steps: (:obj:`int`, `optional`, defaults to 1):
60Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
61learning_rate (:obj:`float`, `optional`, defaults to 5e-5):
62The initial learning rate for Adam.
63weight_decay (:obj:`float`, `optional`, defaults to 0):
64The weight decay to apply (if not zero).
65adam_epsilon (:obj:`float`, `optional`, defaults to 1e-8):
66Epsilon for the Adam optimizer.
67max_grad_norm (:obj:`float`, `optional`, defaults to 1.0):
68Maximum gradient norm (for gradient clipping).
69num_train_epochs(:obj:`float`, `optional`, defaults to 3.0):
70Total number of training epochs to perform.
71max_steps (:obj:`int`, `optional`, defaults to -1):
72If set to a positive number, the total number of training steps to perform. Overrides
73:obj:`num_train_epochs`.
74warmup_steps (:obj:`int`, `optional`, defaults to 0):
75Number of steps used for a linear warmup from 0 to :obj:`learning_rate`.
76logging_dir (:obj:`str`, `optional`):
77Tensorboard log directory. Will default to `runs/**CURRENT_DATETIME_HOSTNAME**`.
78logging_first_step (:obj:`bool`, `optional`, defaults to :obj:`False`):
79Wheter to log and evalulate the first :obj:`global_step` or not.
80logging_steps (:obj:`int`, `optional`, defaults to 500):
81Number of update steps between two logs.
82save_steps (:obj:`int`, `optional`, defaults to 500):
83Number of updates steps before two checkpoint saves.
84save_total_limit (:obj:`int`, `optional`):
85If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
86:obj:`output_dir`.
87no_cuda (:obj:`bool`, `optional`, defaults to :obj:`False`):
88Whether to not use CUDA even when it is available or not.
89seed (:obj:`int`, `optional`, defaults to 42):
90Random seed for initialization.
91fp16 (:obj:`bool`, `optional`, defaults to :obj:`False`):
92Whether to use 16-bit (mixed) precision training (through NVIDIA apex) instead of 32-bit training.
93fp16_opt_level (:obj:`str`, `optional`, defaults to 'O1'):
94For :obj:`fp16` training, apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details
95on the `apex documentation <https://nvidia.github.io/apex/amp.html>`__.
96local_rank (:obj:`int`, `optional`, defaults to -1):
97During distributed training, the rank of the process.
98tpu_num_cores (:obj:`int`, `optional`):
99When training on TPU, the mumber of TPU cores (automatically passed by launcher script).
100debug (:obj:`bool`, `optional`, defaults to :obj:`False`):
101When training on TPU, whether to print debug metrics or not.
102dataloader_drop_last (:obj:`bool`, `optional`, defaults to :obj:`False`):
103Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size)
104or not.
105eval_steps (:obj:`int`, `optional`, defaults to 1000):
106Number of update steps between two evaluations.
107past_index (:obj:`int`, `optional`, defaults to -1):
108Some models like :doc:`TransformerXL <../model_doc/transformerxl>` or :doc`XLNet <../model_doc/xlnet>` can
109make use of the past hidden states for their predictions. If this argument is set to a positive int, the
110``Trainer`` will use the corresponding output (usually index 2) as the past state and feed it to the model
111at the next training step under the keyword argument ``mems``.
112"""
113
114output_dir: str = field(
115metadata={"help": "The output directory where the model predictions and checkpoints will be written."}
116)
117overwrite_output_dir: bool = field(
118default=False,
119metadata={
120"help": (
121"Overwrite the content of the output directory."
122"Use this to continue training if output_dir points to a checkpoint directory."
123)
124},
125)
126
127do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
128do_eval: bool = field(default=False, metadata={"help": "Whether to run eval on the dev set."})
129do_predict: bool = field(default=False, metadata={"help": "Whether to run predictions on the test set."})
130evaluate_during_training: bool = field(
131default=False, metadata={"help": "Run evaluation during training at each logging step."},
132)
133
134per_device_train_batch_size: int = field(
135default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for training."}
136)
137per_device_eval_batch_size: int = field(
138default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."}
139)
140
141per_gpu_train_batch_size: Optional[int] = field(
142default=None,
143metadata={
144"help": "Deprecated, the use of `--per_device_train_batch_size` is preferred. "
145"Batch size per GPU/TPU core/CPU for training."
146},
147)
148per_gpu_eval_batch_size: Optional[int] = field(
149default=None,
150metadata={
151"help": "Deprecated, the use of `--per_device_eval_batch_size` is preferred."
152"Batch size per GPU/TPU core/CPU for evaluation."
153},
154)
155
156gradient_accumulation_steps: int = field(
157default=1,
158metadata={"help": "Number of updates steps to accumulate before performing a backward/update pass."},
159)
160
161learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for Adam."})
162weight_decay: float = field(default=0.0, metadata={"help": "Weight decay if we apply some."})
163adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for Adam optimizer"})
164adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for Adam optimizer"})
165adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for Adam optimizer."})
166max_grad_norm: float = field(default=1.0, metadata={"help": "Max gradient norm."})
167
168num_train_epochs: float = field(default=3.0, metadata={"help": "Total number of training epochs to perform."})
169max_steps: int = field(
170default=-1,
171metadata={"help": "If > 0: set total number of training steps to perform. Override num_train_epochs."},
172)
173warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})
174
175logging_dir: Optional[str] = field(default_factory=default_logdir, metadata={"help": "Tensorboard log dir."})
176logging_first_step: bool = field(default=False, metadata={"help": "Log and eval the first global_step"})
177logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
178save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})
179save_total_limit: Optional[int] = field(
180default=None,
181metadata={
182"help": (
183"Limit the total amount of checkpoints."
184"Deletes the older checkpoints in the output_dir. Default is unlimited checkpoints"
185)
186},
187)
188no_cuda: bool = field(default=False, metadata={"help": "Do not use CUDA even when it is available"})
189seed: int = field(default=42, metadata={"help": "random seed for initialization"})
190
191fp16: bool = field(
192default=False,
193metadata={"help": "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"},
194)
195fp16_opt_level: str = field(
196default="O1",
197metadata={
198"help": (
199"For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
200"See details at https://nvidia.github.io/apex/amp.html"
201)
202},
203)
204local_rank: int = field(default=-1, metadata={"help": "For distributed training: local_rank"})
205
206tpu_num_cores: Optional[int] = field(
207default=None, metadata={"help": "TPU: Number of TPU cores (automatically passed by launcher script)"}
208)
209tpu_metrics_debug: bool = field(
210default=False,
211metadata={"help": "Deprecated, the use of `--debug` is preferred. TPU: Whether to print debug metrics"},
212)
213debug: bool = field(default=False, metadata={"help": "Whether to print debug metrics on TPU"})
214
215dataloader_drop_last: bool = field(
216default=False, metadata={"help": "Drop the last incomplete batch if it is not divisible by the batch size."}
217)
218eval_steps: int = field(default=1000, metadata={"help": "Run an evaluation every X steps."})
219
220past_index: int = field(
221default=-1,
222metadata={"help": "If >=0, uses the corresponding part of the output as the past state for next step."},
223)
224
225@property
226def train_batch_size(self) -> int:
227"""
228The actual batch size for training (may differ from :obj:`per_gpu_train_batch_size` in distributed training).
229"""
230if self.per_gpu_train_batch_size:
231logger.warning(
232"Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future "
233"version. Using `--per_device_train_batch_size` is preferred."
234)
235per_device_batch_size = self.per_gpu_train_batch_size or self.per_device_train_batch_size
236return per_device_batch_size * max(1, self.n_gpu)
237
238@property
239def eval_batch_size(self) -> int:
240"""
241The actual batch size for evaluation (may differ from :obj:`per_gpu_eval_batch_size` in distributed training).
242"""
243if self.per_gpu_eval_batch_size:
244logger.warning(
245"Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future "
246"version. Using `--per_device_eval_batch_size` is preferred."
247)
248per_device_batch_size = self.per_gpu_eval_batch_size or self.per_device_eval_batch_size
249return per_device_batch_size * max(1, self.n_gpu)
250
251@cached_property
252@torch_required
253def _setup_devices(self) -> Tuple["torch.device", int]:
254logger.info("PyTorch: setting up devices")
255if self.no_cuda:
256device = torch.device("cpu")
257n_gpu = 0
258elif is_torch_tpu_available():
259device = xm.xla_device()
260n_gpu = 0
261elif self.local_rank == -1:
262# if n_gpu is > 1 we'll use nn.DataParallel.
263# If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
264# Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will
265# trigger an error that a device index is missing. Index 0 takes into account the
266# GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0`
267# will use the first GPU in that env, i.e. GPU#1
268device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
269n_gpu = torch.cuda.device_count()
270else:
271# Here, we'll use torch.distributed.
272# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
273torch.distributed.init_process_group(backend="nccl")
274device = torch.device("cuda", self.local_rank)
275n_gpu = 1
276
277if device.type == "cuda":
278torch.cuda.set_device(device)
279
280return device, n_gpu
281
282@property
283@torch_required
284def device(self) -> "torch.device":
285"""
286The device used by this process.
287"""
288return self._setup_devices[0]
289
290@property
291@torch_required
292def n_gpu(self):
293"""
294The number of GPUs used by this process.
295
296Note:
297This will only be greater than one when you have multiple GPUs available but are not using distributed
298training. For distributed training, it will always be 1.
299"""
300return self._setup_devices[1]
301
302def to_json_string(self):
303"""
304Serializes this instance to a JSON string.
305"""
306return json.dumps(dataclasses.asdict(self), indent=2)
307
308def to_sanitized_dict(self) -> Dict[str, Any]:
309"""
310Sanitized serialization to use with TensorBoard’s hparams
311"""
312d = dataclasses.asdict(self)
313valid_types = [bool, int, float, str]
314if is_torch_available():
315valid_types.append(torch.Tensor)
316return {k: v if type(v) in valid_types else str(v) for k, v in d.items()}
317