CSS-LM

training_args.py
316 строк · 14.6 Кб
Перенос по словам
1
import dataclasses
2
import json
3
import logging
4
import os
5
from dataclasses import dataclass, field
6
from typing import Any, Dict, Optional, Tuple
7

8
from .file_utils import cached_property, is_torch_available, is_torch_tpu_available, torch_required
9

10

11
if is_torch_available():
12
    import torch
13

14
if is_torch_tpu_available():
15
    import torch_xla.core.xla_model as xm
16

17

18
logger = logging.getLogger(__name__)
19

20

21
def default_logdir() -> str:
22
    """
23
    Same default as PyTorch
24
    """
25
    import socket
26
    from datetime import datetime
27

28
    current_time = datetime.now().strftime("%b%d_%H-%M-%S")
29
    return os.path.join("runs", current_time + "_" + socket.gethostname())
30

31

32
@dataclass
33
class TrainingArguments:
34
    """
35
    TrainingArguments is the subset of the arguments we use in our example scripts
36
    **which relate to the training loop itself**.
37

38
    Using :class:`~transformers.HfArgumentParser` we can turn this class
39
    into argparse arguments to be able to specify them on the command line.
40

41
    Parameters:
42
        output_dir (:obj:`str`):
43
            The output directory where the model predictions and checkpoints will be written.
44
        overwrite_output_dir (:obj:`bool`, `optional`, defaults to :obj:`False`):
45
            If :obj:`True`, overwrite the content of the output directory. Use this to continue training if
46
            :obj:`output_dir` points to a checkpoint directory.
47
        do_train (:obj:`bool`, `optional`, defaults to :obj:`False`):
48
            Whether to run training or not.
49
        do_eval (:obj:`bool`, `optional`, defaults to :obj:`False`):
50
            Whether to run evaluation on the dev set or not.
51
        do_predict (:obj:`bool`, `optional`, defaults to :obj:`False`):
52
            Whether to run predictions on the test set or not.
53
        evaluate_during_training (:obj:`bool`, `optional`, defaults to :obj:`False`):
54
            Whether to run evaluation during training at each logging step or not.
55
        per_device_train_batch_size (:obj:`int`, `optional`, defaults to 8):
56
            The batch size per GPU/TPU core/CPU for training.
57
        per_device_eval_batch_size (:obj:`int`, `optional`, defaults to 8):
58
            The batch size per GPU/TPU core/CPU for evaluation.
59
        gradient_accumulation_steps: (:obj:`int`, `optional`, defaults to 1):
60
            Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
61
        learning_rate (:obj:`float`, `optional`, defaults to 5e-5):
62
            The initial learning rate for Adam.
63
        weight_decay (:obj:`float`, `optional`, defaults to 0):
64
            The weight decay to apply (if not zero).
65
        adam_epsilon (:obj:`float`, `optional`, defaults to 1e-8):
66
            Epsilon for the Adam optimizer.
67
        max_grad_norm (:obj:`float`, `optional`, defaults to 1.0):
68
            Maximum gradient norm (for gradient clipping).
69
        num_train_epochs(:obj:`float`, `optional`, defaults to 3.0):
70
            Total number of training epochs to perform.
71
        max_steps (:obj:`int`, `optional`, defaults to -1):
72
            If set to a positive number, the total number of training steps to perform. Overrides
73
            :obj:`num_train_epochs`.
74
        warmup_steps (:obj:`int`, `optional`, defaults to 0):
75
            Number of steps used for a linear warmup from 0 to :obj:`learning_rate`.
76
        logging_dir (:obj:`str`, `optional`):
77
            Tensorboard log directory. Will default to `runs/**CURRENT_DATETIME_HOSTNAME**`.
78
        logging_first_step (:obj:`bool`, `optional`, defaults to :obj:`False`):
79
            Wheter to log and evalulate the first :obj:`global_step` or not.
80
        logging_steps (:obj:`int`, `optional`, defaults to 500):
81
            Number of update steps between two logs.
82
        save_steps (:obj:`int`, `optional`, defaults to 500):
83
            Number of updates steps before two checkpoint saves.
84
        save_total_limit (:obj:`int`, `optional`):
85
            If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
86
            :obj:`output_dir`.
87
        no_cuda (:obj:`bool`, `optional`, defaults to :obj:`False`):
88
            Whether to not use CUDA even when it is available or not.
89
        seed (:obj:`int`, `optional`, defaults to 42):
90
            Random seed for initialization.
91
        fp16 (:obj:`bool`, `optional`, defaults to :obj:`False`):
92
            Whether to use 16-bit (mixed) precision training (through NVIDIA apex) instead of 32-bit training.
93
        fp16_opt_level (:obj:`str`, `optional`, defaults to 'O1'):
94
            For :obj:`fp16` training, apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details
95
            on the `apex documentation <https://nvidia.github.io/apex/amp.html>`__.
96
        local_rank (:obj:`int`, `optional`, defaults to -1):
97
            During distributed training, the rank of the process.
98
        tpu_num_cores (:obj:`int`, `optional`):
99
            When training on TPU, the mumber of TPU cores (automatically passed by launcher script).
100
        debug (:obj:`bool`, `optional`, defaults to :obj:`False`):
101
            When training on TPU, whether to print debug metrics or not.
102
        dataloader_drop_last (:obj:`bool`, `optional`, defaults to :obj:`False`):
103
            Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size)
104
            or not.
105
        eval_steps (:obj:`int`, `optional`, defaults to 1000):
106
            Number of update steps between two evaluations.
107
        past_index (:obj:`int`, `optional`, defaults to -1):
108
            Some models like :doc:`TransformerXL <../model_doc/transformerxl>` or :doc`XLNet <../model_doc/xlnet>` can
109
            make use of the past hidden states for their predictions. If this argument is set to a positive int, the
110
            ``Trainer`` will use the corresponding output (usually index 2) as the past state and feed it to the model
111
            at the next training step under the keyword argument ``mems``.
112
    """
113

114
    output_dir: str = field(
115
        metadata={"help": "The output directory where the model predictions and checkpoints will be written."}
116
    )
117
    overwrite_output_dir: bool = field(
118
        default=False,
119
        metadata={
120
            "help": (
121
                "Overwrite the content of the output directory."
122
                "Use this to continue training if output_dir points to a checkpoint directory."
123
            )
124
        },
125
    )
126

127
    do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
128
    do_eval: bool = field(default=False, metadata={"help": "Whether to run eval on the dev set."})
129
    do_predict: bool = field(default=False, metadata={"help": "Whether to run predictions on the test set."})
130
    evaluate_during_training: bool = field(
131
        default=False, metadata={"help": "Run evaluation during training at each logging step."},
132
    )
133

134
    per_device_train_batch_size: int = field(
135
        default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for training."}
136
    )
137
    per_device_eval_batch_size: int = field(
138
        default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."}
139
    )
140

141
    per_gpu_train_batch_size: Optional[int] = field(
142
        default=None,
143
        metadata={
144
            "help": "Deprecated, the use of `--per_device_train_batch_size` is preferred. "
145
            "Batch size per GPU/TPU core/CPU for training."
146
        },
147
    )
148
    per_gpu_eval_batch_size: Optional[int] = field(
149
        default=None,
150
        metadata={
151
            "help": "Deprecated, the use of `--per_device_eval_batch_size` is preferred."
152
            "Batch size per GPU/TPU core/CPU for evaluation."
153
        },
154
    )
155

156
    gradient_accumulation_steps: int = field(
157
        default=1,
158
        metadata={"help": "Number of updates steps to accumulate before performing a backward/update pass."},
159
    )
160

161
    learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for Adam."})
162
    weight_decay: float = field(default=0.0, metadata={"help": "Weight decay if we apply some."})
163
    adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for Adam optimizer"})
164
    adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for Adam optimizer"})
165
    adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for Adam optimizer."})
166
    max_grad_norm: float = field(default=1.0, metadata={"help": "Max gradient norm."})
167

168
    num_train_epochs: float = field(default=3.0, metadata={"help": "Total number of training epochs to perform."})
169
    max_steps: int = field(
170
        default=-1,
171
        metadata={"help": "If > 0: set total number of training steps to perform. Override num_train_epochs."},
172
    )
173
    warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})
174

175
    logging_dir: Optional[str] = field(default_factory=default_logdir, metadata={"help": "Tensorboard log dir."})
176
    logging_first_step: bool = field(default=False, metadata={"help": "Log and eval the first global_step"})
177
    logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
178
    save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})
179
    save_total_limit: Optional[int] = field(
180
        default=None,
181
        metadata={
182
            "help": (
183
                "Limit the total amount of checkpoints."
184
                "Deletes the older checkpoints in the output_dir. Default is unlimited checkpoints"
185
            )
186
        },
187
    )
188
    no_cuda: bool = field(default=False, metadata={"help": "Do not use CUDA even when it is available"})
189
    seed: int = field(default=42, metadata={"help": "random seed for initialization"})
190

191
    fp16: bool = field(
192
        default=False,
193
        metadata={"help": "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"},
194
    )
195
    fp16_opt_level: str = field(
196
        default="O1",
197
        metadata={
198
            "help": (
199
                "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
200
                "See details at https://nvidia.github.io/apex/amp.html"
201
            )
202
        },
203
    )
204
    local_rank: int = field(default=-1, metadata={"help": "For distributed training: local_rank"})
205

206
    tpu_num_cores: Optional[int] = field(
207
        default=None, metadata={"help": "TPU: Number of TPU cores (automatically passed by launcher script)"}
208
    )
209
    tpu_metrics_debug: bool = field(
210
        default=False,
211
        metadata={"help": "Deprecated, the use of `--debug` is preferred. TPU: Whether to print debug metrics"},
212
    )
213
    debug: bool = field(default=False, metadata={"help": "Whether to print debug metrics on TPU"})
214

215
    dataloader_drop_last: bool = field(
216
        default=False, metadata={"help": "Drop the last incomplete batch if it is not divisible by the batch size."}
217
    )
218
    eval_steps: int = field(default=1000, metadata={"help": "Run an evaluation every X steps."})
219

220
    past_index: int = field(
221
        default=-1,
222
        metadata={"help": "If >=0, uses the corresponding part of the output as the past state for next step."},
223
    )
224

225
    @property
226
    def train_batch_size(self) -> int:
227
        """
228
        The actual batch size for training (may differ from :obj:`per_gpu_train_batch_size` in distributed training).
229
        """
230
        if self.per_gpu_train_batch_size:
231
            logger.warning(
232
                "Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future "
233
                "version. Using `--per_device_train_batch_size` is preferred."
234
            )
235
        per_device_batch_size = self.per_gpu_train_batch_size or self.per_device_train_batch_size
236
        return per_device_batch_size * max(1, self.n_gpu)
237

238
    @property
239
    def eval_batch_size(self) -> int:
240
        """
241
        The actual batch size for evaluation (may differ from :obj:`per_gpu_eval_batch_size` in distributed training).
242
        """
243
        if self.per_gpu_eval_batch_size:
244
            logger.warning(
245
                "Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future "
246
                "version. Using `--per_device_eval_batch_size` is preferred."
247
            )
248
        per_device_batch_size = self.per_gpu_eval_batch_size or self.per_device_eval_batch_size
249
        return per_device_batch_size * max(1, self.n_gpu)
250

251
    @cached_property
252
    @torch_required
253
    def _setup_devices(self) -> Tuple["torch.device", int]:
254
        logger.info("PyTorch: setting up devices")
255
        if self.no_cuda:
256
            device = torch.device("cpu")
257
            n_gpu = 0
258
        elif is_torch_tpu_available():
259
            device = xm.xla_device()
260
            n_gpu = 0
261
        elif self.local_rank == -1:
262
            # if n_gpu is > 1 we'll use nn.DataParallel.
263
            # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
264
            # Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will
265
            # trigger an error that a device index is missing. Index 0 takes into account the
266
            # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0`
267
            # will use the first GPU in that env, i.e. GPU#1
268
            device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
269
            n_gpu = torch.cuda.device_count()
270
        else:
271
            # Here, we'll use torch.distributed.
272
            # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
273
            torch.distributed.init_process_group(backend="nccl")
274
            device = torch.device("cuda", self.local_rank)
275
            n_gpu = 1
276

277
        if device.type == "cuda":
278
            torch.cuda.set_device(device)
279

280
        return device, n_gpu
281

282
    @property
283
    @torch_required
284
    def device(self) -> "torch.device":
285
        """
286
        The device used by this process.
287
        """
288
        return self._setup_devices[0]
289

290
    @property
291
    @torch_required
292
    def n_gpu(self):
293
        """
294
        The number of GPUs used by this process.
295

296
        Note:
297
            This will only be greater than one when you have multiple GPUs available but are not using distributed
298
            training. For distributed training, it will always be 1.
299
        """
300
        return self._setup_devices[1]
301

302
    def to_json_string(self):
303
        """
304
        Serializes this instance to a JSON string.
305
        """
306
        return json.dumps(dataclasses.asdict(self), indent=2)
307

308
    def to_sanitized_dict(self) -> Dict[str, Any]:
309
        """
310
        Sanitized serialization to use with TensorBoard’s hparams
311
        """
312
        d = dataclasses.asdict(self)
313
        valid_types = [bool, int, float, str]
314
        if is_torch_available():
315
            valid_types.append(torch.Tensor)
316
        return {k: v if type(v) in valid_types else str(v) for k, v in d.items()}
317
CSS-LM

Использование cookies