CSS-LM

glue.py
146 строк · 5.2 Кб
Перенос по словам
1
import logging
2
import os
3
import time
4
from dataclasses import dataclass, field
5
from enum import Enum
6
from typing import List, Optional, Union
7

8
import torch
9
from filelock import FileLock
10
from torch.utils.data.dataset import Dataset
11

12
from ...tokenization_bart import BartTokenizer, BartTokenizerFast
13
from ...tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
14
from ...tokenization_utils import PreTrainedTokenizer
15
from ...tokenization_xlm_roberta import XLMRobertaTokenizer
16
from ..processors.glue import glue_convert_examples_to_features, glue_output_modes, glue_processors
17
from ..processors.utils import InputFeatures
18

19

20
logger = logging.getLogger(__name__)
21

22

23
@dataclass
24
class GlueDataTrainingArguments:
25
    """
26
    Arguments pertaining to what data we are going to input our model for training and eval.
27

28
    Using `HfArgumentParser` we can turn this class
29
    into argparse arguments to be able to specify them on
30
    the command line.
31
    """
32

33
    task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(glue_processors.keys())})
34
    data_dir: str = field(
35
        metadata={"help": "The input data dir. Should contain the .tsv files (or other data files) for the task."}
36
    )
37
    max_seq_length: int = field(
38
        default=128,
39
        metadata={
40
            "help": "The maximum total input sequence length after tokenization. Sequences longer "
41
            "than this will be truncated, sequences shorter will be padded."
42
        },
43
    )
44
    overwrite_cache: bool = field(
45
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
46
    )
47

48
    def __post_init__(self):
49
        self.task_name = self.task_name.lower()
50

51

52
class Split(Enum):
53
    train = "train"
54
    dev = "dev"
55
    test = "test"
56

57

58
class GlueDataset(Dataset):
59
    """
60
    This will be superseded by a framework-agnostic approach
61
    soon.
62
    """
63

64
    args: GlueDataTrainingArguments
65
    output_mode: str
66
    features: List[InputFeatures]
67

68
    def __init__(
69
        self,
70
        args: GlueDataTrainingArguments,
71
        tokenizer: PreTrainedTokenizer,
72
        limit_length: Optional[int] = None,
73
        mode: Union[str, Split] = Split.train,
74
        cache_dir: Optional[str] = None,
75
    ):
76
        self.args = args
77
        self.processor = glue_processors[args.task_name]()
78
        self.output_mode = glue_output_modes[args.task_name]
79
        if isinstance(mode, str):
80
            try:
81
                mode = Split[mode]
82
            except KeyError:
83
                raise KeyError("mode is not a valid split name")
84
        # Load data features from cache or dataset file
85
        cached_features_file = os.path.join(
86
            cache_dir if cache_dir is not None else args.data_dir,
87
            "cached_{}_{}_{}_{}".format(
88
                mode.value, tokenizer.__class__.__name__, str(args.max_seq_length), args.task_name,
89
            ),
90
        )
91
        label_list = self.processor.get_labels()
92
        if args.task_name in ["mnli", "mnli-mm"] and tokenizer.__class__ in (
93
            RobertaTokenizer,
94
            RobertaTokenizerFast,
95
            XLMRobertaTokenizer,
96
            BartTokenizer,
97
            BartTokenizerFast,
98
        ):
99
            # HACK(label indices are swapped in RoBERTa pretrained model)
100
            label_list[1], label_list[2] = label_list[2], label_list[1]
101
        self.label_list = label_list
102

103
        # Make sure only the first process in distributed training processes the dataset,
104
        # and the others will use the cache.
105
        lock_path = cached_features_file + ".lock"
106
        with FileLock(lock_path):
107

108
            if os.path.exists(cached_features_file) and not args.overwrite_cache:
109
                start = time.time()
110
                self.features = torch.load(cached_features_file)
111
                logger.info(
112
                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
113
                )
114
            else:
115
                logger.info(f"Creating features from dataset file at {args.data_dir}")
116

117
                if mode == Split.dev:
118
                    examples = self.processor.get_dev_examples(args.data_dir)
119
                elif mode == Split.test:
120
                    examples = self.processor.get_test_examples(args.data_dir)
121
                else:
122
                    examples = self.processor.get_train_examples(args.data_dir)
123
                if limit_length is not None:
124
                    examples = examples[:limit_length]
125
                self.features = glue_convert_examples_to_features(
126
                    examples,
127
                    tokenizer,
128
                    max_length=args.max_seq_length,
129
                    label_list=label_list,
130
                    output_mode=self.output_mode,
131
                )
132
                start = time.time()
133
                torch.save(self.features, cached_features_file)
134
                # ^ This seems to take a lot of time so I want to investigate why and how we can improve.
135
                logger.info(
136
                    "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
137
                )
138

139
    def __len__(self):
140
        return len(self.features)
141

142
    def __getitem__(self, i) -> InputFeatures:
143
        return self.features[i]
144

145
    def get_labels(self):
146
        return self.label_list
147
CSS-LM

Использование cookies