CSS-LM

tokenization_albert.py
342 строки · 13.8 Кб
Перенос по словам
1
# coding=utf-8
2
# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
""" Tokenization classes for ALBERT model."""
16

17

18
import logging
19
import os
20
import unicodedata
21
from shutil import copyfile
22
from typing import List, Optional
23

24
from .tokenization_utils import PreTrainedTokenizer
25

26

27
logger = logging.getLogger(__name__)
28
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
29

30
PRETRAINED_VOCAB_FILES_MAP = {
31
    "vocab_file": {
32
        "albert-base-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-spiece.model",
33
        "albert-large-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v1-spiece.model",
34
        "albert-xlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v1-spiece.model",
35
        "albert-xxlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v1-spiece.model",
36
        "albert-base-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-spiece.model",
37
        "albert-large-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-spiece.model",
38
        "albert-xlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-spiece.model",
39
        "albert-xxlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-spiece.model",
40
    }
41
}
42

43
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
44
    "albert-base-v1": 512,
45
    "albert-large-v1": 512,
46
    "albert-xlarge-v1": 512,
47
    "albert-xxlarge-v1": 512,
48
    "albert-base-v2": 512,
49
    "albert-large-v2": 512,
50
    "albert-xlarge-v2": 512,
51
    "albert-xxlarge-v2": 512,
52
}
53

54
SPIECE_UNDERLINE = "▁"
55

56

57
class AlbertTokenizer(PreTrainedTokenizer):
58
    """
59
    Constructs an ALBERT tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__
60

61
    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
62
    should refer to the superclass for more information regarding methods.
63

64
    Args:
65
        vocab_file (:obj:`string`):
66
            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a .spm extension) that
67
            contains the vocabulary necessary to instantiate a tokenizer.
68
        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
69
            Whether to lowercase the input when tokenizing.
70
        remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
71
            Whether to strip the text when tokenizing (removing excess spaces before and after the string).
72
        keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
73
            Whether to keep accents when tokenizing.
74
        bos_token (:obj:`string`, `optional`, defaults to "[CLS]"):
75
            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
76

77
            .. note::
78

79
                When building a sequence using special tokens, this is not the token that is used for the beginning
80
                of sequence. The token used is the :obj:`cls_token`.
81
        eos_token (:obj:`string`, `optional`, defaults to "[SEP]"):
82
            The end of sequence token.
83

84
            .. note::
85

86
                When building a sequence using special tokens, this is not the token that is used for the end
87
                of sequence. The token used is the :obj:`sep_token`.
88
        unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
89
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
90
            token instead.
91
        sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
92
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
93
            for sequence classification or for a text and a question for question answering.
94
            It is also used as the last token of a sequence built with special tokens.
95
        pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
96
            The token used for padding, for example when batching sequences of different lengths.
97
        cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
98
            The classifier token which is used when doing sequence classification (classification of the whole
99
            sequence instead of per-token classification). It is the first token of the sequence when built with
100
            special tokens.
101
        mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
102
            The token used for masking values. This is the token used when training this model with masked language
103
            modeling. This is the token which the model will try to predict.
104

105
    Attributes:
106
        sp_model (:obj:`SentencePieceProcessor`):
107
            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
108
    """
109

110
    vocab_files_names = VOCAB_FILES_NAMES
111
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
112
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
113

114
    def __init__(
115
        self,
116
        vocab_file,
117
        do_lower_case=True,
118
        remove_space=True,
119
        keep_accents=False,
120
        bos_token="[CLS]",
121
        eos_token="[SEP]",
122
        unk_token="<unk>",
123
        sep_token="[SEP]",
124
        pad_token="<pad>",
125
        cls_token="[CLS]",
126
        mask_token="[MASK]",
127
        **kwargs
128
    ):
129
        super().__init__(
130
            bos_token=bos_token,
131
            eos_token=eos_token,
132
            unk_token=unk_token,
133
            sep_token=sep_token,
134
            pad_token=pad_token,
135
            cls_token=cls_token,
136
            mask_token=mask_token,
137
            **kwargs,
138
        )
139

140
        try:
141
            import sentencepiece as spm
142
        except ImportError:
143
            logger.warning(
144
                "You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece"
145
                "pip install sentencepiece"
146
            )
147
            raise
148

149
        self.do_lower_case = do_lower_case
150
        self.remove_space = remove_space
151
        self.keep_accents = keep_accents
152
        self.vocab_file = vocab_file
153

154
        self.sp_model = spm.SentencePieceProcessor()
155
        self.sp_model.Load(vocab_file)
156

157
    @property
158
    def vocab_size(self):
159
        return len(self.sp_model)
160

161
    def get_vocab(self):
162
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
163
        vocab.update(self.added_tokens_encoder)
164
        return vocab
165

166
    def __getstate__(self):
167
        state = self.__dict__.copy()
168
        state["sp_model"] = None
169
        return state
170

171
    def __setstate__(self, d):
172
        self.__dict__ = d
173
        try:
174
            import sentencepiece as spm
175
        except ImportError:
176
            logger.warning(
177
                "You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece"
178
                "pip install sentencepiece"
179
            )
180
            raise
181
        self.sp_model = spm.SentencePieceProcessor()
182
        self.sp_model.Load(self.vocab_file)
183

184
    def preprocess_text(self, inputs):
185
        if self.remove_space:
186
            outputs = " ".join(inputs.strip().split())
187
        else:
188
            outputs = inputs
189
        outputs = outputs.replace("``", '"').replace("''", '"')
190

191
        if not self.keep_accents:
192
            outputs = unicodedata.normalize("NFKD", outputs)
193
            outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
194
        if self.do_lower_case:
195
            outputs = outputs.lower()
196

197
        return outputs
198

199
    def _tokenize(self, text, sample=False):
200
        """ Tokenize a string. """
201
        text = self.preprocess_text(text)
202

203
        if not sample:
204
            pieces = self.sp_model.EncodeAsPieces(text)
205
        else:
206
            pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
207
        new_pieces = []
208
        for piece in pieces:
209
            if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
210
                cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
211
                if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
212
                    if len(cur_pieces[0]) == 1:
213
                        cur_pieces = cur_pieces[1:]
214
                    else:
215
                        cur_pieces[0] = cur_pieces[0][1:]
216
                cur_pieces.append(piece[-1])
217
                new_pieces.extend(cur_pieces)
218
            else:
219
                new_pieces.append(piece)
220

221
        return new_pieces
222

223
    def _convert_token_to_id(self, token):
224
        """ Converts a token (str) in an id using the vocab. """
225
        return self.sp_model.PieceToId(token)
226

227
    def _convert_id_to_token(self, index):
228
        """Converts an index (integer) in a token (str) using the vocab."""
229
        return self.sp_model.IdToPiece(index)
230

231
    def convert_tokens_to_string(self, tokens):
232
        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
233
        return out_string
234

235
    def build_inputs_with_special_tokens(
236
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
237
    ) -> List[int]:
238
        """
239
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
240
        by concatenating and adding special tokens.
241
        An ALBERT sequence has the following format:
242

243
        - single sequence: ``[CLS] X [SEP]``
244
        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
245

246
        Args:
247
            token_ids_0 (:obj:`List[int]`):
248
                List of IDs to which the special tokens will be added
249
            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
250
                Optional second list of IDs for sequence pairs.
251

252
        Returns:
253
            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
254
        """
255
        sep = [self.sep_token_id]
256
        cls = [self.cls_token_id]
257
        if token_ids_1 is None:
258
            return cls + token_ids_0 + sep
259
        return cls + token_ids_0 + sep + token_ids_1 + sep
260

261
    def get_special_tokens_mask(
262
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
263
    ) -> List[int]:
264
        """
265
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
266
        special tokens using the tokenizer ``prepare_for_model`` method.
267

268
        Args:
269
            token_ids_0 (:obj:`List[int]`):
270
                List of ids.
271
            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
272
                Optional second list of IDs for sequence pairs.
273
            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
274
                Set to True if the token list is already formatted with special tokens for the model
275

276
        Returns:
277
            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
278
        """
279

280
        if already_has_special_tokens:
281
            if token_ids_1 is not None:
282
                raise ValueError(
283
                    "You should not supply a second sequence if the provided sequence of "
284
                    "ids is already formatted with special tokens for the model."
285
                )
286
            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
287

288
        if token_ids_1 is not None:
289
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
290
        return [1] + ([0] * len(token_ids_0)) + [1]
291

292
    def create_token_type_ids_from_sequences(
293
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
294
    ) -> List[int]:
295
        """
296
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
297
        An ALBERT sequence pair mask has the following format:
298

299
        ::
300

301
            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
302
            | first sequence    | second sequence |
303

304
        if token_ids_1 is None, only returns the first portion of the mask (0s).
305

306
        Args:
307
            token_ids_0 (:obj:`List[int]`):
308
                List of ids.
309
            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
310
                Optional second list of IDs for sequence pairs.
311

312
        Returns:
313
            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
314
            sequence(s).
315
        """
316
        sep = [self.sep_token_id]
317
        cls = [self.cls_token_id]
318

319
        if token_ids_1 is None:
320
            return len(cls + token_ids_0 + sep) * [0]
321
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
322

323
    def save_vocabulary(self, save_directory):
324
        """
325
        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
326

327
        Args:
328
            save_directory (:obj:`str`):
329
                The directory in which to save the vocabulary.
330

331
        Returns:
332
            :obj:`Tuple(str)`: Paths to the files saved.
333
        """
334
        if not os.path.isdir(save_directory):
335
            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
336
            return
337
        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
338

339
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
340
            copyfile(self.vocab_file, out_vocab_file)
341

342
        return (out_vocab_file,)
343
CSS-LM

Использование cookies