CSS-LM

tokenization_utils_fast.py
522 строки · 20.7 Кб
Перенос по словам
1
# coding=utf-8
2
# Copyright 2020 The HuggingFace Inc. team.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
""" Tokenization classes for fast tokenizers (provided by HuggingFace's tokenizers library).
16
    For slow (python) tokenizers see tokenization_utils.py
17
"""
18

19
import logging
20
import os
21
from collections import defaultdict
22
from typing import Any, Dict, List, Optional, Tuple, Union
23

24
from tokenizers import Encoding as EncodingFast
25
from tokenizers.decoders import Decoder as DecoderFast
26
from tokenizers.implementations import BaseTokenizer as BaseTokenizerFast
27

28
from .file_utils import add_end_docstrings
29
from .tokenization_utils_base import (
30
    INIT_TOKENIZER_DOCSTRING,
31
    AddedToken,
32
    BatchEncoding,
33
    PaddingStrategy,
34
    PreTokenizedInput,
35
    PreTokenizedInputPair,
36
    PreTrainedTokenizerBase,
37
    TextInput,
38
    TextInputPair,
39
    TruncationStrategy,
40
)
41

42

43
logger = logging.getLogger(__name__)
44

45

46
@add_end_docstrings(
47
    INIT_TOKENIZER_DOCSTRING,
48
    """
49
    .. automethod:: __call__
50
    """,
51
)
52
class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
53
    """
54
    Base class for all fast tokenizers (wrapping HuggingFace tokenizers library).
55

56
    Inherits from :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase`.
57

58
    Handles all the shared methods for tokenization and special tokens, as well as methods for
59
    downloading/caching/loading pretrained tokenizers, as well as adding tokens to the vocabulary.
60

61
    This class also contains the added tokens in a unified way on top of all tokenizers so we don't
62
    have to handle the specific vocabulary augmentation methods of the various underlying
63
    dictionary structures (BPE, sentencepiece...).
64
    """
65

66
    def __init__(self, tokenizer: BaseTokenizerFast, **kwargs):
67
        if not isinstance(tokenizer, BaseTokenizerFast):
68
            raise ValueError(
69
                "Tokenizer should be an instance of a BaseTokenizer " "provided by HuggingFace tokenizers library."
70
            )
71
        self._tokenizer: BaseTokenizerFast = tokenizer
72

73
        # We call this after having initialized the backend tokenizer because we update it.
74
        super().__init__(**kwargs)
75

76
    @property
77
    def is_fast(self) -> bool:
78
        return True
79

80
    @property
81
    def vocab_size(self) -> int:
82
        """
83
        :obj:`int`: Size of the base vocabulary (without the added tokens).
84
        """
85
        return self._tokenizer.get_vocab_size(with_added_tokens=False)
86

87
    def get_vocab(self) -> Dict[str, int]:
88
        """
89
        Returns the vocabulary as a dictionary of token to index.
90

91
        :obj:`tokenizer.get_vocab()[token]` is equivalent to :obj:`tokenizer.convert_tokens_to_ids(token)` when
92
        :obj:`token` is in the vocab.
93

94
        Returns:
95
            :obj:`Dict[str, int]`: The vocabulary.
96
        """
97
        return self._tokenizer.get_vocab(with_added_tokens=True)
98

99
    def get_added_vocab(self) -> Dict[str, int]:
100
        """
101
        Returns the added tokens in the vocabulary as a dictionary of token to index.
102

103
        Returns:
104
            :obj:`Dict[str, int]`: The added tokens.
105
        """
106
        base_vocab = self._tokenizer.get_vocab(with_added_tokens=False)
107
        full_vocab = self._tokenizer.get_vocab(with_added_tokens=True)
108
        added_vocab = dict((tok, index) for tok, index in full_vocab.items() if tok not in base_vocab)
109
        return added_vocab
110

111
    def __len__(self) -> int:
112
        """
113
        Size of the full vocabulary with the added tokens.
114
        """
115
        return self._tokenizer.get_vocab_size(with_added_tokens=True)
116

117
    @property
118
    def backend_tokenizer(self) -> BaseTokenizerFast:
119
        """
120
        :obj:`tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend.
121
        """
122
        return self._tokenizer
123

124
    @property
125
    def decoder(self) -> DecoderFast:
126
        """
127
        :obj:`tokenizers.decoders.Decoder`: The Rust decoder for this tokenizer.
128
        """
129
        return self._tokenizer._tokenizer.decoder
130

131
    def _convert_encoding(
132
        self,
133
        encoding: EncodingFast,
134
        return_token_type_ids: Optional[bool] = None,
135
        return_attention_mask: Optional[bool] = None,
136
        return_overflowing_tokens: bool = False,
137
        return_special_tokens_mask: bool = False,
138
        return_offsets_mapping: bool = False,
139
        return_length: bool = False,
140
        verbose: bool = True,
141
    ) -> Dict[str, Any]:
142
        """ Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict.
143

144
            Overflowing tokens are converted to additional examples (like batches) so the output values of
145
            the dict are lists (overflows) of lists (tokens).
146

147
            Output shape: (overflows, sequence length)
148
        """
149
        if return_token_type_ids is None:
150
            return_token_type_ids = "token_type_ids" in self.model_input_names
151
        if return_attention_mask is None:
152
            return_attention_mask = "attention_mask" in self.model_input_names
153

154
        if return_overflowing_tokens and encoding.overflowing is not None:
155
            encodings = [encoding] + encoding.overflowing
156
        else:
157
            encodings = [encoding]
158

159
        encoding_dict = defaultdict(list)
160
        for e in encodings:
161
            encoding_dict["input_ids"].append(e.ids)
162

163
            if return_token_type_ids:
164
                encoding_dict["token_type_ids"].append(e.type_ids)
165
            if return_attention_mask:
166
                encoding_dict["attention_mask"].append(e.attention_mask)
167
            if return_special_tokens_mask:
168
                encoding_dict["special_tokens_mask"].append(e.special_tokens_mask)
169
            if return_offsets_mapping:
170
                encoding_dict["offset_mapping"].append(e.offsets)
171
            if return_length:
172
                encoding_dict["length"].append(len(e.ids))
173

174
        return encoding_dict
175

176
    def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
177
        """
178
        Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
179
        vocabulary.
180

181
        Args:
182
            token (:obj:`str` or :obj:`List[str]`): One or several token(s) to convert to token id(s).
183

184
        Returns:
185
            :obj:`int` or :obj:`List[int]`: The token id or list of token ids.
186
        """
187
        if tokens is None:
188
            return None
189

190
        if isinstance(tokens, str):
191
            return self._convert_token_to_id_with_added_voc(tokens)
192

193
        ids = []
194
        for token in tokens:
195
            ids.append(self._convert_token_to_id_with_added_voc(token))
196
        return ids
197

198
    def _convert_token_to_id_with_added_voc(self, token: str) -> int:
199
        index = self._tokenizer.token_to_id(token)
200
        if index is None:
201
            return self.unk_token_id
202
        return index
203

204
    def _convert_id_to_token(self, index: int) -> Optional[str]:
205
        return self._tokenizer.id_to_token(int(index))
206

207
    def _add_tokens(self, new_tokens: List[Union[str, AddedToken]], special_tokens=False) -> int:
208
        if special_tokens:
209
            return self._tokenizer.add_special_tokens(new_tokens)
210

211
        return self._tokenizer.add_tokens(new_tokens)
212

213
    def num_special_tokens_to_add(self, pair: bool = False) -> int:
214
        """
215
        Returns the number of added tokens when encoding a sequence with special tokens.
216

217
        .. note::
218
            This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not
219
            put this inside your training loop.
220

221
        Args:
222
            pair (:obj:`bool`, `optional`, defaults to :obj:`False`):
223
                Whether the number of added tokens should be computed in the case of a sequence pair or a single
224
                sequence.
225

226
        Returns:
227
            :obj:`int`: Number of special tokens added to sequences.
228
        """
229
        return self._tokenizer.num_special_tokens_to_add(pair)
230

231
    def convert_ids_to_tokens(
232
        self, ids: Union[int, List[int]], skip_special_tokens: bool = False
233
    ) -> Union[str, List[str]]:
234
        """
235
        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary
236
        and added tokens.
237

238
        Args:
239
            ids (:obj:`int` or :obj:`List[int]`):
240
                The token id (or token ids) to convert to tokens.
241
            skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
242
                Whether or not to remove special tokens in the decoding.
243

244
        Returns:
245
            :obj:`str` or :obj:`List[str]`: The decoded token(s).
246
        """
247
        if isinstance(ids, int):
248
            return self._tokenizer.id_to_token(ids)
249
        tokens = []
250
        for index in ids:
251
            index = int(index)
252
            if skip_special_tokens and index in self.all_special_ids:
253
                continue
254
            tokens.append(self._tokenizer.id_to_token(index))
255
        return tokens
256

257
    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False) -> List[str]:
258
        """
259
        Converts a string in a sequence of tokens, using the backend Rust tokenizer.
260

261
        Args:
262
            text (:obj:`str`):
263
                The sequence to be encoded.
264
            pair (:obj:`str`, `optional`):
265
                A second sequence to be encoded with the first.
266
            add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
267
                Whether or not to add the special tokens associated with the corresponding model.
268

269
        Returns:
270
            :obj:`List[str]`: The list of tokens.
271
        """
272
        return self._tokenizer.encode(text, pair, add_special_tokens=add_special_tokens).tokens
273

274
    def set_truncation_and_padding(
275
        self,
276
        padding_strategy: PaddingStrategy,
277
        truncation_strategy: TruncationStrategy,
278
        max_length: int,
279
        stride: int,
280
        pad_to_multiple_of: Optional[int],
281
    ):
282
        """
283
        Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers
284
        library) and restore the tokenizer settings afterwards.
285

286
        The provided tokenizer has no padding / truncation strategy before the managed section. If your tokenizer set a
287
        padding / truncation strategy before, then it will be reset to no padding / truncation when exiting the managed
288
        section.
289

290
        Args:
291
            padding_strategy (:class:`~transformers.tokenization_utils_base.PaddingStrategy`):
292
                The kind of padding that will be applied to the input
293
            truncation_strategy (:class:`~transformers.tokenization_utils_base.TruncationStrategy`):
294
                The kind of truncation that will be applied to the input
295
            max_length (:obj:`int`):
296
                The maximum size of a sequence.
297
            stride (:obj:`int`):
298
                The stride to use when handling overflow.
299
            pad_to_multiple_of (:obj:`int`, `optional`):
300
                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
301
                the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
302
        """
303
        # Set truncation and padding on the backend tokenizer
304
        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
305
            self._tokenizer.enable_truncation(max_length, stride=stride, strategy=truncation_strategy.value)
306
        else:
307
            self._tokenizer.no_truncation()
308

309
        if padding_strategy != PaddingStrategy.DO_NOT_PAD:
310
            self._tokenizer.enable_padding(
311
                length=max_length if padding_strategy == PaddingStrategy.MAX_LENGTH else None,
312
                direction=self.padding_side,
313
                pad_id=self.pad_token_id,
314
                pad_type_id=self.pad_token_type_id,
315
                pad_token=self.pad_token,
316
                pad_to_multiple_of=pad_to_multiple_of,
317
            )
318
        else:
319
            self._tokenizer.no_padding()
320

321
    def _batch_encode_plus(
322
        self,
323
        batch_text_or_text_pairs: Union[
324
            List[TextInput], List[TextInputPair], List[PreTokenizedInput], List[PreTokenizedInputPair]
325
        ],
326
        add_special_tokens: bool = True,
327
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
328
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
329
        max_length: Optional[int] = None,
330
        stride: int = 0,
331
        is_pretokenized: bool = False,
332
        pad_to_multiple_of: Optional[int] = None,
333
        return_tensors: Optional[str] = None,
334
        return_token_type_ids: Optional[bool] = None,
335
        return_attention_mask: Optional[bool] = None,
336
        return_overflowing_tokens: bool = False,
337
        return_special_tokens_mask: bool = False,
338
        return_offsets_mapping: bool = False,
339
        return_length: bool = False,
340
        verbose: bool = True,
341
        **kwargs
342
    ) -> BatchEncoding:
343

344
        if not isinstance(batch_text_or_text_pairs, list):
345
            raise ValueError(
346
                "batch_text_or_text_pairs has to be a list (got {})".format(type(batch_text_or_text_pairs))
347
            )
348

349
        if kwargs:
350
            raise ValueError(f"Keyword arguments {kwargs} not recognized.")
351

352
        # Set the truncation and padding strategy and restore the initial configuration
353
        self.set_truncation_and_padding(
354
            padding_strategy=padding_strategy,
355
            truncation_strategy=truncation_strategy,
356
            max_length=max_length,
357
            stride=stride,
358
            pad_to_multiple_of=pad_to_multiple_of,
359
        )
360

361
        # Avoid thread overhead if only one example.
362
        if len(batch_text_or_text_pairs) == 1:
363
            if isinstance(batch_text_or_text_pairs[0], tuple):
364
                # We got a Tuple with a pair of sequences
365
                encodings = self._tokenizer.encode(
366
                    *batch_text_or_text_pairs[0],
367
                    add_special_tokens=add_special_tokens,
368
                    is_pretokenized=is_pretokenized,
369
                )
370
            else:
371
                # We got a single sequence
372
                encodings = self._tokenizer.encode(
373
                    batch_text_or_text_pairs[0],
374
                    add_special_tokens=add_special_tokens,
375
                    is_pretokenized=is_pretokenized,
376
                )
377
            encodings = [encodings]
378
        else:
379
            encodings = self._tokenizer.encode_batch(
380
                batch_text_or_text_pairs, add_special_tokens=add_special_tokens, is_pretokenized=is_pretokenized
381
            )
382

383
        # Convert encoding to dict
384
        # `Tokens` has type: List[Dict[str, List[List[int]]]] or List[Dict[str, 2D-Tensor]]
385
        # with nested dimensions corresponding to batch, overflows, sequence length
386
        tokens = [
387
            self._convert_encoding(
388
                encoding=encoding,
389
                return_token_type_ids=return_token_type_ids,
390
                return_attention_mask=return_attention_mask,
391
                return_overflowing_tokens=return_overflowing_tokens,
392
                return_special_tokens_mask=return_special_tokens_mask,
393
                return_offsets_mapping=return_offsets_mapping,
394
                return_length=return_length,
395
                verbose=verbose,
396
            )
397
            for encoding in encodings
398
        ]
399

400
        # Convert the output to have dict[list] from list[dict]
401
        sanitized = {}
402
        for key in tokens[0].keys():
403
            # To List[List[List[int]]] of shape (batch, overflows, sequence length)
404
            stack = [e for item in tokens for e in item[key]]
405
            sanitized[key] = stack
406

407
        # If returning overflowing tokens, we need to return a mapping
408
        # from the batch idx to the original sample
409
        if return_overflowing_tokens:
410
            overflow_to_sample_mapping = []
411
            for i, enc in enumerate(tokens):
412
                overflow_to_sample_mapping += [i] * len(enc["input_ids"])
413
            sanitized["overflow_to_sample_mapping"] = overflow_to_sample_mapping
414

415
        return BatchEncoding(sanitized, encodings, tensor_type=return_tensors)
416

417
    def _encode_plus(
418
        self,
419
        text: Union[TextInput, PreTokenizedInput],
420
        text_pair: Optional[Union[TextInput, PreTokenizedInput]] = None,
421
        add_special_tokens: bool = True,
422
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
423
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
424
        max_length: Optional[int] = None,
425
        stride: int = 0,
426
        is_pretokenized: bool = False,
427
        pad_to_multiple_of: Optional[int] = None,
428
        return_tensors: Optional[bool] = None,
429
        return_token_type_ids: Optional[bool] = None,
430
        return_attention_mask: Optional[bool] = None,
431
        return_overflowing_tokens: bool = False,
432
        return_special_tokens_mask: bool = False,
433
        return_offsets_mapping: bool = False,
434
        return_length: bool = False,
435
        verbose: bool = True,
436
        **kwargs
437
    ) -> BatchEncoding:
438

439
        batched_input = [(text, text_pair)] if text_pair else [text]
440
        batched_output = self._batch_encode_plus(
441
            batched_input,
442
            is_pretokenized=is_pretokenized,
443
            add_special_tokens=add_special_tokens,
444
            padding_strategy=padding_strategy,
445
            truncation_strategy=truncation_strategy,
446
            max_length=max_length,
447
            stride=stride,
448
            pad_to_multiple_of=pad_to_multiple_of,
449
            return_tensors=return_tensors,
450
            return_token_type_ids=return_token_type_ids,
451
            return_attention_mask=return_attention_mask,
452
            return_overflowing_tokens=return_overflowing_tokens,
453
            return_special_tokens_mask=return_special_tokens_mask,
454
            return_offsets_mapping=return_offsets_mapping,
455
            return_length=return_length,
456
            verbose=verbose,
457
            **kwargs,
458
        )
459

460
        # Return tensor is None, then we can remove the leading batch axis
461
        # Overfolwing tokens are returned as a batch of output so we keep them in this case
462
        if return_tensors is None and not return_overflowing_tokens:
463
            batched_output = BatchEncoding(
464
                {
465
                    key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
466
                    for key, value in batched_output.items()
467
                },
468
                batched_output.encodings,
469
            )
470

471
        return batched_output
472

473
    def decode(
474
        self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True
475
    ) -> str:
476
        """
477
        Converts a sequence of ids in a string, using the tokenizer and vocabulary
478
        with options to remove special tokens and clean up tokenization spaces.
479

480
        Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
481

482
        Args:
483
            token_ids (:obj:`List[int]`):
484
                List of tokenized input ids. Can be obtained using the ``__call__`` method.
485
            skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
486
                Whether or not to remove special tokens in the decoding.
487
            clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`):
488
                Whether or not to clean up the tokenization spaces.
489

490
        Returns:
491
            :obj:`str`: The decoded sentence.
492
        """
493
        text = self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
494

495
        if clean_up_tokenization_spaces:
496
            clean_text = self.clean_up_tokenization(text)
497
            return clean_text
498
        else:
499
            return text
500

501
    def save_vocabulary(self, save_directory: str) -> Tuple[str]:
502
        """
503
        Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens
504
        and special token mappings.
505

506
        .. warning::
507
            Please use :meth:`~transformers.PreTrainedTokenizer.save_pretrained` to save the full tokenizer state if
508
            you want to reload it using the :meth:`~transformers.PreTrainedTokenizer.from_pretrained` class method.
509

510
        Args:
511
            save_directory (:obj:`str`): The path to adirectory where the tokenizer will be saved.
512

513
        Returns:
514
            A tuple of :obj:`str`: The files saved.
515
        """
516
        if os.path.isdir(save_directory):
517
            files = self._tokenizer.save_model(save_directory)
518
        else:
519
            folder, file = os.path.split(os.path.abspath(save_directory))
520
            files = self._tokenizer.save_model(folder, name=file)
521

522
        return tuple(files)
523
CSS-LM

Использование cookies