CSS-LM

tokenization_utils_base.py
2692 строки · 124.7 Кб
Перенос по словам
1
# coding=utf-8
2
# Copyright 2020 The HuggingFace Inc. team.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
""" Base classes common to both the slow and the fast tokenization classes:
16
    PreTrainedTokenizerBase (host all the user fronting encoding methodes)
17
    Special token mixing (host the special tokens logic) and
18
    BatchEncoding (wrap the dictionnary of output with special method for the Fast tokenizers)
19
"""
20

21
import copy
22
import json
23
import logging
24
import os
25
import warnings
26
from collections import UserDict
27
from enum import Enum
28
from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
29

30
import numpy as np
31
from tokenizers import AddedToken
32
from tokenizers import Encoding as EncodingFast
33

34
from .file_utils import (
35
    add_end_docstrings,
36
    cached_path,
37
    hf_bucket_url,
38
    is_remote_url,
39
    is_tf_available,
40
    is_torch_available,
41
    torch_required,
42
)
43

44

45
if is_tf_available():
46
    import tensorflow as tf
47
if is_torch_available():
48
    import torch
49

50

51
logger = logging.getLogger(__name__)
52

53
VERY_LARGE_INTEGER = int(1e30)  # This is used to set the max input length for a model with infinite size input
54
LARGE_INTEGER = int(1e20)  # This is used when we need something big but slightly smaller than VERY_LARGE_INTEGER
55

56
# Define type aliases and NamedTuples
57
TextInput = str
58
PreTokenizedInput = List[str]
59
EncodedInput = List[int]
60
TextInputPair = Tuple[str, str]
61
PreTokenizedInputPair = Tuple[List[str], List[str]]
62
EncodedInputPair = Tuple[List[int], List[int]]
63

64

65
# Slow tokenizers used to be saved in three separated files
66
SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
67
ADDED_TOKENS_FILE = "added_tokens.json"
68
TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
69

70
# Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file
71
FULL_TOKENIZER_FILE = "tokenizer.json"
72

73

74
class ExplicitEnum(Enum):
75
    """
76
    Enum with more explicit error message for missing values.
77
    """
78

79
    @classmethod
80
    def _missing_(cls, value):
81
        raise ValueError(
82
            "%r is not a valid %s, please select one of %s"
83
            % (value, cls.__name__, str(list(cls._value2member_map_.keys())))
84
        )
85

86

87
class TruncationStrategy(ExplicitEnum):
88
    """
89
    Possible values for the ``truncation`` argument in :meth:`PreTrainedTokenizerBase.__call__`.
90
    Useful for tab-completion in an IDE.
91
    """
92

93
    ONLY_FIRST = "only_first"
94
    ONLY_SECOND = "only_second"
95
    LONGEST_FIRST = "longest_first"
96
    DO_NOT_TRUNCATE = "do_not_truncate"
97

98

99
class PaddingStrategy(ExplicitEnum):
100
    """
101
    Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`.
102
    Useful for tab-completion in an IDE.
103
    """
104

105
    LONGEST = "longest"
106
    MAX_LENGTH = "max_length"
107
    DO_NOT_PAD = "do_not_pad"
108

109

110
class TensorType(ExplicitEnum):
111
    """
112
    Possible values for the ``return_tensors`` argument in :meth:`PreTrainedTokenizerBase.__call__`.
113
    Useful for tab-completion in an IDE.
114
    """
115

116
    PYTORCH = "pt"
117
    TENSORFLOW = "tf"
118
    NUMPY = "np"
119

120

121
class CharSpan(NamedTuple):
122
    """
123
    Character span in the original string.
124

125
    Args:
126
        start (:obj:`int`): Index of the first character in the original string.
127
        end (:obj:`int`): Index of the character following the last character in the original string.
128
    """
129

130
    start: int
131
    end: int
132

133

134
class TokenSpan(NamedTuple):
135
    """
136
    Token span in an encoded string (list of tokens).
137

138
    Args:
139
        start (:obj:`int`): Index of the first token in the span.
140
        end (:obj:`int`): Index of the token following the last token in the span.
141
    """
142

143
    start: int
144
    end: int
145

146

147
class BatchEncoding(UserDict):
148
    """
149
    Holds the output of the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.encode_plus`
150
    and :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.batch_encode` methods (tokens,
151
    attention_masks, etc).
152

153
    This class is derived from a python dictionary and can be used as a dictionary. In addition, this class exposes
154
    utility methods to map from word/character space to token space.
155

156
    Args:
157
        data (:obj:`dict`):
158
            Dictionary of lists/arrays/tensors returned by the encode/batch_encode methods ('input_ids',
159
            'attention_mask', etc.).
160
        encoding (:obj:`tokenizers.Encoding` or :obj:`Sequence[tokenizers.Encoding]`, `optional`):
161
            If the tokenizer is a fast tokenizer which outputs additional informations like mapping from word/character
162
            space to token space the :obj:`tokenizers.Encoding` instance or list of instance (for batches) hold these
163
            informations.
164
        tensor_type (:obj:`Union[None, str, TensorType]`, `optional`):
165
            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
166
            initialization.
167
        prepend_batch_axis (:obj:`bool`, `optional`, defaults to :obj:`False`):
168
            Whether or not to add a batch axis when converting to tensors (see :obj:`tensor_type` above).
169
    """
170

171
    def __init__(
172
        self,
173
        data: Optional[Dict[str, Any]] = None,
174
        encoding: Optional[Union[EncodingFast, Sequence[EncodingFast]]] = None,
175
        tensor_type: Union[None, str, TensorType] = None,
176
        prepend_batch_axis: bool = False,
177
    ):
178
        super().__init__(data)
179

180
        if isinstance(encoding, EncodingFast):
181
            encoding = [encoding]
182

183
        self._encodings = encoding
184

185
        self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)
186

187
    @property
188
    def is_fast(self) -> bool:
189
        """
190
        :obj:`bool`: Indicate whether this :class:`~transformers.BatchEncoding` was generated from the result of a
191
        :class:`~transformers.PreTrainedTokenizerFast` or not.
192
        """
193
        return self._encodings is not None
194

195
    def __getitem__(self, item: Union[int, str]) -> Union[Any, EncodingFast]:
196
        """
197
        If the key is a string, returns the value of the dict associated to :obj:`key` ('input_ids',
198
        'attention_mask', etc.).
199

200
        If the key is an integer, get the :obj:`tokenizers.Encoding` for batch item with index :obj:`key`.
201
        """
202
        if isinstance(item, str):
203
            return self.data[item]
204
        elif self._encodings is not None:
205
            return self._encodings[item]
206
        else:
207
            raise KeyError(
208
                "Indexing with integers (to access backend Encoding for a given batch index) "
209
                "is not available when using Python based tokenizers"
210
            )
211

212
    def __getattr__(self, item: str):
213
        try:
214
            return self.data[item]
215
        except KeyError:
216
            raise AttributeError
217

218
    def __getstate__(self):
219
        return {"data": self.data, "encodings": self._encodings}
220

221
    def __setstate__(self, state):
222
        if "data" in state:
223
            self.data = state["data"]
224

225
        if "encodings" in state:
226
            self._encodings = state["encodings"]
227

228
    def keys(self):
229
        return self.data.keys()
230

231
    def values(self):
232
        return self.data.values()
233

234
    def items(self):
235
        return self.data.items()
236

237
    # After this point:
238
    # Extended properties and methods only available for fast (Rust-based) tokenizers
239
    # provided by HuggingFace tokenizers library.
240

241
    @property
242
    def encodings(self) -> Optional[List[EncodingFast]]:
243
        """
244
        :obj:`Optional[List[tokenizers.Encoding]]`: The list all encodings from the tokenization process.
245
        Returns :obj:`None` if the input was tokenized through Python (i.e., not a fast) tokenizer.
246
        """
247
        return self._encodings
248

249
    def tokens(self, batch_index: int = 0) -> List[str]:
250
        """
251
        Return the list of tokens (sub-parts of the input strings after word/subword splitting and before converstion
252
        to integer indices) at a given batch index (only works for the output of a fast tokenizer).
253

254
        Args:
255
            batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.
256

257
        Returns:
258
            :obj:`List[str]`: The list of tokens at that index.
259
        """
260
        if not self._encodings:
261
            raise ValueError("tokens() is not available when using Python-based tokenizers")
262
        return self._encodings[batch_index].tokens
263

264
    def words(self, batch_index: int = 0) -> List[Optional[int]]:
265
        """
266
        Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
267

268
        Args:
269
            batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.
270

271
        Returns:
272
            :obj:`List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by
273
            the tokenizer are mapped to :obj:`None` and other tokens are mapped to the index of their corresponding
274
            word (several tokens will be mapped to the same word index if they are parts of that word).
275
        """
276
        if not self._encodings:
277
            raise ValueError("words() is not available when using Python-based tokenizers")
278
        return self._encodings[batch_index].words
279

280
    def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
281
        """
282
        Get the index of the word corresponding (i.e. comprising) to an encoded token
283
        in a sequence of the batch.
284

285
        Can be called as:
286

287
        - ``self.token_to_word(token_index)`` if batch size is 1
288
        - ``self.token_to_word(batch_index, token_index)`` if batch size is greater than 1
289

290
        This method is particularly suited when the input sequences are provided as
291
        pre-tokenized sequences (i.e., words are defined by the user). In this case it allows
292
        to easily associate encoded tokens with provided tokenized words.
293

294
        Args:
295
            batch_or_token_index (:obj:`int`):
296
                Index of the sequence in the batch. If the batch only comprise one sequence,
297
                this can be the index of the token in the sequence.
298
            token_index (:obj:`int`, `optional`):
299
                If a batch index is provided in `batch_or_token_index`, this can be the index
300
                of the token in the sequence.
301

302
        Returns:
303
            :obj:`int`: Index of the word in the input sequence.
304
        """
305

306
        if not self._encodings:
307
            raise ValueError("token_to_word() is not available when using Python based tokenizers")
308
        if token_index is not None:
309
            batch_index = batch_or_token_index
310
        else:
311
            batch_index = 0
312
            token_index = batch_or_token_index
313
        if batch_index < 0:
314
            batch_index = self._batch_size + batch_index
315
        if token_index < 0:
316
            token_index = self._seq_len + token_index
317
        return self._encodings[batch_index].token_to_word(token_index)
318

319
    def word_to_tokens(self, batch_or_word_index: int, word_index: Optional[int] = None) -> TokenSpan:
320
        """
321
        Get the encoded token span corresponding to a word in the sequence of the batch.
322

323
        Token spans are returned as a :class:`~transformers.tokenization_utils_base.TokenSpan` with:
324

325
        - **start** -- Index of the first token.
326
        - **end** -- Index of the token following the last token.
327

328
        Can be called as:
329

330
        - ``self.word_to_tokens(word_index)`` if batch size is 1
331
        - ``self.word_to_tokens(batch_index, word_index)`` if batch size is greater or equal to 1
332

333
        This method is particularly suited when the input sequences are provided as
334
        pre-tokenized sequences (i.e. words are defined by the user). In this case it allows
335
        to easily associate encoded tokens with provided tokenized words.
336

337
        Args:
338
            batch_or_word_index (:obj:`int`):
339
                Index of the sequence in the batch. If the batch only comprises one sequence,
340
                this can be the index of the word in the sequence.
341
            word_index (:obj:`int`, `optional`):
342
                If a batch index is provided in `batch_or_token_index`, this can be the index
343
                of the word in the sequence.
344

345
        Returns:
346
            :class:`~transformers.tokenization_utils_base.TokenSpan`
347
            Span of tokens in the encoded sequence.
348
        """
349

350
        if not self._encodings:
351
            raise ValueError("word_to_tokens() is not available when using Python based tokenizers")
352
        if word_index is not None:
353
            batch_index = batch_or_word_index
354
        else:
355
            batch_index = 0
356
            word_index = batch_or_word_index
357
        if batch_index < 0:
358
            batch_index = self._batch_size + batch_index
359
        if word_index < 0:
360
            word_index = self._seq_len + word_index
361
        return TokenSpan(*(self._encodings[batch_index].word_to_tokens(word_index)))
362

363
    def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> CharSpan:
364
        """
365
        Get the character span corresponding to an encoded token in a sequence of the batch.
366

367
        Character spans are returned as a :class:`~transformers.tokenization_utils_base.CharSpan` with:
368

369
        - **start** -- Index of the first character in the original string associated to the token.
370
        - **end** -- Index of the character following the last character in the original string associated to the
371
          token.
372

373
        Can be called as:
374

375
        - ``self.token_to_chars(token_index)`` if batch size is 1
376
        - ``self.token_to_chars(batch_index, token_index)`` if batch size is greater or equal to 1
377

378
        Args:
379
            batch_or_token_index (:obj:`int`):
380
                Index of the sequence in the batch. If the batch only comprise one sequence,
381
                this can be the index of the token in the sequence.
382
            token_index (:obj:`int`, `optional`):
383
                If a batch index is provided in `batch_or_token_index`, this can be the index
384
                of the token or tokens in the sequence.
385

386
        Returns:
387
            :class:`~transformers.tokenization_utils_base.CharSpan`:
388
            Span of characters in the original string.
389
        """
390

391
        if not self._encodings:
392
            raise ValueError("token_to_chars() is not available when using Python based tokenizers")
393
        if token_index is not None:
394
            batch_index = batch_or_token_index
395
        else:
396
            batch_index = 0
397
            token_index = batch_or_token_index
398
        return CharSpan(*(self._encodings[batch_index].token_to_chars(token_index)))
399

400
    def char_to_token(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int:
401
        """
402
        Get the index of the token in the encoded output comprising a character
403
        in the original string for a sequence of the batch.
404

405
        Can be called as:
406

407
        - ``self.char_to_token(char_index)`` if batch size is 1
408
        - ``self.char_to_token(batch_index, char_index)`` if batch size is greater or equal to 1
409

410
        This method is particularly suited when the input sequences are provided as
411
        pre-tokenized sequences (i.e. words are defined by the user). In this case it allows
412
        to easily associate encoded tokens with provided tokenized words.
413

414
        Args:
415
            batch_or_char_index (:obj:`int`):
416
                Index of the sequence in the batch. If the batch only comprise one sequence,
417
                this can be the index of the word in the sequence
418
            char_index (:obj:`int`, `optional`):
419
                If a batch index is provided in `batch_or_token_index`, this can be the index
420
                of the word in the sequence.
421

422

423
        Returns:
424
            :obj:`int`: Index of the token.
425
        """
426

427
        if not self._encodings:
428
            raise ValueError("char_to_token() is not available when using Python based tokenizers")
429
        if char_index is not None:
430
            batch_index = batch_or_char_index
431
        else:
432
            batch_index = 0
433
            char_index = batch_or_char_index
434
        return self._encodings[batch_index].char_to_token(char_index)
435

436
    def word_to_chars(self, batch_or_word_index: int, word_index: Optional[int] = None) -> CharSpan:
437
        """
438
        Get the character span in the original string corresponding to given word in a sequence
439
        of the batch.
440

441
        Character spans are returned as a CharSpan NamedTuple with:
442

443
        - start: index of the first character in the original string
444
        - end: index of the character following the last character in the original string
445

446
        Can be called as:
447

448
        - ``self.word_to_chars(word_index)`` if batch size is 1
449
        - ``self.word_to_chars(batch_index, word_index)`` if batch size is greater or equal to 1
450

451
        Args:
452
            batch_or_word_index (:obj:`int`):
453
                Index of the sequence in the batch. If the batch only comprise one sequence,
454
                this can be the index of the word in the sequence
455
            word_index (:obj:`int`, `optional`):
456
                If a batch index is provided in `batch_or_token_index`, this can be the index
457
                of the word in the sequence.
458

459
        Returns:
460
            :obj:`CharSpan` or :obj:`List[CharSpan]`:
461
                Span(s) of the associated character or characters in the string.
462
                CharSpan are NamedTuple with:
463

464
                - start: index of the first character associated to the token in the original string
465
                - end: index of the character following the last character associated to the token in the original string
466
        """
467

468
        if not self._encodings:
469
            raise ValueError("word_to_chars() is not available when using Python based tokenizers")
470
        if word_index is not None:
471
            batch_index = batch_or_word_index
472
        else:
473
            batch_index = 0
474
            word_index = batch_or_word_index
475
        return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index)))
476

477
    def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int:
478
        """
479
        Get the word in the original string corresponding to a character in the original string of
480
        a sequence of the batch.
481

482
        Can be called as:
483

484
        - ``self.char_to_word(char_index)`` if batch size is 1
485
        - ``self.char_to_word(batch_index, char_index)`` if batch size is greater than 1
486

487
        This method is particularly suited when the input sequences are provided as
488
        pre-tokenized sequences (i.e. words are defined by the user). In this case it allows
489
        to easily associate encoded tokens with provided tokenized words.
490

491
        Args:
492
            batch_or_char_index (:obj:`int`):
493
                Index of the sequence in the batch. If the batch only comprise one sequence,
494
                this can be the index of the character in the orginal string.
495
            char_index (:obj:`int`, `optional`):
496
                If a batch index is provided in `batch_or_token_index`, this can be the index
497
                of the character in the orginal string.
498

499

500
        Returns:
501
            :obj:`int` or :obj:`List[int]`:
502
                Index or indices of the associated encoded token(s).
503
        """
504

505
        if not self._encodings:
506
            raise ValueError("char_to_word() is not available when using Python based tokenizers")
507
        if char_index is not None:
508
            batch_index = batch_or_char_index
509
        else:
510
            batch_index = 0
511
            char_index = batch_or_char_index
512
        return self._encodings[batch_index].char_to_word(char_index)
513

514
    def convert_to_tensors(
515
        self, tensor_type: Optional[Union[str, TensorType]] = None, prepend_batch_axis: bool = False
516
    ):
517
        """
518
        Convert the inner content to tensors.
519

520
        Args:
521
            tensor_type (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
522
                The type of tensors to use. If :obj:`str`, should be one of the values of the enum
523
                :class:`~transformers.tokenization_utils_base.TensorType`. If :obj:`None`, no modification is done.
524
            prepend_batch_axis (:obj:`int`, `optional`, defaults to :obj:`False`):
525
                Whether or not to add the batch dimension during the conversion.
526
        """
527
        if tensor_type is None:
528
            return self
529

530
        # Convert to TensorType
531
        if not isinstance(tensor_type, TensorType):
532
            tensor_type = TensorType(tensor_type)
533

534
        # Get a function reference for the correct framework
535
        if tensor_type == TensorType.TENSORFLOW and is_tf_available():
536
            as_tensor = tf.constant
537
        elif tensor_type == TensorType.PYTORCH and is_torch_available():
538
            as_tensor = torch.tensor
539
        elif tensor_type == TensorType.NUMPY:
540
            as_tensor = np.asarray
541
        else:
542
            raise ImportError(
543
                "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
544
                    tensor_type
545
                )
546
            )
547

548
        # Do the tensor conversion in batch
549
        for key, value in self.items():
550
            try:
551
                if prepend_batch_axis:
552
                    value = [value]
553

554
                tensor = as_tensor(value)
555

556
                # at-least2d
557
                if tensor.ndim > 2:
558
                    tensor = tensor.squeeze(0)
559
                elif tensor.ndim < 2:
560
                    tensor = tensor[None, :]
561

562
                self[key] = tensor
563
            except:  # noqa E722
564
                if key == "overflowing_tokens":
565
                    raise ValueError(
566
                        "Unable to create tensor returning overflowing tokens of different lengths. "
567
                        "Please see if a fast version of this tokenizer is available to have this feature available."
568
                    )
569
                raise ValueError(
570
                    "Unable to create tensor, you should probably activate truncation and/or padding "
571
                    "with 'padding=True' 'truncation=True' to have batched tensors with the same length."
572
                )
573

574
        return self
575

576
    @torch_required
577
    def to(self, device: str) -> "BatchEncoding":
578
        """
579
        Send all values to device by calling :obj:`v.to(device)` (PyTorch only).
580

581
        Args:
582
            device (:obj:`str` or :obj:`torch.device`): The device to put the tensors on.
583

584
        Returns:
585
            :class:`~transformers.BatchEncoding`:
586
            The same instance of :class:`~transformers.BatchEncoding` after modification.
587
        """
588
        self.data = {k: v.to(device) for k, v in self.data.items()}
589
        return self
590

591

592
# class AddedToken(UserString):
593
#     """ AddedToken represents a token to be added to a Tokenizer
594

595
#         An AddedToken can have special options defining the way it should behave.
596

597
#         Args:
598
#             content: str:
599
#                 The content of the token
600

601
#             single_word: bool
602
#                 Whether this token should only match against single word. If True,
603
#                 this token will never match inside of a word.
604

605
#             lstrip: bool
606
#                 Whether this token should strip all potential whitespaces on the left side.
607
#                 If True, this token will greedily match any whitespace on the left and then strip
608
#                 them out.
609

610
#             rstrip: bool
611
#                 Whether this token should strip all potential whitespaces on the right side.
612
#                 If True, this token will greedily match any whitespace on the right and then strip
613
#                 them out.
614
#     """
615

616
#     def __init__(
617
#         self, data: str, single_word: bool = False, lstrip: bool = False, rstrip: bool = False,
618
#     ):
619
#         super().__init__(data)
620

621
#         self._single_word = single_word
622
#         self._lstrip = lstrip
623
#         self._rstrip = rstrip
624

625
#     def lower(self):
626
#         return AddedToken(self.data.lower(), self._single_word, self._lstrip, self._rstrip)
627

628

629
class SpecialTokensMixin:
630
    """
631
    A mixin derived by :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast`
632
    to handle specific behaviors related to special tokens. In particular, this class hold the attributes which can be
633
    used to directly access these special tokens in a model-independant manner and allow to set and update the special
634
    tokens.
635

636
    Args:
637
        bos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
638
            A special token representing the beginning of a sentence.
639
        eos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
640
            A special token representing the end of a sentence.
641
        unk_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
642
            A special token representing an out-of-vocabulary token.
643
        sep_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
644
            A special token separating two different sentences in the same input (used by BERT for instance).
645
        pad_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
646
            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
647
            attention mechanisms or loss computation.
648
        cls_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
649
            A special token representing the class of the input (used by BERT for instance).
650
        mask_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
651
            A special token representing a masked token (used by masked-language modeling pretraining objectives, like
652
            BERT).
653
        additional_special_tokens (tuple or list of :obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
654
            A tuple or a list of additional special tokens.
655
    """
656

657
    SPECIAL_TOKENS_ATTRIBUTES = [
658
        "bos_token",
659
        "eos_token",
660
        "unk_token",
661
        "sep_token",
662
        "pad_token",
663
        "cls_token",
664
        "mask_token",
665
        "additional_special_tokens",
666
    ]
667

668
    def __init__(self, verbose=True, **kwargs):
669
        self._bos_token = None
670
        self._eos_token = None
671
        self._unk_token = None
672
        self._sep_token = None
673
        self._pad_token = None
674
        self._cls_token = None
675
        self._mask_token = None
676
        self._pad_token_type_id = 0
677
        self._additional_special_tokens = []
678
        self.verbose = verbose
679

680
        # We directly set the hidden value to allow initialization with special tokens
681
        # which are not yet in the vocabulary. Necesssary for serialization/de-serialization
682
        # TODO clean this up at some point (probably by sitching to fast tokenizers)
683
        for key, value in kwargs.items():
684
            if key in self.SPECIAL_TOKENS_ATTRIBUTES:
685
                if key == "additional_special_tokens":
686
                    assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value)
687
                    setattr(self, key, value)
688
                elif isinstance(value, (str, AddedToken)):
689
                    setattr(self, key, value)
690
                else:
691
                    raise TypeError(
692
                        "special token {} has to be either str or AddedToken but got: {}".format(key, type(value))
693
                    )
694

695
    def sanitize_special_tokens(self) -> int:
696
        """
697
        Make sure that all the special tokens attributes of the tokenizer (:obj:`tokenizer.mask_token`,
698
        :obj:`tokenizer.cls_token`, etc.) are in the vocabulary.
699

700
        Add the missing ones to the vocabulary if needed.
701

702
        Return:
703
            :obj:`int`: The number of tokens added in the vocaulary during the operation.
704
        """
705
        return self.add_tokens(self.all_special_tokens_extended, special_tokens=True)
706

707
    def add_special_tokens(self, special_tokens_dict: Dict[str, Union[str, AddedToken]]) -> int:
708
        """
709
        Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder and link them to class attributes. If
710
        special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the
711
        current vocabulary).
712

713
        Using : obj:`add_special_tokens` will ensure your special tokens can be used in several ways:
714

715
        - Special tokens are carefully handled by the tokenizer (they are never split).
716
        - You can easily refer to special tokens using tokenizer class attributes like :obj:`tokenizer.cls_token`. This
717
          makes it easy to develop model-agnostic training and fine-tuning scripts.
718

719
        When possible, special tokens are already registered for provided pretrained models (for instance
720
        :class:`~transformers.BertTokenizer` :obj:`cls_token` is already registered to be :obj`'[CLS]'` and XLM's one
721
        is also registered to be :obj:`'</s>'`).
722

723
        Args:
724
            special_tokens_dict (dictionary `str` to `str` or :obj:`tokenizers.AddedToken`):
725
                Keys should be in the list of predefined special attributes: [``bos_token``, ``eos_token``,
726
                ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``,
727
                ``additional_special_tokens``].
728

729
                Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer
730
                assign the index of the ``unk_token`` to them).
731

732
        Returns:
733
            :obj:`int`: Number of tokens added to the vocabulary.
734

735
        Examples::
736

737
            # Let's see how to add a new classification token to GPT-2
738
            tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
739
            model = GPT2Model.from_pretrained('gpt2')
740

741
            special_tokens_dict = {'cls_token': '<CLS>'}
742

743
            num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
744
            print('We have added', num_added_toks, 'tokens')
745
            # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
746
            model.resize_token_embeddings(len(tokenizer))
747

748
            assert tokenizer.cls_token == '<CLS>'
749
        """
750
        if not special_tokens_dict:
751
            return 0
752

753
        added_tokens = 0
754
        for key, value in special_tokens_dict.items():
755
            assert key in self.SPECIAL_TOKENS_ATTRIBUTES
756

757
            if self.verbose:
758
                logger.info("Assigning %s to the %s key of the tokenizer", value, key)
759
            setattr(self, key, value)
760

761
            if key == "additional_special_tokens":
762
                assert isinstance(value, (list, tuple)) and all(
763
                    isinstance(t, (str, AddedToken)) for t in value
764
                ), f"Tokens {value} for key {key} should all be str or AddedToken instances"
765
                added_tokens += self.add_tokens(value, special_tokens=True)
766
            else:
767
                assert isinstance(
768
                    value, (str, AddedToken)
769
                ), f"Token {value} for key {key} should be a str or an AddedToken instance"
770
                added_tokens += self.add_tokens([value], special_tokens=True)
771

772
        return added_tokens
773

774
    def add_tokens(
775
        self, new_tokens: Union[str, AddedToken, List[Union[str, AddedToken]]], special_tokens: bool = False
776
    ) -> int:
777
        """
778
        Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
779
        it with indices starting from length of the current vocabulary.
780

781
        Args:
782
            new_tokens (:obj:`str`, :obj:`tokenizers.AddedToken` or a list of `str` or :obj:`tokenizers.AddedToken`):
783
                Tokens are only added if they are not already in the vocabulary. :obj:`tokenizers.AddedToken` wraps a
784
                string token to let you personalize its behavior: whether this token should only match against a single
785
                word, whether this token should strip all potential whitespaces on the left side, whether this token
786
                should strip all potential whitespaces on the right side, etc.
787
            special_token (:obj:`bool`, `optional`, defaults to :obj:`False`):
788
                Can be used to specify if the token is a special token. This mostly change the normalization behavior
789
                (special tokens like CLS or [MASK] are usually not lower-cased for instance).
790

791
                See details for :obj:`tokenizers.AddedToken` in HuggingFace tokenizers library.
792

793
        Returns:
794
            :obj:`int`: Number of tokens added to the vocabulary.
795

796
        Examples::
797

798
            # Let's see how to increase the vocabulary of Bert model and tokenizer
799
            tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
800
            model = BertModel.from_pretrained('bert-base-uncased')
801

802
            num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
803
            print('We have added', num_added_toks, 'tokens')
804
             # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
805
            model.resize_token_embeddings(len(tokenizer))
806
        """
807
        if not new_tokens:
808
            return 0
809

810
        if not isinstance(new_tokens, (list, tuple)):
811
            new_tokens = [new_tokens]
812

813
        return self._add_tokens(new_tokens, special_tokens=special_tokens)
814

815
    @property
816
    def bos_token(self) -> str:
817
        """
818
        :obj:`str`: Beginning of sentence token. Log an error if used while not having been set.
819
        """
820
        if self._bos_token is None and self.verbose:
821
            logger.error("Using bos_token, but it is not set yet.")
822
            return None
823
        return str(self._bos_token)
824

825
    @property
826
    def eos_token(self) -> str:
827
        """
828
        :obj:`str`: End of sentence token. Log an error if used while not having been set.
829
        """
830
        if self._eos_token is None and self.verbose:
831
            logger.error("Using eos_token, but it is not set yet.")
832
            return None
833
        return str(self._eos_token)
834

835
    @property
836
    def unk_token(self) -> str:
837
        """
838
        :obj:`str`: Unknown token. Log an error if used while not having been set.
839
        """
840
        if self._unk_token is None and self.verbose:
841
            logger.error("Using unk_token, but it is not set yet.")
842
            return None
843
        return str(self._unk_token)
844

845
    @property
846
    def sep_token(self) -> str:
847
        """
848
        :obj:`str`: Separation token, to separate context and query in an input sequence.
849
        Log an error if used while not having been set.
850
        """
851
        if self._sep_token is None and self.verbose:
852
            logger.error("Using sep_token, but it is not set yet.")
853
            return None
854
        return str(self._sep_token)
855

856
    @property
857
    def pad_token(self) -> str:
858
        """
859
        :obj:`str`: Padding token. Log an error if used while not having been set.
860
        """
861
        if self._pad_token is None and self.verbose:
862
            logger.error("Using pad_token, but it is not set yet.")
863
            return None
864
        return str(self._pad_token)
865

866
    @property
867
    def cls_token(self) -> str:
868
        """
869
        :obj:`str`: Classification token, to extract a summary of an input sequence leveraging self-attention along
870
        the full depth of the model. Log an error if used while not having been set.
871
        """
872
        if self._cls_token is None and self.verbose:
873
            logger.error("Using cls_token, but it is not set yet.")
874
            return None
875
        return str(self._cls_token)
876

877
    @property
878
    def mask_token(self) -> str:
879
        """
880
        :obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
881
        not having been set.
882
        """
883
        if self._mask_token is None and self.verbose:
884
            logger.error("Using mask_token, but it is not set yet.")
885
            return None
886
        return str(self._mask_token)
887

888
    @property
889
    def additional_special_tokens(self) -> List[str]:
890
        """
891
        :obj:`List[str]`: All the additional special tokens you may want to use. Log an error if used while not having
892
        been set.
893
        """
894
        if self._additional_special_tokens is None and self.verbose:
895
            logger.error("Using additional_special_tokens, but it is not set yet.")
896
            return None
897
        return [str(tok) for tok in self._additional_special_tokens]
898

899
    @bos_token.setter
900
    def bos_token(self, value):
901
        self._bos_token = value
902

903
    @eos_token.setter
904
    def eos_token(self, value):
905
        self._eos_token = value
906

907
    @unk_token.setter
908
    def unk_token(self, value):
909
        self._unk_token = value
910

911
    @sep_token.setter
912
    def sep_token(self, value):
913
        self._sep_token = value
914

915
    @pad_token.setter
916
    def pad_token(self, value):
917
        self._pad_token = value
918

919
    @cls_token.setter
920
    def cls_token(self, value):
921
        self._cls_token = value
922

923
    @mask_token.setter
924
    def mask_token(self, value):
925
        self._mask_token = value
926

927
    @additional_special_tokens.setter
928
    def additional_special_tokens(self, value):
929
        self._additional_special_tokens = value
930

931
    @property
932
    def bos_token_id(self) -> Optional[int]:
933
        """
934
        :obj:`Optional[int]`: Id of the beginning of sentence token in the vocabulary. Returns :obj:`None` if the token
935
        has not been set.
936
        """
937
        if self._bos_token is None:
938
            return None
939
        return self.convert_tokens_to_ids(self.bos_token)
940

941
    @property
942
    def eos_token_id(self) -> Optional[int]:
943
        """
944
        :obj:`Optional[int]`: Id of the end of sentence token in the vocabulary. Returns :obj:`None` if the token has
945
        not been set.
946
        """
947
        if self._eos_token is None:
948
            return None
949
        return self.convert_tokens_to_ids(self.eos_token)
950

951
    @property
952
    def unk_token_id(self) -> Optional[int]:
953
        """
954
        :obj:`Optional[int]`: Id of the unknown token in the vocabulary. Returns :obj:`None` if the token has not been
955
        set.
956
        """
957
        if self._unk_token is None:
958
            return None
959
        return self.convert_tokens_to_ids(self.unk_token)
960

961
    @property
962
    def sep_token_id(self) -> Optional[int]:
963
        """
964
        :obj:`Optional[int]`: Id of the separation token in the vocabulary, to separate context and query in an input
965
        sequence. Returns :obj:`None` if the token has not been set.
966
        """
967
        if self._sep_token is None:
968
            return None
969
        return self.convert_tokens_to_ids(self.sep_token)
970

971
    @property
972
    def pad_token_id(self) -> Optional[int]:
973
        """
974
        :obj:`Optional[int]`: Id of the padding token in the vocabulary. Returns :obj:`None` if the token has not been
975
        set.
976
        """
977
        if self._pad_token is None:
978
            return None
979
        return self.convert_tokens_to_ids(self.pad_token)
980

981
    @property
982
    def pad_token_type_id(self) -> int:
983
        """
984
        :obj:`int`: Id of the padding token type in the vocabulary.
985
        """
986
        return self._pad_token_type_id
987

988
    @property
989
    def cls_token_id(self) -> Optional[int]:
990
        """
991
        :obj:`Optional[int]`: Id of the classification token in the vocabulary, to extract a summary of an input
992
        sequence leveraging self-attention along the full depth of the model.
993

994
        Returns :obj:`None` if the token has not been set.
995
        """
996
        if self._cls_token is None:
997
            return None
998
        return self.convert_tokens_to_ids(self.cls_token)
999

1000
    @property
1001
    def mask_token_id(self) -> Optional[int]:
1002
        """
1003
        :obj:`Optional[int]`: Id of the mask token in the vocabulary, used when training a model with masked-language
1004
        modeling. Returns :obj:`None` if the token has not been set.
1005
        """
1006
        if self._mask_token is None:
1007
            return None
1008
        return self.convert_tokens_to_ids(self.mask_token)
1009

1010
    @property
1011
    def additional_special_tokens_ids(self) -> List[int]:
1012
        """
1013
        :obj:`List[int]`: Ids of all the additional special tokens in the vocabulary.
1014
        Log an error if used while not having been set.
1015
        """
1016
        return self.convert_tokens_to_ids(self.additional_special_tokens)
1017

1018
    @property
1019
    def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:
1020
        """
1021
        :obj:`Dict[str, Union[str, List[str]]]`: A dictionary mapping special token class attributes
1022
        (:obj:`cls_token`, :obj:`unk_token`, etc.) to their values (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.).
1023

1024
        Convert potential tokens of :obj:`tokenizers.AddedToken` type to string.
1025
        """
1026
        set_attr = {}
1027
        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
1028
            attr_value = getattr(self, "_" + attr)
1029
            if attr_value:
1030
                set_attr[attr] = str(attr_value)
1031
        return set_attr
1032

1033
    @property
1034
    def special_tokens_map_extended(self) -> Dict[str, Union[str, AddedToken, List[Union[str, AddedToken]]]]:
1035
        """
1036
        :obj:`Dict[str, Union[str, tokenizers.AddedToken, List[Union[str, tokenizers.AddedToken]]]]`: A dictionary
1037
        mapping special token class attributes (:obj:`cls_token`, :obj:`unk_token`, etc.) to their values
1038
        (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.).
1039

1040
        Don't convert tokens of :obj:`tokenizers.AddedToken` type to string so they can be used to control more finely
1041
        how special tokens are tokenized.
1042
        """
1043
        set_attr = {}
1044
        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
1045
            attr_value = getattr(self, "_" + attr)
1046
            if attr_value:
1047
                set_attr[attr] = attr_value
1048
        return set_attr
1049

1050
    @property
1051
    def all_special_tokens(self) -> List[str]:
1052
        """
1053
        :obj:`List[str]`: All the special tokens (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.) mapped to class attributes.
1054

1055
        Convert tokens of :obj:`tokenizers.AddedToken` type to string.
1056
        """
1057
        all_toks = [str(s) for s in self.all_special_tokens_extended]
1058
        return all_toks
1059

1060
    @property
1061
    def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]:
1062
        """
1063
        :obj:`List[Union[str, tokenizers.AddedToken]]`: All the special tokens (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.)
1064
        mapped to class attributes.
1065

1066
        Don't convert tokens of :obj:`tokenizers.AddedToken` type to string so they can be used to control more finely
1067
        how special tokens are tokenized.
1068
        """
1069
        all_toks = []
1070
        set_attr = self.special_tokens_map_extended
1071
        for attr_value in set_attr.values():
1072
            all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value])
1073
        all_toks = list(set(all_toks))
1074
        return all_toks
1075

1076
    @property
1077
    def all_special_ids(self) -> List[int]:
1078
        """
1079
        :obj:`List[int]`: List the ids of the special tokens(:obj:`'<unk>'`, :obj:`'<cls>'`, etc.) mapped to class
1080
        attributes.
1081
        """
1082
        all_toks = self.all_special_tokens
1083
        all_ids = self.convert_tokens_to_ids(all_toks)
1084
        return all_ids
1085

1086

1087
ENCODE_KWARGS_DOCSTRING = r"""
1088
            add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
1089
                Whether or not to encode the sequences with the special tokens relative to their model.
1090
            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
1091
                Activates and controls padding. Accepts the following values:
1092

1093
                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
1094
                  single sequence if provided).
1095
                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
1096
                  maximum acceptable input length for the model if that argument is not provided.
1097
                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
1098
                  different lengths).
1099
            truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`):
1100
                Activates and controls truncation. Accepts the following values:
1101

1102
                * :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument
1103
                  :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
1104
                  provided. This will truncate token by token, removing a token from the longest sequence in the pair
1105
                  if a pair of sequences (or a batch of pairs) is provided.
1106
                * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
1107
                  the maximum acceptable input length for the model if that argument is not provided. This will only
1108
                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
1109
                * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
1110
                  to the maximum acceptable input length for the model if that argument is not provided. This will only
1111
                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
1112
                * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with
1113
                  sequence lengths greater than the model maximum admissible input size).
1114
            max_length (:obj:`int`, `optional`):
1115
                Controls the maximum length to use by one of the truncation/padding parameters.
1116

1117
                If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
1118
                length is required by one of the truncation/padding parameters. If the model has no specific maximum
1119
                input length (like XLNet) truncation/padding to a maximum length will be deactivated.
1120
            stride (:obj:`int`, `optional`, defaults to 0):
1121
                If set to a number along with :obj:`max_length`, the overflowing tokens returned when
1122
                :obj:`return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
1123
                returned to provide some overlap between truncated and overflowing sequences. The value of this
1124
                argument defines the number of overlapping tokens.
1125
            is_pretokenized (:obj:`bool`, `optional`, defaults to :obj:`False`):
1126
                Whether or not the input is already tokenized.
1127
            pad_to_multiple_of (:obj:`int`, `optional`):
1128
                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
1129
                the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
1130
            return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
1131
                If set, will return tensors instead of list of python integers. Acceptable values are:
1132

1133
                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
1134
                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
1135
                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
1136
"""
1137

1138
ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
1139
            return_token_type_ids (:obj:`bool`, `optional`):
1140
                Whether to return token type IDs. If left to the default, will return the token type IDs according
1141
                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
1142

1143
                `What are token type IDs? <../glossary.html#token-type-ids>`__
1144
            return_attention_mask (:obj:`bool`, `optional`):
1145
                Whether to return the attention mask. If left to the default, will return the attention mask according
1146
                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
1147

1148
                `What are attention masks? <../glossary.html#attention-mask>`__
1149
            return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
1150
                Whether or not to return overflowing token sequences.
1151
            return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`):
1152
                Wheter or not to return special tokens mask information.
1153
            return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`):
1154
                Whether or not to return :obj:`(char_start, char_end)` for each token.
1155

1156
                This is only available on fast tokenizers inheriting from
1157
                :class:`~transformers.PreTrainedTokenizerFast`, if using Python's tokenizer, this method will raise
1158
                :obj:`NotImplementedError`.
1159
            return_length  (:obj:`bool`, `optional`, defaults to :obj:`False`):
1160
                Whether or not to return the lengths of the encoded inputs.
1161
            verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
1162
                Whether or not to print informations and warnings.
1163
            **kwargs: passed to the :obj:`self.tokenize()` method
1164

1165
        Return:
1166
            :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields:
1167

1168
            - **input_ids** -- List of token ids to be fed to a model.
1169

1170
              `What are input IDs? <../glossary.html#input-ids>`__
1171
            - **token_type_ids** -- List of token type ids to be fed to a model (when :obj:`return_token_type_ids=True`
1172
              or if `"token_type_ids"` is in :obj:`self.model_input_names`).
1173

1174
              `What are token type IDs? <../glossary.html#token-type-ids>`__
1175
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
1176
              :obj:`return_attention_mask=True` or if `"attention_mask"` is in :obj:`self.model_input_names`).
1177

1178
              `What are attention masks? <../glossary.html#attention-mask>`__
1179
            - **overflowing_tokens** -- List of overflowing tokens sequences (when a :obj:`max_length` is specified and
1180
              :obj:`return_overflowing_tokens=True`).
1181
            - **num_truncated_tokens** -- Number of tokens truncated (when a :obj:`max_length` is specified and
1182
              :obj:`return_overflowing_tokens=True`).
1183
            - **special_tokens_mask** -- List of 0s and 1s, with 0 specifying added special tokens and 1 specifying
1184
              regual sequence tokens (when :obj:`add_special_tokens=True` and :obj:`return_special_tokens_mask=True`).
1185
            - **length** -- The length of the inputs (when :obj:`return_length=True`)
1186
"""
1187

1188
INIT_TOKENIZER_DOCSTRING = r"""
1189
    Class attributes (overridden by derived classes)
1190
        - **vocab_files_names** (:obj:`Dict[str, str]`) -- A ditionary with, as keys, the ``__init__`` keyword name of
1191
          each vocabulary file required by the model, and as associated values, the filename for saving the associated
1192
          file (string).
1193
        - **pretrained_vocab_files_map** (:obj:`Dict[str, Dict[str, str]]`) -- A dictionary of dictionaries, with the
1194
          high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the
1195
          low-level being the :obj:`short-cut-names` of the pretrained models with, as associated values, the
1196
          :obj:`url` to the associated pretrained vocabulary file.
1197
        - **max_model_input_sizes** (:obj:`Dict[str, Optinal[int]]`) -- A dictionary with, as keys, the
1198
          :obj:`short-cut-names` of the pretrained models, and as associated values, the maximum length of the sequence
1199
          inputs of this model, or :obj:`None` if the model has no maximum input size.
1200
        - **pretrained_init_configuration** (:obj:`Dict[str, Dict[str, Any]]`) -- A dictionary with, as keys, the
1201
          :obj:`short-cut-names` of the pretrained models, and as associated values, a dictionnary of specific
1202
          arguments to pass to the ``__init__`` method of the tokenizer class for this pretrained model when loading the
1203
          tokenizer with the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`
1204
          method.
1205
        - **model_input_names** (:obj:`List[str]`) -- A list of inputs expected in the forward pass of the model.
1206
        - **padding_side** (:obj:`str`) -- The default value for the side on which the model should have padding
1207
          applied. Should be :obj:`'right'` or :obj:`'left'`.
1208

1209
    Args:
1210
        model_max_length (:obj:`int`, `optional`):
1211
            The maximum length (in number of tokens) for the inputs to the transformer model.
1212
            When the tokenizer is loaded with
1213
            :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`, this will be set to
1214
            the value stored for the associated model in ``max_model_input_sizes`` (see above). If no value is
1215
            provided, will default to VERY_LARGE_INTEGER (:obj:`int(1e30)`).
1216
        padding_side: (:obj:`str`, `optional`):
1217
            The side on which the model should have padding applied. Should be selected between ['right', 'left'].
1218
            Default value is picked from the class attribute of the same name.
1219
        model_input_names (:obj:`List[string]`, `optional`):
1220
            The list of inputs accepted by the forward pass of the model (like :obj:`"token_type_ids"` or
1221
            :obj:`"attention_mask"`). Default value is picked from the class attribute of the same name.
1222
        bos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
1223
            A special token representing the beginning of a sentence. Will be associated to ``self.bos_token`` and
1224
            ``self.bos_token_id``.
1225
        eos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
1226
            A special token representing the end of a sentence. Will be associated to ``self.eos_token`` and
1227
            ``self.eos_token_id``.
1228
        unk_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
1229
            A special token representing an out-of-vocabulary token. Will be associated to ``self.unk_token`` and
1230
            ``self.unk_token_id``.
1231
        sep_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
1232
            A special token separating two different sentences in the same input (used by BERT for instance). Will be
1233
            associated to ``self.sep_token`` and ``self.sep_token_id``.
1234
        pad_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
1235
            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
1236
            attention mechanisms or loss computation. Will be associated to ``self.pad_token`` and
1237
            ``self.pad_token_id``.
1238
        cls_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
1239
            A special token representing the class of the input (used by BERT for instance). Will be associated to
1240
            ``self.cls_token`` and ``self.cls_token_id``.
1241
        mask_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
1242
            A special token representing a masked token (used by masked-language modeling pretraining objectives, like
1243
            BERT). Will be associated to ``self.mask_token`` and ``self.mask_token_id``.
1244
        additional_special_tokens (tuple or list of :obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
1245
            A tuple or a list of additional special tokens. Add them here to ensure they won't be split by the
1246
            tokenization process. Will be associated to ``self.additional_special_tokens`` and
1247
            ``self.additional_special_tokens_ids``.
1248
"""
1249

1250

1251
@add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
1252
class PreTrainedTokenizerBase(SpecialTokensMixin):
1253
    """
1254
    Base class for :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast`.
1255

1256
    Handles shared (mostly boiler plate) methods for those two classes.
1257
    """
1258

1259
    vocab_files_names: Dict[str, str] = {}
1260
    pretrained_vocab_files_map: Dict[str, Dict[str, str]] = {}
1261
    pretrained_init_configuration: Dict[str, Dict[str, Any]] = {}
1262
    max_model_input_sizes: Dict[str, Optional[int]] = {}
1263
    model_input_names: List[str] = ["token_type_ids", "attention_mask"]
1264
    padding_side: str = "right"
1265

1266
    def __init__(self, **kwargs):
1267
        # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
1268
        self.init_inputs = ()
1269
        self.init_kwargs = kwargs
1270

1271
        # For backward compatibility we fallback to set model_max_length from max_len if provided
1272
        model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None))
1273
        self.model_max_length = model_max_length if model_max_length is not None else VERY_LARGE_INTEGER
1274

1275
        # Padding side is right by default and overridden in subclasses. If specified in the kwargs, it is changed.
1276
        self.padding_side = kwargs.pop("padding_side", self.padding_side)
1277
        assert self.padding_side in [
1278
            "right",
1279
            "left",
1280
        ], f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}"
1281
        self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
1282

1283
        super().__init__(**kwargs)
1284

1285
    @property
1286
    def max_len(self) -> int:
1287
        """
1288
        :obj:`int`: **Deprecated** Kept here for backward compatibility. Now renamed to :obj:`model_max_length` to
1289
        avoid ambiguity.
1290
        """
1291
        warnings.warn(
1292
            "The `max_len` attribute has been deprecated and will be removed in a future version, use `model_max_length` instead.",
1293
            FutureWarning,
1294
        )
1295
        return self.model_max_length
1296

1297
    @property
1298
    def max_len_single_sentence(self) -> int:
1299
        """
1300
        :obj:`int`: The maximum length of a sentence that can be fed to the model.
1301
        """
1302
        return self.model_max_length - self.num_special_tokens_to_add(pair=False)
1303

1304
    @property
1305
    def max_len_sentences_pair(self) -> int:
1306
        """
1307
        :obj:`int`: The maximum combined length of a pair of sentences that can be fed to the model.
1308
        """
1309
        return self.model_max_length - self.num_special_tokens_to_add(pair=True)
1310

1311
    @max_len_single_sentence.setter
1312
    def max_len_single_sentence(self, value) -> int:
1313
        # For backward compatibility, allow to try to setup 'max_len_single_sentence'.
1314
        if value == self.model_max_length - self.num_special_tokens_to_add(pair=False) and self.verbose:
1315
            logger.warning(
1316
                "Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up."
1317
            )
1318
        else:
1319
            raise ValueError(
1320
                "Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up."
1321
            )
1322

1323
    @max_len_sentences_pair.setter
1324
    def max_len_sentences_pair(self, value) -> int:
1325
        # For backward compatibility, allow to try to setup 'max_len_sentences_pair'.
1326
        if value == self.model_max_length - self.num_special_tokens_to_add(pair=True) and self.verbose:
1327
            logger.warning(
1328
                "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
1329
            )
1330
        else:
1331
            raise ValueError(
1332
                "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
1333
            )
1334

1335
    @classmethod
1336
    def from_pretrained(cls, *inputs, **kwargs):
1337
        r"""
1338
        Instantiate a :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase` (or a derived class) from
1339
        a predefined tokenizer.
1340

1341
        Args:
1342
            pretrained_model_name_or_path (:obj:`str`):
1343
                Can be either:
1344

1345
                - A string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.,
1346
                  ``bert-base-uncased``.
1347
                - A string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.,
1348
                  ``dbmdz/bert-base-german-cased``.
1349
                - A path to a `directory` containing vocabulary files required by the tokenizer, for instance saved
1350
                  using the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`
1351
                  method, e.g., ``./my_model_directory/``.
1352
                - (**Deprecated**, not applicable to all derived classes) A path or url to a single saved vocabulary
1353
                  file (if and only if the tokenizer only requires a single vocabulary file like Bert or XLNet), e.g.,
1354
                  ``./my_model_directory/vocab.txt``.
1355
            cache_dir (:obj:`str`, `optional`):
1356
                Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the
1357
                standard cache should not be used.
1358
            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
1359
                Whether or not to force the (re-)download the vocabulary files and override the cached versions if they
1360
                exist.
1361
            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
1362
                Whether or not to delete incompletely received files. Attempt to resume the download if such a file
1363
                exists.
1364
            proxies (:obj:`Dict[str, str], `optional`):
1365
                A dictionary of proxy servers to use by protocol or endpoint, e.g.,
1366
                :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each
1367
                request.
1368
            inputs (additional positional arguments, `optional`):
1369
                Will be passed along to the Tokenizer ``__init__`` method.
1370
            kwargs (additional keyword arguments, `optional`):
1371
                Will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like
1372
                ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``,
1373
                ``mask_token``, ``additional_special_tokens``. See parameters in the ``__init__`` for more details.
1374

1375
        Examples::
1376

1377
            # We can't instantiate directly the base class `PreTrainedTokenizerBase` so let's show our examples on a derived class: BertTokenizer
1378
            # Download vocabulary from S3 and cache.
1379
            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
1380

1381
            # Download vocabulary from S3 (user-uploaded) and cache.
1382
            tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
1383

1384
            # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
1385
            tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')
1386

1387
            # If the tokenizer uses a single vocabulary file, you can point directly to this file
1388
            tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt')
1389

1390
            # You can link tokens to special vocabulary when instantiating
1391
            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='<unk>')
1392
            # You should be sure '<unk>' is in the vocabulary when doing that.
1393
            # Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead)
1394
            assert tokenizer.unk_token == '<unk>'
1395

1396
        """
1397
        return cls._from_pretrained(*inputs, **kwargs)
1398

1399
    @classmethod
1400
    def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
1401
        cache_dir = kwargs.pop("cache_dir", None)
1402
        force_download = kwargs.pop("force_download", False)
1403
        resume_download = kwargs.pop("resume_download", False)
1404
        proxies = kwargs.pop("proxies", None)
1405
        local_files_only = kwargs.pop("local_files_only", False)
1406

1407
        s3_models = list(cls.max_model_input_sizes.keys())
1408
        vocab_files = {}
1409
        init_configuration = {}
1410
        if pretrained_model_name_or_path in s3_models:
1411
            # Get the vocabulary from AWS S3 bucket
1412
            for file_id, map_list in cls.pretrained_vocab_files_map.items():
1413
                vocab_files[file_id] = map_list[pretrained_model_name_or_path]
1414
            if (
1415
                cls.pretrained_init_configuration
1416
                and pretrained_model_name_or_path in cls.pretrained_init_configuration
1417
            ):
1418
                init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path].copy()
1419
        else:
1420
            # Get the vocabulary from local files
1421
            logger.info(
1422
                "Model name '{}' not found in model shortcut name list ({}). "
1423
                "Assuming '{}' is a path, a model identifier, or url to a directory containing tokenizer files.".format(
1424
                    pretrained_model_name_or_path, ", ".join(s3_models), pretrained_model_name_or_path
1425
                )
1426
            )
1427

1428
            if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
1429
                if len(cls.vocab_files_names) > 1:
1430
                    raise ValueError(
1431
                        "Calling {}.from_pretrained() with the path to a single file or url is not supported."
1432
                        "Use a model identifier or the path to a directory instead.".format(cls.__name__)
1433
                    )
1434
                logger.warning(
1435
                    "Calling {}.from_pretrained() with the path to a single file or url is deprecated".format(
1436
                        cls.__name__
1437
                    )
1438
                )
1439
                file_id = list(cls.vocab_files_names.keys())[0]
1440
                vocab_files[file_id] = pretrained_model_name_or_path
1441
            else:
1442
                # At this point pretrained_model_name_or_path is either a directory or a model identifier name
1443
                additional_files_names = {
1444
                    "added_tokens_file": ADDED_TOKENS_FILE,
1445
                    "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
1446
                    "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
1447
                    "full_tokenizer_file": FULL_TOKENIZER_FILE,
1448
                }
1449
                # Look for the tokenizer files
1450
                for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items():
1451
                    if os.path.isdir(pretrained_model_name_or_path):
1452
                        full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
1453
                        if not os.path.exists(full_file_name):
1454
                            logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
1455
                            full_file_name = None
1456
                    else:
1457
                        full_file_name = hf_bucket_url(
1458
                            pretrained_model_name_or_path, filename=file_name, use_cdn=False
1459
                        )
1460

1461
                    vocab_files[file_id] = full_file_name
1462

1463
        # Get files from url, cache, or disk depending on the case
1464
        try:
1465
            resolved_vocab_files = {}
1466
            for file_id, file_path in vocab_files.items():
1467
                if file_path is None:
1468
                    resolved_vocab_files[file_id] = None
1469
                else:
1470
                    resolved_vocab_files[file_id] = cached_path(
1471
                        file_path,
1472
                        cache_dir=cache_dir,
1473
                        force_download=force_download,
1474
                        proxies=proxies,
1475
                        resume_download=resume_download,
1476
                        local_files_only=local_files_only,
1477
                    )
1478
        except EnvironmentError:
1479
            if pretrained_model_name_or_path in s3_models:
1480
                msg = "Couldn't reach server at '{}' to download vocabulary files."
1481
            else:
1482
                msg = (
1483
                    "Model name '{}' was not found in tokenizers model name list ({}). "
1484
                    "We assumed '{}' was a path or url to a directory containing vocabulary files "
1485
                    "named {}, but couldn't find such vocabulary files at this path or url.".format(
1486
                        pretrained_model_name_or_path,
1487
                        ", ".join(s3_models),
1488
                        pretrained_model_name_or_path,
1489
                        list(cls.vocab_files_names.values()),
1490
                    )
1491
                )
1492

1493
            raise EnvironmentError(msg)
1494

1495
        if all(full_file_name is None for full_file_name in resolved_vocab_files.values()):
1496
            raise EnvironmentError(
1497
                "Model name '{}' was not found in tokenizers model name list ({}). "
1498
                "We assumed '{}' was a path, a model identifier, or url to a directory containing vocabulary files "
1499
                "named {} but couldn't find such vocabulary files at this path or url.".format(
1500
                    pretrained_model_name_or_path,
1501
                    ", ".join(s3_models),
1502
                    pretrained_model_name_or_path,
1503
                    list(cls.vocab_files_names.values()),
1504
                )
1505
            )
1506

1507
        for file_id, file_path in vocab_files.items():
1508
            if file_path == resolved_vocab_files[file_id]:
1509
                logger.info("loading file {}".format(file_path))
1510
            else:
1511
                logger.info("loading file {} from cache at {}".format(file_path, resolved_vocab_files[file_id]))
1512

1513
        # Prepare tokenizer initialization kwargs
1514
        # Did we saved some inputs and kwargs to reload ?
1515
        tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None)
1516
        if tokenizer_config_file is not None:
1517
            with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
1518
                init_kwargs = json.load(tokenizer_config_handle)
1519
            saved_init_inputs = init_kwargs.pop("init_inputs", ())
1520
            if not init_inputs:
1521
                init_inputs = saved_init_inputs
1522
        else:
1523
            init_kwargs = init_configuration
1524

1525
        # Update with newly provided kwargs
1526
        init_kwargs.update(kwargs)
1527

1528
        # Set max length if needed
1529
        if pretrained_model_name_or_path in cls.max_model_input_sizes:
1530
            # if we're using a pretrained model, ensure the tokenizer
1531
            # wont index sequences longer than the number of positional embeddings
1532
            model_max_length = cls.max_model_input_sizes[pretrained_model_name_or_path]
1533
            if model_max_length is not None and isinstance(model_max_length, (int, float)):
1534
                init_kwargs["model_max_length"] = min(init_kwargs.get("model_max_length", int(1e30)), model_max_length)
1535

1536
        # Merge resolved_vocab_files arguments in init_kwargs.
1537
        added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
1538
        for args_name, file_path in resolved_vocab_files.items():
1539
            if args_name not in init_kwargs:
1540
                init_kwargs[args_name] = file_path
1541

1542
        # Instantiate tokenizer.
1543
        try:
1544
            tokenizer = cls(*init_inputs, **init_kwargs)
1545
        except OSError:
1546
            raise OSError(
1547
                "Unable to load vocabulary from file. "
1548
                "Please check that the provided vocabulary is accessible and not corrupted."
1549
            )
1550

1551
        # Save inputs and kwargs for saving and re-loading with ``save_pretrained``
1552
        tokenizer.init_inputs = init_inputs
1553
        tokenizer.init_kwargs = init_kwargs
1554

1555
        # If there is a complementary special token map, load it
1556
        special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
1557
        if special_tokens_map_file is not None:
1558
            with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
1559
                special_tokens_map = json.load(special_tokens_map_handle)
1560

1561
            for key, value in special_tokens_map.items():
1562
                if isinstance(value, dict):
1563
                    value = AddedToken(**value)
1564
                setattr(tokenizer, key, value)
1565

1566
        # Add supplementary tokens.
1567
        special_tokens = tokenizer.all_special_tokens
1568
        if added_tokens_file is not None:
1569
            with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
1570
                added_tok_encoder = json.load(added_tokens_handle)
1571

1572
            # Sort added tokens by index
1573
            added_tok_encoder_sorted = list(sorted(added_tok_encoder.items(), key=lambda x: x[1]))
1574

1575
            for token, index in added_tok_encoder_sorted:
1576
                assert index == len(tokenizer), (
1577
                    f"Non-consecutive added token '{token}' found. "
1578
                    f"Should have index {len(tokenizer)} but has index {index} in saved vocabulary."
1579
                )
1580
                tokenizer.add_tokens(token, special_tokens=bool(token in special_tokens))
1581

1582
        # Check all our special tokens are registrered as "no split" token (we don't cut them) and are in the vocab
1583
        added_tokens = tokenizer.sanitize_special_tokens()
1584
        if added_tokens:
1585
            logger.warning(
1586
                "Special tokens have been added in the vocabulary, make sure the associated word emebedding are fine-tuned or trained."
1587
            )
1588

1589
        return tokenizer
1590

1591
    def save_pretrained(self, save_directory: str) -> Tuple[str]:
1592
        """
1593
        Save the tokenizer vocabulary files together with:
1594

1595
            - added tokens,
1596
            - special tokens to class attributes mapping,
1597
            - tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert).
1598

1599
        This method make sure the full tokenizer can then be re-loaded using the
1600
        :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained` class method.
1601

1602
        .. Warning::
1603
           This won't save modifications you may have applied to the tokenizer after the instantiation (for instance,
1604
           modifying :obj:`tokenizer.do_lower_case` after creation).
1605

1606
        Args:
1607
            save_directory (:obj:`str`): The path to adirectory where the tokenizer will be saved.
1608

1609
        Returns:
1610
            A tuple of :obj:`str`: The files saved.
1611
        """
1612
        if os.path.isfile(save_directory):
1613
            logger.error("Provided path ({}) should be a directory, not a file".format(save_directory))
1614
            return
1615
        os.makedirs(save_directory, exist_ok=True)
1616

1617
        special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE)
1618
        added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE)
1619
        tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE)
1620

1621
        tokenizer_config = copy.deepcopy(self.init_kwargs)
1622
        if len(self.init_inputs) > 0:
1623
            tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
1624
        for file_id in self.vocab_files_names.keys():
1625
            tokenizer_config.pop(file_id, None)
1626

1627
        with open(tokenizer_config_file, "w", encoding="utf-8") as f:
1628
            f.write(json.dumps(tokenizer_config, ensure_ascii=False))
1629

1630
        with open(special_tokens_map_file, "w", encoding="utf-8") as f:
1631
            write_dict = {}
1632
            for key, value in self.special_tokens_map_extended.items():
1633
                if isinstance(value, AddedToken):
1634
                    write_dict[key] = value.__getstate__()
1635
                else:
1636
                    write_dict[key] = value
1637
            f.write(json.dumps(write_dict, ensure_ascii=False))
1638

1639
        added_vocab = self.get_added_vocab()
1640
        if added_vocab:
1641
            with open(added_tokens_file, "w", encoding="utf-8") as f:
1642
                out_str = json.dumps(added_vocab, ensure_ascii=False)
1643
                f.write(out_str)
1644

1645
        vocab_files = self.save_vocabulary(save_directory)
1646

1647
        return vocab_files + (special_tokens_map_file, added_tokens_file)
1648

1649
    @add_end_docstrings(
1650
        ENCODE_KWARGS_DOCSTRING,
1651
        """
1652
            **kwargs: Passed along to the `.tokenize()` method.
1653
        """,
1654
        """
1655
        Returns:
1656
            :obj:`List[int]`, :obj:`torch.Tensor`, :obj:`tf.Tensor` or :obj:`np.ndarray`:
1657
            The tokenized ids of the text.
1658
        """,
1659
    )
1660
    def encode(
1661
        self,
1662
        text: Union[TextInput, PreTokenizedInput, EncodedInput],
1663
        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
1664
        add_special_tokens: bool = True,
1665
        padding: Union[bool, str, PaddingStrategy] = False,
1666
        truncation: Union[bool, str, TruncationStrategy] = False,
1667
        max_length: Optional[int] = None,
1668
        stride: int = 0,
1669
        return_tensors: Optional[Union[str, TensorType]] = None,
1670
        **kwargs
1671
    ) -> List[int]:
1672
        """
1673
        Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.
1674

1675
        Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
1676

1677
        Args:
1678
            text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`):
1679
                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
1680
                the ``tokenize`` method) or a list of integers (tokenized string ids using the
1681
                ``convert_tokens_to_ids`` method).
1682
            text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`):
1683
                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
1684
                string using the ``tokenize`` method) or a list of integers (tokenized string ids using the
1685
                ``convert_tokens_to_ids`` method).
1686
        """
1687
        encoded_inputs = self.encode_plus(
1688
            text,
1689
            text_pair=text_pair,
1690
            add_special_tokens=add_special_tokens,
1691
            padding=padding,
1692
            truncation=truncation,
1693
            max_length=max_length,
1694
            stride=stride,
1695
            return_tensors=return_tensors,
1696
            **kwargs,
1697
        )
1698

1699
        return encoded_inputs["input_ids"]
1700

1701
    def num_special_tokens_to_add(self, pair: bool = False) -> int:
1702
        raise NotImplementedError
1703

1704
    def _get_padding_truncation_strategies(
1705
        self, padding=False, truncation=False, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs
1706
    ):
1707
        """
1708
        Find the correct padding/truncation strategy with backward compatibility
1709
        for old arguments (truncation_strategy and pad_to_max_length) and behaviors.
1710
        """
1711
        old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate")
1712
        old_pad_to_max_length = kwargs.pop("pad_to_max_length", False)
1713

1714
        # Backward compatibility for previous behavior, maybe we should deprecate it:
1715
        # If you only set max_length, it activates truncation for max_length
1716
        if max_length is not None and padding is False and truncation is False:
1717
            if verbose:
1718
                logger.warning(
1719
                    "Truncation was not explicitely activated but `max_length` is provided a specific value, "
1720
                    "please use `truncation=True` to explicitely truncate examples to max length. "
1721
                    "Defaulting to 'longest_first' truncation strategy. "
1722
                    "If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy "
1723
                    "more precisely by providing a specific strategy to `truncation`."
1724
                )
1725
            truncation = "longest_first"
1726

1727
        # Get padding strategy
1728
        if padding is False and old_pad_to_max_length:
1729
            if verbose:
1730
                warnings.warn(
1731
                    "The `pad_to_max_length` argument is deprecated and will be removed in a future version, "
1732
                    "use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or "
1733
                    "use `padding='max_length'` to pad to a max length. In this case, you can give a specific "
1734
                    "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the "
1735
                    "maximal input size of the model (e.g. 512 for Bert).",
1736
                    FutureWarning,
1737
                )
1738
            if max_length is None:
1739
                padding_strategy = PaddingStrategy.LONGEST
1740
            else:
1741
                padding_strategy = PaddingStrategy.MAX_LENGTH
1742
        elif padding is not False:
1743
            if padding is True:
1744
                padding_strategy = PaddingStrategy.LONGEST  # Default to pad to the longest sequence in the batch
1745
            elif not isinstance(padding, PaddingStrategy):
1746
                padding_strategy = PaddingStrategy(padding)
1747
        else:
1748
            padding_strategy = PaddingStrategy.DO_NOT_PAD
1749

1750
        # Get truncation strategy
1751
        if truncation is False and old_truncation_strategy != "do_not_truncate":
1752
            if verbose:
1753
                warnings.warn(
1754
                    "The `truncation_strategy` argument is deprecated and will be removed in a future version, "
1755
                    "use `truncation=True` to truncate examples to a max length. You can give a specific "
1756
                    "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the "
1757
                    "maximal input size of the model (e.g. 512 for Bert). "
1758
                    " If you have pairs of inputs, you can give a specific truncation strategy selected among "
1759
                    "`truncation='only_first'` (will only truncate the first sentence in the pairs) "
1760
                    "`truncation='only_second'` (will only truncate the second sentence in the pairs) "
1761
                    "or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence in the pairs).",
1762
                    FutureWarning,
1763
                )
1764
            truncation_strategy = TruncationStrategy(old_truncation_strategy)
1765
        elif truncation is not False:
1766
            if truncation is True:
1767
                truncation_strategy = (
1768
                    TruncationStrategy.LONGEST_FIRST
1769
                )  # Default to truncate the longest sequences in pairs of inputs
1770
            elif not isinstance(truncation, TruncationStrategy):
1771
                truncation_strategy = TruncationStrategy(truncation)
1772
        else:
1773
            truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
1774

1775
        # Set max length if needed
1776
        if max_length is None:
1777
            if padding_strategy == PaddingStrategy.MAX_LENGTH:
1778
                if self.model_max_length > LARGE_INTEGER:
1779
                    if verbose:
1780
                        logger.warning(
1781
                            "Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. "
1782
                            "Default to no padding."
1783
                        )
1784
                    padding_strategy = PaddingStrategy.DO_NOT_PAD
1785
                else:
1786
                    max_length = self.model_max_length
1787

1788
            if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
1789
                if self.model_max_length > LARGE_INTEGER:
1790
                    if verbose:
1791
                        logger.warning(
1792
                            "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. "
1793
                            "Default to no truncation."
1794
                        )
1795
                    truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
1796
                else:
1797
                    max_length = self.model_max_length
1798

1799
        # Test if we have a padding token
1800
        if padding_strategy != PaddingStrategy.DO_NOT_PAD and (not self.pad_token or self.pad_token_id < 0):
1801
            raise ValueError(
1802
                "Asking to pad but the tokenizer does not have a padding token. "
1803
                "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
1804
                "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
1805
            )
1806

1807
        # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
1808
        if (
1809
            truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
1810
            and padding_strategy != PaddingStrategy.DO_NOT_PAD
1811
            and pad_to_multiple_of is not None
1812
            and max_length is not None
1813
            and (max_length % pad_to_multiple_of != 0)
1814
        ):
1815
            raise ValueError(
1816
                f"Truncation and padding are both activated but "
1817
                f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
1818
            )
1819

1820
        return padding_strategy, truncation_strategy, max_length, kwargs
1821

1822
    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
1823
    def __call__(
1824
        self,
1825
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
1826
        text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
1827
        add_special_tokens: bool = True,
1828
        padding: Union[bool, str, PaddingStrategy] = False,
1829
        truncation: Union[bool, str, TruncationStrategy] = False,
1830
        max_length: Optional[int] = None,
1831
        stride: int = 0,
1832
        is_pretokenized: bool = False,
1833
        pad_to_multiple_of: Optional[int] = None,
1834
        return_tensors: Optional[Union[str, TensorType]] = None,
1835
        return_token_type_ids: Optional[bool] = None,
1836
        return_attention_mask: Optional[bool] = None,
1837
        return_overflowing_tokens: bool = False,
1838
        return_special_tokens_mask: bool = False,
1839
        return_offsets_mapping: bool = False,
1840
        return_length: bool = False,
1841
        verbose: bool = True,
1842
        **kwargs
1843
    ) -> BatchEncoding:
1844
        """
1845
        Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
1846
        sequences.
1847

1848
        Args:
1849
            text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
1850
                The sequence or batch of sequences to be encoded.
1851
                Each sequence can be a string or a list of strings (pretokenized string).
1852
                If the sequences are provided as list of strings (pretokenized), you must set
1853
                :obj:`is_pretokenized=True` (to lift the ambiguity with a batch of sequences).
1854
            text_pair (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
1855
                The sequence or batch of sequences to be encoded.
1856
                Each sequence can be a string or a list of strings (pretokenized string).
1857
                If the sequences are provided as list of strings (pretokenized), you must set
1858
                :obj:`is_pretokenized=True` (to lift the ambiguity with a batch of sequences).
1859
        """
1860
        # Input type checking for clearer error
1861
        assert isinstance(text, str) or (
1862
            isinstance(text, (list, tuple))
1863
            and (
1864
                len(text) == 0
1865
                or (
1866
                    isinstance(text[0], str)
1867
                    or (isinstance(text[0], (list, tuple)) and (len(text[0]) == 0 or isinstance(text[0][0], str)))
1868
                )
1869
            )
1870
        ), (
1871
            "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
1872
            "or `List[List[str]]` (batch of pretokenized examples)."
1873
        )
1874

1875
        assert (
1876
            text_pair is None
1877
            or isinstance(text_pair, str)
1878
            or (
1879
                isinstance(text_pair, (list, tuple))
1880
                and (
1881
                    len(text_pair) == 0
1882
                    or (
1883
                        isinstance(text_pair[0], str)
1884
                        or (
1885
                            isinstance(text_pair[0], (list, tuple))
1886
                            and (len(text_pair[0]) == 0 or isinstance(text_pair[0][0], str))
1887
                        )
1888
                    )
1889
                )
1890
            )
1891
        ), (
1892
            "text_pair input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
1893
            "or `List[List[str]]` (batch of pretokenized examples)."
1894
        )
1895

1896
        is_batched = bool(
1897
            (not is_pretokenized and isinstance(text, (list, tuple)))
1898
            or (is_pretokenized and isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple)))
1899
        )
1900

1901
        if is_batched:
1902
            batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
1903
            return self.batch_encode_plus(
1904
                batch_text_or_text_pairs=batch_text_or_text_pairs,
1905
                add_special_tokens=add_special_tokens,
1906
                padding=padding,
1907
                truncation=truncation,
1908
                max_length=max_length,
1909
                stride=stride,
1910
                is_pretokenized=is_pretokenized,
1911
                pad_to_multiple_of=pad_to_multiple_of,
1912
                return_tensors=return_tensors,
1913
                return_token_type_ids=return_token_type_ids,
1914
                return_attention_mask=return_attention_mask,
1915
                return_overflowing_tokens=return_overflowing_tokens,
1916
                return_special_tokens_mask=return_special_tokens_mask,
1917
                return_offsets_mapping=return_offsets_mapping,
1918
                return_length=return_length,
1919
                verbose=verbose,
1920
                **kwargs,
1921
            )
1922
        else:
1923
            return self.encode_plus(
1924
                text=text,
1925
                text_pair=text_pair,
1926
                add_special_tokens=add_special_tokens,
1927
                padding=padding,
1928
                truncation=truncation,
1929
                max_length=max_length,
1930
                stride=stride,
1931
                is_pretokenized=is_pretokenized,
1932
                pad_to_multiple_of=pad_to_multiple_of,
1933
                return_tensors=return_tensors,
1934
                return_token_type_ids=return_token_type_ids,
1935
                return_attention_mask=return_attention_mask,
1936
                return_overflowing_tokens=return_overflowing_tokens,
1937
                return_special_tokens_mask=return_special_tokens_mask,
1938
                return_offsets_mapping=return_offsets_mapping,
1939
                return_length=return_length,
1940
                verbose=verbose,
1941
                **kwargs,
1942
            )
1943

1944
    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
1945
    def encode_plus(
1946
        self,
1947
        text: Union[TextInput, PreTokenizedInput, EncodedInput],
1948
        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
1949
        add_special_tokens: bool = True,
1950
        padding: Union[bool, str, PaddingStrategy] = False,
1951
        truncation: Union[bool, str, TruncationStrategy] = False,
1952
        max_length: Optional[int] = None,
1953
        stride: int = 0,
1954
        is_pretokenized: bool = False,
1955
        pad_to_multiple_of: Optional[int] = None,
1956
        return_tensors: Optional[Union[str, TensorType]] = None,
1957
        return_token_type_ids: Optional[bool] = None,
1958
        return_attention_mask: Optional[bool] = None,
1959
        return_overflowing_tokens: bool = False,
1960
        return_special_tokens_mask: bool = False,
1961
        return_offsets_mapping: bool = False,
1962
        return_length: bool = False,
1963
        verbose: bool = True,
1964
        **kwargs
1965
    ) -> BatchEncoding:
1966
        """
1967
        Tokenize and prepare for the model a sequence or a pair of sequences.
1968

1969
        .. warning::
1970
            This method is deprecated, ``__call__`` should be used instead.
1971

1972
        Args:
1973
            text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]` (the latter only for not-fast tokenizers)):
1974
                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
1975
                the ``tokenize`` method) or a list of integers (tokenized string ids using the
1976
                ``convert_tokens_to_ids`` method).
1977
            text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`):
1978
                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
1979
                string using the ``tokenize`` method) or a list of integers (tokenized string ids using the
1980
                ``convert_tokens_to_ids`` method).
1981
        """
1982

1983
        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
1984
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
1985
            padding=padding,
1986
            truncation=truncation,
1987
            max_length=max_length,
1988
            pad_to_multiple_of=pad_to_multiple_of,
1989
            verbose=verbose,
1990
            **kwargs,
1991
        )
1992

1993
        return self._encode_plus(
1994
            text=text,
1995
            text_pair=text_pair,
1996
            add_special_tokens=add_special_tokens,
1997
            padding_strategy=padding_strategy,
1998
            truncation_strategy=truncation_strategy,
1999
            max_length=max_length,
2000
            stride=stride,
2001
            is_pretokenized=is_pretokenized,
2002
            pad_to_multiple_of=pad_to_multiple_of,
2003
            return_tensors=return_tensors,
2004
            return_token_type_ids=return_token_type_ids,
2005
            return_attention_mask=return_attention_mask,
2006
            return_overflowing_tokens=return_overflowing_tokens,
2007
            return_special_tokens_mask=return_special_tokens_mask,
2008
            return_offsets_mapping=return_offsets_mapping,
2009
            return_length=return_length,
2010
            verbose=verbose,
2011
            **kwargs,
2012
        )
2013

2014
    def _encode_plus(
2015
        self,
2016
        text: Union[TextInput, PreTokenizedInput, EncodedInput],
2017
        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
2018
        add_special_tokens: bool = True,
2019
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
2020
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
2021
        max_length: Optional[int] = None,
2022
        stride: int = 0,
2023
        is_pretokenized: bool = False,
2024
        pad_to_multiple_of: Optional[int] = None,
2025
        return_tensors: Optional[Union[str, TensorType]] = None,
2026
        return_token_type_ids: Optional[bool] = None,
2027
        return_attention_mask: Optional[bool] = None,
2028
        return_overflowing_tokens: bool = False,
2029
        return_special_tokens_mask: bool = False,
2030
        return_offsets_mapping: bool = False,
2031
        return_length: bool = False,
2032
        verbose: bool = True,
2033
        **kwargs
2034
    ) -> BatchEncoding:
2035
        raise NotImplementedError
2036

2037
    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
2038
    def batch_encode_plus(
2039
        self,
2040
        batch_text_or_text_pairs: Union[
2041
            List[TextInput],
2042
            List[TextInputPair],
2043
            List[PreTokenizedInput],
2044
            List[PreTokenizedInputPair],
2045
            List[EncodedInput],
2046
            List[EncodedInputPair],
2047
        ],
2048
        add_special_tokens: bool = True,
2049
        padding: Union[bool, str, PaddingStrategy] = False,
2050
        truncation: Union[bool, str, TruncationStrategy] = False,
2051
        max_length: Optional[int] = None,
2052
        stride: int = 0,
2053
        is_pretokenized: bool = False,
2054
        pad_to_multiple_of: Optional[int] = None,
2055
        return_tensors: Optional[Union[str, TensorType]] = None,
2056
        return_token_type_ids: Optional[bool] = None,
2057
        return_attention_mask: Optional[bool] = None,
2058
        return_overflowing_tokens: bool = False,
2059
        return_special_tokens_mask: bool = False,
2060
        return_offsets_mapping: bool = False,
2061
        return_length: bool = False,
2062
        verbose: bool = True,
2063
        **kwargs
2064
    ) -> BatchEncoding:
2065
        """
2066
        Tokenize and prepare for the model a list of sequences or a list of pairs of sequences.
2067

2068
        .. warning::
2069
            This method is deprecated, ``__call__`` should be used instead.
2070

2071
        Args:
2072
            batch_text_or_text_pairs (:obj:`List[str]`, :obj:`List[Tuple[str, str]]`, :obj:`List[List[str]]`, :obj:`List[Tuple[List[str], List[str]]]`, and for not-fast tokenizers, also :obj:`List[List[int]]`, :obj:`List[Tuple[List[int], List[int]]]`):
2073
                Batch of sequences or pair of sequences to be encoded.
2074
                This can be a list of string/string-sequences/int-sequences or a list of pair of
2075
                string/string-sequences/int-sequence (see details in ``encode_plus``).
2076
        """
2077

2078
        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
2079
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
2080
            padding=padding,
2081
            truncation=truncation,
2082
            max_length=max_length,
2083
            pad_to_multiple_of=pad_to_multiple_of,
2084
            verbose=verbose,
2085
            **kwargs,
2086
        )
2087

2088
        return self._batch_encode_plus(
2089
            batch_text_or_text_pairs=batch_text_or_text_pairs,
2090
            add_special_tokens=add_special_tokens,
2091
            padding_strategy=padding_strategy,
2092
            truncation_strategy=truncation_strategy,
2093
            max_length=max_length,
2094
            stride=stride,
2095
            is_pretokenized=is_pretokenized,
2096
            pad_to_multiple_of=pad_to_multiple_of,
2097
            return_tensors=return_tensors,
2098
            return_token_type_ids=return_token_type_ids,
2099
            return_attention_mask=return_attention_mask,
2100
            return_overflowing_tokens=return_overflowing_tokens,
2101
            return_special_tokens_mask=return_special_tokens_mask,
2102
            return_offsets_mapping=return_offsets_mapping,
2103
            return_length=return_length,
2104
            verbose=verbose,
2105
            **kwargs,
2106
        )
2107

2108
    def _batch_encode_plus(
2109
        self,
2110
        batch_text_or_text_pairs: Union[
2111
            List[TextInput],
2112
            List[TextInputPair],
2113
            List[PreTokenizedInput],
2114
            List[PreTokenizedInputPair],
2115
            List[EncodedInput],
2116
            List[EncodedInputPair],
2117
        ],
2118
        add_special_tokens: bool = True,
2119
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
2120
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
2121
        max_length: Optional[int] = None,
2122
        stride: int = 0,
2123
        is_pretokenized: bool = False,
2124
        pad_to_multiple_of: Optional[int] = None,
2125
        return_tensors: Optional[Union[str, TensorType]] = None,
2126
        return_token_type_ids: Optional[bool] = None,
2127
        return_attention_mask: Optional[bool] = None,
2128
        return_overflowing_tokens: bool = False,
2129
        return_special_tokens_mask: bool = False,
2130
        return_offsets_mapping: bool = False,
2131
        return_length: bool = False,
2132
        verbose: bool = True,
2133
        **kwargs
2134
    ) -> BatchEncoding:
2135
        raise NotImplementedError
2136

2137
    def pad(
2138
        self,
2139
        encoded_inputs: Union[
2140
            BatchEncoding,
2141
            List[BatchEncoding],
2142
            Dict[str, EncodedInput],
2143
            Dict[str, List[EncodedInput]],
2144
            List[Dict[str, EncodedInput]],
2145
        ],
2146
        padding: Union[bool, str, PaddingStrategy] = True,
2147
        max_length: Optional[int] = None,
2148
        pad_to_multiple_of: Optional[int] = None,
2149
        return_attention_mask: Optional[bool] = None,
2150
        return_tensors: Optional[Union[str, TensorType]] = None,
2151
        verbose: bool = True,
2152
    ) -> BatchEncoding:
2153
        """
2154
        Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
2155
        in the batch.
2156

2157
        Padding side (left/right) padding token ids are defined at the tokenizer level
2158
        (with ``self.padding_side``, ``self.pad_token_id`` and ``self.pad_token_type_id``)
2159

2160
        Args:
2161
            encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`):
2162
                Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or
2163
                :obj:`Dict[str, List[int]]`) or a batch of tokenized inputs (list of
2164
                :class:`~transformers.BatchEncoding`, `Dict[str, List[List[int]]]` or `List[Dict[str, List[int]]]`) so
2165
                you can use this method during preprocessing as well as in a PyTorch Dataloader collate function.
2166
            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
2167
                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
2168
                 index) among:
2169

2170
                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
2171
                  single sequence if provided).
2172
                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
2173
                  maximum acceptable input length for the model if that argument is not provided.
2174
                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
2175
                  different lengths).
2176
            max_length (:obj:`int`, `optional`):
2177
                Maximum length of the returned list and optionally padding length (see above).
2178
            pad_to_multiple_of (:obj:`int`, `optional`):
2179
                If set will pad the sequence to a multiple of the provided value.
2180

2181
                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
2182
                >= 7.5 (Volta).
2183
            return_attention_mask (:obj:`bool`, `optional`):
2184
                Whether to return the attention mask. If left to the default, will return the attention mask according
2185
                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
2186

2187
                `What are attention masks? <../glossary.html#attention-mask>`__
2188
            return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
2189
                If set, will return tensors instead of list of python integers. Acceptable values are:
2190

2191
                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
2192
                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
2193
                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
2194
            verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
2195
                Whether or not to print informations and warnings.
2196
        """
2197
        # If we have a list of dicts, let's convert it in a dict of lists
2198
        if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], (dict, BatchEncoding)):
2199
            encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}
2200

2201
        assert "input_ids" in encoded_inputs, (
2202
            "You should supply an encoding or a list of encodings to this method. "
2203
            "An encoding is the output of one the encoding methods of the tokenizer, i.e. "
2204
            "__call__/encode_plus/batch_encode_plus. "
2205
        )
2206

2207
        if not encoded_inputs["input_ids"]:
2208
            if return_attention_mask:
2209
                encoded_inputs["attention_mask"] = []
2210
            return encoded_inputs
2211

2212
        # Convert padding_strategy in PaddingStrategy
2213
        padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
2214
            padding=padding, max_length=max_length, verbose=verbose
2215
        )
2216

2217
        if encoded_inputs["input_ids"] and not isinstance(encoded_inputs["input_ids"][0], (list, tuple)):
2218
            encoded_inputs = self._pad(
2219
                encoded_inputs,
2220
                max_length=max_length,
2221
                padding_strategy=padding_strategy,
2222
                pad_to_multiple_of=pad_to_multiple_of,
2223
                return_attention_mask=return_attention_mask,
2224
            )
2225
            return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
2226

2227
        batch_size = len(encoded_inputs["input_ids"])
2228
        assert all(
2229
            len(v) == batch_size for v in encoded_inputs.values()
2230
        ), "Some items in the output dictionnary have a different batch size than others."
2231

2232
        if padding_strategy == PaddingStrategy.LONGEST:
2233
            max_length = max(len(inputs) for inputs in encoded_inputs["input_ids"])
2234
            padding_strategy = PaddingStrategy.MAX_LENGTH
2235

2236
        batch_outputs = {}
2237
        for i in range(batch_size):
2238
            inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
2239
            outputs = self._pad(
2240
                inputs,
2241
                max_length=max_length,
2242
                padding_strategy=padding_strategy,
2243
                pad_to_multiple_of=pad_to_multiple_of,
2244
                return_attention_mask=return_attention_mask,
2245
            )
2246

2247
            for key, value in outputs.items():
2248
                if key not in batch_outputs:
2249
                    batch_outputs[key] = []
2250
                batch_outputs[key].append(value)
2251

2252
        return BatchEncoding(batch_outputs, tensor_type=return_tensors)
2253

2254
    def create_token_type_ids_from_sequences(
2255
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
2256
    ) -> List[int]:
2257
        """
2258
        Create the token type IDs corresponding to the sequences passed.
2259
        `What are token type IDs? <../glossary.html#token-type-ids>`__
2260

2261
        Should be overriden in a subclass if the model has a special way of building those.
2262

2263
        Args:
2264
            token_ids_0 (:obj:`List[int]`): The first tokenized sequence.
2265
            token_ids_1 (:obj:`List[int]`, `optional`): The second tokenized sequence.
2266

2267
        Returns:
2268
            :obj:`List[int]`: The token type ids.
2269
        """
2270
        if token_ids_1 is None:
2271
            return len(token_ids_0) * [0]
2272
        return [0] * len(token_ids_0) + [1] * len(token_ids_1)
2273

2274
    def build_inputs_with_special_tokens(
2275
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
2276
    ) -> List[int]:
2277
        """
2278
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
2279
        by concatenating and adding special tokens.
2280

2281
        This implementation does not add special tokens and this method should be overriden in a subclass.
2282

2283
        Args:
2284
            token_ids_0 (:obj:`List[int]`): The first tokenized sequence.
2285
            token_ids_1 (:obj:`List[int]`, `optional`): The second tokenized sequence.
2286

2287
        Returns:
2288
            :obj:`List[int]`: The model input with special tokens.
2289
        """
2290
        if token_ids_1 is None:
2291
            return token_ids_0
2292
        return token_ids_0 + token_ids_1
2293

2294
    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
2295
    def prepare_for_model(
2296
        self,
2297
        ids: List[int],
2298
        pair_ids: Optional[List[int]] = None,
2299
        add_special_tokens: bool = True,
2300
        padding: Union[bool, str, PaddingStrategy] = False,
2301
        truncation: Union[bool, str, TruncationStrategy] = False,
2302
        max_length: Optional[int] = None,
2303
        stride: int = 0,
2304
        pad_to_multiple_of: Optional[int] = None,
2305
        return_tensors: Optional[Union[str, TensorType]] = None,
2306
        return_token_type_ids: Optional[bool] = None,
2307
        return_attention_mask: Optional[bool] = None,
2308
        return_overflowing_tokens: bool = False,
2309
        return_special_tokens_mask: bool = False,
2310
        return_offsets_mapping: bool = False,
2311
        return_length: bool = False,
2312
        verbose: bool = True,
2313
        prepend_batch_axis: bool = False,
2314
        **kwargs
2315
    ) -> BatchEncoding:
2316
        """
2317
        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
2318
        It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
2319
        manages a moving window (with user defined stride) for overflowing tokens
2320

2321
        Args:
2322
            ids (:obj:`List[int]`):
2323
                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the
2324
                ``tokenize`` and ``convert_tokens_to_ids`` methods.
2325
            pair_ids (:obj:`List[int]`, `optional`):
2326
                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the
2327
                ``tokenize`` and ``convert_tokens_to_ids`` methods.
2328
        """
2329

2330
        if "return_lengths" in kwargs:
2331
            if verbose:
2332
                warnings.warn(
2333
                    "The PreTrainedTokenizerBase.prepare_for_model `return_lengths` parameter is deprecated. "
2334
                    "Please use `return_length` instead.",
2335
                    FutureWarning,
2336
                )
2337
            return_length = kwargs["return_lengths"]
2338

2339
        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
2340
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
2341
            padding=padding,
2342
            truncation=truncation,
2343
            max_length=max_length,
2344
            pad_to_multiple_of=pad_to_multiple_of,
2345
            verbose=verbose,
2346
            **kwargs,
2347
        )
2348

2349
        pair = bool(pair_ids is not None)
2350
        len_ids = len(ids)
2351
        len_pair_ids = len(pair_ids) if pair else 0
2352

2353
        # Load from model defaults
2354
        if return_token_type_ids is None:
2355
            return_token_type_ids = "token_type_ids" in self.model_input_names
2356
        if return_attention_mask is None:
2357
            return_attention_mask = "attention_mask" in self.model_input_names
2358

2359
        encoded_inputs = {}
2360

2361
        # Compute the total size of the returned encodings
2362
        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)
2363

2364
        # Truncation: Handle max sequence length
2365
        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
2366
            ids, pair_ids, overflowing_tokens = self.truncate_sequences(
2367
                ids,
2368
                pair_ids=pair_ids,
2369
                num_tokens_to_remove=total_len - max_length,
2370
                truncation_strategy=truncation_strategy,
2371
                stride=stride,
2372
            )
2373
            if return_overflowing_tokens:
2374
                encoded_inputs["overflowing_tokens"] = overflowing_tokens
2375
                encoded_inputs["num_truncated_tokens"] = total_len - max_length
2376

2377
        # Add special tokens
2378
        if add_special_tokens:
2379
            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
2380
            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
2381
        else:
2382
            sequence = ids + pair_ids if pair else ids
2383
            token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
2384

2385
        # Build output dictionnary
2386
        encoded_inputs["input_ids"] = sequence
2387
        if return_token_type_ids:
2388
            encoded_inputs["token_type_ids"] = token_type_ids
2389
        if return_special_tokens_mask:
2390
            if add_special_tokens:
2391
                encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
2392
            else:
2393
                encoded_inputs["special_tokens_mask"] = [0] * len(sequence)
2394

2395
        # Check lengths
2396
        if max_length is None and len(encoded_inputs["input_ids"]) > self.model_max_length and verbose:
2397
            logger.warning(
2398
                "Token indices sequence length is longer than the specified maximum sequence length "
2399
                "for this model ({} > {}). Running this sequence through the model will result in "
2400
                "indexing errors".format(len(ids), self.model_max_length)
2401
            )
2402

2403
        # Padding
2404
        if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
2405
            encoded_inputs = self.pad(
2406
                encoded_inputs,
2407
                max_length=max_length,
2408
                padding=padding_strategy.value,
2409
                pad_to_multiple_of=pad_to_multiple_of,
2410
                return_attention_mask=return_attention_mask,
2411
            )
2412

2413
        if return_length:
2414
            encoded_inputs["length"] = len(encoded_inputs["input_ids"])
2415

2416
        batch_outputs = BatchEncoding(
2417
            encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
2418
        )
2419

2420
        return batch_outputs
2421

2422
    def truncate_sequences(
2423
        self,
2424
        ids: List[int],
2425
        pair_ids: Optional[List[int]] = None,
2426
        num_tokens_to_remove: int = 0,
2427
        truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
2428
        stride: int = 0,
2429
    ) -> Tuple[List[int], List[int], List[int]]:
2430
        """
2431
        Truncates a sequence pair in-place following the strategy.
2432

2433
        Args:
2434
            ids (:obj:`List[int]`):
2435
                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the
2436
                ``tokenize`` and ``convert_tokens_to_ids`` methods.
2437
            pair_ids (:obj:`List[int]`, `optional`):
2438
                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the
2439
                ``tokenize`` and ``convert_tokens_to_ids`` methods.
2440
            num_tokens_to_remove (:obj:`int`, `optional`, defaults to 0):
2441
                Number of tokens to remove using the truncation strategy.
2442
            truncation (:obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`):
2443
                The strategy to follow for truncation. Can be:
2444

2445
                * :obj:`'longest_first'`: Truncate to a maximum length specified with the argument
2446
                  :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
2447
                  provided. This will truncate token by token, removing a token from the longest sequence in the pair
2448
                  if a pair of sequences (or a batch of pairs) is provided.
2449
                * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
2450
                  the maximum acceptable input length for the model if that argument is not provided. This will only
2451
                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
2452
                * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
2453
                  to the maximum acceptable input length for the model if that argument is not provided. This will only
2454
                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
2455
                * :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with
2456
                  sequence lengths greater than the model maximum admissible input size).
2457
            max_length (:obj:`int`, `optional`):
2458
                Controls the maximum length to use by one of the truncation/padding parameters.
2459

2460
                If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
2461
                length is required by one of the truncation/padding parameters. If the model has no specific maximum
2462
                input length (like XLNet) truncation/padding to a maximum length will be deactivated.
2463
            stride (:obj:`int`, `optional`, defaults to 0):
2464
                If set to a positive number, the overflowing tokens returned will contain some tokens
2465
                from the main sequence returned. The value of this argument defines the number of additional tokens.
2466

2467
        Returns:
2468
            :obj:`Tuple[List[int], List[int], List[int]]`:
2469
            The truncated ``ids``, the truncated ``pair_ids`` and the list of overflowing tokens.
2470
        """
2471
        if num_tokens_to_remove <= 0:
2472
            return ids, pair_ids, []
2473

2474
        if not isinstance(truncation_strategy, TruncationStrategy):
2475
            truncation_strategy = TruncationStrategy(truncation_strategy)
2476

2477
        overflowing_tokens = []
2478
        if truncation_strategy == TruncationStrategy.LONGEST_FIRST:
2479
            for _ in range(num_tokens_to_remove):
2480
                if pair_ids is None or len(ids) > len(pair_ids):
2481
                    if not overflowing_tokens:
2482
                        window_len = min(len(ids), stride + 1)
2483
                    else:
2484
                        window_len = 1
2485
                    overflowing_tokens.extend(ids[-window_len:])
2486
                    ids = ids[:-1]
2487
                else:
2488
                    if not overflowing_tokens:
2489
                        window_len = min(len(pair_ids), stride + 1)
2490
                    else:
2491
                        window_len = 1
2492
                    overflowing_tokens.extend(pair_ids[-window_len:])
2493
                    pair_ids = pair_ids[:-1]
2494
        elif truncation_strategy == TruncationStrategy.ONLY_FIRST:
2495
            if len(ids) > num_tokens_to_remove:
2496
                window_len = min(len(ids), stride + num_tokens_to_remove)
2497
                overflowing_tokens = ids[-window_len:]
2498
                ids = ids[:-num_tokens_to_remove]
2499
            else:
2500
                logger.error(
2501
                    f"We need to remove {num_tokens_to_remove} to truncate the input"
2502
                    f"but the first sequence has a length {len(ids)}. "
2503
                    f"Please select another truncation strategy than {truncation_strategy}, "
2504
                    f"for instance 'longest_first' or 'only_second'."
2505
                )
2506
        elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None:
2507
            if len(pair_ids) > num_tokens_to_remove:
2508
                window_len = min(len(pair_ids), stride + num_tokens_to_remove)
2509
                overflowing_tokens = pair_ids[-window_len:]
2510
                pair_ids = pair_ids[:-num_tokens_to_remove]
2511
            else:
2512
                logger.error(
2513
                    f"We need to remove {num_tokens_to_remove} to truncate the input"
2514
                    f"but the second sequence has a length {len(pair_ids)}. "
2515
                    f"Please select another truncation strategy than {truncation_strategy}, "
2516
                    f"for instance 'longest_first' or 'only_first'."
2517
                )
2518

2519
        return (ids, pair_ids, overflowing_tokens)
2520

2521
    def _pad(
2522
        self,
2523
        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
2524
        max_length: Optional[int] = None,
2525
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
2526
        pad_to_multiple_of: Optional[int] = None,
2527
        return_attention_mask: Optional[bool] = None,
2528
    ) -> dict:
2529
        """
2530
        Pad encoded inputs (on left/right and up to predefined legnth or max length in the batch)
2531

2532
        Args:
2533
            encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
2534
            max_length: maximum length of the returned list and optionally padding length (see below).
2535
                Will truncate by taking into account the special tokens.
2536
            padding_strategy: PaddingStrategy to use for padding.
2537
                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
2538
                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
2539
                - PaddingStrategy.DO_NOT_PAD: Do not pad
2540
                The tokenizer padding sides are defined in self.padding_side:
2541
                    - 'left': pads on the left of the sequences
2542
                    - 'right': pads on the right of the sequences
2543
            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
2544
                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
2545
                >= 7.5 (Volta).
2546
            return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics)
2547
        """
2548
        # Load from model defaults
2549
        if return_attention_mask is None:
2550
            return_attention_mask = "attention_mask" in self.model_input_names
2551

2552
        if padding_strategy == PaddingStrategy.LONGEST:
2553
            max_length = len(encoded_inputs["input_ids"])
2554

2555
        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
2556
            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
2557

2558
        needs_to_be_padded = (
2559
            padding_strategy != PaddingStrategy.DO_NOT_PAD and len(encoded_inputs["input_ids"]) != max_length
2560
        )
2561

2562
        if needs_to_be_padded:
2563
            difference = max_length - len(encoded_inputs["input_ids"])
2564
            if self.padding_side == "right":
2565
                if return_attention_mask:
2566
                    encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference
2567
                if "token_type_ids" in encoded_inputs:
2568
                    encoded_inputs["token_type_ids"] = (
2569
                        encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
2570
                    )
2571
                if "special_tokens_mask" in encoded_inputs:
2572
                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
2573
                encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference
2574
            elif self.padding_side == "left":
2575
                if return_attention_mask:
2576
                    encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"])
2577
                if "token_type_ids" in encoded_inputs:
2578
                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
2579
                        "token_type_ids"
2580
                    ]
2581
                if "special_tokens_mask" in encoded_inputs:
2582
                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
2583
                encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"]
2584
            else:
2585
                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
2586
        else:
2587
            if return_attention_mask:
2588
                encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"])
2589

2590
        return encoded_inputs
2591

2592
    def batch_decode(
2593
        self, sequences: List[List[int]], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True
2594
    ) -> List[str]:
2595
        """
2596
        Convert a list of lists of token ids into a list of strings by calling decode.
2597

2598
        Args:
2599
            sequences (:obj:`List[List[int]]`):
2600
                List of tokenized input ids. Can be obtained using the ``__call__`` method.
2601
            skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
2602
                Whether or not to remove special tokens in the decoding.
2603
            clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`):
2604
                Whether or not to clean up the tokenization spaces.
2605

2606
        Returns:
2607
            :obj:`List[str]`: The list of decoded sentences.
2608
        """
2609
        return [
2610
            self.decode(
2611
                seq, skip_special_tokens=skip_special_tokens, clean_up_tokenization_spaces=clean_up_tokenization_spaces
2612
            )
2613
            for seq in sequences
2614
        ]
2615

2616
    def decode(
2617
        self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True
2618
    ) -> str:
2619
        """
2620
        Converts a sequence of ids in a string, using the tokenizer and vocabulary
2621
        with options to remove special tokens and clean up tokenization spaces.
2622

2623
        Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
2624

2625
        Args:
2626
            token_ids (:obj:`List[int]`):
2627
                List of tokenized input ids. Can be obtained using the ``__call__`` method.
2628
            skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
2629
                Whether or not to remove special tokens in the decoding.
2630
            clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`):
2631
                Whether or not to clean up the tokenization spaces.
2632

2633
        Returns:
2634
            :obj:`str`: The decoded sentence.
2635
        """
2636
        raise NotImplementedError
2637

2638
    def get_special_tokens_mask(
2639
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
2640
    ) -> List[int]:
2641
        """
2642
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
2643
        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
2644

2645
        Args:
2646
            token_ids_0 (:obj:`List[int]`):
2647
                List of ids of the first sequence.
2648
            token_ids_1 (:obj:`List[int]`, `optional`):
2649
                List of ids of the second sequence.
2650
            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
2651
                Wheter or not the token list is already formated with special tokens for the model.
2652

2653
        Returns:
2654
            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
2655
        """
2656
        assert already_has_special_tokens and token_ids_1 is None, (
2657
            "You cannot use ``already_has_special_tokens=False`` with this tokenizer. "
2658
            "Please use a slow (full python) tokenizer to activate this argument."
2659
            "Or set `return_special_token_mask=True` when calling the encoding method "
2660
            "to get the special tokens mask in any tokenizer. "
2661
        )
2662

2663
        all_special_ids = self.all_special_ids  # cache the property
2664

2665
        special_tokens_mask = [1 if token in all_special_ids else 0 for token in token_ids_0]
2666

2667
        return special_tokens_mask
2668

2669
    @staticmethod
2670
    def clean_up_tokenization(out_string: str) -> str:
2671
        """
2672
        Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms.
2673

2674
        Args:
2675
            out_string (:obj:`str`): The text to clean up.
2676

2677
        Returns:
2678
            :obj:`str`: The cleaned-up string.
2679
        """
2680
        out_string = (
2681
            out_string.replace(" .", ".")
2682
            .replace(" ?", "?")
2683
            .replace(" !", "!")
2684
            .replace(" ,", ",")
2685
            .replace(" ' ", "'")
2686
            .replace(" n't", "n't")
2687
            .replace(" 'm", "'m")
2688
            .replace(" 's", "'s")
2689
            .replace(" 've", "'ve")
2690
            .replace(" 're", "'re")
2691
        )
2692
        return out_string
2693
CSS-LM

Использование cookies