CSS-LM
2692 строки · 124.7 Кб
1# coding=utf-8
2# Copyright 2020 The HuggingFace Inc. team.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15""" Base classes common to both the slow and the fast tokenization classes:
16PreTrainedTokenizerBase (host all the user fronting encoding methodes)
17Special token mixing (host the special tokens logic) and
18BatchEncoding (wrap the dictionnary of output with special method for the Fast tokenizers)
19"""
20
21import copy
22import json
23import logging
24import os
25import warnings
26from collections import UserDict
27from enum import Enum
28from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
29
30import numpy as np
31from tokenizers import AddedToken
32from tokenizers import Encoding as EncodingFast
33
34from .file_utils import (
35add_end_docstrings,
36cached_path,
37hf_bucket_url,
38is_remote_url,
39is_tf_available,
40is_torch_available,
41torch_required,
42)
43
44
45if is_tf_available():
46import tensorflow as tf
47if is_torch_available():
48import torch
49
50
51logger = logging.getLogger(__name__)
52
53VERY_LARGE_INTEGER = int(1e30) # This is used to set the max input length for a model with infinite size input
54LARGE_INTEGER = int(1e20) # This is used when we need something big but slightly smaller than VERY_LARGE_INTEGER
55
56# Define type aliases and NamedTuples
57TextInput = str
58PreTokenizedInput = List[str]
59EncodedInput = List[int]
60TextInputPair = Tuple[str, str]
61PreTokenizedInputPair = Tuple[List[str], List[str]]
62EncodedInputPair = Tuple[List[int], List[int]]
63
64
65# Slow tokenizers used to be saved in three separated files
66SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
67ADDED_TOKENS_FILE = "added_tokens.json"
68TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
69
70# Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file
71FULL_TOKENIZER_FILE = "tokenizer.json"
72
73
74class ExplicitEnum(Enum):
75"""
76Enum with more explicit error message for missing values.
77"""
78
79@classmethod
80def _missing_(cls, value):
81raise ValueError(
82"%r is not a valid %s, please select one of %s"
83% (value, cls.__name__, str(list(cls._value2member_map_.keys())))
84)
85
86
87class TruncationStrategy(ExplicitEnum):
88"""
89Possible values for the ``truncation`` argument in :meth:`PreTrainedTokenizerBase.__call__`.
90Useful for tab-completion in an IDE.
91"""
92
93ONLY_FIRST = "only_first"
94ONLY_SECOND = "only_second"
95LONGEST_FIRST = "longest_first"
96DO_NOT_TRUNCATE = "do_not_truncate"
97
98
99class PaddingStrategy(ExplicitEnum):
100"""
101Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`.
102Useful for tab-completion in an IDE.
103"""
104
105LONGEST = "longest"
106MAX_LENGTH = "max_length"
107DO_NOT_PAD = "do_not_pad"
108
109
110class TensorType(ExplicitEnum):
111"""
112Possible values for the ``return_tensors`` argument in :meth:`PreTrainedTokenizerBase.__call__`.
113Useful for tab-completion in an IDE.
114"""
115
116PYTORCH = "pt"
117TENSORFLOW = "tf"
118NUMPY = "np"
119
120
121class CharSpan(NamedTuple):
122"""
123Character span in the original string.
124
125Args:
126start (:obj:`int`): Index of the first character in the original string.
127end (:obj:`int`): Index of the character following the last character in the original string.
128"""
129
130start: int
131end: int
132
133
134class TokenSpan(NamedTuple):
135"""
136Token span in an encoded string (list of tokens).
137
138Args:
139start (:obj:`int`): Index of the first token in the span.
140end (:obj:`int`): Index of the token following the last token in the span.
141"""
142
143start: int
144end: int
145
146
147class BatchEncoding(UserDict):
148"""
149Holds the output of the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.encode_plus`
150and :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.batch_encode` methods (tokens,
151attention_masks, etc).
152
153This class is derived from a python dictionary and can be used as a dictionary. In addition, this class exposes
154utility methods to map from word/character space to token space.
155
156Args:
157data (:obj:`dict`):
158Dictionary of lists/arrays/tensors returned by the encode/batch_encode methods ('input_ids',
159'attention_mask', etc.).
160encoding (:obj:`tokenizers.Encoding` or :obj:`Sequence[tokenizers.Encoding]`, `optional`):
161If the tokenizer is a fast tokenizer which outputs additional informations like mapping from word/character
162space to token space the :obj:`tokenizers.Encoding` instance or list of instance (for batches) hold these
163informations.
164tensor_type (:obj:`Union[None, str, TensorType]`, `optional`):
165You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
166initialization.
167prepend_batch_axis (:obj:`bool`, `optional`, defaults to :obj:`False`):
168Whether or not to add a batch axis when converting to tensors (see :obj:`tensor_type` above).
169"""
170
171def __init__(
172self,
173data: Optional[Dict[str, Any]] = None,
174encoding: Optional[Union[EncodingFast, Sequence[EncodingFast]]] = None,
175tensor_type: Union[None, str, TensorType] = None,
176prepend_batch_axis: bool = False,
177):
178super().__init__(data)
179
180if isinstance(encoding, EncodingFast):
181encoding = [encoding]
182
183self._encodings = encoding
184
185self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)
186
187@property
188def is_fast(self) -> bool:
189"""
190:obj:`bool`: Indicate whether this :class:`~transformers.BatchEncoding` was generated from the result of a
191:class:`~transformers.PreTrainedTokenizerFast` or not.
192"""
193return self._encodings is not None
194
195def __getitem__(self, item: Union[int, str]) -> Union[Any, EncodingFast]:
196"""
197If the key is a string, returns the value of the dict associated to :obj:`key` ('input_ids',
198'attention_mask', etc.).
199
200If the key is an integer, get the :obj:`tokenizers.Encoding` for batch item with index :obj:`key`.
201"""
202if isinstance(item, str):
203return self.data[item]
204elif self._encodings is not None:
205return self._encodings[item]
206else:
207raise KeyError(
208"Indexing with integers (to access backend Encoding for a given batch index) "
209"is not available when using Python based tokenizers"
210)
211
212def __getattr__(self, item: str):
213try:
214return self.data[item]
215except KeyError:
216raise AttributeError
217
218def __getstate__(self):
219return {"data": self.data, "encodings": self._encodings}
220
221def __setstate__(self, state):
222if "data" in state:
223self.data = state["data"]
224
225if "encodings" in state:
226self._encodings = state["encodings"]
227
228def keys(self):
229return self.data.keys()
230
231def values(self):
232return self.data.values()
233
234def items(self):
235return self.data.items()
236
237# After this point:
238# Extended properties and methods only available for fast (Rust-based) tokenizers
239# provided by HuggingFace tokenizers library.
240
241@property
242def encodings(self) -> Optional[List[EncodingFast]]:
243"""
244:obj:`Optional[List[tokenizers.Encoding]]`: The list all encodings from the tokenization process.
245Returns :obj:`None` if the input was tokenized through Python (i.e., not a fast) tokenizer.
246"""
247return self._encodings
248
249def tokens(self, batch_index: int = 0) -> List[str]:
250"""
251Return the list of tokens (sub-parts of the input strings after word/subword splitting and before converstion
252to integer indices) at a given batch index (only works for the output of a fast tokenizer).
253
254Args:
255batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.
256
257Returns:
258:obj:`List[str]`: The list of tokens at that index.
259"""
260if not self._encodings:
261raise ValueError("tokens() is not available when using Python-based tokenizers")
262return self._encodings[batch_index].tokens
263
264def words(self, batch_index: int = 0) -> List[Optional[int]]:
265"""
266Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
267
268Args:
269batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.
270
271Returns:
272:obj:`List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by
273the tokenizer are mapped to :obj:`None` and other tokens are mapped to the index of their corresponding
274word (several tokens will be mapped to the same word index if they are parts of that word).
275"""
276if not self._encodings:
277raise ValueError("words() is not available when using Python-based tokenizers")
278return self._encodings[batch_index].words
279
280def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
281"""
282Get the index of the word corresponding (i.e. comprising) to an encoded token
283in a sequence of the batch.
284
285Can be called as:
286
287- ``self.token_to_word(token_index)`` if batch size is 1
288- ``self.token_to_word(batch_index, token_index)`` if batch size is greater than 1
289
290This method is particularly suited when the input sequences are provided as
291pre-tokenized sequences (i.e., words are defined by the user). In this case it allows
292to easily associate encoded tokens with provided tokenized words.
293
294Args:
295batch_or_token_index (:obj:`int`):
296Index of the sequence in the batch. If the batch only comprise one sequence,
297this can be the index of the token in the sequence.
298token_index (:obj:`int`, `optional`):
299If a batch index is provided in `batch_or_token_index`, this can be the index
300of the token in the sequence.
301
302Returns:
303:obj:`int`: Index of the word in the input sequence.
304"""
305
306if not self._encodings:
307raise ValueError("token_to_word() is not available when using Python based tokenizers")
308if token_index is not None:
309batch_index = batch_or_token_index
310else:
311batch_index = 0
312token_index = batch_or_token_index
313if batch_index < 0:
314batch_index = self._batch_size + batch_index
315if token_index < 0:
316token_index = self._seq_len + token_index
317return self._encodings[batch_index].token_to_word(token_index)
318
319def word_to_tokens(self, batch_or_word_index: int, word_index: Optional[int] = None) -> TokenSpan:
320"""
321Get the encoded token span corresponding to a word in the sequence of the batch.
322
323Token spans are returned as a :class:`~transformers.tokenization_utils_base.TokenSpan` with:
324
325- **start** -- Index of the first token.
326- **end** -- Index of the token following the last token.
327
328Can be called as:
329
330- ``self.word_to_tokens(word_index)`` if batch size is 1
331- ``self.word_to_tokens(batch_index, word_index)`` if batch size is greater or equal to 1
332
333This method is particularly suited when the input sequences are provided as
334pre-tokenized sequences (i.e. words are defined by the user). In this case it allows
335to easily associate encoded tokens with provided tokenized words.
336
337Args:
338batch_or_word_index (:obj:`int`):
339Index of the sequence in the batch. If the batch only comprises one sequence,
340this can be the index of the word in the sequence.
341word_index (:obj:`int`, `optional`):
342If a batch index is provided in `batch_or_token_index`, this can be the index
343of the word in the sequence.
344
345Returns:
346:class:`~transformers.tokenization_utils_base.TokenSpan`
347Span of tokens in the encoded sequence.
348"""
349
350if not self._encodings:
351raise ValueError("word_to_tokens() is not available when using Python based tokenizers")
352if word_index is not None:
353batch_index = batch_or_word_index
354else:
355batch_index = 0
356word_index = batch_or_word_index
357if batch_index < 0:
358batch_index = self._batch_size + batch_index
359if word_index < 0:
360word_index = self._seq_len + word_index
361return TokenSpan(*(self._encodings[batch_index].word_to_tokens(word_index)))
362
363def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> CharSpan:
364"""
365Get the character span corresponding to an encoded token in a sequence of the batch.
366
367Character spans are returned as a :class:`~transformers.tokenization_utils_base.CharSpan` with:
368
369- **start** -- Index of the first character in the original string associated to the token.
370- **end** -- Index of the character following the last character in the original string associated to the
371token.
372
373Can be called as:
374
375- ``self.token_to_chars(token_index)`` if batch size is 1
376- ``self.token_to_chars(batch_index, token_index)`` if batch size is greater or equal to 1
377
378Args:
379batch_or_token_index (:obj:`int`):
380Index of the sequence in the batch. If the batch only comprise one sequence,
381this can be the index of the token in the sequence.
382token_index (:obj:`int`, `optional`):
383If a batch index is provided in `batch_or_token_index`, this can be the index
384of the token or tokens in the sequence.
385
386Returns:
387:class:`~transformers.tokenization_utils_base.CharSpan`:
388Span of characters in the original string.
389"""
390
391if not self._encodings:
392raise ValueError("token_to_chars() is not available when using Python based tokenizers")
393if token_index is not None:
394batch_index = batch_or_token_index
395else:
396batch_index = 0
397token_index = batch_or_token_index
398return CharSpan(*(self._encodings[batch_index].token_to_chars(token_index)))
399
400def char_to_token(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int:
401"""
402Get the index of the token in the encoded output comprising a character
403in the original string for a sequence of the batch.
404
405Can be called as:
406
407- ``self.char_to_token(char_index)`` if batch size is 1
408- ``self.char_to_token(batch_index, char_index)`` if batch size is greater or equal to 1
409
410This method is particularly suited when the input sequences are provided as
411pre-tokenized sequences (i.e. words are defined by the user). In this case it allows
412to easily associate encoded tokens with provided tokenized words.
413
414Args:
415batch_or_char_index (:obj:`int`):
416Index of the sequence in the batch. If the batch only comprise one sequence,
417this can be the index of the word in the sequence
418char_index (:obj:`int`, `optional`):
419If a batch index is provided in `batch_or_token_index`, this can be the index
420of the word in the sequence.
421
422
423Returns:
424:obj:`int`: Index of the token.
425"""
426
427if not self._encodings:
428raise ValueError("char_to_token() is not available when using Python based tokenizers")
429if char_index is not None:
430batch_index = batch_or_char_index
431else:
432batch_index = 0
433char_index = batch_or_char_index
434return self._encodings[batch_index].char_to_token(char_index)
435
436def word_to_chars(self, batch_or_word_index: int, word_index: Optional[int] = None) -> CharSpan:
437"""
438Get the character span in the original string corresponding to given word in a sequence
439of the batch.
440
441Character spans are returned as a CharSpan NamedTuple with:
442
443- start: index of the first character in the original string
444- end: index of the character following the last character in the original string
445
446Can be called as:
447
448- ``self.word_to_chars(word_index)`` if batch size is 1
449- ``self.word_to_chars(batch_index, word_index)`` if batch size is greater or equal to 1
450
451Args:
452batch_or_word_index (:obj:`int`):
453Index of the sequence in the batch. If the batch only comprise one sequence,
454this can be the index of the word in the sequence
455word_index (:obj:`int`, `optional`):
456If a batch index is provided in `batch_or_token_index`, this can be the index
457of the word in the sequence.
458
459Returns:
460:obj:`CharSpan` or :obj:`List[CharSpan]`:
461Span(s) of the associated character or characters in the string.
462CharSpan are NamedTuple with:
463
464- start: index of the first character associated to the token in the original string
465- end: index of the character following the last character associated to the token in the original string
466"""
467
468if not self._encodings:
469raise ValueError("word_to_chars() is not available when using Python based tokenizers")
470if word_index is not None:
471batch_index = batch_or_word_index
472else:
473batch_index = 0
474word_index = batch_or_word_index
475return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index)))
476
477def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int:
478"""
479Get the word in the original string corresponding to a character in the original string of
480a sequence of the batch.
481
482Can be called as:
483
484- ``self.char_to_word(char_index)`` if batch size is 1
485- ``self.char_to_word(batch_index, char_index)`` if batch size is greater than 1
486
487This method is particularly suited when the input sequences are provided as
488pre-tokenized sequences (i.e. words are defined by the user). In this case it allows
489to easily associate encoded tokens with provided tokenized words.
490
491Args:
492batch_or_char_index (:obj:`int`):
493Index of the sequence in the batch. If the batch only comprise one sequence,
494this can be the index of the character in the orginal string.
495char_index (:obj:`int`, `optional`):
496If a batch index is provided in `batch_or_token_index`, this can be the index
497of the character in the orginal string.
498
499
500Returns:
501:obj:`int` or :obj:`List[int]`:
502Index or indices of the associated encoded token(s).
503"""
504
505if not self._encodings:
506raise ValueError("char_to_word() is not available when using Python based tokenizers")
507if char_index is not None:
508batch_index = batch_or_char_index
509else:
510batch_index = 0
511char_index = batch_or_char_index
512return self._encodings[batch_index].char_to_word(char_index)
513
514def convert_to_tensors(
515self, tensor_type: Optional[Union[str, TensorType]] = None, prepend_batch_axis: bool = False
516):
517"""
518Convert the inner content to tensors.
519
520Args:
521tensor_type (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
522The type of tensors to use. If :obj:`str`, should be one of the values of the enum
523:class:`~transformers.tokenization_utils_base.TensorType`. If :obj:`None`, no modification is done.
524prepend_batch_axis (:obj:`int`, `optional`, defaults to :obj:`False`):
525Whether or not to add the batch dimension during the conversion.
526"""
527if tensor_type is None:
528return self
529
530# Convert to TensorType
531if not isinstance(tensor_type, TensorType):
532tensor_type = TensorType(tensor_type)
533
534# Get a function reference for the correct framework
535if tensor_type == TensorType.TENSORFLOW and is_tf_available():
536as_tensor = tf.constant
537elif tensor_type == TensorType.PYTORCH and is_torch_available():
538as_tensor = torch.tensor
539elif tensor_type == TensorType.NUMPY:
540as_tensor = np.asarray
541else:
542raise ImportError(
543"Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
544tensor_type
545)
546)
547
548# Do the tensor conversion in batch
549for key, value in self.items():
550try:
551if prepend_batch_axis:
552value = [value]
553
554tensor = as_tensor(value)
555
556# at-least2d
557if tensor.ndim > 2:
558tensor = tensor.squeeze(0)
559elif tensor.ndim < 2:
560tensor = tensor[None, :]
561
562self[key] = tensor
563except: # noqa E722
564if key == "overflowing_tokens":
565raise ValueError(
566"Unable to create tensor returning overflowing tokens of different lengths. "
567"Please see if a fast version of this tokenizer is available to have this feature available."
568)
569raise ValueError(
570"Unable to create tensor, you should probably activate truncation and/or padding "
571"with 'padding=True' 'truncation=True' to have batched tensors with the same length."
572)
573
574return self
575
576@torch_required
577def to(self, device: str) -> "BatchEncoding":
578"""
579Send all values to device by calling :obj:`v.to(device)` (PyTorch only).
580
581Args:
582device (:obj:`str` or :obj:`torch.device`): The device to put the tensors on.
583
584Returns:
585:class:`~transformers.BatchEncoding`:
586The same instance of :class:`~transformers.BatchEncoding` after modification.
587"""
588self.data = {k: v.to(device) for k, v in self.data.items()}
589return self
590
591
592# class AddedToken(UserString):
593# """ AddedToken represents a token to be added to a Tokenizer
594
595# An AddedToken can have special options defining the way it should behave.
596
597# Args:
598# content: str:
599# The content of the token
600
601# single_word: bool
602# Whether this token should only match against single word. If True,
603# this token will never match inside of a word.
604
605# lstrip: bool
606# Whether this token should strip all potential whitespaces on the left side.
607# If True, this token will greedily match any whitespace on the left and then strip
608# them out.
609
610# rstrip: bool
611# Whether this token should strip all potential whitespaces on the right side.
612# If True, this token will greedily match any whitespace on the right and then strip
613# them out.
614# """
615
616# def __init__(
617# self, data: str, single_word: bool = False, lstrip: bool = False, rstrip: bool = False,
618# ):
619# super().__init__(data)
620
621# self._single_word = single_word
622# self._lstrip = lstrip
623# self._rstrip = rstrip
624
625# def lower(self):
626# return AddedToken(self.data.lower(), self._single_word, self._lstrip, self._rstrip)
627
628
629class SpecialTokensMixin:
630"""
631A mixin derived by :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast`
632to handle specific behaviors related to special tokens. In particular, this class hold the attributes which can be
633used to directly access these special tokens in a model-independant manner and allow to set and update the special
634tokens.
635
636Args:
637bos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
638A special token representing the beginning of a sentence.
639eos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
640A special token representing the end of a sentence.
641unk_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
642A special token representing an out-of-vocabulary token.
643sep_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
644A special token separating two different sentences in the same input (used by BERT for instance).
645pad_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
646A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
647attention mechanisms or loss computation.
648cls_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
649A special token representing the class of the input (used by BERT for instance).
650mask_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
651A special token representing a masked token (used by masked-language modeling pretraining objectives, like
652BERT).
653additional_special_tokens (tuple or list of :obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
654A tuple or a list of additional special tokens.
655"""
656
657SPECIAL_TOKENS_ATTRIBUTES = [
658"bos_token",
659"eos_token",
660"unk_token",
661"sep_token",
662"pad_token",
663"cls_token",
664"mask_token",
665"additional_special_tokens",
666]
667
668def __init__(self, verbose=True, **kwargs):
669self._bos_token = None
670self._eos_token = None
671self._unk_token = None
672self._sep_token = None
673self._pad_token = None
674self._cls_token = None
675self._mask_token = None
676self._pad_token_type_id = 0
677self._additional_special_tokens = []
678self.verbose = verbose
679
680# We directly set the hidden value to allow initialization with special tokens
681# which are not yet in the vocabulary. Necesssary for serialization/de-serialization
682# TODO clean this up at some point (probably by sitching to fast tokenizers)
683for key, value in kwargs.items():
684if key in self.SPECIAL_TOKENS_ATTRIBUTES:
685if key == "additional_special_tokens":
686assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value)
687setattr(self, key, value)
688elif isinstance(value, (str, AddedToken)):
689setattr(self, key, value)
690else:
691raise TypeError(
692"special token {} has to be either str or AddedToken but got: {}".format(key, type(value))
693)
694
695def sanitize_special_tokens(self) -> int:
696"""
697Make sure that all the special tokens attributes of the tokenizer (:obj:`tokenizer.mask_token`,
698:obj:`tokenizer.cls_token`, etc.) are in the vocabulary.
699
700Add the missing ones to the vocabulary if needed.
701
702Return:
703:obj:`int`: The number of tokens added in the vocaulary during the operation.
704"""
705return self.add_tokens(self.all_special_tokens_extended, special_tokens=True)
706
707def add_special_tokens(self, special_tokens_dict: Dict[str, Union[str, AddedToken]]) -> int:
708"""
709Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder and link them to class attributes. If
710special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the
711current vocabulary).
712
713Using : obj:`add_special_tokens` will ensure your special tokens can be used in several ways:
714
715- Special tokens are carefully handled by the tokenizer (they are never split).
716- You can easily refer to special tokens using tokenizer class attributes like :obj:`tokenizer.cls_token`. This
717makes it easy to develop model-agnostic training and fine-tuning scripts.
718
719When possible, special tokens are already registered for provided pretrained models (for instance
720:class:`~transformers.BertTokenizer` :obj:`cls_token` is already registered to be :obj`'[CLS]'` and XLM's one
721is also registered to be :obj:`'</s>'`).
722
723Args:
724special_tokens_dict (dictionary `str` to `str` or :obj:`tokenizers.AddedToken`):
725Keys should be in the list of predefined special attributes: [``bos_token``, ``eos_token``,
726``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``,
727``additional_special_tokens``].
728
729Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer
730assign the index of the ``unk_token`` to them).
731
732Returns:
733:obj:`int`: Number of tokens added to the vocabulary.
734
735Examples::
736
737# Let's see how to add a new classification token to GPT-2
738tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
739model = GPT2Model.from_pretrained('gpt2')
740
741special_tokens_dict = {'cls_token': '<CLS>'}
742
743num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
744print('We have added', num_added_toks, 'tokens')
745# Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
746model.resize_token_embeddings(len(tokenizer))
747
748assert tokenizer.cls_token == '<CLS>'
749"""
750if not special_tokens_dict:
751return 0
752
753added_tokens = 0
754for key, value in special_tokens_dict.items():
755assert key in self.SPECIAL_TOKENS_ATTRIBUTES
756
757if self.verbose:
758logger.info("Assigning %s to the %s key of the tokenizer", value, key)
759setattr(self, key, value)
760
761if key == "additional_special_tokens":
762assert isinstance(value, (list, tuple)) and all(
763isinstance(t, (str, AddedToken)) for t in value
764), f"Tokens {value} for key {key} should all be str or AddedToken instances"
765added_tokens += self.add_tokens(value, special_tokens=True)
766else:
767assert isinstance(
768value, (str, AddedToken)
769), f"Token {value} for key {key} should be a str or an AddedToken instance"
770added_tokens += self.add_tokens([value], special_tokens=True)
771
772return added_tokens
773
774def add_tokens(
775self, new_tokens: Union[str, AddedToken, List[Union[str, AddedToken]]], special_tokens: bool = False
776) -> int:
777"""
778Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
779it with indices starting from length of the current vocabulary.
780
781Args:
782new_tokens (:obj:`str`, :obj:`tokenizers.AddedToken` or a list of `str` or :obj:`tokenizers.AddedToken`):
783Tokens are only added if they are not already in the vocabulary. :obj:`tokenizers.AddedToken` wraps a
784string token to let you personalize its behavior: whether this token should only match against a single
785word, whether this token should strip all potential whitespaces on the left side, whether this token
786should strip all potential whitespaces on the right side, etc.
787special_token (:obj:`bool`, `optional`, defaults to :obj:`False`):
788Can be used to specify if the token is a special token. This mostly change the normalization behavior
789(special tokens like CLS or [MASK] are usually not lower-cased for instance).
790
791See details for :obj:`tokenizers.AddedToken` in HuggingFace tokenizers library.
792
793Returns:
794:obj:`int`: Number of tokens added to the vocabulary.
795
796Examples::
797
798# Let's see how to increase the vocabulary of Bert model and tokenizer
799tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
800model = BertModel.from_pretrained('bert-base-uncased')
801
802num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
803print('We have added', num_added_toks, 'tokens')
804# Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
805model.resize_token_embeddings(len(tokenizer))
806"""
807if not new_tokens:
808return 0
809
810if not isinstance(new_tokens, (list, tuple)):
811new_tokens = [new_tokens]
812
813return self._add_tokens(new_tokens, special_tokens=special_tokens)
814
815@property
816def bos_token(self) -> str:
817"""
818:obj:`str`: Beginning of sentence token. Log an error if used while not having been set.
819"""
820if self._bos_token is None and self.verbose:
821logger.error("Using bos_token, but it is not set yet.")
822return None
823return str(self._bos_token)
824
825@property
826def eos_token(self) -> str:
827"""
828:obj:`str`: End of sentence token. Log an error if used while not having been set.
829"""
830if self._eos_token is None and self.verbose:
831logger.error("Using eos_token, but it is not set yet.")
832return None
833return str(self._eos_token)
834
835@property
836def unk_token(self) -> str:
837"""
838:obj:`str`: Unknown token. Log an error if used while not having been set.
839"""
840if self._unk_token is None and self.verbose:
841logger.error("Using unk_token, but it is not set yet.")
842return None
843return str(self._unk_token)
844
845@property
846def sep_token(self) -> str:
847"""
848:obj:`str`: Separation token, to separate context and query in an input sequence.
849Log an error if used while not having been set.
850"""
851if self._sep_token is None and self.verbose:
852logger.error("Using sep_token, but it is not set yet.")
853return None
854return str(self._sep_token)
855
856@property
857def pad_token(self) -> str:
858"""
859:obj:`str`: Padding token. Log an error if used while not having been set.
860"""
861if self._pad_token is None and self.verbose:
862logger.error("Using pad_token, but it is not set yet.")
863return None
864return str(self._pad_token)
865
866@property
867def cls_token(self) -> str:
868"""
869:obj:`str`: Classification token, to extract a summary of an input sequence leveraging self-attention along
870the full depth of the model. Log an error if used while not having been set.
871"""
872if self._cls_token is None and self.verbose:
873logger.error("Using cls_token, but it is not set yet.")
874return None
875return str(self._cls_token)
876
877@property
878def mask_token(self) -> str:
879"""
880:obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
881not having been set.
882"""
883if self._mask_token is None and self.verbose:
884logger.error("Using mask_token, but it is not set yet.")
885return None
886return str(self._mask_token)
887
888@property
889def additional_special_tokens(self) -> List[str]:
890"""
891:obj:`List[str]`: All the additional special tokens you may want to use. Log an error if used while not having
892been set.
893"""
894if self._additional_special_tokens is None and self.verbose:
895logger.error("Using additional_special_tokens, but it is not set yet.")
896return None
897return [str(tok) for tok in self._additional_special_tokens]
898
899@bos_token.setter
900def bos_token(self, value):
901self._bos_token = value
902
903@eos_token.setter
904def eos_token(self, value):
905self._eos_token = value
906
907@unk_token.setter
908def unk_token(self, value):
909self._unk_token = value
910
911@sep_token.setter
912def sep_token(self, value):
913self._sep_token = value
914
915@pad_token.setter
916def pad_token(self, value):
917self._pad_token = value
918
919@cls_token.setter
920def cls_token(self, value):
921self._cls_token = value
922
923@mask_token.setter
924def mask_token(self, value):
925self._mask_token = value
926
927@additional_special_tokens.setter
928def additional_special_tokens(self, value):
929self._additional_special_tokens = value
930
931@property
932def bos_token_id(self) -> Optional[int]:
933"""
934:obj:`Optional[int]`: Id of the beginning of sentence token in the vocabulary. Returns :obj:`None` if the token
935has not been set.
936"""
937if self._bos_token is None:
938return None
939return self.convert_tokens_to_ids(self.bos_token)
940
941@property
942def eos_token_id(self) -> Optional[int]:
943"""
944:obj:`Optional[int]`: Id of the end of sentence token in the vocabulary. Returns :obj:`None` if the token has
945not been set.
946"""
947if self._eos_token is None:
948return None
949return self.convert_tokens_to_ids(self.eos_token)
950
951@property
952def unk_token_id(self) -> Optional[int]:
953"""
954:obj:`Optional[int]`: Id of the unknown token in the vocabulary. Returns :obj:`None` if the token has not been
955set.
956"""
957if self._unk_token is None:
958return None
959return self.convert_tokens_to_ids(self.unk_token)
960
961@property
962def sep_token_id(self) -> Optional[int]:
963"""
964:obj:`Optional[int]`: Id of the separation token in the vocabulary, to separate context and query in an input
965sequence. Returns :obj:`None` if the token has not been set.
966"""
967if self._sep_token is None:
968return None
969return self.convert_tokens_to_ids(self.sep_token)
970
971@property
972def pad_token_id(self) -> Optional[int]:
973"""
974:obj:`Optional[int]`: Id of the padding token in the vocabulary. Returns :obj:`None` if the token has not been
975set.
976"""
977if self._pad_token is None:
978return None
979return self.convert_tokens_to_ids(self.pad_token)
980
981@property
982def pad_token_type_id(self) -> int:
983"""
984:obj:`int`: Id of the padding token type in the vocabulary.
985"""
986return self._pad_token_type_id
987
988@property
989def cls_token_id(self) -> Optional[int]:
990"""
991:obj:`Optional[int]`: Id of the classification token in the vocabulary, to extract a summary of an input
992sequence leveraging self-attention along the full depth of the model.
993
994Returns :obj:`None` if the token has not been set.
995"""
996if self._cls_token is None:
997return None
998return self.convert_tokens_to_ids(self.cls_token)
999
1000@property
1001def mask_token_id(self) -> Optional[int]:
1002"""
1003:obj:`Optional[int]`: Id of the mask token in the vocabulary, used when training a model with masked-language
1004modeling. Returns :obj:`None` if the token has not been set.
1005"""
1006if self._mask_token is None:
1007return None
1008return self.convert_tokens_to_ids(self.mask_token)
1009
1010@property
1011def additional_special_tokens_ids(self) -> List[int]:
1012"""
1013:obj:`List[int]`: Ids of all the additional special tokens in the vocabulary.
1014Log an error if used while not having been set.
1015"""
1016return self.convert_tokens_to_ids(self.additional_special_tokens)
1017
1018@property
1019def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:
1020"""
1021:obj:`Dict[str, Union[str, List[str]]]`: A dictionary mapping special token class attributes
1022(:obj:`cls_token`, :obj:`unk_token`, etc.) to their values (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.).
1023
1024Convert potential tokens of :obj:`tokenizers.AddedToken` type to string.
1025"""
1026set_attr = {}
1027for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
1028attr_value = getattr(self, "_" + attr)
1029if attr_value:
1030set_attr[attr] = str(attr_value)
1031return set_attr
1032
1033@property
1034def special_tokens_map_extended(self) -> Dict[str, Union[str, AddedToken, List[Union[str, AddedToken]]]]:
1035"""
1036:obj:`Dict[str, Union[str, tokenizers.AddedToken, List[Union[str, tokenizers.AddedToken]]]]`: A dictionary
1037mapping special token class attributes (:obj:`cls_token`, :obj:`unk_token`, etc.) to their values
1038(:obj:`'<unk>'`, :obj:`'<cls>'`, etc.).
1039
1040Don't convert tokens of :obj:`tokenizers.AddedToken` type to string so they can be used to control more finely
1041how special tokens are tokenized.
1042"""
1043set_attr = {}
1044for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
1045attr_value = getattr(self, "_" + attr)
1046if attr_value:
1047set_attr[attr] = attr_value
1048return set_attr
1049
1050@property
1051def all_special_tokens(self) -> List[str]:
1052"""
1053:obj:`List[str]`: All the special tokens (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.) mapped to class attributes.
1054
1055Convert tokens of :obj:`tokenizers.AddedToken` type to string.
1056"""
1057all_toks = [str(s) for s in self.all_special_tokens_extended]
1058return all_toks
1059
1060@property
1061def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]:
1062"""
1063:obj:`List[Union[str, tokenizers.AddedToken]]`: All the special tokens (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.)
1064mapped to class attributes.
1065
1066Don't convert tokens of :obj:`tokenizers.AddedToken` type to string so they can be used to control more finely
1067how special tokens are tokenized.
1068"""
1069all_toks = []
1070set_attr = self.special_tokens_map_extended
1071for attr_value in set_attr.values():
1072all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value])
1073all_toks = list(set(all_toks))
1074return all_toks
1075
1076@property
1077def all_special_ids(self) -> List[int]:
1078"""
1079:obj:`List[int]`: List the ids of the special tokens(:obj:`'<unk>'`, :obj:`'<cls>'`, etc.) mapped to class
1080attributes.
1081"""
1082all_toks = self.all_special_tokens
1083all_ids = self.convert_tokens_to_ids(all_toks)
1084return all_ids
1085
1086
1087ENCODE_KWARGS_DOCSTRING = r"""
1088add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
1089Whether or not to encode the sequences with the special tokens relative to their model.
1090padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
1091Activates and controls padding. Accepts the following values:
1092
1093* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
1094single sequence if provided).
1095* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
1096maximum acceptable input length for the model if that argument is not provided.
1097* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
1098different lengths).
1099truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`):
1100Activates and controls truncation. Accepts the following values:
1101
1102* :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument
1103:obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
1104provided. This will truncate token by token, removing a token from the longest sequence in the pair
1105if a pair of sequences (or a batch of pairs) is provided.
1106* :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
1107the maximum acceptable input length for the model if that argument is not provided. This will only
1108truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
1109* :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
1110to the maximum acceptable input length for the model if that argument is not provided. This will only
1111truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
1112* :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with
1113sequence lengths greater than the model maximum admissible input size).
1114max_length (:obj:`int`, `optional`):
1115Controls the maximum length to use by one of the truncation/padding parameters.
1116
1117If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
1118length is required by one of the truncation/padding parameters. If the model has no specific maximum
1119input length (like XLNet) truncation/padding to a maximum length will be deactivated.
1120stride (:obj:`int`, `optional`, defaults to 0):
1121If set to a number along with :obj:`max_length`, the overflowing tokens returned when
1122:obj:`return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
1123returned to provide some overlap between truncated and overflowing sequences. The value of this
1124argument defines the number of overlapping tokens.
1125is_pretokenized (:obj:`bool`, `optional`, defaults to :obj:`False`):
1126Whether or not the input is already tokenized.
1127pad_to_multiple_of (:obj:`int`, `optional`):
1128If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
1129the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
1130return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
1131If set, will return tensors instead of list of python integers. Acceptable values are:
1132
1133* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
1134* :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
1135* :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
1136"""
1137
1138ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
1139return_token_type_ids (:obj:`bool`, `optional`):
1140Whether to return token type IDs. If left to the default, will return the token type IDs according
1141to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
1142
1143`What are token type IDs? <../glossary.html#token-type-ids>`__
1144return_attention_mask (:obj:`bool`, `optional`):
1145Whether to return the attention mask. If left to the default, will return the attention mask according
1146to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
1147
1148`What are attention masks? <../glossary.html#attention-mask>`__
1149return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
1150Whether or not to return overflowing token sequences.
1151return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`):
1152Wheter or not to return special tokens mask information.
1153return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`):
1154Whether or not to return :obj:`(char_start, char_end)` for each token.
1155
1156This is only available on fast tokenizers inheriting from
1157:class:`~transformers.PreTrainedTokenizerFast`, if using Python's tokenizer, this method will raise
1158:obj:`NotImplementedError`.
1159return_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
1160Whether or not to return the lengths of the encoded inputs.
1161verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
1162Whether or not to print informations and warnings.
1163**kwargs: passed to the :obj:`self.tokenize()` method
1164
1165Return:
1166:class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields:
1167
1168- **input_ids** -- List of token ids to be fed to a model.
1169
1170`What are input IDs? <../glossary.html#input-ids>`__
1171- **token_type_ids** -- List of token type ids to be fed to a model (when :obj:`return_token_type_ids=True`
1172or if `"token_type_ids"` is in :obj:`self.model_input_names`).
1173
1174`What are token type IDs? <../glossary.html#token-type-ids>`__
1175- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
1176:obj:`return_attention_mask=True` or if `"attention_mask"` is in :obj:`self.model_input_names`).
1177
1178`What are attention masks? <../glossary.html#attention-mask>`__
1179- **overflowing_tokens** -- List of overflowing tokens sequences (when a :obj:`max_length` is specified and
1180:obj:`return_overflowing_tokens=True`).
1181- **num_truncated_tokens** -- Number of tokens truncated (when a :obj:`max_length` is specified and
1182:obj:`return_overflowing_tokens=True`).
1183- **special_tokens_mask** -- List of 0s and 1s, with 0 specifying added special tokens and 1 specifying
1184regual sequence tokens (when :obj:`add_special_tokens=True` and :obj:`return_special_tokens_mask=True`).
1185- **length** -- The length of the inputs (when :obj:`return_length=True`)
1186"""
1187
1188INIT_TOKENIZER_DOCSTRING = r"""
1189Class attributes (overridden by derived classes)
1190- **vocab_files_names** (:obj:`Dict[str, str]`) -- A ditionary with, as keys, the ``__init__`` keyword name of
1191each vocabulary file required by the model, and as associated values, the filename for saving the associated
1192file (string).
1193- **pretrained_vocab_files_map** (:obj:`Dict[str, Dict[str, str]]`) -- A dictionary of dictionaries, with the
1194high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the
1195low-level being the :obj:`short-cut-names` of the pretrained models with, as associated values, the
1196:obj:`url` to the associated pretrained vocabulary file.
1197- **max_model_input_sizes** (:obj:`Dict[str, Optinal[int]]`) -- A dictionary with, as keys, the
1198:obj:`short-cut-names` of the pretrained models, and as associated values, the maximum length of the sequence
1199inputs of this model, or :obj:`None` if the model has no maximum input size.
1200- **pretrained_init_configuration** (:obj:`Dict[str, Dict[str, Any]]`) -- A dictionary with, as keys, the
1201:obj:`short-cut-names` of the pretrained models, and as associated values, a dictionnary of specific
1202arguments to pass to the ``__init__`` method of the tokenizer class for this pretrained model when loading the
1203tokenizer with the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`
1204method.
1205- **model_input_names** (:obj:`List[str]`) -- A list of inputs expected in the forward pass of the model.
1206- **padding_side** (:obj:`str`) -- The default value for the side on which the model should have padding
1207applied. Should be :obj:`'right'` or :obj:`'left'`.
1208
1209Args:
1210model_max_length (:obj:`int`, `optional`):
1211The maximum length (in number of tokens) for the inputs to the transformer model.
1212When the tokenizer is loaded with
1213:meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`, this will be set to
1214the value stored for the associated model in ``max_model_input_sizes`` (see above). If no value is
1215provided, will default to VERY_LARGE_INTEGER (:obj:`int(1e30)`).
1216padding_side: (:obj:`str`, `optional`):
1217The side on which the model should have padding applied. Should be selected between ['right', 'left'].
1218Default value is picked from the class attribute of the same name.
1219model_input_names (:obj:`List[string]`, `optional`):
1220The list of inputs accepted by the forward pass of the model (like :obj:`"token_type_ids"` or
1221:obj:`"attention_mask"`). Default value is picked from the class attribute of the same name.
1222bos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
1223A special token representing the beginning of a sentence. Will be associated to ``self.bos_token`` and
1224``self.bos_token_id``.
1225eos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
1226A special token representing the end of a sentence. Will be associated to ``self.eos_token`` and
1227``self.eos_token_id``.
1228unk_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
1229A special token representing an out-of-vocabulary token. Will be associated to ``self.unk_token`` and
1230``self.unk_token_id``.
1231sep_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
1232A special token separating two different sentences in the same input (used by BERT for instance). Will be
1233associated to ``self.sep_token`` and ``self.sep_token_id``.
1234pad_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
1235A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
1236attention mechanisms or loss computation. Will be associated to ``self.pad_token`` and
1237``self.pad_token_id``.
1238cls_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
1239A special token representing the class of the input (used by BERT for instance). Will be associated to
1240``self.cls_token`` and ``self.cls_token_id``.
1241mask_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
1242A special token representing a masked token (used by masked-language modeling pretraining objectives, like
1243BERT). Will be associated to ``self.mask_token`` and ``self.mask_token_id``.
1244additional_special_tokens (tuple or list of :obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
1245A tuple or a list of additional special tokens. Add them here to ensure they won't be split by the
1246tokenization process. Will be associated to ``self.additional_special_tokens`` and
1247``self.additional_special_tokens_ids``.
1248"""
1249
1250
1251@add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
1252class PreTrainedTokenizerBase(SpecialTokensMixin):
1253"""
1254Base class for :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast`.
1255
1256Handles shared (mostly boiler plate) methods for those two classes.
1257"""
1258
1259vocab_files_names: Dict[str, str] = {}
1260pretrained_vocab_files_map: Dict[str, Dict[str, str]] = {}
1261pretrained_init_configuration: Dict[str, Dict[str, Any]] = {}
1262max_model_input_sizes: Dict[str, Optional[int]] = {}
1263model_input_names: List[str] = ["token_type_ids", "attention_mask"]
1264padding_side: str = "right"
1265
1266def __init__(self, **kwargs):
1267# inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
1268self.init_inputs = ()
1269self.init_kwargs = kwargs
1270
1271# For backward compatibility we fallback to set model_max_length from max_len if provided
1272model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None))
1273self.model_max_length = model_max_length if model_max_length is not None else VERY_LARGE_INTEGER
1274
1275# Padding side is right by default and overridden in subclasses. If specified in the kwargs, it is changed.
1276self.padding_side = kwargs.pop("padding_side", self.padding_side)
1277assert self.padding_side in [
1278"right",
1279"left",
1280], f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}"
1281self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
1282
1283super().__init__(**kwargs)
1284
1285@property
1286def max_len(self) -> int:
1287"""
1288:obj:`int`: **Deprecated** Kept here for backward compatibility. Now renamed to :obj:`model_max_length` to
1289avoid ambiguity.
1290"""
1291warnings.warn(
1292"The `max_len` attribute has been deprecated and will be removed in a future version, use `model_max_length` instead.",
1293FutureWarning,
1294)
1295return self.model_max_length
1296
1297@property
1298def max_len_single_sentence(self) -> int:
1299"""
1300:obj:`int`: The maximum length of a sentence that can be fed to the model.
1301"""
1302return self.model_max_length - self.num_special_tokens_to_add(pair=False)
1303
1304@property
1305def max_len_sentences_pair(self) -> int:
1306"""
1307:obj:`int`: The maximum combined length of a pair of sentences that can be fed to the model.
1308"""
1309return self.model_max_length - self.num_special_tokens_to_add(pair=True)
1310
1311@max_len_single_sentence.setter
1312def max_len_single_sentence(self, value) -> int:
1313# For backward compatibility, allow to try to setup 'max_len_single_sentence'.
1314if value == self.model_max_length - self.num_special_tokens_to_add(pair=False) and self.verbose:
1315logger.warning(
1316"Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up."
1317)
1318else:
1319raise ValueError(
1320"Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up."
1321)
1322
1323@max_len_sentences_pair.setter
1324def max_len_sentences_pair(self, value) -> int:
1325# For backward compatibility, allow to try to setup 'max_len_sentences_pair'.
1326if value == self.model_max_length - self.num_special_tokens_to_add(pair=True) and self.verbose:
1327logger.warning(
1328"Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
1329)
1330else:
1331raise ValueError(
1332"Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
1333)
1334
1335@classmethod
1336def from_pretrained(cls, *inputs, **kwargs):
1337r"""
1338Instantiate a :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase` (or a derived class) from
1339a predefined tokenizer.
1340
1341Args:
1342pretrained_model_name_or_path (:obj:`str`):
1343Can be either:
1344
1345- A string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.,
1346``bert-base-uncased``.
1347- A string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.,
1348``dbmdz/bert-base-german-cased``.
1349- A path to a `directory` containing vocabulary files required by the tokenizer, for instance saved
1350using the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`
1351method, e.g., ``./my_model_directory/``.
1352- (**Deprecated**, not applicable to all derived classes) A path or url to a single saved vocabulary
1353file (if and only if the tokenizer only requires a single vocabulary file like Bert or XLNet), e.g.,
1354``./my_model_directory/vocab.txt``.
1355cache_dir (:obj:`str`, `optional`):
1356Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the
1357standard cache should not be used.
1358force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
1359Whether or not to force the (re-)download the vocabulary files and override the cached versions if they
1360exist.
1361resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
1362Whether or not to delete incompletely received files. Attempt to resume the download if such a file
1363exists.
1364proxies (:obj:`Dict[str, str], `optional`):
1365A dictionary of proxy servers to use by protocol or endpoint, e.g.,
1366:obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each
1367request.
1368inputs (additional positional arguments, `optional`):
1369Will be passed along to the Tokenizer ``__init__`` method.
1370kwargs (additional keyword arguments, `optional`):
1371Will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like
1372``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``,
1373``mask_token``, ``additional_special_tokens``. See parameters in the ``__init__`` for more details.
1374
1375Examples::
1376
1377# We can't instantiate directly the base class `PreTrainedTokenizerBase` so let's show our examples on a derived class: BertTokenizer
1378# Download vocabulary from S3 and cache.
1379tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
1380
1381# Download vocabulary from S3 (user-uploaded) and cache.
1382tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
1383
1384# If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
1385tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')
1386
1387# If the tokenizer uses a single vocabulary file, you can point directly to this file
1388tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt')
1389
1390# You can link tokens to special vocabulary when instantiating
1391tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='<unk>')
1392# You should be sure '<unk>' is in the vocabulary when doing that.
1393# Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead)
1394assert tokenizer.unk_token == '<unk>'
1395
1396"""
1397return cls._from_pretrained(*inputs, **kwargs)
1398
1399@classmethod
1400def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
1401cache_dir = kwargs.pop("cache_dir", None)
1402force_download = kwargs.pop("force_download", False)
1403resume_download = kwargs.pop("resume_download", False)
1404proxies = kwargs.pop("proxies", None)
1405local_files_only = kwargs.pop("local_files_only", False)
1406
1407s3_models = list(cls.max_model_input_sizes.keys())
1408vocab_files = {}
1409init_configuration = {}
1410if pretrained_model_name_or_path in s3_models:
1411# Get the vocabulary from AWS S3 bucket
1412for file_id, map_list in cls.pretrained_vocab_files_map.items():
1413vocab_files[file_id] = map_list[pretrained_model_name_or_path]
1414if (
1415cls.pretrained_init_configuration
1416and pretrained_model_name_or_path in cls.pretrained_init_configuration
1417):
1418init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path].copy()
1419else:
1420# Get the vocabulary from local files
1421logger.info(
1422"Model name '{}' not found in model shortcut name list ({}). "
1423"Assuming '{}' is a path, a model identifier, or url to a directory containing tokenizer files.".format(
1424pretrained_model_name_or_path, ", ".join(s3_models), pretrained_model_name_or_path
1425)
1426)
1427
1428if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
1429if len(cls.vocab_files_names) > 1:
1430raise ValueError(
1431"Calling {}.from_pretrained() with the path to a single file or url is not supported."
1432"Use a model identifier or the path to a directory instead.".format(cls.__name__)
1433)
1434logger.warning(
1435"Calling {}.from_pretrained() with the path to a single file or url is deprecated".format(
1436cls.__name__
1437)
1438)
1439file_id = list(cls.vocab_files_names.keys())[0]
1440vocab_files[file_id] = pretrained_model_name_or_path
1441else:
1442# At this point pretrained_model_name_or_path is either a directory or a model identifier name
1443additional_files_names = {
1444"added_tokens_file": ADDED_TOKENS_FILE,
1445"special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
1446"tokenizer_config_file": TOKENIZER_CONFIG_FILE,
1447"full_tokenizer_file": FULL_TOKENIZER_FILE,
1448}
1449# Look for the tokenizer files
1450for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items():
1451if os.path.isdir(pretrained_model_name_or_path):
1452full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
1453if not os.path.exists(full_file_name):
1454logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
1455full_file_name = None
1456else:
1457full_file_name = hf_bucket_url(
1458pretrained_model_name_or_path, filename=file_name, use_cdn=False
1459)
1460
1461vocab_files[file_id] = full_file_name
1462
1463# Get files from url, cache, or disk depending on the case
1464try:
1465resolved_vocab_files = {}
1466for file_id, file_path in vocab_files.items():
1467if file_path is None:
1468resolved_vocab_files[file_id] = None
1469else:
1470resolved_vocab_files[file_id] = cached_path(
1471file_path,
1472cache_dir=cache_dir,
1473force_download=force_download,
1474proxies=proxies,
1475resume_download=resume_download,
1476local_files_only=local_files_only,
1477)
1478except EnvironmentError:
1479if pretrained_model_name_or_path in s3_models:
1480msg = "Couldn't reach server at '{}' to download vocabulary files."
1481else:
1482msg = (
1483"Model name '{}' was not found in tokenizers model name list ({}). "
1484"We assumed '{}' was a path or url to a directory containing vocabulary files "
1485"named {}, but couldn't find such vocabulary files at this path or url.".format(
1486pretrained_model_name_or_path,
1487", ".join(s3_models),
1488pretrained_model_name_or_path,
1489list(cls.vocab_files_names.values()),
1490)
1491)
1492
1493raise EnvironmentError(msg)
1494
1495if all(full_file_name is None for full_file_name in resolved_vocab_files.values()):
1496raise EnvironmentError(
1497"Model name '{}' was not found in tokenizers model name list ({}). "
1498"We assumed '{}' was a path, a model identifier, or url to a directory containing vocabulary files "
1499"named {} but couldn't find such vocabulary files at this path or url.".format(
1500pretrained_model_name_or_path,
1501", ".join(s3_models),
1502pretrained_model_name_or_path,
1503list(cls.vocab_files_names.values()),
1504)
1505)
1506
1507for file_id, file_path in vocab_files.items():
1508if file_path == resolved_vocab_files[file_id]:
1509logger.info("loading file {}".format(file_path))
1510else:
1511logger.info("loading file {} from cache at {}".format(file_path, resolved_vocab_files[file_id]))
1512
1513# Prepare tokenizer initialization kwargs
1514# Did we saved some inputs and kwargs to reload ?
1515tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None)
1516if tokenizer_config_file is not None:
1517with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
1518init_kwargs = json.load(tokenizer_config_handle)
1519saved_init_inputs = init_kwargs.pop("init_inputs", ())
1520if not init_inputs:
1521init_inputs = saved_init_inputs
1522else:
1523init_kwargs = init_configuration
1524
1525# Update with newly provided kwargs
1526init_kwargs.update(kwargs)
1527
1528# Set max length if needed
1529if pretrained_model_name_or_path in cls.max_model_input_sizes:
1530# if we're using a pretrained model, ensure the tokenizer
1531# wont index sequences longer than the number of positional embeddings
1532model_max_length = cls.max_model_input_sizes[pretrained_model_name_or_path]
1533if model_max_length is not None and isinstance(model_max_length, (int, float)):
1534init_kwargs["model_max_length"] = min(init_kwargs.get("model_max_length", int(1e30)), model_max_length)
1535
1536# Merge resolved_vocab_files arguments in init_kwargs.
1537added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
1538for args_name, file_path in resolved_vocab_files.items():
1539if args_name not in init_kwargs:
1540init_kwargs[args_name] = file_path
1541
1542# Instantiate tokenizer.
1543try:
1544tokenizer = cls(*init_inputs, **init_kwargs)
1545except OSError:
1546raise OSError(
1547"Unable to load vocabulary from file. "
1548"Please check that the provided vocabulary is accessible and not corrupted."
1549)
1550
1551# Save inputs and kwargs for saving and re-loading with ``save_pretrained``
1552tokenizer.init_inputs = init_inputs
1553tokenizer.init_kwargs = init_kwargs
1554
1555# If there is a complementary special token map, load it
1556special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
1557if special_tokens_map_file is not None:
1558with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
1559special_tokens_map = json.load(special_tokens_map_handle)
1560
1561for key, value in special_tokens_map.items():
1562if isinstance(value, dict):
1563value = AddedToken(**value)
1564setattr(tokenizer, key, value)
1565
1566# Add supplementary tokens.
1567special_tokens = tokenizer.all_special_tokens
1568if added_tokens_file is not None:
1569with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
1570added_tok_encoder = json.load(added_tokens_handle)
1571
1572# Sort added tokens by index
1573added_tok_encoder_sorted = list(sorted(added_tok_encoder.items(), key=lambda x: x[1]))
1574
1575for token, index in added_tok_encoder_sorted:
1576assert index == len(tokenizer), (
1577f"Non-consecutive added token '{token}' found. "
1578f"Should have index {len(tokenizer)} but has index {index} in saved vocabulary."
1579)
1580tokenizer.add_tokens(token, special_tokens=bool(token in special_tokens))
1581
1582# Check all our special tokens are registrered as "no split" token (we don't cut them) and are in the vocab
1583added_tokens = tokenizer.sanitize_special_tokens()
1584if added_tokens:
1585logger.warning(
1586"Special tokens have been added in the vocabulary, make sure the associated word emebedding are fine-tuned or trained."
1587)
1588
1589return tokenizer
1590
1591def save_pretrained(self, save_directory: str) -> Tuple[str]:
1592"""
1593Save the tokenizer vocabulary files together with:
1594
1595- added tokens,
1596- special tokens to class attributes mapping,
1597- tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert).
1598
1599This method make sure the full tokenizer can then be re-loaded using the
1600:meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained` class method.
1601
1602.. Warning::
1603This won't save modifications you may have applied to the tokenizer after the instantiation (for instance,
1604modifying :obj:`tokenizer.do_lower_case` after creation).
1605
1606Args:
1607save_directory (:obj:`str`): The path to adirectory where the tokenizer will be saved.
1608
1609Returns:
1610A tuple of :obj:`str`: The files saved.
1611"""
1612if os.path.isfile(save_directory):
1613logger.error("Provided path ({}) should be a directory, not a file".format(save_directory))
1614return
1615os.makedirs(save_directory, exist_ok=True)
1616
1617special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE)
1618added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE)
1619tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE)
1620
1621tokenizer_config = copy.deepcopy(self.init_kwargs)
1622if len(self.init_inputs) > 0:
1623tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
1624for file_id in self.vocab_files_names.keys():
1625tokenizer_config.pop(file_id, None)
1626
1627with open(tokenizer_config_file, "w", encoding="utf-8") as f:
1628f.write(json.dumps(tokenizer_config, ensure_ascii=False))
1629
1630with open(special_tokens_map_file, "w", encoding="utf-8") as f:
1631write_dict = {}
1632for key, value in self.special_tokens_map_extended.items():
1633if isinstance(value, AddedToken):
1634write_dict[key] = value.__getstate__()
1635else:
1636write_dict[key] = value
1637f.write(json.dumps(write_dict, ensure_ascii=False))
1638
1639added_vocab = self.get_added_vocab()
1640if added_vocab:
1641with open(added_tokens_file, "w", encoding="utf-8") as f:
1642out_str = json.dumps(added_vocab, ensure_ascii=False)
1643f.write(out_str)
1644
1645vocab_files = self.save_vocabulary(save_directory)
1646
1647return vocab_files + (special_tokens_map_file, added_tokens_file)
1648
1649@add_end_docstrings(
1650ENCODE_KWARGS_DOCSTRING,
1651"""
1652**kwargs: Passed along to the `.tokenize()` method.
1653""",
1654"""
1655Returns:
1656:obj:`List[int]`, :obj:`torch.Tensor`, :obj:`tf.Tensor` or :obj:`np.ndarray`:
1657The tokenized ids of the text.
1658""",
1659)
1660def encode(
1661self,
1662text: Union[TextInput, PreTokenizedInput, EncodedInput],
1663text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
1664add_special_tokens: bool = True,
1665padding: Union[bool, str, PaddingStrategy] = False,
1666truncation: Union[bool, str, TruncationStrategy] = False,
1667max_length: Optional[int] = None,
1668stride: int = 0,
1669return_tensors: Optional[Union[str, TensorType]] = None,
1670**kwargs
1671) -> List[int]:
1672"""
1673Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.
1674
1675Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
1676
1677Args:
1678text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`):
1679The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
1680the ``tokenize`` method) or a list of integers (tokenized string ids using the
1681``convert_tokens_to_ids`` method).
1682text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`):
1683Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
1684string using the ``tokenize`` method) or a list of integers (tokenized string ids using the
1685``convert_tokens_to_ids`` method).
1686"""
1687encoded_inputs = self.encode_plus(
1688text,
1689text_pair=text_pair,
1690add_special_tokens=add_special_tokens,
1691padding=padding,
1692truncation=truncation,
1693max_length=max_length,
1694stride=stride,
1695return_tensors=return_tensors,
1696**kwargs,
1697)
1698
1699return encoded_inputs["input_ids"]
1700
1701def num_special_tokens_to_add(self, pair: bool = False) -> int:
1702raise NotImplementedError
1703
1704def _get_padding_truncation_strategies(
1705self, padding=False, truncation=False, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs
1706):
1707"""
1708Find the correct padding/truncation strategy with backward compatibility
1709for old arguments (truncation_strategy and pad_to_max_length) and behaviors.
1710"""
1711old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate")
1712old_pad_to_max_length = kwargs.pop("pad_to_max_length", False)
1713
1714# Backward compatibility for previous behavior, maybe we should deprecate it:
1715# If you only set max_length, it activates truncation for max_length
1716if max_length is not None and padding is False and truncation is False:
1717if verbose:
1718logger.warning(
1719"Truncation was not explicitely activated but `max_length` is provided a specific value, "
1720"please use `truncation=True` to explicitely truncate examples to max length. "
1721"Defaulting to 'longest_first' truncation strategy. "
1722"If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy "
1723"more precisely by providing a specific strategy to `truncation`."
1724)
1725truncation = "longest_first"
1726
1727# Get padding strategy
1728if padding is False and old_pad_to_max_length:
1729if verbose:
1730warnings.warn(
1731"The `pad_to_max_length` argument is deprecated and will be removed in a future version, "
1732"use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or "
1733"use `padding='max_length'` to pad to a max length. In this case, you can give a specific "
1734"length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the "
1735"maximal input size of the model (e.g. 512 for Bert).",
1736FutureWarning,
1737)
1738if max_length is None:
1739padding_strategy = PaddingStrategy.LONGEST
1740else:
1741padding_strategy = PaddingStrategy.MAX_LENGTH
1742elif padding is not False:
1743if padding is True:
1744padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch
1745elif not isinstance(padding, PaddingStrategy):
1746padding_strategy = PaddingStrategy(padding)
1747else:
1748padding_strategy = PaddingStrategy.DO_NOT_PAD
1749
1750# Get truncation strategy
1751if truncation is False and old_truncation_strategy != "do_not_truncate":
1752if verbose:
1753warnings.warn(
1754"The `truncation_strategy` argument is deprecated and will be removed in a future version, "
1755"use `truncation=True` to truncate examples to a max length. You can give a specific "
1756"length with `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the "
1757"maximal input size of the model (e.g. 512 for Bert). "
1758" If you have pairs of inputs, you can give a specific truncation strategy selected among "
1759"`truncation='only_first'` (will only truncate the first sentence in the pairs) "
1760"`truncation='only_second'` (will only truncate the second sentence in the pairs) "
1761"or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence in the pairs).",
1762FutureWarning,
1763)
1764truncation_strategy = TruncationStrategy(old_truncation_strategy)
1765elif truncation is not False:
1766if truncation is True:
1767truncation_strategy = (
1768TruncationStrategy.LONGEST_FIRST
1769) # Default to truncate the longest sequences in pairs of inputs
1770elif not isinstance(truncation, TruncationStrategy):
1771truncation_strategy = TruncationStrategy(truncation)
1772else:
1773truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
1774
1775# Set max length if needed
1776if max_length is None:
1777if padding_strategy == PaddingStrategy.MAX_LENGTH:
1778if self.model_max_length > LARGE_INTEGER:
1779if verbose:
1780logger.warning(
1781"Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. "
1782"Default to no padding."
1783)
1784padding_strategy = PaddingStrategy.DO_NOT_PAD
1785else:
1786max_length = self.model_max_length
1787
1788if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
1789if self.model_max_length > LARGE_INTEGER:
1790if verbose:
1791logger.warning(
1792"Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. "
1793"Default to no truncation."
1794)
1795truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
1796else:
1797max_length = self.model_max_length
1798
1799# Test if we have a padding token
1800if padding_strategy != PaddingStrategy.DO_NOT_PAD and (not self.pad_token or self.pad_token_id < 0):
1801raise ValueError(
1802"Asking to pad but the tokenizer does not have a padding token. "
1803"Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
1804"or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
1805)
1806
1807# Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
1808if (
1809truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
1810and padding_strategy != PaddingStrategy.DO_NOT_PAD
1811and pad_to_multiple_of is not None
1812and max_length is not None
1813and (max_length % pad_to_multiple_of != 0)
1814):
1815raise ValueError(
1816f"Truncation and padding are both activated but "
1817f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
1818)
1819
1820return padding_strategy, truncation_strategy, max_length, kwargs
1821
1822@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
1823def __call__(
1824self,
1825text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
1826text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
1827add_special_tokens: bool = True,
1828padding: Union[bool, str, PaddingStrategy] = False,
1829truncation: Union[bool, str, TruncationStrategy] = False,
1830max_length: Optional[int] = None,
1831stride: int = 0,
1832is_pretokenized: bool = False,
1833pad_to_multiple_of: Optional[int] = None,
1834return_tensors: Optional[Union[str, TensorType]] = None,
1835return_token_type_ids: Optional[bool] = None,
1836return_attention_mask: Optional[bool] = None,
1837return_overflowing_tokens: bool = False,
1838return_special_tokens_mask: bool = False,
1839return_offsets_mapping: bool = False,
1840return_length: bool = False,
1841verbose: bool = True,
1842**kwargs
1843) -> BatchEncoding:
1844"""
1845Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
1846sequences.
1847
1848Args:
1849text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
1850The sequence or batch of sequences to be encoded.
1851Each sequence can be a string or a list of strings (pretokenized string).
1852If the sequences are provided as list of strings (pretokenized), you must set
1853:obj:`is_pretokenized=True` (to lift the ambiguity with a batch of sequences).
1854text_pair (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
1855The sequence or batch of sequences to be encoded.
1856Each sequence can be a string or a list of strings (pretokenized string).
1857If the sequences are provided as list of strings (pretokenized), you must set
1858:obj:`is_pretokenized=True` (to lift the ambiguity with a batch of sequences).
1859"""
1860# Input type checking for clearer error
1861assert isinstance(text, str) or (
1862isinstance(text, (list, tuple))
1863and (
1864len(text) == 0
1865or (
1866isinstance(text[0], str)
1867or (isinstance(text[0], (list, tuple)) and (len(text[0]) == 0 or isinstance(text[0][0], str)))
1868)
1869)
1870), (
1871"text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
1872"or `List[List[str]]` (batch of pretokenized examples)."
1873)
1874
1875assert (
1876text_pair is None
1877or isinstance(text_pair, str)
1878or (
1879isinstance(text_pair, (list, tuple))
1880and (
1881len(text_pair) == 0
1882or (
1883isinstance(text_pair[0], str)
1884or (
1885isinstance(text_pair[0], (list, tuple))
1886and (len(text_pair[0]) == 0 or isinstance(text_pair[0][0], str))
1887)
1888)
1889)
1890)
1891), (
1892"text_pair input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
1893"or `List[List[str]]` (batch of pretokenized examples)."
1894)
1895
1896is_batched = bool(
1897(not is_pretokenized and isinstance(text, (list, tuple)))
1898or (is_pretokenized and isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple)))
1899)
1900
1901if is_batched:
1902batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
1903return self.batch_encode_plus(
1904batch_text_or_text_pairs=batch_text_or_text_pairs,
1905add_special_tokens=add_special_tokens,
1906padding=padding,
1907truncation=truncation,
1908max_length=max_length,
1909stride=stride,
1910is_pretokenized=is_pretokenized,
1911pad_to_multiple_of=pad_to_multiple_of,
1912return_tensors=return_tensors,
1913return_token_type_ids=return_token_type_ids,
1914return_attention_mask=return_attention_mask,
1915return_overflowing_tokens=return_overflowing_tokens,
1916return_special_tokens_mask=return_special_tokens_mask,
1917return_offsets_mapping=return_offsets_mapping,
1918return_length=return_length,
1919verbose=verbose,
1920**kwargs,
1921)
1922else:
1923return self.encode_plus(
1924text=text,
1925text_pair=text_pair,
1926add_special_tokens=add_special_tokens,
1927padding=padding,
1928truncation=truncation,
1929max_length=max_length,
1930stride=stride,
1931is_pretokenized=is_pretokenized,
1932pad_to_multiple_of=pad_to_multiple_of,
1933return_tensors=return_tensors,
1934return_token_type_ids=return_token_type_ids,
1935return_attention_mask=return_attention_mask,
1936return_overflowing_tokens=return_overflowing_tokens,
1937return_special_tokens_mask=return_special_tokens_mask,
1938return_offsets_mapping=return_offsets_mapping,
1939return_length=return_length,
1940verbose=verbose,
1941**kwargs,
1942)
1943
1944@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
1945def encode_plus(
1946self,
1947text: Union[TextInput, PreTokenizedInput, EncodedInput],
1948text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
1949add_special_tokens: bool = True,
1950padding: Union[bool, str, PaddingStrategy] = False,
1951truncation: Union[bool, str, TruncationStrategy] = False,
1952max_length: Optional[int] = None,
1953stride: int = 0,
1954is_pretokenized: bool = False,
1955pad_to_multiple_of: Optional[int] = None,
1956return_tensors: Optional[Union[str, TensorType]] = None,
1957return_token_type_ids: Optional[bool] = None,
1958return_attention_mask: Optional[bool] = None,
1959return_overflowing_tokens: bool = False,
1960return_special_tokens_mask: bool = False,
1961return_offsets_mapping: bool = False,
1962return_length: bool = False,
1963verbose: bool = True,
1964**kwargs
1965) -> BatchEncoding:
1966"""
1967Tokenize and prepare for the model a sequence or a pair of sequences.
1968
1969.. warning::
1970This method is deprecated, ``__call__`` should be used instead.
1971
1972Args:
1973text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]` (the latter only for not-fast tokenizers)):
1974The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
1975the ``tokenize`` method) or a list of integers (tokenized string ids using the
1976``convert_tokens_to_ids`` method).
1977text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`):
1978Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
1979string using the ``tokenize`` method) or a list of integers (tokenized string ids using the
1980``convert_tokens_to_ids`` method).
1981"""
1982
1983# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
1984padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
1985padding=padding,
1986truncation=truncation,
1987max_length=max_length,
1988pad_to_multiple_of=pad_to_multiple_of,
1989verbose=verbose,
1990**kwargs,
1991)
1992
1993return self._encode_plus(
1994text=text,
1995text_pair=text_pair,
1996add_special_tokens=add_special_tokens,
1997padding_strategy=padding_strategy,
1998truncation_strategy=truncation_strategy,
1999max_length=max_length,
2000stride=stride,
2001is_pretokenized=is_pretokenized,
2002pad_to_multiple_of=pad_to_multiple_of,
2003return_tensors=return_tensors,
2004return_token_type_ids=return_token_type_ids,
2005return_attention_mask=return_attention_mask,
2006return_overflowing_tokens=return_overflowing_tokens,
2007return_special_tokens_mask=return_special_tokens_mask,
2008return_offsets_mapping=return_offsets_mapping,
2009return_length=return_length,
2010verbose=verbose,
2011**kwargs,
2012)
2013
2014def _encode_plus(
2015self,
2016text: Union[TextInput, PreTokenizedInput, EncodedInput],
2017text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
2018add_special_tokens: bool = True,
2019padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
2020truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
2021max_length: Optional[int] = None,
2022stride: int = 0,
2023is_pretokenized: bool = False,
2024pad_to_multiple_of: Optional[int] = None,
2025return_tensors: Optional[Union[str, TensorType]] = None,
2026return_token_type_ids: Optional[bool] = None,
2027return_attention_mask: Optional[bool] = None,
2028return_overflowing_tokens: bool = False,
2029return_special_tokens_mask: bool = False,
2030return_offsets_mapping: bool = False,
2031return_length: bool = False,
2032verbose: bool = True,
2033**kwargs
2034) -> BatchEncoding:
2035raise NotImplementedError
2036
2037@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
2038def batch_encode_plus(
2039self,
2040batch_text_or_text_pairs: Union[
2041List[TextInput],
2042List[TextInputPair],
2043List[PreTokenizedInput],
2044List[PreTokenizedInputPair],
2045List[EncodedInput],
2046List[EncodedInputPair],
2047],
2048add_special_tokens: bool = True,
2049padding: Union[bool, str, PaddingStrategy] = False,
2050truncation: Union[bool, str, TruncationStrategy] = False,
2051max_length: Optional[int] = None,
2052stride: int = 0,
2053is_pretokenized: bool = False,
2054pad_to_multiple_of: Optional[int] = None,
2055return_tensors: Optional[Union[str, TensorType]] = None,
2056return_token_type_ids: Optional[bool] = None,
2057return_attention_mask: Optional[bool] = None,
2058return_overflowing_tokens: bool = False,
2059return_special_tokens_mask: bool = False,
2060return_offsets_mapping: bool = False,
2061return_length: bool = False,
2062verbose: bool = True,
2063**kwargs
2064) -> BatchEncoding:
2065"""
2066Tokenize and prepare for the model a list of sequences or a list of pairs of sequences.
2067
2068.. warning::
2069This method is deprecated, ``__call__`` should be used instead.
2070
2071Args:
2072batch_text_or_text_pairs (:obj:`List[str]`, :obj:`List[Tuple[str, str]]`, :obj:`List[List[str]]`, :obj:`List[Tuple[List[str], List[str]]]`, and for not-fast tokenizers, also :obj:`List[List[int]]`, :obj:`List[Tuple[List[int], List[int]]]`):
2073Batch of sequences or pair of sequences to be encoded.
2074This can be a list of string/string-sequences/int-sequences or a list of pair of
2075string/string-sequences/int-sequence (see details in ``encode_plus``).
2076"""
2077
2078# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
2079padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
2080padding=padding,
2081truncation=truncation,
2082max_length=max_length,
2083pad_to_multiple_of=pad_to_multiple_of,
2084verbose=verbose,
2085**kwargs,
2086)
2087
2088return self._batch_encode_plus(
2089batch_text_or_text_pairs=batch_text_or_text_pairs,
2090add_special_tokens=add_special_tokens,
2091padding_strategy=padding_strategy,
2092truncation_strategy=truncation_strategy,
2093max_length=max_length,
2094stride=stride,
2095is_pretokenized=is_pretokenized,
2096pad_to_multiple_of=pad_to_multiple_of,
2097return_tensors=return_tensors,
2098return_token_type_ids=return_token_type_ids,
2099return_attention_mask=return_attention_mask,
2100return_overflowing_tokens=return_overflowing_tokens,
2101return_special_tokens_mask=return_special_tokens_mask,
2102return_offsets_mapping=return_offsets_mapping,
2103return_length=return_length,
2104verbose=verbose,
2105**kwargs,
2106)
2107
2108def _batch_encode_plus(
2109self,
2110batch_text_or_text_pairs: Union[
2111List[TextInput],
2112List[TextInputPair],
2113List[PreTokenizedInput],
2114List[PreTokenizedInputPair],
2115List[EncodedInput],
2116List[EncodedInputPair],
2117],
2118add_special_tokens: bool = True,
2119padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
2120truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
2121max_length: Optional[int] = None,
2122stride: int = 0,
2123is_pretokenized: bool = False,
2124pad_to_multiple_of: Optional[int] = None,
2125return_tensors: Optional[Union[str, TensorType]] = None,
2126return_token_type_ids: Optional[bool] = None,
2127return_attention_mask: Optional[bool] = None,
2128return_overflowing_tokens: bool = False,
2129return_special_tokens_mask: bool = False,
2130return_offsets_mapping: bool = False,
2131return_length: bool = False,
2132verbose: bool = True,
2133**kwargs
2134) -> BatchEncoding:
2135raise NotImplementedError
2136
2137def pad(
2138self,
2139encoded_inputs: Union[
2140BatchEncoding,
2141List[BatchEncoding],
2142Dict[str, EncodedInput],
2143Dict[str, List[EncodedInput]],
2144List[Dict[str, EncodedInput]],
2145],
2146padding: Union[bool, str, PaddingStrategy] = True,
2147max_length: Optional[int] = None,
2148pad_to_multiple_of: Optional[int] = None,
2149return_attention_mask: Optional[bool] = None,
2150return_tensors: Optional[Union[str, TensorType]] = None,
2151verbose: bool = True,
2152) -> BatchEncoding:
2153"""
2154Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
2155in the batch.
2156
2157Padding side (left/right) padding token ids are defined at the tokenizer level
2158(with ``self.padding_side``, ``self.pad_token_id`` and ``self.pad_token_type_id``)
2159
2160Args:
2161encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`):
2162Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or
2163:obj:`Dict[str, List[int]]`) or a batch of tokenized inputs (list of
2164:class:`~transformers.BatchEncoding`, `Dict[str, List[List[int]]]` or `List[Dict[str, List[int]]]`) so
2165you can use this method during preprocessing as well as in a PyTorch Dataloader collate function.
2166padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
2167Select a strategy to pad the returned sequences (according to the model's padding side and padding
2168index) among:
2169
2170* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
2171single sequence if provided).
2172* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
2173maximum acceptable input length for the model if that argument is not provided.
2174* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
2175different lengths).
2176max_length (:obj:`int`, `optional`):
2177Maximum length of the returned list and optionally padding length (see above).
2178pad_to_multiple_of (:obj:`int`, `optional`):
2179If set will pad the sequence to a multiple of the provided value.
2180
2181This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
2182>= 7.5 (Volta).
2183return_attention_mask (:obj:`bool`, `optional`):
2184Whether to return the attention mask. If left to the default, will return the attention mask according
2185to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
2186
2187`What are attention masks? <../glossary.html#attention-mask>`__
2188return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
2189If set, will return tensors instead of list of python integers. Acceptable values are:
2190
2191* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
2192* :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
2193* :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
2194verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
2195Whether or not to print informations and warnings.
2196"""
2197# If we have a list of dicts, let's convert it in a dict of lists
2198if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], (dict, BatchEncoding)):
2199encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}
2200
2201assert "input_ids" in encoded_inputs, (
2202"You should supply an encoding or a list of encodings to this method. "
2203"An encoding is the output of one the encoding methods of the tokenizer, i.e. "
2204"__call__/encode_plus/batch_encode_plus. "
2205)
2206
2207if not encoded_inputs["input_ids"]:
2208if return_attention_mask:
2209encoded_inputs["attention_mask"] = []
2210return encoded_inputs
2211
2212# Convert padding_strategy in PaddingStrategy
2213padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
2214padding=padding, max_length=max_length, verbose=verbose
2215)
2216
2217if encoded_inputs["input_ids"] and not isinstance(encoded_inputs["input_ids"][0], (list, tuple)):
2218encoded_inputs = self._pad(
2219encoded_inputs,
2220max_length=max_length,
2221padding_strategy=padding_strategy,
2222pad_to_multiple_of=pad_to_multiple_of,
2223return_attention_mask=return_attention_mask,
2224)
2225return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
2226
2227batch_size = len(encoded_inputs["input_ids"])
2228assert all(
2229len(v) == batch_size for v in encoded_inputs.values()
2230), "Some items in the output dictionnary have a different batch size than others."
2231
2232if padding_strategy == PaddingStrategy.LONGEST:
2233max_length = max(len(inputs) for inputs in encoded_inputs["input_ids"])
2234padding_strategy = PaddingStrategy.MAX_LENGTH
2235
2236batch_outputs = {}
2237for i in range(batch_size):
2238inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
2239outputs = self._pad(
2240inputs,
2241max_length=max_length,
2242padding_strategy=padding_strategy,
2243pad_to_multiple_of=pad_to_multiple_of,
2244return_attention_mask=return_attention_mask,
2245)
2246
2247for key, value in outputs.items():
2248if key not in batch_outputs:
2249batch_outputs[key] = []
2250batch_outputs[key].append(value)
2251
2252return BatchEncoding(batch_outputs, tensor_type=return_tensors)
2253
2254def create_token_type_ids_from_sequences(
2255self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
2256) -> List[int]:
2257"""
2258Create the token type IDs corresponding to the sequences passed.
2259`What are token type IDs? <../glossary.html#token-type-ids>`__
2260
2261Should be overriden in a subclass if the model has a special way of building those.
2262
2263Args:
2264token_ids_0 (:obj:`List[int]`): The first tokenized sequence.
2265token_ids_1 (:obj:`List[int]`, `optional`): The second tokenized sequence.
2266
2267Returns:
2268:obj:`List[int]`: The token type ids.
2269"""
2270if token_ids_1 is None:
2271return len(token_ids_0) * [0]
2272return [0] * len(token_ids_0) + [1] * len(token_ids_1)
2273
2274def build_inputs_with_special_tokens(
2275self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
2276) -> List[int]:
2277"""
2278Build model inputs from a sequence or a pair of sequence for sequence classification tasks
2279by concatenating and adding special tokens.
2280
2281This implementation does not add special tokens and this method should be overriden in a subclass.
2282
2283Args:
2284token_ids_0 (:obj:`List[int]`): The first tokenized sequence.
2285token_ids_1 (:obj:`List[int]`, `optional`): The second tokenized sequence.
2286
2287Returns:
2288:obj:`List[int]`: The model input with special tokens.
2289"""
2290if token_ids_1 is None:
2291return token_ids_0
2292return token_ids_0 + token_ids_1
2293
2294@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
2295def prepare_for_model(
2296self,
2297ids: List[int],
2298pair_ids: Optional[List[int]] = None,
2299add_special_tokens: bool = True,
2300padding: Union[bool, str, PaddingStrategy] = False,
2301truncation: Union[bool, str, TruncationStrategy] = False,
2302max_length: Optional[int] = None,
2303stride: int = 0,
2304pad_to_multiple_of: Optional[int] = None,
2305return_tensors: Optional[Union[str, TensorType]] = None,
2306return_token_type_ids: Optional[bool] = None,
2307return_attention_mask: Optional[bool] = None,
2308return_overflowing_tokens: bool = False,
2309return_special_tokens_mask: bool = False,
2310return_offsets_mapping: bool = False,
2311return_length: bool = False,
2312verbose: bool = True,
2313prepend_batch_axis: bool = False,
2314**kwargs
2315) -> BatchEncoding:
2316"""
2317Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
2318It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
2319manages a moving window (with user defined stride) for overflowing tokens
2320
2321Args:
2322ids (:obj:`List[int]`):
2323Tokenized input ids of the first sequence. Can be obtained from a string by chaining the
2324``tokenize`` and ``convert_tokens_to_ids`` methods.
2325pair_ids (:obj:`List[int]`, `optional`):
2326Tokenized input ids of the second sequence. Can be obtained from a string by chaining the
2327``tokenize`` and ``convert_tokens_to_ids`` methods.
2328"""
2329
2330if "return_lengths" in kwargs:
2331if verbose:
2332warnings.warn(
2333"The PreTrainedTokenizerBase.prepare_for_model `return_lengths` parameter is deprecated. "
2334"Please use `return_length` instead.",
2335FutureWarning,
2336)
2337return_length = kwargs["return_lengths"]
2338
2339# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
2340padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
2341padding=padding,
2342truncation=truncation,
2343max_length=max_length,
2344pad_to_multiple_of=pad_to_multiple_of,
2345verbose=verbose,
2346**kwargs,
2347)
2348
2349pair = bool(pair_ids is not None)
2350len_ids = len(ids)
2351len_pair_ids = len(pair_ids) if pair else 0
2352
2353# Load from model defaults
2354if return_token_type_ids is None:
2355return_token_type_ids = "token_type_ids" in self.model_input_names
2356if return_attention_mask is None:
2357return_attention_mask = "attention_mask" in self.model_input_names
2358
2359encoded_inputs = {}
2360
2361# Compute the total size of the returned encodings
2362total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)
2363
2364# Truncation: Handle max sequence length
2365if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
2366ids, pair_ids, overflowing_tokens = self.truncate_sequences(
2367ids,
2368pair_ids=pair_ids,
2369num_tokens_to_remove=total_len - max_length,
2370truncation_strategy=truncation_strategy,
2371stride=stride,
2372)
2373if return_overflowing_tokens:
2374encoded_inputs["overflowing_tokens"] = overflowing_tokens
2375encoded_inputs["num_truncated_tokens"] = total_len - max_length
2376
2377# Add special tokens
2378if add_special_tokens:
2379sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
2380token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
2381else:
2382sequence = ids + pair_ids if pair else ids
2383token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
2384
2385# Build output dictionnary
2386encoded_inputs["input_ids"] = sequence
2387if return_token_type_ids:
2388encoded_inputs["token_type_ids"] = token_type_ids
2389if return_special_tokens_mask:
2390if add_special_tokens:
2391encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
2392else:
2393encoded_inputs["special_tokens_mask"] = [0] * len(sequence)
2394
2395# Check lengths
2396if max_length is None and len(encoded_inputs["input_ids"]) > self.model_max_length and verbose:
2397logger.warning(
2398"Token indices sequence length is longer than the specified maximum sequence length "
2399"for this model ({} > {}). Running this sequence through the model will result in "
2400"indexing errors".format(len(ids), self.model_max_length)
2401)
2402
2403# Padding
2404if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
2405encoded_inputs = self.pad(
2406encoded_inputs,
2407max_length=max_length,
2408padding=padding_strategy.value,
2409pad_to_multiple_of=pad_to_multiple_of,
2410return_attention_mask=return_attention_mask,
2411)
2412
2413if return_length:
2414encoded_inputs["length"] = len(encoded_inputs["input_ids"])
2415
2416batch_outputs = BatchEncoding(
2417encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
2418)
2419
2420return batch_outputs
2421
2422def truncate_sequences(
2423self,
2424ids: List[int],
2425pair_ids: Optional[List[int]] = None,
2426num_tokens_to_remove: int = 0,
2427truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
2428stride: int = 0,
2429) -> Tuple[List[int], List[int], List[int]]:
2430"""
2431Truncates a sequence pair in-place following the strategy.
2432
2433Args:
2434ids (:obj:`List[int]`):
2435Tokenized input ids of the first sequence. Can be obtained from a string by chaining the
2436``tokenize`` and ``convert_tokens_to_ids`` methods.
2437pair_ids (:obj:`List[int]`, `optional`):
2438Tokenized input ids of the second sequence. Can be obtained from a string by chaining the
2439``tokenize`` and ``convert_tokens_to_ids`` methods.
2440num_tokens_to_remove (:obj:`int`, `optional`, defaults to 0):
2441Number of tokens to remove using the truncation strategy.
2442truncation (:obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`):
2443The strategy to follow for truncation. Can be:
2444
2445* :obj:`'longest_first'`: Truncate to a maximum length specified with the argument
2446:obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
2447provided. This will truncate token by token, removing a token from the longest sequence in the pair
2448if a pair of sequences (or a batch of pairs) is provided.
2449* :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
2450the maximum acceptable input length for the model if that argument is not provided. This will only
2451truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
2452* :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
2453to the maximum acceptable input length for the model if that argument is not provided. This will only
2454truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
2455* :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with
2456sequence lengths greater than the model maximum admissible input size).
2457max_length (:obj:`int`, `optional`):
2458Controls the maximum length to use by one of the truncation/padding parameters.
2459
2460If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
2461length is required by one of the truncation/padding parameters. If the model has no specific maximum
2462input length (like XLNet) truncation/padding to a maximum length will be deactivated.
2463stride (:obj:`int`, `optional`, defaults to 0):
2464If set to a positive number, the overflowing tokens returned will contain some tokens
2465from the main sequence returned. The value of this argument defines the number of additional tokens.
2466
2467Returns:
2468:obj:`Tuple[List[int], List[int], List[int]]`:
2469The truncated ``ids``, the truncated ``pair_ids`` and the list of overflowing tokens.
2470"""
2471if num_tokens_to_remove <= 0:
2472return ids, pair_ids, []
2473
2474if not isinstance(truncation_strategy, TruncationStrategy):
2475truncation_strategy = TruncationStrategy(truncation_strategy)
2476
2477overflowing_tokens = []
2478if truncation_strategy == TruncationStrategy.LONGEST_FIRST:
2479for _ in range(num_tokens_to_remove):
2480if pair_ids is None or len(ids) > len(pair_ids):
2481if not overflowing_tokens:
2482window_len = min(len(ids), stride + 1)
2483else:
2484window_len = 1
2485overflowing_tokens.extend(ids[-window_len:])
2486ids = ids[:-1]
2487else:
2488if not overflowing_tokens:
2489window_len = min(len(pair_ids), stride + 1)
2490else:
2491window_len = 1
2492overflowing_tokens.extend(pair_ids[-window_len:])
2493pair_ids = pair_ids[:-1]
2494elif truncation_strategy == TruncationStrategy.ONLY_FIRST:
2495if len(ids) > num_tokens_to_remove:
2496window_len = min(len(ids), stride + num_tokens_to_remove)
2497overflowing_tokens = ids[-window_len:]
2498ids = ids[:-num_tokens_to_remove]
2499else:
2500logger.error(
2501f"We need to remove {num_tokens_to_remove} to truncate the input"
2502f"but the first sequence has a length {len(ids)}. "
2503f"Please select another truncation strategy than {truncation_strategy}, "
2504f"for instance 'longest_first' or 'only_second'."
2505)
2506elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None:
2507if len(pair_ids) > num_tokens_to_remove:
2508window_len = min(len(pair_ids), stride + num_tokens_to_remove)
2509overflowing_tokens = pair_ids[-window_len:]
2510pair_ids = pair_ids[:-num_tokens_to_remove]
2511else:
2512logger.error(
2513f"We need to remove {num_tokens_to_remove} to truncate the input"
2514f"but the second sequence has a length {len(pair_ids)}. "
2515f"Please select another truncation strategy than {truncation_strategy}, "
2516f"for instance 'longest_first' or 'only_first'."
2517)
2518
2519return (ids, pair_ids, overflowing_tokens)
2520
2521def _pad(
2522self,
2523encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
2524max_length: Optional[int] = None,
2525padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
2526pad_to_multiple_of: Optional[int] = None,
2527return_attention_mask: Optional[bool] = None,
2528) -> dict:
2529"""
2530Pad encoded inputs (on left/right and up to predefined legnth or max length in the batch)
2531
2532Args:
2533encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
2534max_length: maximum length of the returned list and optionally padding length (see below).
2535Will truncate by taking into account the special tokens.
2536padding_strategy: PaddingStrategy to use for padding.
2537- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
2538- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
2539- PaddingStrategy.DO_NOT_PAD: Do not pad
2540The tokenizer padding sides are defined in self.padding_side:
2541- 'left': pads on the left of the sequences
2542- 'right': pads on the right of the sequences
2543pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
2544This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
2545>= 7.5 (Volta).
2546return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics)
2547"""
2548# Load from model defaults
2549if return_attention_mask is None:
2550return_attention_mask = "attention_mask" in self.model_input_names
2551
2552if padding_strategy == PaddingStrategy.LONGEST:
2553max_length = len(encoded_inputs["input_ids"])
2554
2555if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
2556max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
2557
2558needs_to_be_padded = (
2559padding_strategy != PaddingStrategy.DO_NOT_PAD and len(encoded_inputs["input_ids"]) != max_length
2560)
2561
2562if needs_to_be_padded:
2563difference = max_length - len(encoded_inputs["input_ids"])
2564if self.padding_side == "right":
2565if return_attention_mask:
2566encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference
2567if "token_type_ids" in encoded_inputs:
2568encoded_inputs["token_type_ids"] = (
2569encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
2570)
2571if "special_tokens_mask" in encoded_inputs:
2572encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
2573encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference
2574elif self.padding_side == "left":
2575if return_attention_mask:
2576encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"])
2577if "token_type_ids" in encoded_inputs:
2578encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
2579"token_type_ids"
2580]
2581if "special_tokens_mask" in encoded_inputs:
2582encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
2583encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"]
2584else:
2585raise ValueError("Invalid padding strategy:" + str(self.padding_side))
2586else:
2587if return_attention_mask:
2588encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"])
2589
2590return encoded_inputs
2591
2592def batch_decode(
2593self, sequences: List[List[int]], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True
2594) -> List[str]:
2595"""
2596Convert a list of lists of token ids into a list of strings by calling decode.
2597
2598Args:
2599sequences (:obj:`List[List[int]]`):
2600List of tokenized input ids. Can be obtained using the ``__call__`` method.
2601skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
2602Whether or not to remove special tokens in the decoding.
2603clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`):
2604Whether or not to clean up the tokenization spaces.
2605
2606Returns:
2607:obj:`List[str]`: The list of decoded sentences.
2608"""
2609return [
2610self.decode(
2611seq, skip_special_tokens=skip_special_tokens, clean_up_tokenization_spaces=clean_up_tokenization_spaces
2612)
2613for seq in sequences
2614]
2615
2616def decode(
2617self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True
2618) -> str:
2619"""
2620Converts a sequence of ids in a string, using the tokenizer and vocabulary
2621with options to remove special tokens and clean up tokenization spaces.
2622
2623Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
2624
2625Args:
2626token_ids (:obj:`List[int]`):
2627List of tokenized input ids. Can be obtained using the ``__call__`` method.
2628skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
2629Whether or not to remove special tokens in the decoding.
2630clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`):
2631Whether or not to clean up the tokenization spaces.
2632
2633Returns:
2634:obj:`str`: The decoded sentence.
2635"""
2636raise NotImplementedError
2637
2638def get_special_tokens_mask(
2639self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
2640) -> List[int]:
2641"""
2642Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
2643special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
2644
2645Args:
2646token_ids_0 (:obj:`List[int]`):
2647List of ids of the first sequence.
2648token_ids_1 (:obj:`List[int]`, `optional`):
2649List of ids of the second sequence.
2650already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
2651Wheter or not the token list is already formated with special tokens for the model.
2652
2653Returns:
2654A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
2655"""
2656assert already_has_special_tokens and token_ids_1 is None, (
2657"You cannot use ``already_has_special_tokens=False`` with this tokenizer. "
2658"Please use a slow (full python) tokenizer to activate this argument."
2659"Or set `return_special_token_mask=True` when calling the encoding method "
2660"to get the special tokens mask in any tokenizer. "
2661)
2662
2663all_special_ids = self.all_special_ids # cache the property
2664
2665special_tokens_mask = [1 if token in all_special_ids else 0 for token in token_ids_0]
2666
2667return special_tokens_mask
2668
2669@staticmethod
2670def clean_up_tokenization(out_string: str) -> str:
2671"""
2672Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms.
2673
2674Args:
2675out_string (:obj:`str`): The text to clean up.
2676
2677Returns:
2678:obj:`str`: The cleaned-up string.
2679"""
2680out_string = (
2681out_string.replace(" .", ".")
2682.replace(" ?", "?")
2683.replace(" !", "!")
2684.replace(" ,", ",")
2685.replace(" ' ", "'")
2686.replace(" n't", "n't")
2687.replace(" 'm", "'m")
2688.replace(" 's", "'s")
2689.replace(" 've", "'ve")
2690.replace(" 're", "'re")
2691)
2692return out_string
2693