17
Base classes common to both the slow and the fast tokenization classes: PreTrainedTokenizerBase (host all the user
18
fronting encoding methods) Special token mixing (host the special tokens logic) and BatchEncoding (wrap the dictionary
19
of output with special method for the Fast tokenizers)
28
from collections import OrderedDict, UserDict
29
from collections.abc import Mapping
30
from contextlib import contextmanager
31
from dataclasses import dataclass, field
47
def is_sentencepiece_available():
48
return importlib.util.find_spec("sentencepiece") is not None
51
def is_tokenizers_available():
52
return importlib.util.find_spec("tokenizers") is not None
55
if is_tokenizers_available():
56
from tokenizers import AddedToken
59
@dataclass(frozen=True, eq=True)
62
AddedToken represents a token to be added to a Tokenizer An AddedToken can have special options defining the
66
content: str = field(default_factory=str)
67
single_word: bool = False
70
normalized: bool = True
72
def __getstate__(self):
76
TOKENIZER_MAPPING_NAMES = OrderedDict(
81
"AlbertTokenizer" if is_sentencepiece_available() else None,
82
"AlbertTokenizerFast" if is_tokenizers_available() else None,
85
("bart", ("BartTokenizer", "BartTokenizerFast")),
89
"BarthezTokenizer" if is_sentencepiece_available() else None,
90
"BarthezTokenizerFast" if is_tokenizers_available() else None,
93
("bartpho", ("BartphoTokenizer", None)),
94
("bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
95
("bert-generation", ("BertGenerationTokenizer" if is_sentencepiece_available() else None, None)),
96
("bert-japanese", ("BertJapaneseTokenizer", None)),
97
("bertweet", ("BertweetTokenizer", None)),
101
"BigBirdTokenizer" if is_sentencepiece_available() else None,
102
"BigBirdTokenizerFast" if is_tokenizers_available() else None,
105
("bigbird_pegasus", ("PegasusTokenizer", "PegasusTokenizerFast" if is_tokenizers_available() else None)),
106
("blenderbot", ("BlenderbotTokenizer", "BlenderbotTokenizerFast")),
107
("blenderbot-small", ("BlenderbotSmallTokenizer", None)),
108
("bloom", (None, "BloomTokenizerFast" if is_tokenizers_available() else None)),
109
("byt5", ("ByT5Tokenizer", None)),
113
"CamembertTokenizer" if is_sentencepiece_available() else None,
114
"CamembertTokenizerFast" if is_tokenizers_available() else None,
117
("canine", ("CanineTokenizer", None)),
122
"CLIPTokenizerFast" if is_tokenizers_available() else None,
125
("convbert", ("ConvBertTokenizer", "ConvBertTokenizerFast" if is_tokenizers_available() else None)),
129
"CpmTokenizer" if is_sentencepiece_available() else None,
130
"CpmTokenizerFast" if is_tokenizers_available() else None,
133
("ctrl", ("CTRLTokenizer", None)),
134
("data2vec-text", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
135
("deberta", ("DebertaTokenizer", "DebertaTokenizerFast" if is_tokenizers_available() else None)),
139
"DebertaV2Tokenizer" if is_sentencepiece_available() else None,
140
"DebertaV2TokenizerFast" if is_tokenizers_available() else None,
143
("distilbert", ("DistilBertTokenizer", "DistilBertTokenizerFast" if is_tokenizers_available() else None)),
147
"DPRQuestionEncoderTokenizer",
148
"DPRQuestionEncoderTokenizerFast" if is_tokenizers_available() else None,
151
("electra", ("ElectraTokenizer", "ElectraTokenizerFast" if is_tokenizers_available() else None)),
152
("flaubert", ("FlaubertTokenizer", None)),
153
("fnet", ("FNetTokenizer", "FNetTokenizerFast" if is_tokenizers_available() else None)),
154
("fsmt", ("FSMTTokenizer", None)),
155
("funnel", ("FunnelTokenizer", "FunnelTokenizerFast" if is_tokenizers_available() else None)),
156
("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
157
("gpt_neo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
158
("gpt_neox", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
159
("gptj", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
160
("herbert", ("HerbertTokenizer", "HerbertTokenizerFast" if is_tokenizers_available() else None)),
161
("hubert", ("Wav2Vec2CTCTokenizer", None)),
162
("ibert", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
163
("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)),
164
("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)),
165
("layoutlmv3", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)),
166
("layoutxlm", ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast" if is_tokenizers_available() else None)),
167
("led", ("LEDTokenizer", "LEDTokenizerFast" if is_tokenizers_available() else None)),
168
("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)),
172
"T5Tokenizer" if is_sentencepiece_available() else None,
173
"T5TokenizerFast" if is_tokenizers_available() else None,
176
("luke", ("LukeTokenizer", None)),
177
("lxmert", ("LxmertTokenizer", "LxmertTokenizerFast" if is_tokenizers_available() else None)),
178
("m2m_100", ("M2M100Tokenizer" if is_sentencepiece_available() else None, None)),
179
("marian", ("MarianTokenizer" if is_sentencepiece_available() else None, None)),
183
"MBartTokenizer" if is_sentencepiece_available() else None,
184
"MBartTokenizerFast" if is_tokenizers_available() else None,
190
"MBart50Tokenizer" if is_sentencepiece_available() else None,
191
"MBart50TokenizerFast" if is_tokenizers_available() else None,
194
("megatron-bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
195
("mluke", ("MLukeTokenizer" if is_sentencepiece_available() else None, None)),
196
("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast" if is_tokenizers_available() else None)),
197
("mpnet", ("MPNetTokenizer", "MPNetTokenizerFast" if is_tokenizers_available() else None)),
201
"MT5Tokenizer" if is_sentencepiece_available() else None,
202
"MT5TokenizerFast" if is_tokenizers_available() else None,
208
"AlbertTokenizer" if is_sentencepiece_available() else None,
209
"AlbertTokenizerFast" if is_tokenizers_available() else None,
212
("openai-gpt", ("OpenAIGPTTokenizer", "OpenAIGPTTokenizerFast" if is_tokenizers_available() else None)),
213
("opt", ("GPT2Tokenizer", None)),
217
"PegasusTokenizer" if is_sentencepiece_available() else None,
218
"PegasusTokenizerFast" if is_tokenizers_available() else None,
224
"PerceiverTokenizer",
228
("phobert", ("PhobertTokenizer", None)),
229
("plbart", ("PLBartTokenizer" if is_sentencepiece_available() else None, None)),
230
("prophetnet", ("ProphetNetTokenizer", None)),
231
("qdqbert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
232
("rag", ("RagTokenizer", None)),
233
("realm", ("RealmTokenizer", "RealmTokenizerFast" if is_tokenizers_available() else None)),
237
"ReformerTokenizer" if is_sentencepiece_available() else None,
238
"ReformerTokenizerFast" if is_tokenizers_available() else None,
244
"RemBertTokenizer" if is_sentencepiece_available() else None,
245
"RemBertTokenizerFast" if is_tokenizers_available() else None,
248
("retribert", ("RetriBertTokenizer", "RetriBertTokenizerFast" if is_tokenizers_available() else None)),
249
("roberta", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
250
("roformer", ("RoFormerTokenizer", "RoFormerTokenizerFast" if is_tokenizers_available() else None)),
251
("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)),
252
("speech_to_text_2", ("Speech2Text2Tokenizer", None)),
253
("splinter", ("SplinterTokenizer", "SplinterTokenizerFast")),
256
("SqueezeBertTokenizer", "SqueezeBertTokenizerFast" if is_tokenizers_available() else None),
261
"T5Tokenizer" if is_sentencepiece_available() else None,
262
"T5TokenizerFast" if is_tokenizers_available() else None,
265
("tapas", ("TapasTokenizer", None)),
266
("tapex", ("TapexTokenizer", None)),
267
("transfo-xl", ("TransfoXLTokenizer", None)),
268
("vilt", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
269
("visual_bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
270
("wav2vec2", ("Wav2Vec2CTCTokenizer", None)),
271
("wav2vec2-conformer", ("Wav2Vec2CTCTokenizer", None)),
272
("wav2vec2_phoneme", ("Wav2Vec2PhonemeCTCTokenizer", None)),
276
"XGLMTokenizer" if is_sentencepiece_available() else None,
277
"XGLMTokenizerFast" if is_tokenizers_available() else None,
280
("xlm", ("XLMTokenizer", None)),
281
("xlm-prophetnet", ("XLMProphetNetTokenizer" if is_sentencepiece_available() else None, None)),
285
"XLMRobertaTokenizer" if is_sentencepiece_available() else None,
286
"XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
289
("xlm-roberta-xl", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
293
"XLNetTokenizer" if is_sentencepiece_available() else None,
294
"XLNetTokenizerFast" if is_tokenizers_available() else None,
300
"AlbertTokenizer" if is_sentencepiece_available() else None,
301
"AlbertTokenizerFast" if is_tokenizers_available() else None,
307
SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict(
309
("openai-gpt", "openai"),
310
("data2vec-audio", "data2vec"),
311
("data2vec-text", "data2vec"),
312
("data2vec-vision", "data2vec"),
317
def model_type_to_module_name(key):
318
"""Converts a config key to the corresponding module."""
320
if key in SPECIAL_MODEL_TYPE_TO_MODULE_NAME:
321
return SPECIAL_MODEL_TYPE_TO_MODULE_NAME[key]
323
return key.replace("-", "_")
326
class _LazyConfigMapping(OrderedDict):
328
A dictionary that lazily load its values when they are requested.
331
def __init__(self, mapping):
332
self._mapping = mapping
333
self._extra_content = {}
336
def __getitem__(self, key):
337
if key in self._extra_content:
338
return self._extra_content[key]
339
if key not in self._mapping:
341
value = self._mapping[key]
342
module_name = model_type_to_module_name(key)
343
if module_name not in self._modules:
345
self._modules[module_name] = importlib.import_module(f".{module_name}", "transformers.models")
346
if hasattr(self._modules[module_name], value):
347
return getattr(self._modules[module_name], value)
351
transformers_module = importlib.import_module("transformers")
352
return getattr(transformers_module, value)
355
return list(self._mapping.keys()) + list(self._extra_content.keys())
358
return [self[k] for k in self._mapping.keys()] + list(self._extra_content.values())
361
return [(k, self[k]) for k in self._mapping.keys()] + list(self._extra_content.items())
364
return iter(list(self._mapping.keys()) + list(self._extra_content.keys()))
366
def __contains__(self, item):
367
return item in self._mapping or item in self._extra_content
369
def register(self, key, value):
371
Register a new configuration in this mapping.
373
if key in self._mapping.keys():
374
raise ValueError(f"'{key}' is already used by a Transformers config, pick another name.")
375
self._extra_content[key] = value
380
Trie in Python. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass
381
Loose reference https://en.wikipedia.org/wiki/Trie
387
def add(self, word: str):
389
Passes over every char (utf-8 char) on word and recursively adds it to the internal `data` trie representation.
390
The special key `""` is used to represent termination.
392
This function is idempotent, adding twice the same word will leave the trie unchanged
398
>>> trie.add("Hello 友達")
400
{"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}}
402
>>> trie.add("Hello")
404
{"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}}
412
ref[char] = char in ref and ref[char] or {}
416
def split(self, text: str) -> List[str]:
418
Will look for the words added to the trie within `text`. Output is the original string splitted along the
419
boundaries of the words found.
421
This trie will match the longest possible word first !
427
>>> trie.split("[CLS] This is a extra_id_100")
428
["[CLS] This is a extra_id_100"]
430
>>> trie.add("[CLS]")
431
>>> trie.add("extra_id_1")
432
>>> trie.add("extra_id_100")
433
>>> trie.split("[CLS] This is a extra_id_100")
434
["[CLS]", " This is a ", "extra_id_100"]
449
states = OrderedDict()
461
for current, current_char in enumerate(text):
462
if skip and current < skip:
477
for start, trie_pointer in states.items():
478
if "" in trie_pointer:
487
for lookstart, looktrie_pointer in states.items():
488
if lookstart > start:
491
elif lookstart < start:
494
lookahead_index = current + 1
500
lookahead_index = current
502
next_char = text[lookahead_index] if lookahead_index < len(text) else None
503
if "" in looktrie_pointer:
505
end = lookahead_index
506
skip = lookahead_index
508
while next_char in looktrie_pointer:
509
looktrie_pointer = looktrie_pointer[next_char]
511
if "" in looktrie_pointer:
513
end = lookahead_index
514
skip = lookahead_index
516
if lookahead_index == len(text):
519
next_char = text[lookahead_index]
523
offsets.append(start)
527
elif current_char in trie_pointer:
530
trie_pointer = trie_pointer[current_char]
534
states[start] = trie_pointer
547
for start in to_remove:
552
if current >= skip and current_char in self.data:
553
states[current] = self.data[current_char]
556
for start, trie_pointer in states.items():
557
if "" in trie_pointer:
561
offsets.append(start)
567
return self.cut_text(text, offsets)
569
def cut_text(self, text, offsets):
573
offsets.append(len(text))
579
"There was a bug in Trie algorithm in tokenization. Attempting to recover. Please report it"
588
tokens.append(text[start:end])
597
class ExplicitEnum(Enum):
599
Enum with more explicit error message for missing values.
603
def _missing_(cls, value):
605
f"{value} is not a valid {cls.__name__}, please select one of {list(cls._value2member_map_.keys())}"
609
class TensorType(ExplicitEnum):
611
Possible values for the `return_tensors` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for
612
tab-completion in an IDE.
622
class BatchEncoding(UserDict):
624
Holds the output of the [`~tokenization_utils_base.PreTrainedTokenizerBase.__call__`],
625
[`~tokenization_utils_base.PreTrainedTokenizerBase.encode_plus`] and
626
[`~tokenization_utils_base.PreTrainedTokenizerBase.batch_encode_plus`] methods (tokens, attention_masks, etc).
628
This class is derived from a python dictionary and can be used as a dictionary. In addition, this class exposes
629
utility methods to map from word/character space to token space.
633
Dictionary of lists/arrays/tensors returned by the `__call__`/`encode_plus`/`batch_encode_plus` methods
634
('input_ids', 'attention_mask', etc.).
635
encoding (`tokenizers.Encoding` or `Sequence[tokenizers.Encoding]`, *optional*):
636
If the tokenizer is a fast tokenizer which outputs additional information like mapping from word/character
637
space to token space the `tokenizers.Encoding` instance or list of instance (for batches) hold this
639
tensor_type (`Union[None, str, TensorType]`, *optional*):
640
You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
642
prepend_batch_axis (`bool`, *optional*, defaults to `False`):
643
Whether or not to add a batch axis when converting to tensors (see `tensor_type` above).
644
n_sequences (`Optional[int]`, *optional*):
645
You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
654
prepend_batch_axis: bool = False,
657
super().__init__(data)
662
self._encodings = encoding
664
if n_sequences is None and encoding is not None and len(encoding):
665
n_sequences = encoding[0].n_sequences
667
self._n_sequences = n_sequences
669
self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)
672
def n_sequences(self) -> Optional[int]:
674
`Optional[int]`: The number of sequences used to generate each sample from the batch encoded in this
675
[`BatchEncoding`]. Currently can be one of `None` (unknown), `1` (a single sentence) or `2` (a pair of
678
return self._n_sequences
681
def is_fast(self) -> bool:
683
`bool`: Indicate whether this [`BatchEncoding`] was generated from the result of a [`PreTrainedTokenizerFast`]
686
return self._encodings is not None
690
def __getitem__(self, item):
692
If the key is a string, returns the value of the dict associated to `key` ('input_ids', 'attention_mask',
695
If the key is an integer, get the `tokenizers.Encoding` for batch item with index `key`.
697
if isinstance(item, str):
698
return self.data[item]
699
elif self._encodings is not None:
700
return self._encodings[item]
703
"Indexing with integers (to access backend Encoding for a given batch index) "
704
"is not available when using Python based tokenizers"
707
def __getattr__(self, item: str):
709
return self.data[item]
713
def __getstate__(self):
714
return {"data": self.data, "encodings": self._encodings}
716
def __setstate__(self, state):
718
self.data = state["data"]
720
if "encodings" in state:
721
self._encodings = state["encodings"]
724
return self.data.keys()
727
return self.data.values()
730
return self.data.items()
739
`Optional[List[tokenizers.Encoding]]`: The list all encodings from the tokenization process. Returns `None` if
740
the input was tokenized through Python (i.e., not a fast) tokenizer.
742
return self._encodings
744
def tokens(self, batch_index=0):
746
Return the list of tokens (sub-parts of the input strings after word/subword splitting and before conversion to
747
integer indices) at a given batch index (only works for the output of a fast tokenizer).
750
batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
753
`List[str]`: The list of tokens at that index.
755
if not self._encodings:
756
raise ValueError("tokens() is not available when using Python-based tokenizers")
757
return self._encodings[batch_index].tokens
759
def sequence_ids(self, batch_index=0):
761
Return a list mapping the tokens to the id of their original sentences:
763
- `None` for special tokens added around or between sequences,
764
- `0` for tokens corresponding to words in the first sequence,
765
- `1` for tokens corresponding to words in the second sequence when a pair of sequences was jointly
769
batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
772
`List[Optional[int]]`: A list indicating the sequence id corresponding to each token. Special tokens added
773
by the tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding
776
if not self._encodings:
777
raise ValueError("sequence_ids() is not available when using Python-based tokenizers")
778
return self._encodings[batch_index].sequence_ids
780
def words(self, batch_index=0):
782
Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
785
batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
788
`List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by the
789
tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding word
790
(several tokens will be mapped to the same word index if they are parts of that word).
792
if not self._encodings:
793
raise ValueError("words() is not available when using Python-based tokenizers")
795
"`BatchEncoding.words()` property is deprecated and should be replaced with the identical, "
796
"but more self-explanatory `BatchEncoding.word_ids()` property.",
799
return self.word_ids(batch_index)
801
def word_ids(self, batch_index: int = 0) -> List[Optional[int]]:
803
Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
806
batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
809
`List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by the
810
tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding word
811
(several tokens will be mapped to the same word index if they are parts of that word).
813
if not self._encodings:
814
raise ValueError("word_ids() is not available when using Python-based tokenizers")
815
return self._encodings[batch_index].word_ids
817
def token_to_sequence(self, batch_or_token_index, token_index):
819
Get the index of the sequence represented by the given token. In the general use case, this method returns `0`
820
for a single sequence or the first sequence of a pair, and `1` for the second sequence of a pair
824
- `self.token_to_sequence(token_index)` if batch size is 1
825
- `self.token_to_sequence(batch_index, token_index)` if batch size is greater than 1
827
This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
828
words are defined by the user). In this case it allows to easily associate encoded tokens with provided
832
batch_or_token_index (`int`):
833
Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
834
the token in the sequence.
835
token_index (`int`, *optional*):
836
If a batch index is provided in *batch_or_token_index*, this can be the index of the token in the
840
`int`: Index of the word in the input sequence.
843
if not self._encodings:
844
raise ValueError("token_to_sequence() is not available when using Python based tokenizers")
845
if token_index is not None:
846
batch_index = batch_or_token_index
849
token_index = batch_or_token_index
851
batch_index = self._batch_size + batch_index
853
token_index = self._seq_len + token_index
854
return self._encodings[batch_index].token_to_sequence(token_index)
856
def token_to_word(self, batch_or_token_index, token_index=None):
858
Get the index of the word corresponding (i.e. comprising) to an encoded token in a sequence of the batch.
862
- `self.token_to_word(token_index)` if batch size is 1
863
- `self.token_to_word(batch_index, token_index)` if batch size is greater than 1
865
This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
866
words are defined by the user). In this case it allows to easily associate encoded tokens with provided
870
batch_or_token_index (`int`):
871
Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
872
the token in the sequence.
873
token_index (`int`, *optional*):
874
If a batch index is provided in *batch_or_token_index*, this can be the index of the token in the
878
`int`: Index of the word in the input sequence.
881
if not self._encodings:
882
raise ValueError("token_to_word() is not available when using Python based tokenizers")
883
if token_index is not None:
884
batch_index = batch_or_token_index
887
token_index = batch_or_token_index
889
batch_index = self._batch_size + batch_index
891
token_index = self._seq_len + token_index
892
return self._encodings[batch_index].token_to_word(token_index)
894
def word_to_tokens(self, batch_or_word_index, word_index=None, sequence_index=0):
896
Get the encoded token span corresponding to a word in a sequence of the batch.
898
Token spans are returned as a [`~tokenization_utils_base.TokenSpan`] with:
900
- **start** -- Index of the first token.
901
- **end** -- Index of the token following the last token.
905
- `self.word_to_tokens(word_index, sequence_index: int = 0)` if batch size is 1
906
- `self.word_to_tokens(batch_index, word_index, sequence_index: int = 0)` if batch size is greater or equal to
909
This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
910
are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
914
batch_or_word_index (`int`):
915
Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
916
the word in the sequence.
917
word_index (`int`, *optional*):
918
If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the
920
sequence_index (`int`, *optional*, defaults to 0):
921
If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
922
or 1) the provided word index belongs to.
925
Optional [`~tokenization_utils_base.TokenSpan`] Span of tokens in the encoded sequence. Returns `None` if
926
no tokens correspond to the word.
929
if not self._encodings:
930
raise ValueError("word_to_tokens() is not available when using Python based tokenizers")
931
if word_index is not None:
932
batch_index = batch_or_word_index
935
word_index = batch_or_word_index
937
batch_index = self._batch_size + batch_index
939
word_index = self._seq_len + word_index
940
span = self._encodings[batch_index].word_to_tokens(word_index, sequence_index)
941
return TokenSpan(*span) if span is not None else None
943
def token_to_chars(self, batch_or_token_index: int, token_index=None):
945
Get the character span corresponding to an encoded token in a sequence of the batch.
947
Character spans are returned as a [`~tokenization_utils_base.CharSpan`] with:
949
- **start** -- Index of the first character in the original string associated to the token.
950
- **end** -- Index of the character following the last character in the original string associated to the
955
- `self.token_to_chars(token_index)` if batch size is 1
956
- `self.token_to_chars(batch_index, token_index)` if batch size is greater or equal to 1
959
batch_or_token_index (`int`):
960
Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
961
the token in the sequence.
962
token_index (`int`, *optional*):
963
If a batch index is provided in *batch_or_token_index*, this can be the index of the token or tokens in
967
[`~tokenization_utils_base.CharSpan`]: Span of characters in the original string, or None, if the token
968
(e.g. <s>, </s>) doesn't correspond to any chars in the origin string.
971
if not self._encodings:
972
raise ValueError("token_to_chars() is not available when using Python based tokenizers")
973
if token_index is not None:
974
batch_index = batch_or_token_index
977
token_index = batch_or_token_index
978
span_indices = self._encodings[batch_index].token_to_chars(token_index)
980
return CharSpan(*span_indices) if span_indices is not None else None
983
self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0
986
Get the index of the token in the encoded output comprising a character in the original string for a sequence
991
- `self.char_to_token(char_index)` if batch size is 1
992
- `self.char_to_token(batch_index, char_index)` if batch size is greater or equal to 1
994
This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
995
are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
999
batch_or_char_index (`int`):
1000
Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
1001
the word in the sequence
1002
char_index (`int`, *optional*):
1003
If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the
1005
sequence_index (`int`, *optional*, defaults to 0):
1006
If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
1007
or 1) the provided character index belongs to.
1011
`int`: Index of the token.
1014
if not self._encodings:
1015
raise ValueError("char_to_token() is not available when using Python based tokenizers")
1016
if char_index is not None:
1017
batch_index = batch_or_char_index
1020
char_index = batch_or_char_index
1021
return self._encodings[batch_index].char_to_token(char_index, sequence_index)
1023
def word_to_chars(self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0):
1025
Get the character span in the original string corresponding to given word in a sequence of the batch.
1027
Character spans are returned as a CharSpan NamedTuple with:
1029
- start: index of the first character in the original string
1030
- end: index of the character following the last character in the original string
1034
- `self.word_to_chars(word_index)` if batch size is 1
1035
- `self.word_to_chars(batch_index, word_index)` if batch size is greater or equal to 1
1038
batch_or_word_index (`int`):
1039
Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
1040
the word in the sequence
1041
word_index (`int`, *optional*):
1042
If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the
1044
sequence_index (`int`, *optional*, defaults to 0):
1045
If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
1046
or 1) the provided word index belongs to.
1049
`CharSpan` or `List[CharSpan]`: Span(s) of the associated character or characters in the string. CharSpan
1050
are NamedTuple with:
1052
- start: index of the first character associated to the token in the original string
1053
- end: index of the character following the last character associated to the token in the original
1057
if not self._encodings:
1058
raise ValueError("word_to_chars() is not available when using Python based tokenizers")
1059
if word_index is not None:
1060
batch_index = batch_or_word_index
1063
word_index = batch_or_word_index
1064
return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index, sequence_index)))
1066
def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0) -> int:
1068
Get the word in the original string corresponding to a character in the original string of a sequence of the
1073
- `self.char_to_word(char_index)` if batch size is 1
1074
- `self.char_to_word(batch_index, char_index)` if batch size is greater than 1
1076
This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
1077
are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
1081
batch_or_char_index (`int`):
1082
Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
1083
the character in the original string.
1084
char_index (`int`, *optional*):
1085
If a batch index is provided in *batch_or_token_index*, this can be the index of the character in the
1087
sequence_index (`int`, *optional*, defaults to 0):
1088
If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
1089
or 1) the provided character index belongs to.
1093
`int` or `List[int]`: Index or indices of the associated encoded token(s).
1096
if not self._encodings:
1097
raise ValueError("char_to_word() is not available when using Python based tokenizers")
1098
if char_index is not None:
1099
batch_index = batch_or_char_index
1102
char_index = batch_or_char_index
1103
return self._encodings[batch_index].char_to_word(char_index, sequence_index)
1105
def convert_to_tensors(self, tensor_type=None, prepend_batch_axis: bool = False):
1107
Convert the inner content to tensors.
1110
tensor_type (`str` or [`~utils.TensorType`], *optional*):
1111
The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If
1112
`None`, no modification is done.
1113
prepend_batch_axis (`int`, *optional*, defaults to `False`):
1114
Whether or not to add the batch dimension during the conversion.
1116
if tensor_type is None:
1120
if tensor_type == "paddle":
1123
as_tensor = paddle.to_tensor
1124
is_tensor = paddle.is_tensor
1126
as_tensor = np.asarray
1127
is_tensor = _is_numpy
1135
for key, value in self.items():
1137
if prepend_batch_axis:
1140
if not is_tensor(value):
1141
tensor = as_tensor(value)
1152
if key == "overflowing_tokens":
1154
"Unable to create tensor returning overflowing tokens of different lengths. "
1155
"Please see if a fast version of this tokenizer is available to have this feature available."
1158
"Unable to create tensor, you should probably activate truncation and/or padding "
1159
"with 'padding=True' 'truncation=True' to have batched tensors with the same length."
1165
class TruncationStrategy(ExplicitEnum):
1167
Possible values for the `truncation` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for tab-completion in
1171
ONLY_FIRST = "only_first"
1172
ONLY_SECOND = "only_second"
1173
LONGEST_FIRST = "longest_first"
1174
DO_NOT_TRUNCATE = "do_not_truncate"
1177
class PaddingStrategy(ExplicitEnum):
1179
Possible values for the `padding` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for tab-completion in an
1184
MAX_LENGTH = "max_length"
1185
DO_NOT_PAD = "do_not_pad"
1188
class SpecialTokensMixin:
1190
A mixin derived by [`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`] to handle specific behaviors related to
1191
special tokens. In particular, this class hold the attributes which can be used to directly access these special
1192
tokens in a model-independent manner and allow to set and update the special tokens.
1195
bos_token (`str` or `tokenizers.AddedToken`, *optional*):
1196
A special token representing the beginning of a sentence.
1197
eos_token (`str` or `tokenizers.AddedToken`, *optional*):
1198
A special token representing the end of a sentence.
1199
unk_token (`str` or `tokenizers.AddedToken`, *optional*):
1200
A special token representing an out-of-vocabulary token.
1201
sep_token (`str` or `tokenizers.AddedToken`, *optional*):
1202
A special token separating two different sentences in the same input (used by BERT for instance).
1203
pad_token (`str` or `tokenizers.AddedToken`, *optional*):
1204
A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
1205
attention mechanisms or loss computation.
1206
cls_token (`str` or `tokenizers.AddedToken`, *optional*):
1207
A special token representing the class of the input (used by BERT for instance).
1208
mask_token (`str` or `tokenizers.AddedToken`, *optional*):
1209
A special token representing a masked token (used by masked-language modeling pretraining objectives, like
1211
additional_special_tokens (tuple or list of `str` or `tokenizers.AddedToken`, *optional*):
1212
A tuple or a list of additional special tokens.
1215
SPECIAL_TOKENS_ATTRIBUTES = [
1223
"additional_special_tokens",
1226
def __init__(self, verbose=True, **kwargs):
1227
self._bos_token = None
1228
self._eos_token = None
1229
self._unk_token = None
1230
self._sep_token = None
1231
self._pad_token = None
1232
self._cls_token = None
1233
self._mask_token = None
1234
self._pad_token_type_id = 0
1235
self._additional_special_tokens = []
1236
self.verbose = verbose
1237
self.added_tokens_encoder: Dict[str, int] = {}
1238
self.added_tokens_decoder: Dict[int, str] = {}
1239
self.unique_no_split_tokens: List[str] = []
1240
self.tokens_trie = Trie()
1242
self._decode_use_source_tokenizer = False
1247
for key, value in kwargs.items():
1250
if key in self.SPECIAL_TOKENS_ATTRIBUTES:
1251
if key == "additional_special_tokens":
1252
assert isinstance(value, (list, tuple)), f"Value {value} is not a list or tuple"
1254
isinstance(t, (str, AddedToken)) for t in value
1255
), "One of the tokens is not a string or an AddedToken"
1256
setattr(self, key, value)
1257
elif isinstance(value, (str, AddedToken)):
1258
setattr(self, key, value)
1260
raise TypeError(f"special token {key} has to be either str or AddedToken but got: {type(value)}")
1262
def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
1264
Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
1268
tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s).
1271
`int` or `List[int]`: The token id or list of token ids.
1276
if isinstance(tokens, str):
1277
return self._convert_token_to_id_with_added_voc(tokens)
1280
for token in tokens:
1281
ids.append(self._convert_token_to_id_with_added_voc(token))
1284
def _convert_token_to_id_with_added_voc(self, token):
1288
if token in self.added_tokens_encoder:
1289
return self.added_tokens_encoder[token]
1290
return self._convert_token_to_id(token)
1292
def _convert_token_to_id(self, token):
1293
"""Converts a token (str) in an id using the vocab."""
1294
if token.startswith("<extra_id_"):
1295
match = re.match(r"<extra_id_(\d+)>", token)
1296
num = int(match.group(1))
1297
return self.vocab_size - num - 1
1298
return self.sp_model.piece_to_id(token)
1300
def sanitize_special_tokens(self) -> int:
1302
Make sure that all the special tokens attributes of the tokenizer (`tokenizer.mask_token`,
1303
`tokenizer.cls_token`, etc.) are in the vocabulary.
1305
Add the missing ones to the vocabulary if needed.
1308
`int`: The number of tokens added in the vocabulary during the operation.
1310
return self.add_tokens(self.all_special_tokens_extended, special_tokens=True)
1312
def add_special_tokens(self, special_tokens_dict: Dict[str, Union[str, AddedToken]]) -> int:
1314
Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder and link them to class attributes. If
1315
special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the
1316
current vocabulary).
1318
Note,None When adding new tokens to the vocabulary, you should make sure to also resize the token embedding
1319
matrix of the model so that its embedding matrix matches the tokenizer.
1321
In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.
1323
Using `add_special_tokens` will ensure your special tokens can be used in several ways:
1325
- Special tokens are carefully handled by the tokenizer (they are never split).
1326
- You can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This
1327
makes it easy to develop model-agnostic training and fine-tuning scripts.
1329
When possible, special tokens are already registered for provided pretrained models (for instance
1330
[`BertTokenizer`] `cls_token` is already registered to be :obj*'[CLS]'* and XLM's one is also registered to be
1334
special_tokens_dict (dictionary *str* to *str* or `tokenizers.AddedToken`):
1335
Keys should be in the list of predefined special attributes: [`bos_token`, `eos_token`, `unk_token`,
1336
`sep_token`, `pad_token`, `cls_token`, `mask_token`, `additional_special_tokens`].
1338
Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer
1339
assign the index of the `unk_token` to them).
1342
`int`: Number of tokens added to the vocabulary.
1347
# Let's see how to add a new classification token to GPT-2
1348
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
1349
model = GPT2Model.from_pretrained("gpt2")
1351
special_tokens_dict = {"cls_token": "<CLS>"}
1353
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
1354
print("We have added", num_added_toks, "tokens")
1355
# Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
1356
model.resize_token_embeddings(len(tokenizer))
1358
assert tokenizer.cls_token == "<CLS>"
1360
if not special_tokens_dict:
1364
for key, value in special_tokens_dict.items():
1365
assert key in self.SPECIAL_TOKENS_ATTRIBUTES, f"Key {key} is not a special token"
1369
print(f"Assigning {value} to the {key} key of the tokenizer")
1370
setattr(self, key, value)
1372
if key == "additional_special_tokens":
1373
assert isinstance(value, (list, tuple)) and all(
1374
isinstance(t, (str, AddedToken)) for t in value
1375
), f"Tokens {value} for key {key} should all be str or AddedToken instances"
1376
added_tokens += self.add_tokens(value, special_tokens=True)
1379
value, (str, AddedToken)
1380
), f"Token {value} for key {key} should be a str or an AddedToken instance"
1381
added_tokens += self.add_tokens([value], special_tokens=True)
1386
self, new_tokens: Union[str, AddedToken, List[Union[str, AddedToken]]], special_tokens: bool = False
1389
Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
1390
it with indices starting from length of the current vocabulary.
1392
Note,None When adding new tokens to the vocabulary, you should make sure to also resize the token embedding
1393
matrix of the model so that its embedding matrix matches the tokenizer.
1395
In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.
1398
new_tokens (`str`, `tokenizers.AddedToken` or a list of *str* or `tokenizers.AddedToken`):
1399
Tokens are only added if they are not already in the vocabulary. `tokenizers.AddedToken` wraps a string
1400
token to let you personalize its behavior: whether this token should only match against a single word,
1401
whether this token should strip all potential whitespaces on the left side, whether this token should
1402
strip all potential whitespaces on the right side, etc.
1403
special_tokens (`bool`, *optional*, defaults to `False`):
1404
Can be used to specify if the token is a special token. This mostly change the normalization behavior
1405
(special tokens like CLS or [MASK] are usually not lower-cased for instance).
1407
See details for `tokenizers.AddedToken` in HuggingFace tokenizers library.
1410
`int`: Number of tokens added to the vocabulary.
1415
# Let's see how to increase the vocabulary of Bert model and tokenizer
1416
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
1417
model = BertModel.from_pretrained("bert-base-uncased")
1419
num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
1420
print("We have added", num_added_toks, "tokens")
1421
# Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
1422
model.resize_token_embeddings(len(tokenizer))
1427
if not isinstance(new_tokens, (list, tuple)):
1428
new_tokens = [new_tokens]
1430
return self._add_tokens(new_tokens, special_tokens=special_tokens)
1432
def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
1433
new_tokens = [str(tok) for tok in new_tokens]
1436
for token in new_tokens:
1437
if not isinstance(token, str):
1438
raise TypeError(f"Token {token} is not a string but a {type(token)}.")
1439
if not special_tokens and hasattr(self, "do_lower_case") and self.do_lower_case:
1440
token = token.lower()
1442
token != self.unk_token
1443
and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token)
1444
and token not in tokens_to_add
1446
tokens_to_add.append(token)
1451
added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(tokens_to_add))
1452
added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
1453
self.added_tokens_encoder.update(added_tok_encoder)
1454
self.added_tokens_decoder.update(added_tok_decoder)
1458
if len(new_tokens) == 1:
1459
_insert_one_token_to_ordered_list(self.unique_no_split_tokens, new_tokens[0])
1461
self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(new_tokens)))
1464
if len(tokens_to_add) == 1:
1465
_insert_one_token_to_ordered_list(self.unique_no_split_tokens, tokens_to_add[0])
1467
self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(tokens_to_add)))
1468
self._create_trie(self.unique_no_split_tokens)
1470
return len(tokens_to_add)
1472
def _create_trie(self, unique_no_split_tokens):
1474
for token in unique_no_split_tokens:
1475
if hasattr(self, "do_lower_case") and self.do_lower_case and token not in self.all_special_tokens:
1476
trie.add(token.lower())
1479
self.tokens_trie = trie
1482
def bos_token(self) -> str:
1484
`str`: Beginning of sentence token. Log an error if used while not having been set.
1486
if self._bos_token is None and self.verbose:
1487
print("Using bos_token, but it is not set yet.")
1490
return str(self._bos_token)
1493
def eos_token(self) -> str:
1495
`str`: End of sentence token. Log an error if used while not having been set.
1497
if self._eos_token is None and self.verbose:
1499
print("Using eos_token, but it is not set yet.")
1501
return str(self._eos_token)
1504
def unk_token(self) -> str:
1506
`str`: Unknown token. Log an error if used while not having been set.
1508
if self._unk_token is None and self.verbose:
1509
print("Using unk_token, but it is not set yet.")
1512
return str(self._unk_token)
1515
def sep_token(self) -> str:
1517
`str`: Separation token, to separate context and query in an input sequence. Log an error if used while not
1520
if self._sep_token is None and self.verbose:
1521
print("Using sep_token, but it is not set yet.")
1524
return str(self._sep_token)
1527
def pad_token(self) -> str:
1529
`str`: Padding token. Log an error if used while not having been set.
1531
if self._pad_token is None and self.verbose:
1533
print("Using pad_token, but it is not set yet.")
1535
return str(self._pad_token)
1538
def cls_token(self) -> str:
1540
`str`: Classification token, to extract a summary of an input sequence leveraging self-attention along the full
1541
depth of the model. Log an error if used while not having been set.
1543
if self._cls_token is None and self.verbose:
1545
print("Using cls_token, but it is not set yet.")
1547
return str(self._cls_token)
1550
def mask_token(self) -> str:
1552
`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
1555
if self._mask_token is None and self.verbose:
1557
print("Using mask_token, but it is not set yet.")
1559
return str(self._mask_token)
1562
def additional_special_tokens(self) -> List[str]:
1564
`List[str]`: All the additional special tokens you may want to use. Log an error if used while not having been
1567
if self._additional_special_tokens is None and self.verbose:
1569
print("Using additional_special_tokens, but it is not set yet.")
1571
return [str(tok) for tok in self._additional_special_tokens]
1574
def bos_token(self, value):
1575
self._bos_token = value
1578
def eos_token(self, value):
1579
self._eos_token = value
1582
def unk_token(self, value):
1583
self._unk_token = value
1586
def sep_token(self, value):
1587
self._sep_token = value
1590
def pad_token(self, value):
1591
self._pad_token = value
1594
def cls_token(self, value):
1595
self._cls_token = value
1598
def mask_token(self, value):
1599
self._mask_token = value
1601
@additional_special_tokens.setter
1602
def additional_special_tokens(self, value):
1603
self._additional_special_tokens = value
1606
def bos_token_id(self) -> Optional[int]:
1608
`Optional[int]`: Id of the beginning of sentence token in the vocabulary. Returns `None` if the token has not
1611
if self._bos_token is None:
1613
return self.convert_tokens_to_ids(self.bos_token)
1616
def eos_token_id(self) -> Optional[int]:
1618
`Optional[int]`: Id of the end of sentence token in the vocabulary. Returns `None` if the token has not been
1621
if self._eos_token is None:
1623
return self.convert_tokens_to_ids(self.eos_token)
1626
def unk_token_id(self) -> Optional[int]:
1628
`Optional[int]`: Id of the unknown token in the vocabulary. Returns `None` if the token has not been set.
1630
if self._unk_token is None:
1632
return self.convert_tokens_to_ids(self.unk_token)
1635
def sep_token_id(self) -> Optional[int]:
1637
`Optional[int]`: Id of the separation token in the vocabulary, to separate context and query in an input
1638
sequence. Returns `None` if the token has not been set.
1640
if self._sep_token is None:
1642
return self.convert_tokens_to_ids(self.sep_token)
1645
def pad_token_id(self) -> Optional[int]:
1647
`Optional[int]`: Id of the padding token in the vocabulary. Returns `None` if the token has not been set.
1649
if self._pad_token is None:
1651
return self.convert_tokens_to_ids(self.pad_token)
1654
def pad_token_type_id(self) -> int:
1656
`int`: Id of the padding token type in the vocabulary.
1658
return self._pad_token_type_id
1661
def cls_token_id(self) -> Optional[int]:
1663
`Optional[int]`: Id of the classification token in the vocabulary, to extract a summary of an input sequence
1664
leveraging self-attention along the full depth of the model.
1666
Returns `None` if the token has not been set.
1668
if self._cls_token is None:
1670
return self.convert_tokens_to_ids(self.cls_token)
1673
def mask_token_id(self) -> Optional[int]:
1675
`Optional[int]`: Id of the mask token in the vocabulary, used when training a model with masked-language
1676
modeling. Returns `None` if the token has not been set.
1678
if self._mask_token is None:
1680
return self.convert_tokens_to_ids(self.mask_token)
1683
def additional_special_tokens_ids(self) -> List[int]:
1685
`List[int]`: Ids of all the additional special tokens in the vocabulary. Log an error if used while not having
1688
return self.convert_tokens_to_ids(self.additional_special_tokens)
1690
@bos_token_id.setter
1691
def bos_token_id(self, value):
1692
self._bos_token = self.convert_ids_to_tokens(value) if value is not None else None
1694
@eos_token_id.setter
1695
def eos_token_id(self, value):
1696
self._eos_token = self.convert_ids_to_tokens(value) if value is not None else None
1698
@unk_token_id.setter
1699
def unk_token_id(self, value):
1700
self._unk_token = self.convert_ids_to_tokens(value) if value is not None else None
1702
@sep_token_id.setter
1703
def sep_token_id(self, value):
1704
self._sep_token = self.convert_ids_to_tokens(value) if value is not None else None
1706
@pad_token_id.setter
1707
def pad_token_id(self, value):
1708
self._pad_token = self.convert_ids_to_tokens(value) if value is not None else None
1710
@cls_token_id.setter
1711
def cls_token_id(self, value):
1712
self._cls_token = self.convert_ids_to_tokens(value) if value is not None else None
1714
@mask_token_id.setter
1715
def mask_token_id(self, value):
1716
self._mask_token = self.convert_ids_to_tokens(value) if value is not None else None
1718
@additional_special_tokens_ids.setter
1719
def additional_special_tokens_ids(self, values):
1720
self._additional_special_tokens = [self.convert_ids_to_tokens(value) for value in values]
1723
def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:
1725
`Dict[str, Union[str, List[str]]]`: A dictionary mapping special token class attributes (`cls_token`,
1726
`unk_token`, etc.) to their values (`'<unk>'`, `'<cls>'`, etc.).
1728
Convert potential tokens of `tokenizers.AddedToken` type to string.
1731
for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
1732
attr_value = getattr(self, "_" + attr)
1735
type(attr_value)(str(attr_value_sub) for attr_value_sub in attr_value)
1736
if isinstance(attr_value, (list, tuple))
1737
else str(attr_value)
1742
def special_tokens_map_extended(self) -> Dict[str, Union[str, AddedToken, List[Union[str, AddedToken]]]]:
1744
`Dict[str, Union[str, tokenizers.AddedToken, List[Union[str, tokenizers.AddedToken]]]]`: A dictionary mapping
1745
special token class attributes (`cls_token`, `unk_token`, etc.) to their values (`'<unk>'`, `'<cls>'`, etc.).
1747
Don't convert tokens of `tokenizers.AddedToken` type to string so they can be used to control more finely how
1748
special tokens are tokenized.
1751
for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
1752
attr_value = getattr(self, "_" + attr)
1754
set_attr[attr] = attr_value
1758
def all_special_tokens(self) -> List[str]:
1760
`List[str]`: All the special tokens (`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
1762
Convert tokens of `tokenizers.AddedToken` type to string.
1764
all_toks = [str(s) for s in self.all_special_tokens_extended]
1768
def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]:
1770
`List[Union[str, tokenizers.AddedToken]]`: All the special tokens (`'<unk>'`, `'<cls>'`, etc.) mapped to class
1773
Don't convert tokens of `tokenizers.AddedToken` type to string so they can be used to control more finely how
1774
special tokens are tokenized.
1777
set_attr = self.special_tokens_map_extended
1778
for attr_value in set_attr.values():
1779
all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value])
1780
all_toks = list(OrderedDict.fromkeys(all_toks))
1784
def all_special_ids(self) -> List[int]:
1786
`List[int]`: List the ids of the special tokens(`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
1788
all_toks = self.all_special_tokens
1789
all_ids = self.convert_tokens_to_ids(all_toks)