CSS-LM
771 строка · 31.4 Кб
1# coding=utf-8
2# Copyright 2020 The HuggingFace Inc. team.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15""" Tokenization classes for python tokenizers.
16For fast tokenizers (provided by HuggingFace's tokenizers library) see tokenization_utils_fast.py
17"""
18
19import itertools
20import logging
21import re
22import unicodedata
23from typing import Any, Dict, List, Optional, Tuple, Union
24
25from .file_utils import add_end_docstrings
26from .tokenization_utils_base import (
27ENCODE_KWARGS_DOCSTRING,
28ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING,
29INIT_TOKENIZER_DOCSTRING,
30AddedToken,
31BatchEncoding,
32EncodedInput,
33EncodedInputPair,
34PaddingStrategy,
35PreTokenizedInput,
36PreTokenizedInputPair,
37PreTrainedTokenizerBase,
38TensorType,
39TextInput,
40TextInputPair,
41TruncationStrategy,
42)
43
44
45logger = logging.getLogger(__name__)
46
47
48def _is_whitespace(char):
49"""Checks whether `char` is a whitespace character."""
50# \t, \n, and \r are technically contorl characters but we treat them
51# as whitespace since they are generally considered as such.
52if char == " " or char == "\t" or char == "\n" or char == "\r":
53return True
54cat = unicodedata.category(char)
55if cat == "Zs":
56return True
57return False
58
59
60def _is_control(char):
61"""Checks whether `char` is a control character."""
62# These are technically control characters but we count them as whitespace
63# characters.
64if char == "\t" or char == "\n" or char == "\r":
65return False
66cat = unicodedata.category(char)
67if cat.startswith("C"):
68return True
69return False
70
71
72def _is_punctuation(char):
73"""Checks whether `char` is a punctuation character."""
74cp = ord(char)
75# We treat all non-letter/number ASCII as punctuation.
76# Characters such as "^", "$", and "`" are not in the Unicode
77# Punctuation class but we treat them as punctuation anyways, for
78# consistency.
79if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
80return True
81cat = unicodedata.category(char)
82if cat.startswith("P"):
83return True
84return False
85
86
87def _is_end_of_word(text):
88"""Checks whether the last character in text is one of a punctuation, control or whitespace character."""
89last_char = text[-1]
90return bool(_is_control(last_char) | _is_punctuation(last_char) | _is_whitespace(last_char))
91
92
93def _is_start_of_word(text):
94"""Checks whether the first character in text is one of a punctuation, control or whitespace character."""
95first_char = text[0]
96return bool(_is_control(first_char) | _is_punctuation(first_char) | _is_whitespace(first_char))
97
98
99@add_end_docstrings(INIT_TOKENIZER_DOCSTRING, """ .. automethod:: __call__""")
100class PreTrainedTokenizer(PreTrainedTokenizerBase):
101"""
102Base class for all slow tokenizers.
103
104Inherits from :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase`.
105
106Handle all the shared methods for tokenization and special tokens as well as methods
107downloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary.
108
109This class also contain the added tokens in a unified way on top of all tokenizers so we don't
110have to handle the specific vocabulary augmentation methods of the various underlying
111dictionary structures (BPE, sentencepiece...).
112"""
113
114def __init__(self, **kwargs):
115super().__init__(**kwargs)
116
117# Added tokens - We store this for both slow and fast tokenizers
118# until the serialization of Fast tokenizers is updated
119self.added_tokens_encoder: Dict[str, int] = {}
120self.added_tokens_decoder: Dict[int, str] = {}
121self.unique_no_split_tokens: List[str] = []
122
123@property
124def is_fast(self) -> bool:
125return False
126
127@property
128def vocab_size(self) -> int:
129"""
130:obj:`int`: Size of the base vocabulary (without the added tokens).
131"""
132raise NotImplementedError
133
134def get_vocab(self) -> Dict[str, int]:
135"""
136Returns the vocabulary as a dictionary of token to index.
137
138:obj:`tokenizer.get_vocab()[token]` is equivalent to :obj:`tokenizer.convert_tokens_to_ids(token)` when
139:obj:`token` is in the vocab.
140
141Returns:
142:obj:`Dict[str, int]`: The vocabulary.
143"""
144raise NotImplementedError()
145
146def get_added_vocab(self) -> Dict[str, int]:
147"""
148Returns the added tokens in the vocabulary as a dictionary of token to index.
149
150Returns:
151:obj:`Dict[str, int]`: The added tokens.
152"""
153return self.added_tokens_encoder
154
155def __len__(self):
156"""
157Size of the full vocabulary with the added tokens.
158"""
159return self.vocab_size + len(self.added_tokens_encoder)
160
161def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
162"""
163Add a list of new tokens to the tokenizer class. If the new tokens are not in the
164vocabulary, they are added to it with indices starting from length of the current vocabulary.
165
166Args:
167new_tokens (:obj:`List[str]`or :obj:`List[tokenizers.AddedToken]`):
168Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by
169checking if the tokenizer assign the index of the ``unk_token`` to them).
170special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
171Whether or not the tokens should be added as special tokens.
172
173Returns:
174:obj:`int`: The number of tokens actually added to the vocabulary.
175
176Examples::
177
178# Let's see how to increase the vocabulary of Bert model and tokenizer
179tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
180model = BertModel.from_pretrained('bert-base-uncased')
181
182num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
183print('We have added', num_added_toks, 'tokens')
184# Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
185model.resize_token_embeddings(len(tokenizer))
186"""
187new_tokens = [str(tok) for tok in new_tokens]
188
189tokens_to_add = []
190for token in new_tokens:
191assert isinstance(token, str)
192if not special_tokens and self.init_kwargs.get("do_lower_case", False):
193token = token.lower()
194if (
195token != self.unk_token
196and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token)
197and token not in tokens_to_add
198):
199tokens_to_add.append(token)
200if self.verbose:
201logger.info("Adding %s to the vocabulary", token)
202
203added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(tokens_to_add))
204added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
205self.added_tokens_encoder.update(added_tok_encoder)
206self.added_tokens_decoder.update(added_tok_decoder)
207
208# Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert)
209if special_tokens:
210self.unique_no_split_tokens = list(set(self.unique_no_split_tokens).union(set(new_tokens)))
211else:
212# Or on the newly added tokens
213self.unique_no_split_tokens = list(set(self.unique_no_split_tokens).union(set(tokens_to_add)))
214
215return len(tokens_to_add)
216
217def num_special_tokens_to_add(self, pair: bool = False) -> int:
218"""
219Returns the number of added tokens when encoding a sequence with special tokens.
220
221.. note::
222This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not
223put this inside your training loop.
224
225Args:
226pair (:obj:`bool`, `optional`, defaults to :obj:`False`):
227Whether the number of added tokens should be computed in the case of a sequence pair or a single
228sequence.
229
230Returns:
231:obj:`int`: Number of special tokens added to sequences.
232"""
233token_ids_0 = []
234token_ids_1 = []
235return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
236
237def tokenize(self, text: TextInput, **kwargs) -> List[str]:
238"""
239Converts a string in a sequence of tokens, using the tokenizer.
240
241Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
242Takes care of added tokens.
243
244Args:
245text (:obj:`str`):
246The sequence to be encoded.
247**kwargs (additional keyword arguments):
248Passed along to the model-specific ``prepare_for_tokenization`` preprocessing method.
249
250Returns:
251:obj:`List[str]`: The list of tokens.
252"""
253# Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
254all_special_tokens_extended = dict(
255(str(t), t) for t in self.all_special_tokens_extended if isinstance(t, AddedToken)
256)
257
258text, kwargs = self.prepare_for_tokenization(text, **kwargs)
259
260if kwargs:
261logger.warning(f"Keyword arguments {kwargs} not recognized.")
262
263# TODO: should this be in the base class?
264if self.init_kwargs.get("do_lower_case", False):
265# convert non-special tokens to lowercase
266escaped_special_toks = [re.escape(s_tok) for s_tok in self.all_special_tokens]
267pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
268text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)
269
270def split_on_token(tok, text):
271result = []
272tok_extended = all_special_tokens_extended.get(tok, None)
273split_text = text.split(tok)
274full_word = ""
275for i, sub_text in enumerate(split_text):
276# AddedToken can control whitespace stripping around them.
277# We use them for GPT2 and Roberta to have different behavior depending on the special token
278# Cf. https://github.com/huggingface/transformers/pull/2778
279# and https://github.com/huggingface/transformers/issues/3788
280if isinstance(tok_extended, AddedToken):
281if tok_extended.single_word:
282# Try to avoid splitting on token
283if (
284i < len(split_text) - 1
285and not _is_end_of_word(sub_text)
286and not _is_start_of_word(split_text[i + 1])
287):
288# Don't extract the special token
289full_word += sub_text + tok
290elif full_word:
291full_word += sub_text
292result += [full_word]
293full_word = ""
294continue
295# Strip white spaces on the right
296if tok_extended.rstrip and i > 0:
297# A bit counter-intuitive but we strip the left of the string
298# since tok_extended.rstrip means the special token is eating all white spaces on its right
299sub_text = sub_text.lstrip()
300# Strip white spaces on the left
301if tok_extended.lstrip and i < len(split_text) - 1:
302sub_text = sub_text.rstrip() # Opposite here
303else:
304# We strip left and right by default
305if i < len(split_text) - 1:
306sub_text = sub_text.rstrip()
307if i > 0:
308sub_text = sub_text.lstrip()
309
310if i == 0 and not sub_text:
311result += [tok]
312elif i == len(split_text) - 1:
313if sub_text:
314result += [sub_text]
315else:
316pass
317else:
318if sub_text:
319result += [sub_text]
320result += [tok]
321return result
322
323def split_on_tokens(tok_list, text):
324if not text.strip():
325return []
326if not tok_list:
327return self._tokenize(text)
328
329tokenized_text = []
330text_list = [text]
331for tok in tok_list:
332tokenized_text = []
333for sub_text in text_list:
334if sub_text not in self.unique_no_split_tokens:
335tokenized_text += split_on_token(tok, sub_text)
336else:
337tokenized_text += [sub_text]
338text_list = tokenized_text
339
340return list(
341itertools.chain.from_iterable(
342(
343self._tokenize(token) if token not in self.unique_no_split_tokens else [token]
344for token in tokenized_text
345)
346)
347)
348
349no_split_token = self.unique_no_split_tokens
350tokenized_text = split_on_tokens(no_split_token, text)
351return tokenized_text
352
353def _tokenize(self, text, **kwargs):
354"""
355Converts a string in a sequence of tokens (string), using the tokenizer.
356Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
357(BPE/SentencePieces/WordPieces).
358
359Do NOT take care of added tokens.
360"""
361raise NotImplementedError
362
363def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
364"""
365Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
366vocabulary.
367
368Args:
369token (:obj:`str` or :obj:`List[str]`): One or several token(s) to convert to token id(s).
370
371Returns:
372:obj:`int` or :obj:`List[int]`: The token id or list of token ids.
373"""
374if tokens is None:
375return None
376
377if isinstance(tokens, str):
378return self._convert_token_to_id_with_added_voc(tokens)
379
380ids = []
381for token in tokens:
382ids.append(self._convert_token_to_id_with_added_voc(token))
383return ids
384
385def _convert_token_to_id_with_added_voc(self, token):
386if token is None:
387return None
388
389if token in self.added_tokens_encoder:
390return self.added_tokens_encoder[token]
391return self._convert_token_to_id(token)
392
393def _convert_token_to_id(self, token):
394raise NotImplementedError
395
396def _encode_plus(
397self,
398text: Union[TextInput, PreTokenizedInput, EncodedInput],
399text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
400add_special_tokens: bool = True,
401padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
402truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
403max_length: Optional[int] = None,
404stride: int = 0,
405is_pretokenized: bool = False,
406pad_to_multiple_of: Optional[int] = None,
407return_tensors: Optional[Union[str, TensorType]] = None,
408return_token_type_ids: Optional[bool] = None,
409return_attention_mask: Optional[bool] = None,
410return_overflowing_tokens: bool = False,
411return_special_tokens_mask: bool = False,
412return_offsets_mapping: bool = False,
413return_length: bool = False,
414verbose: bool = True,
415**kwargs
416) -> BatchEncoding:
417def get_input_ids(text):
418if isinstance(text, str):
419tokens = self.tokenize(text, **kwargs)
420return self.convert_tokens_to_ids(tokens)
421elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
422if is_pretokenized:
423tokens = list(itertools.chain(*(self.tokenize(t, is_pretokenized=True, **kwargs) for t in text)))
424return self.convert_tokens_to_ids(tokens)
425else:
426return self.convert_tokens_to_ids(text)
427elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
428return text
429else:
430if is_pretokenized:
431raise ValueError(
432f"Input {text} is not valid. Should be a string or a list/tuple of strings when `is_pretokenized=True`."
433)
434else:
435raise ValueError(
436f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
437)
438
439if return_offsets_mapping:
440raise NotImplementedError(
441"return_offset_mapping is not available when using Python tokenizers."
442"To use this feature, change your tokenizer to one deriving from "
443"transformers.PreTrainedTokenizerFast."
444"More information on available tokenizers at "
445"https://github.com/huggingface/transformers/pull/2674"
446)
447
448first_ids = get_input_ids(text)
449second_ids = get_input_ids(text_pair) if text_pair is not None else None
450
451return self.prepare_for_model(
452first_ids,
453pair_ids=second_ids,
454add_special_tokens=add_special_tokens,
455padding=padding_strategy.value,
456truncation=truncation_strategy.value,
457max_length=max_length,
458stride=stride,
459pad_to_multiple_of=pad_to_multiple_of,
460return_tensors=return_tensors,
461prepend_batch_axis=True,
462return_attention_mask=return_attention_mask,
463return_token_type_ids=return_token_type_ids,
464return_overflowing_tokens=return_overflowing_tokens,
465return_special_tokens_mask=return_special_tokens_mask,
466return_length=return_length,
467verbose=verbose,
468)
469
470def _batch_encode_plus(
471self,
472batch_text_or_text_pairs: Union[
473List[TextInput],
474List[TextInputPair],
475List[PreTokenizedInput],
476List[PreTokenizedInputPair],
477List[EncodedInput],
478List[EncodedInputPair],
479],
480add_special_tokens: bool = True,
481padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
482truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
483max_length: Optional[int] = None,
484stride: int = 0,
485is_pretokenized: bool = False,
486pad_to_multiple_of: Optional[int] = None,
487return_tensors: Optional[Union[str, TensorType]] = None,
488return_token_type_ids: Optional[bool] = None,
489return_attention_mask: Optional[bool] = None,
490return_overflowing_tokens: bool = False,
491return_special_tokens_mask: bool = False,
492return_offsets_mapping: bool = False,
493return_length: bool = False,
494verbose: bool = True,
495**kwargs
496) -> BatchEncoding:
497def get_input_ids(text):
498if isinstance(text, str):
499tokens = self.tokenize(text, **kwargs)
500return self.convert_tokens_to_ids(tokens)
501elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
502if is_pretokenized:
503tokens = list(itertools.chain(*(self.tokenize(t, is_pretokenized=True, **kwargs) for t in text)))
504return self.convert_tokens_to_ids(tokens)
505else:
506return self.convert_tokens_to_ids(text)
507elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
508return text
509else:
510raise ValueError(
511"Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
512)
513
514if return_offsets_mapping:
515raise NotImplementedError(
516"return_offset_mapping is not available when using Python tokenizers."
517"To use this feature, change your tokenizer to one deriving from "
518"transformers.PreTrainedTokenizerFast."
519)
520
521input_ids = []
522for ids_or_pair_ids in batch_text_or_text_pairs:
523if not isinstance(ids_or_pair_ids, (list, tuple)):
524ids, pair_ids = ids_or_pair_ids, None
525elif is_pretokenized and not isinstance(ids_or_pair_ids[0], (list, tuple)):
526ids, pair_ids = ids_or_pair_ids, None
527else:
528ids, pair_ids = ids_or_pair_ids
529
530first_ids = get_input_ids(ids)
531second_ids = get_input_ids(pair_ids) if pair_ids is not None else None
532input_ids.append((first_ids, second_ids))
533
534batch_outputs = self._batch_prepare_for_model(
535input_ids,
536add_special_tokens=add_special_tokens,
537padding_strategy=padding_strategy,
538truncation_strategy=truncation_strategy,
539max_length=max_length,
540stride=stride,
541pad_to_multiple_of=pad_to_multiple_of,
542return_attention_mask=return_attention_mask,
543return_token_type_ids=return_token_type_ids,
544return_overflowing_tokens=return_overflowing_tokens,
545return_special_tokens_mask=return_special_tokens_mask,
546return_length=return_length,
547return_tensors=return_tensors,
548verbose=verbose,
549)
550
551return BatchEncoding(batch_outputs)
552
553@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
554def _batch_prepare_for_model(
555self,
556batch_ids_pairs: List[Union[PreTokenizedInputPair, Tuple[List[int], None]]],
557add_special_tokens: bool = True,
558padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
559truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
560max_length: Optional[int] = None,
561stride: int = 0,
562pad_to_multiple_of: Optional[int] = None,
563return_tensors: Optional[str] = None,
564return_token_type_ids: Optional[bool] = None,
565return_attention_mask: Optional[bool] = None,
566return_overflowing_tokens: bool = False,
567return_special_tokens_mask: bool = False,
568return_length: bool = False,
569verbose: bool = True,
570) -> BatchEncoding:
571"""
572Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
573It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
574manages a moving window (with user defined stride) for overflowing tokens
575
576Args:
577batch_ids_pairs: list of tokenized input ids or input ids pairs
578"""
579
580batch_outputs = {}
581for first_ids, second_ids in batch_ids_pairs:
582outputs = self.prepare_for_model(
583first_ids,
584second_ids,
585add_special_tokens=add_special_tokens,
586padding=PaddingStrategy.DO_NOT_PAD.value, # we pad in batch afterward
587truncation=truncation_strategy.value,
588max_length=max_length,
589stride=stride,
590pad_to_multiple_of=None, # we pad in batch afterward
591return_attention_mask=False, # we pad in batch afterward
592return_token_type_ids=return_token_type_ids,
593return_overflowing_tokens=return_overflowing_tokens,
594return_special_tokens_mask=return_special_tokens_mask,
595return_length=return_length,
596return_tensors=None, # We convert the whole batch to tensors at the end
597prepend_batch_axis=False,
598verbose=verbose,
599)
600
601for key, value in outputs.items():
602if key not in batch_outputs:
603batch_outputs[key] = []
604batch_outputs[key].append(value)
605
606batch_outputs = self.pad(
607batch_outputs,
608padding=padding_strategy.value,
609max_length=max_length,
610pad_to_multiple_of=pad_to_multiple_of,
611return_attention_mask=return_attention_mask,
612)
613
614batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
615
616return batch_outputs
617
618def prepare_for_tokenization(
619self, text: str, is_pretokenized: bool = False, **kwargs
620) -> Tuple[str, Dict[str, Any]]:
621"""
622Performs any necessary transformations before tokenization.
623
624This method should pop the arguments from kwargs and return the remaining :obj:`kwargs` as well.
625We test the :obj:`kwargs` at the end of the encoding process to be sure all the arguments have been used.
626
627Args:
628test (:obj:`str`):
629The text to prepare.
630is_pretokenized (:obj:`bool`, `optional`, defaults to :obj:`False`):
631Whether or not the text has been pretokenized.
632kwargs:
633Keyword arguments to use for the tokenization.
634
635Returns:
636:obj:`Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
637"""
638return (text, kwargs)
639
640def get_special_tokens_mask(
641self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False
642) -> List[int]:
643"""
644Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
645special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
646
647Args:
648token_ids_0 (:obj:`List[int]`):
649List of ids of the first sequence.
650token_ids_1 (:obj:`List[int]`, `optional`):
651List of ids of the second sequence.
652already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
653Wheter or not the token list is already formated with special tokens for the model.
654
655Returns:
656A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
657"""
658return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
659
660def convert_ids_to_tokens(
661self, ids: Union[int, List[int]], skip_special_tokens: bool = False
662) -> Union[str, List[str]]:
663"""
664Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary
665and added tokens.
666
667Args:
668ids (:obj:`int` or :obj:`List[int]`):
669The token id (or token ids) to convert to tokens.
670skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
671Whether or not to remove special tokens in the decoding.
672
673Returns:
674:obj:`str` or :obj:`List[str]`: The decoded token(s).
675"""
676if isinstance(ids, int):
677if ids in self.added_tokens_decoder:
678return self.added_tokens_decoder[ids]
679else:
680return self._convert_id_to_token(ids)
681tokens = []
682for index in ids:
683index = int(index)
684if skip_special_tokens and index in self.all_special_ids:
685continue
686if index in self.added_tokens_decoder:
687tokens.append(self.added_tokens_decoder[index])
688else:
689tokens.append(self._convert_id_to_token(index))
690return tokens
691
692def _convert_id_to_token(self, index: int) -> str:
693raise NotImplementedError
694
695def convert_tokens_to_string(self, tokens: List[str]) -> str:
696"""
697Converts a sequence of token ids in a single string.
698
699The most simple way to do it is ``" ".join(tokens)`` but we often want to remove
700sub-word tokenization artifacts at the same time.
701
702Args:
703tokens (:obj:`List[str]`): The token to join in a string.
704
705Return: The joined tokens.
706"""
707return " ".join(tokens)
708
709def decode(
710self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True
711) -> str:
712"""
713Converts a sequence of ids in a string, using the tokenizer and vocabulary
714with options to remove special tokens and clean up tokenization spaces.
715
716Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
717
718Args:
719token_ids (:obj:`List[int]`):
720List of tokenized input ids. Can be obtained using the ``__call__`` method.
721skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
722Whether or not to remove special tokens in the decoding.
723clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`):
724Whether or not to clean up the tokenization spaces.
725
726Returns:
727:obj:`str`: The decoded sentence.
728"""
729filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
730
731# To avoid mixing byte-level and unicode for byte-level BPT
732# we need to build string separatly for added tokens and byte-level tokens
733# cf. https://github.com/huggingface/transformers/issues/1133
734sub_texts = []
735current_sub_text = []
736for token in filtered_tokens:
737if skip_special_tokens and token in self.all_special_ids:
738continue
739if token in self.added_tokens_encoder:
740if current_sub_text:
741sub_texts.append(self.convert_tokens_to_string(current_sub_text))
742current_sub_text = []
743sub_texts.append(token)
744else:
745current_sub_text.append(token)
746if current_sub_text:
747sub_texts.append(self.convert_tokens_to_string(current_sub_text))
748text = " ".join(sub_texts)
749
750if clean_up_tokenization_spaces:
751clean_text = self.clean_up_tokenization(text)
752return clean_text
753else:
754return text
755
756def save_vocabulary(self, save_directory) -> Tuple[str]:
757"""
758Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens
759and special token mappings.
760
761.. warning::
762Please use :meth:`~transformers.PreTrainedTokenizer.save_pretrained` to save the full tokenizer state if
763you want to reload it using the :meth:`~transformers.PreTrainedTokenizer.from_pretrained` class method.
764
765Args:
766save_directory (:obj:`str`): The path to adirectory where the tokenizer will be saved.
767
768Returns:
769A tuple of :obj:`str`: The files saved.
770"""
771raise NotImplementedError
772