CSS-LM
522 строки · 20.7 Кб
1# coding=utf-8
2# Copyright 2020 The HuggingFace Inc. team.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15""" Tokenization classes for fast tokenizers (provided by HuggingFace's tokenizers library).
16For slow (python) tokenizers see tokenization_utils.py
17"""
18
19import logging
20import os
21from collections import defaultdict
22from typing import Any, Dict, List, Optional, Tuple, Union
23
24from tokenizers import Encoding as EncodingFast
25from tokenizers.decoders import Decoder as DecoderFast
26from tokenizers.implementations import BaseTokenizer as BaseTokenizerFast
27
28from .file_utils import add_end_docstrings
29from .tokenization_utils_base import (
30INIT_TOKENIZER_DOCSTRING,
31AddedToken,
32BatchEncoding,
33PaddingStrategy,
34PreTokenizedInput,
35PreTokenizedInputPair,
36PreTrainedTokenizerBase,
37TextInput,
38TextInputPair,
39TruncationStrategy,
40)
41
42
43logger = logging.getLogger(__name__)
44
45
46@add_end_docstrings(
47INIT_TOKENIZER_DOCSTRING,
48"""
49.. automethod:: __call__
50""",
51)
52class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
53"""
54Base class for all fast tokenizers (wrapping HuggingFace tokenizers library).
55
56Inherits from :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase`.
57
58Handles all the shared methods for tokenization and special tokens, as well as methods for
59downloading/caching/loading pretrained tokenizers, as well as adding tokens to the vocabulary.
60
61This class also contains the added tokens in a unified way on top of all tokenizers so we don't
62have to handle the specific vocabulary augmentation methods of the various underlying
63dictionary structures (BPE, sentencepiece...).
64"""
65
66def __init__(self, tokenizer: BaseTokenizerFast, **kwargs):
67if not isinstance(tokenizer, BaseTokenizerFast):
68raise ValueError(
69"Tokenizer should be an instance of a BaseTokenizer " "provided by HuggingFace tokenizers library."
70)
71self._tokenizer: BaseTokenizerFast = tokenizer
72
73# We call this after having initialized the backend tokenizer because we update it.
74super().__init__(**kwargs)
75
76@property
77def is_fast(self) -> bool:
78return True
79
80@property
81def vocab_size(self) -> int:
82"""
83:obj:`int`: Size of the base vocabulary (without the added tokens).
84"""
85return self._tokenizer.get_vocab_size(with_added_tokens=False)
86
87def get_vocab(self) -> Dict[str, int]:
88"""
89Returns the vocabulary as a dictionary of token to index.
90
91:obj:`tokenizer.get_vocab()[token]` is equivalent to :obj:`tokenizer.convert_tokens_to_ids(token)` when
92:obj:`token` is in the vocab.
93
94Returns:
95:obj:`Dict[str, int]`: The vocabulary.
96"""
97return self._tokenizer.get_vocab(with_added_tokens=True)
98
99def get_added_vocab(self) -> Dict[str, int]:
100"""
101Returns the added tokens in the vocabulary as a dictionary of token to index.
102
103Returns:
104:obj:`Dict[str, int]`: The added tokens.
105"""
106base_vocab = self._tokenizer.get_vocab(with_added_tokens=False)
107full_vocab = self._tokenizer.get_vocab(with_added_tokens=True)
108added_vocab = dict((tok, index) for tok, index in full_vocab.items() if tok not in base_vocab)
109return added_vocab
110
111def __len__(self) -> int:
112"""
113Size of the full vocabulary with the added tokens.
114"""
115return self._tokenizer.get_vocab_size(with_added_tokens=True)
116
117@property
118def backend_tokenizer(self) -> BaseTokenizerFast:
119"""
120:obj:`tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend.
121"""
122return self._tokenizer
123
124@property
125def decoder(self) -> DecoderFast:
126"""
127:obj:`tokenizers.decoders.Decoder`: The Rust decoder for this tokenizer.
128"""
129return self._tokenizer._tokenizer.decoder
130
131def _convert_encoding(
132self,
133encoding: EncodingFast,
134return_token_type_ids: Optional[bool] = None,
135return_attention_mask: Optional[bool] = None,
136return_overflowing_tokens: bool = False,
137return_special_tokens_mask: bool = False,
138return_offsets_mapping: bool = False,
139return_length: bool = False,
140verbose: bool = True,
141) -> Dict[str, Any]:
142""" Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict.
143
144Overflowing tokens are converted to additional examples (like batches) so the output values of
145the dict are lists (overflows) of lists (tokens).
146
147Output shape: (overflows, sequence length)
148"""
149if return_token_type_ids is None:
150return_token_type_ids = "token_type_ids" in self.model_input_names
151if return_attention_mask is None:
152return_attention_mask = "attention_mask" in self.model_input_names
153
154if return_overflowing_tokens and encoding.overflowing is not None:
155encodings = [encoding] + encoding.overflowing
156else:
157encodings = [encoding]
158
159encoding_dict = defaultdict(list)
160for e in encodings:
161encoding_dict["input_ids"].append(e.ids)
162
163if return_token_type_ids:
164encoding_dict["token_type_ids"].append(e.type_ids)
165if return_attention_mask:
166encoding_dict["attention_mask"].append(e.attention_mask)
167if return_special_tokens_mask:
168encoding_dict["special_tokens_mask"].append(e.special_tokens_mask)
169if return_offsets_mapping:
170encoding_dict["offset_mapping"].append(e.offsets)
171if return_length:
172encoding_dict["length"].append(len(e.ids))
173
174return encoding_dict
175
176def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
177"""
178Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
179vocabulary.
180
181Args:
182token (:obj:`str` or :obj:`List[str]`): One or several token(s) to convert to token id(s).
183
184Returns:
185:obj:`int` or :obj:`List[int]`: The token id or list of token ids.
186"""
187if tokens is None:
188return None
189
190if isinstance(tokens, str):
191return self._convert_token_to_id_with_added_voc(tokens)
192
193ids = []
194for token in tokens:
195ids.append(self._convert_token_to_id_with_added_voc(token))
196return ids
197
198def _convert_token_to_id_with_added_voc(self, token: str) -> int:
199index = self._tokenizer.token_to_id(token)
200if index is None:
201return self.unk_token_id
202return index
203
204def _convert_id_to_token(self, index: int) -> Optional[str]:
205return self._tokenizer.id_to_token(int(index))
206
207def _add_tokens(self, new_tokens: List[Union[str, AddedToken]], special_tokens=False) -> int:
208if special_tokens:
209return self._tokenizer.add_special_tokens(new_tokens)
210
211return self._tokenizer.add_tokens(new_tokens)
212
213def num_special_tokens_to_add(self, pair: bool = False) -> int:
214"""
215Returns the number of added tokens when encoding a sequence with special tokens.
216
217.. note::
218This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not
219put this inside your training loop.
220
221Args:
222pair (:obj:`bool`, `optional`, defaults to :obj:`False`):
223Whether the number of added tokens should be computed in the case of a sequence pair or a single
224sequence.
225
226Returns:
227:obj:`int`: Number of special tokens added to sequences.
228"""
229return self._tokenizer.num_special_tokens_to_add(pair)
230
231def convert_ids_to_tokens(
232self, ids: Union[int, List[int]], skip_special_tokens: bool = False
233) -> Union[str, List[str]]:
234"""
235Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary
236and added tokens.
237
238Args:
239ids (:obj:`int` or :obj:`List[int]`):
240The token id (or token ids) to convert to tokens.
241skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
242Whether or not to remove special tokens in the decoding.
243
244Returns:
245:obj:`str` or :obj:`List[str]`: The decoded token(s).
246"""
247if isinstance(ids, int):
248return self._tokenizer.id_to_token(ids)
249tokens = []
250for index in ids:
251index = int(index)
252if skip_special_tokens and index in self.all_special_ids:
253continue
254tokens.append(self._tokenizer.id_to_token(index))
255return tokens
256
257def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False) -> List[str]:
258"""
259Converts a string in a sequence of tokens, using the backend Rust tokenizer.
260
261Args:
262text (:obj:`str`):
263The sequence to be encoded.
264pair (:obj:`str`, `optional`):
265A second sequence to be encoded with the first.
266add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
267Whether or not to add the special tokens associated with the corresponding model.
268
269Returns:
270:obj:`List[str]`: The list of tokens.
271"""
272return self._tokenizer.encode(text, pair, add_special_tokens=add_special_tokens).tokens
273
274def set_truncation_and_padding(
275self,
276padding_strategy: PaddingStrategy,
277truncation_strategy: TruncationStrategy,
278max_length: int,
279stride: int,
280pad_to_multiple_of: Optional[int],
281):
282"""
283Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers
284library) and restore the tokenizer settings afterwards.
285
286The provided tokenizer has no padding / truncation strategy before the managed section. If your tokenizer set a
287padding / truncation strategy before, then it will be reset to no padding / truncation when exiting the managed
288section.
289
290Args:
291padding_strategy (:class:`~transformers.tokenization_utils_base.PaddingStrategy`):
292The kind of padding that will be applied to the input
293truncation_strategy (:class:`~transformers.tokenization_utils_base.TruncationStrategy`):
294The kind of truncation that will be applied to the input
295max_length (:obj:`int`):
296The maximum size of a sequence.
297stride (:obj:`int`):
298The stride to use when handling overflow.
299pad_to_multiple_of (:obj:`int`, `optional`):
300If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
301the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
302"""
303# Set truncation and padding on the backend tokenizer
304if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
305self._tokenizer.enable_truncation(max_length, stride=stride, strategy=truncation_strategy.value)
306else:
307self._tokenizer.no_truncation()
308
309if padding_strategy != PaddingStrategy.DO_NOT_PAD:
310self._tokenizer.enable_padding(
311length=max_length if padding_strategy == PaddingStrategy.MAX_LENGTH else None,
312direction=self.padding_side,
313pad_id=self.pad_token_id,
314pad_type_id=self.pad_token_type_id,
315pad_token=self.pad_token,
316pad_to_multiple_of=pad_to_multiple_of,
317)
318else:
319self._tokenizer.no_padding()
320
321def _batch_encode_plus(
322self,
323batch_text_or_text_pairs: Union[
324List[TextInput], List[TextInputPair], List[PreTokenizedInput], List[PreTokenizedInputPair]
325],
326add_special_tokens: bool = True,
327padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
328truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
329max_length: Optional[int] = None,
330stride: int = 0,
331is_pretokenized: bool = False,
332pad_to_multiple_of: Optional[int] = None,
333return_tensors: Optional[str] = None,
334return_token_type_ids: Optional[bool] = None,
335return_attention_mask: Optional[bool] = None,
336return_overflowing_tokens: bool = False,
337return_special_tokens_mask: bool = False,
338return_offsets_mapping: bool = False,
339return_length: bool = False,
340verbose: bool = True,
341**kwargs
342) -> BatchEncoding:
343
344if not isinstance(batch_text_or_text_pairs, list):
345raise ValueError(
346"batch_text_or_text_pairs has to be a list (got {})".format(type(batch_text_or_text_pairs))
347)
348
349if kwargs:
350raise ValueError(f"Keyword arguments {kwargs} not recognized.")
351
352# Set the truncation and padding strategy and restore the initial configuration
353self.set_truncation_and_padding(
354padding_strategy=padding_strategy,
355truncation_strategy=truncation_strategy,
356max_length=max_length,
357stride=stride,
358pad_to_multiple_of=pad_to_multiple_of,
359)
360
361# Avoid thread overhead if only one example.
362if len(batch_text_or_text_pairs) == 1:
363if isinstance(batch_text_or_text_pairs[0], tuple):
364# We got a Tuple with a pair of sequences
365encodings = self._tokenizer.encode(
366*batch_text_or_text_pairs[0],
367add_special_tokens=add_special_tokens,
368is_pretokenized=is_pretokenized,
369)
370else:
371# We got a single sequence
372encodings = self._tokenizer.encode(
373batch_text_or_text_pairs[0],
374add_special_tokens=add_special_tokens,
375is_pretokenized=is_pretokenized,
376)
377encodings = [encodings]
378else:
379encodings = self._tokenizer.encode_batch(
380batch_text_or_text_pairs, add_special_tokens=add_special_tokens, is_pretokenized=is_pretokenized
381)
382
383# Convert encoding to dict
384# `Tokens` has type: List[Dict[str, List[List[int]]]] or List[Dict[str, 2D-Tensor]]
385# with nested dimensions corresponding to batch, overflows, sequence length
386tokens = [
387self._convert_encoding(
388encoding=encoding,
389return_token_type_ids=return_token_type_ids,
390return_attention_mask=return_attention_mask,
391return_overflowing_tokens=return_overflowing_tokens,
392return_special_tokens_mask=return_special_tokens_mask,
393return_offsets_mapping=return_offsets_mapping,
394return_length=return_length,
395verbose=verbose,
396)
397for encoding in encodings
398]
399
400# Convert the output to have dict[list] from list[dict]
401sanitized = {}
402for key in tokens[0].keys():
403# To List[List[List[int]]] of shape (batch, overflows, sequence length)
404stack = [e for item in tokens for e in item[key]]
405sanitized[key] = stack
406
407# If returning overflowing tokens, we need to return a mapping
408# from the batch idx to the original sample
409if return_overflowing_tokens:
410overflow_to_sample_mapping = []
411for i, enc in enumerate(tokens):
412overflow_to_sample_mapping += [i] * len(enc["input_ids"])
413sanitized["overflow_to_sample_mapping"] = overflow_to_sample_mapping
414
415return BatchEncoding(sanitized, encodings, tensor_type=return_tensors)
416
417def _encode_plus(
418self,
419text: Union[TextInput, PreTokenizedInput],
420text_pair: Optional[Union[TextInput, PreTokenizedInput]] = None,
421add_special_tokens: bool = True,
422padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
423truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
424max_length: Optional[int] = None,
425stride: int = 0,
426is_pretokenized: bool = False,
427pad_to_multiple_of: Optional[int] = None,
428return_tensors: Optional[bool] = None,
429return_token_type_ids: Optional[bool] = None,
430return_attention_mask: Optional[bool] = None,
431return_overflowing_tokens: bool = False,
432return_special_tokens_mask: bool = False,
433return_offsets_mapping: bool = False,
434return_length: bool = False,
435verbose: bool = True,
436**kwargs
437) -> BatchEncoding:
438
439batched_input = [(text, text_pair)] if text_pair else [text]
440batched_output = self._batch_encode_plus(
441batched_input,
442is_pretokenized=is_pretokenized,
443add_special_tokens=add_special_tokens,
444padding_strategy=padding_strategy,
445truncation_strategy=truncation_strategy,
446max_length=max_length,
447stride=stride,
448pad_to_multiple_of=pad_to_multiple_of,
449return_tensors=return_tensors,
450return_token_type_ids=return_token_type_ids,
451return_attention_mask=return_attention_mask,
452return_overflowing_tokens=return_overflowing_tokens,
453return_special_tokens_mask=return_special_tokens_mask,
454return_offsets_mapping=return_offsets_mapping,
455return_length=return_length,
456verbose=verbose,
457**kwargs,
458)
459
460# Return tensor is None, then we can remove the leading batch axis
461# Overfolwing tokens are returned as a batch of output so we keep them in this case
462if return_tensors is None and not return_overflowing_tokens:
463batched_output = BatchEncoding(
464{
465key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
466for key, value in batched_output.items()
467},
468batched_output.encodings,
469)
470
471return batched_output
472
473def decode(
474self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True
475) -> str:
476"""
477Converts a sequence of ids in a string, using the tokenizer and vocabulary
478with options to remove special tokens and clean up tokenization spaces.
479
480Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
481
482Args:
483token_ids (:obj:`List[int]`):
484List of tokenized input ids. Can be obtained using the ``__call__`` method.
485skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
486Whether or not to remove special tokens in the decoding.
487clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`):
488Whether or not to clean up the tokenization spaces.
489
490Returns:
491:obj:`str`: The decoded sentence.
492"""
493text = self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
494
495if clean_up_tokenization_spaces:
496clean_text = self.clean_up_tokenization(text)
497return clean_text
498else:
499return text
500
501def save_vocabulary(self, save_directory: str) -> Tuple[str]:
502"""
503Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens
504and special token mappings.
505
506.. warning::
507Please use :meth:`~transformers.PreTrainedTokenizer.save_pretrained` to save the full tokenizer state if
508you want to reload it using the :meth:`~transformers.PreTrainedTokenizer.from_pretrained` class method.
509
510Args:
511save_directory (:obj:`str`): The path to adirectory where the tokenizer will be saved.
512
513Returns:
514A tuple of :obj:`str`: The files saved.
515"""
516if os.path.isdir(save_directory):
517files = self._tokenizer.save_model(save_directory)
518else:
519folder, file = os.path.split(os.path.abspath(save_directory))
520files = self._tokenizer.save_model(folder, name=file)
521
522return tuple(files)
523