CSS-LM

tokenization_utils.py
771 строка · 31.4 Кб
Перенос по словам
1
# coding=utf-8
2
# Copyright 2020 The HuggingFace Inc. team.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
""" Tokenization classes for python tokenizers.
16
    For fast tokenizers (provided by HuggingFace's tokenizers library) see tokenization_utils_fast.py
17
"""
18

19
import itertools
20
import logging
21
import re
22
import unicodedata
23
from typing import Any, Dict, List, Optional, Tuple, Union
24

25
from .file_utils import add_end_docstrings
26
from .tokenization_utils_base import (
27
    ENCODE_KWARGS_DOCSTRING,
28
    ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING,
29
    INIT_TOKENIZER_DOCSTRING,
30
    AddedToken,
31
    BatchEncoding,
32
    EncodedInput,
33
    EncodedInputPair,
34
    PaddingStrategy,
35
    PreTokenizedInput,
36
    PreTokenizedInputPair,
37
    PreTrainedTokenizerBase,
38
    TensorType,
39
    TextInput,
40
    TextInputPair,
41
    TruncationStrategy,
42
)
43

44

45
logger = logging.getLogger(__name__)
46

47

48
def _is_whitespace(char):
49
    """Checks whether `char` is a whitespace character."""
50
    # \t, \n, and \r are technically contorl characters but we treat them
51
    # as whitespace since they are generally considered as such.
52
    if char == " " or char == "\t" or char == "\n" or char == "\r":
53
        return True
54
    cat = unicodedata.category(char)
55
    if cat == "Zs":
56
        return True
57
    return False
58

59

60
def _is_control(char):
61
    """Checks whether `char` is a control character."""
62
    # These are technically control characters but we count them as whitespace
63
    # characters.
64
    if char == "\t" or char == "\n" or char == "\r":
65
        return False
66
    cat = unicodedata.category(char)
67
    if cat.startswith("C"):
68
        return True
69
    return False
70

71

72
def _is_punctuation(char):
73
    """Checks whether `char` is a punctuation character."""
74
    cp = ord(char)
75
    # We treat all non-letter/number ASCII as punctuation.
76
    # Characters such as "^", "$", and "`" are not in the Unicode
77
    # Punctuation class but we treat them as punctuation anyways, for
78
    # consistency.
79
    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
80
        return True
81
    cat = unicodedata.category(char)
82
    if cat.startswith("P"):
83
        return True
84
    return False
85

86

87
def _is_end_of_word(text):
88
    """Checks whether the last character in text is one of a punctuation, control or whitespace character."""
89
    last_char = text[-1]
90
    return bool(_is_control(last_char) | _is_punctuation(last_char) | _is_whitespace(last_char))
91

92

93
def _is_start_of_word(text):
94
    """Checks whether the first character in text is one of a punctuation, control or whitespace character."""
95
    first_char = text[0]
96
    return bool(_is_control(first_char) | _is_punctuation(first_char) | _is_whitespace(first_char))
97

98

99
@add_end_docstrings(INIT_TOKENIZER_DOCSTRING, """    .. automethod:: __call__""")
100
class PreTrainedTokenizer(PreTrainedTokenizerBase):
101
    """
102
    Base class for all slow tokenizers.
103

104
    Inherits from :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase`.
105

106
    Handle all the shared methods for tokenization and special tokens as well as methods
107
    downloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary.
108

109
    This class also contain the added tokens in a unified way on top of all tokenizers so we don't
110
    have to handle the specific vocabulary augmentation methods of the various underlying
111
    dictionary structures (BPE, sentencepiece...).
112
    """
113

114
    def __init__(self, **kwargs):
115
        super().__init__(**kwargs)
116

117
        # Added tokens - We store this for both slow and fast tokenizers
118
        # until the serialization of Fast tokenizers is updated
119
        self.added_tokens_encoder: Dict[str, int] = {}
120
        self.added_tokens_decoder: Dict[int, str] = {}
121
        self.unique_no_split_tokens: List[str] = []
122

123
    @property
124
    def is_fast(self) -> bool:
125
        return False
126

127
    @property
128
    def vocab_size(self) -> int:
129
        """
130
        :obj:`int`: Size of the base vocabulary (without the added tokens).
131
        """
132
        raise NotImplementedError
133

134
    def get_vocab(self) -> Dict[str, int]:
135
        """
136
        Returns the vocabulary as a dictionary of token to index.
137

138
        :obj:`tokenizer.get_vocab()[token]` is equivalent to :obj:`tokenizer.convert_tokens_to_ids(token)` when
139
        :obj:`token` is in the vocab.
140

141
        Returns:
142
            :obj:`Dict[str, int]`: The vocabulary.
143
        """
144
        raise NotImplementedError()
145

146
    def get_added_vocab(self) -> Dict[str, int]:
147
        """
148
        Returns the added tokens in the vocabulary as a dictionary of token to index.
149

150
        Returns:
151
            :obj:`Dict[str, int]`: The added tokens.
152
        """
153
        return self.added_tokens_encoder
154

155
    def __len__(self):
156
        """
157
        Size of the full vocabulary with the added tokens.
158
        """
159
        return self.vocab_size + len(self.added_tokens_encoder)
160

161
    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
162
        """
163
        Add a list of new tokens to the tokenizer class. If the new tokens are not in the
164
        vocabulary, they are added to it with indices starting from length of the current vocabulary.
165

166
        Args:
167
            new_tokens (:obj:`List[str]`or :obj:`List[tokenizers.AddedToken]`):
168
                Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by
169
                checking if the tokenizer assign the index of the ``unk_token`` to them).
170
            special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
171
                Whether or not the tokens should be added as special tokens.
172

173
        Returns:
174
            :obj:`int`: The number of tokens actually added to the vocabulary.
175

176
        Examples::
177

178
            # Let's see how to increase the vocabulary of Bert model and tokenizer
179
            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
180
            model = BertModel.from_pretrained('bert-base-uncased')
181

182
            num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
183
            print('We have added', num_added_toks, 'tokens')
184
            # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
185
            model.resize_token_embeddings(len(tokenizer))
186
        """
187
        new_tokens = [str(tok) for tok in new_tokens]
188

189
        tokens_to_add = []
190
        for token in new_tokens:
191
            assert isinstance(token, str)
192
            if not special_tokens and self.init_kwargs.get("do_lower_case", False):
193
                token = token.lower()
194
            if (
195
                token != self.unk_token
196
                and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token)
197
                and token not in tokens_to_add
198
            ):
199
                tokens_to_add.append(token)
200
                if self.verbose:
201
                    logger.info("Adding %s to the vocabulary", token)
202

203
        added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(tokens_to_add))
204
        added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
205
        self.added_tokens_encoder.update(added_tok_encoder)
206
        self.added_tokens_decoder.update(added_tok_decoder)
207

208
        # Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert)
209
        if special_tokens:
210
            self.unique_no_split_tokens = list(set(self.unique_no_split_tokens).union(set(new_tokens)))
211
        else:
212
            # Or on the newly added tokens
213
            self.unique_no_split_tokens = list(set(self.unique_no_split_tokens).union(set(tokens_to_add)))
214

215
        return len(tokens_to_add)
216

217
    def num_special_tokens_to_add(self, pair: bool = False) -> int:
218
        """
219
        Returns the number of added tokens when encoding a sequence with special tokens.
220

221
        .. note::
222
            This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not
223
            put this inside your training loop.
224

225
        Args:
226
            pair (:obj:`bool`, `optional`, defaults to :obj:`False`):
227
                Whether the number of added tokens should be computed in the case of a sequence pair or a single
228
                sequence.
229

230
        Returns:
231
            :obj:`int`: Number of special tokens added to sequences.
232
        """
233
        token_ids_0 = []
234
        token_ids_1 = []
235
        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
236

237
    def tokenize(self, text: TextInput, **kwargs) -> List[str]:
238
        """
239
        Converts a string in a sequence of tokens, using the tokenizer.
240

241
        Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
242
        Takes care of added tokens.
243

244
        Args:
245
            text (:obj:`str`):
246
                The sequence to be encoded.
247
            **kwargs (additional keyword arguments):
248
                Passed along to the model-specific ``prepare_for_tokenization`` preprocessing method.
249

250
        Returns:
251
            :obj:`List[str]`: The list of tokens.
252
        """
253
        # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
254
        all_special_tokens_extended = dict(
255
            (str(t), t) for t in self.all_special_tokens_extended if isinstance(t, AddedToken)
256
        )
257

258
        text, kwargs = self.prepare_for_tokenization(text, **kwargs)
259

260
        if kwargs:
261
            logger.warning(f"Keyword arguments {kwargs} not recognized.")
262

263
        # TODO: should this be in the base class?
264
        if self.init_kwargs.get("do_lower_case", False):
265
            # convert non-special tokens to lowercase
266
            escaped_special_toks = [re.escape(s_tok) for s_tok in self.all_special_tokens]
267
            pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
268
            text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)
269

270
        def split_on_token(tok, text):
271
            result = []
272
            tok_extended = all_special_tokens_extended.get(tok, None)
273
            split_text = text.split(tok)
274
            full_word = ""
275
            for i, sub_text in enumerate(split_text):
276
                # AddedToken can control whitespace stripping around them.
277
                # We use them for GPT2 and Roberta to have different behavior depending on the special token
278
                # Cf. https://github.com/huggingface/transformers/pull/2778
279
                # and https://github.com/huggingface/transformers/issues/3788
280
                if isinstance(tok_extended, AddedToken):
281
                    if tok_extended.single_word:
282
                        # Try to avoid splitting on token
283
                        if (
284
                            i < len(split_text) - 1
285
                            and not _is_end_of_word(sub_text)
286
                            and not _is_start_of_word(split_text[i + 1])
287
                        ):
288
                            # Don't extract the special token
289
                            full_word += sub_text + tok
290
                        elif full_word:
291
                            full_word += sub_text
292
                            result += [full_word]
293
                            full_word = ""
294
                            continue
295
                    # Strip white spaces on the right
296
                    if tok_extended.rstrip and i > 0:
297
                        # A bit counter-intuitive but we strip the left of the string
298
                        # since tok_extended.rstrip means the special token is eating all white spaces on its right
299
                        sub_text = sub_text.lstrip()
300
                    # Strip white spaces on the left
301
                    if tok_extended.lstrip and i < len(split_text) - 1:
302
                        sub_text = sub_text.rstrip()  # Opposite here
303
                else:
304
                    # We strip left and right by default
305
                    if i < len(split_text) - 1:
306
                        sub_text = sub_text.rstrip()
307
                    if i > 0:
308
                        sub_text = sub_text.lstrip()
309

310
                if i == 0 and not sub_text:
311
                    result += [tok]
312
                elif i == len(split_text) - 1:
313
                    if sub_text:
314
                        result += [sub_text]
315
                    else:
316
                        pass
317
                else:
318
                    if sub_text:
319
                        result += [sub_text]
320
                    result += [tok]
321
            return result
322

323
        def split_on_tokens(tok_list, text):
324
            if not text.strip():
325
                return []
326
            if not tok_list:
327
                return self._tokenize(text)
328

329
            tokenized_text = []
330
            text_list = [text]
331
            for tok in tok_list:
332
                tokenized_text = []
333
                for sub_text in text_list:
334
                    if sub_text not in self.unique_no_split_tokens:
335
                        tokenized_text += split_on_token(tok, sub_text)
336
                    else:
337
                        tokenized_text += [sub_text]
338
                text_list = tokenized_text
339

340
            return list(
341
                itertools.chain.from_iterable(
342
                    (
343
                        self._tokenize(token) if token not in self.unique_no_split_tokens else [token]
344
                        for token in tokenized_text
345
                    )
346
                )
347
            )
348

349
        no_split_token = self.unique_no_split_tokens
350
        tokenized_text = split_on_tokens(no_split_token, text)
351
        return tokenized_text
352

353
    def _tokenize(self, text, **kwargs):
354
        """
355
        Converts a string in a sequence of tokens (string), using the tokenizer.
356
        Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
357
        (BPE/SentencePieces/WordPieces).
358

359
        Do NOT take care of added tokens.
360
        """
361
        raise NotImplementedError
362

363
    def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
364
        """
365
        Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
366
        vocabulary.
367

368
        Args:
369
            token (:obj:`str` or :obj:`List[str]`): One or several token(s) to convert to token id(s).
370

371
        Returns:
372
            :obj:`int` or :obj:`List[int]`: The token id or list of token ids.
373
        """
374
        if tokens is None:
375
            return None
376

377
        if isinstance(tokens, str):
378
            return self._convert_token_to_id_with_added_voc(tokens)
379

380
        ids = []
381
        for token in tokens:
382
            ids.append(self._convert_token_to_id_with_added_voc(token))
383
        return ids
384

385
    def _convert_token_to_id_with_added_voc(self, token):
386
        if token is None:
387
            return None
388

389
        if token in self.added_tokens_encoder:
390
            return self.added_tokens_encoder[token]
391
        return self._convert_token_to_id(token)
392

393
    def _convert_token_to_id(self, token):
394
        raise NotImplementedError
395

396
    def _encode_plus(
397
        self,
398
        text: Union[TextInput, PreTokenizedInput, EncodedInput],
399
        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
400
        add_special_tokens: bool = True,
401
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
402
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
403
        max_length: Optional[int] = None,
404
        stride: int = 0,
405
        is_pretokenized: bool = False,
406
        pad_to_multiple_of: Optional[int] = None,
407
        return_tensors: Optional[Union[str, TensorType]] = None,
408
        return_token_type_ids: Optional[bool] = None,
409
        return_attention_mask: Optional[bool] = None,
410
        return_overflowing_tokens: bool = False,
411
        return_special_tokens_mask: bool = False,
412
        return_offsets_mapping: bool = False,
413
        return_length: bool = False,
414
        verbose: bool = True,
415
        **kwargs
416
    ) -> BatchEncoding:
417
        def get_input_ids(text):
418
            if isinstance(text, str):
419
                tokens = self.tokenize(text, **kwargs)
420
                return self.convert_tokens_to_ids(tokens)
421
            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
422
                if is_pretokenized:
423
                    tokens = list(itertools.chain(*(self.tokenize(t, is_pretokenized=True, **kwargs) for t in text)))
424
                    return self.convert_tokens_to_ids(tokens)
425
                else:
426
                    return self.convert_tokens_to_ids(text)
427
            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
428
                return text
429
            else:
430
                if is_pretokenized:
431
                    raise ValueError(
432
                        f"Input {text} is not valid. Should be a string or a list/tuple of strings when `is_pretokenized=True`."
433
                    )
434
                else:
435
                    raise ValueError(
436
                        f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
437
                    )
438

439
        if return_offsets_mapping:
440
            raise NotImplementedError(
441
                "return_offset_mapping is not available when using Python tokenizers."
442
                "To use this feature, change your tokenizer to one deriving from "
443
                "transformers.PreTrainedTokenizerFast."
444
                "More information on available tokenizers at "
445
                "https://github.com/huggingface/transformers/pull/2674"
446
            )
447

448
        first_ids = get_input_ids(text)
449
        second_ids = get_input_ids(text_pair) if text_pair is not None else None
450

451
        return self.prepare_for_model(
452
            first_ids,
453
            pair_ids=second_ids,
454
            add_special_tokens=add_special_tokens,
455
            padding=padding_strategy.value,
456
            truncation=truncation_strategy.value,
457
            max_length=max_length,
458
            stride=stride,
459
            pad_to_multiple_of=pad_to_multiple_of,
460
            return_tensors=return_tensors,
461
            prepend_batch_axis=True,
462
            return_attention_mask=return_attention_mask,
463
            return_token_type_ids=return_token_type_ids,
464
            return_overflowing_tokens=return_overflowing_tokens,
465
            return_special_tokens_mask=return_special_tokens_mask,
466
            return_length=return_length,
467
            verbose=verbose,
468
        )
469

470
    def _batch_encode_plus(
471
        self,
472
        batch_text_or_text_pairs: Union[
473
            List[TextInput],
474
            List[TextInputPair],
475
            List[PreTokenizedInput],
476
            List[PreTokenizedInputPair],
477
            List[EncodedInput],
478
            List[EncodedInputPair],
479
        ],
480
        add_special_tokens: bool = True,
481
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
482
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
483
        max_length: Optional[int] = None,
484
        stride: int = 0,
485
        is_pretokenized: bool = False,
486
        pad_to_multiple_of: Optional[int] = None,
487
        return_tensors: Optional[Union[str, TensorType]] = None,
488
        return_token_type_ids: Optional[bool] = None,
489
        return_attention_mask: Optional[bool] = None,
490
        return_overflowing_tokens: bool = False,
491
        return_special_tokens_mask: bool = False,
492
        return_offsets_mapping: bool = False,
493
        return_length: bool = False,
494
        verbose: bool = True,
495
        **kwargs
496
    ) -> BatchEncoding:
497
        def get_input_ids(text):
498
            if isinstance(text, str):
499
                tokens = self.tokenize(text, **kwargs)
500
                return self.convert_tokens_to_ids(tokens)
501
            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
502
                if is_pretokenized:
503
                    tokens = list(itertools.chain(*(self.tokenize(t, is_pretokenized=True, **kwargs) for t in text)))
504
                    return self.convert_tokens_to_ids(tokens)
505
                else:
506
                    return self.convert_tokens_to_ids(text)
507
            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
508
                return text
509
            else:
510
                raise ValueError(
511
                    "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
512
                )
513

514
        if return_offsets_mapping:
515
            raise NotImplementedError(
516
                "return_offset_mapping is not available when using Python tokenizers."
517
                "To use this feature, change your tokenizer to one deriving from "
518
                "transformers.PreTrainedTokenizerFast."
519
            )
520

521
        input_ids = []
522
        for ids_or_pair_ids in batch_text_or_text_pairs:
523
            if not isinstance(ids_or_pair_ids, (list, tuple)):
524
                ids, pair_ids = ids_or_pair_ids, None
525
            elif is_pretokenized and not isinstance(ids_or_pair_ids[0], (list, tuple)):
526
                ids, pair_ids = ids_or_pair_ids, None
527
            else:
528
                ids, pair_ids = ids_or_pair_ids
529

530
            first_ids = get_input_ids(ids)
531
            second_ids = get_input_ids(pair_ids) if pair_ids is not None else None
532
            input_ids.append((first_ids, second_ids))
533

534
        batch_outputs = self._batch_prepare_for_model(
535
            input_ids,
536
            add_special_tokens=add_special_tokens,
537
            padding_strategy=padding_strategy,
538
            truncation_strategy=truncation_strategy,
539
            max_length=max_length,
540
            stride=stride,
541
            pad_to_multiple_of=pad_to_multiple_of,
542
            return_attention_mask=return_attention_mask,
543
            return_token_type_ids=return_token_type_ids,
544
            return_overflowing_tokens=return_overflowing_tokens,
545
            return_special_tokens_mask=return_special_tokens_mask,
546
            return_length=return_length,
547
            return_tensors=return_tensors,
548
            verbose=verbose,
549
        )
550

551
        return BatchEncoding(batch_outputs)
552

553
    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
554
    def _batch_prepare_for_model(
555
        self,
556
        batch_ids_pairs: List[Union[PreTokenizedInputPair, Tuple[List[int], None]]],
557
        add_special_tokens: bool = True,
558
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
559
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
560
        max_length: Optional[int] = None,
561
        stride: int = 0,
562
        pad_to_multiple_of: Optional[int] = None,
563
        return_tensors: Optional[str] = None,
564
        return_token_type_ids: Optional[bool] = None,
565
        return_attention_mask: Optional[bool] = None,
566
        return_overflowing_tokens: bool = False,
567
        return_special_tokens_mask: bool = False,
568
        return_length: bool = False,
569
        verbose: bool = True,
570
    ) -> BatchEncoding:
571
        """
572
        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
573
        It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
574
        manages a moving window (with user defined stride) for overflowing tokens
575

576
        Args:
577
            batch_ids_pairs: list of tokenized input ids or input ids pairs
578
        """
579

580
        batch_outputs = {}
581
        for first_ids, second_ids in batch_ids_pairs:
582
            outputs = self.prepare_for_model(
583
                first_ids,
584
                second_ids,
585
                add_special_tokens=add_special_tokens,
586
                padding=PaddingStrategy.DO_NOT_PAD.value,  # we pad in batch afterward
587
                truncation=truncation_strategy.value,
588
                max_length=max_length,
589
                stride=stride,
590
                pad_to_multiple_of=None,  # we pad in batch afterward
591
                return_attention_mask=False,  # we pad in batch afterward
592
                return_token_type_ids=return_token_type_ids,
593
                return_overflowing_tokens=return_overflowing_tokens,
594
                return_special_tokens_mask=return_special_tokens_mask,
595
                return_length=return_length,
596
                return_tensors=None,  # We convert the whole batch to tensors at the end
597
                prepend_batch_axis=False,
598
                verbose=verbose,
599
            )
600

601
            for key, value in outputs.items():
602
                if key not in batch_outputs:
603
                    batch_outputs[key] = []
604
                batch_outputs[key].append(value)
605

606
        batch_outputs = self.pad(
607
            batch_outputs,
608
            padding=padding_strategy.value,
609
            max_length=max_length,
610
            pad_to_multiple_of=pad_to_multiple_of,
611
            return_attention_mask=return_attention_mask,
612
        )
613

614
        batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
615

616
        return batch_outputs
617

618
    def prepare_for_tokenization(
619
        self, text: str, is_pretokenized: bool = False, **kwargs
620
    ) -> Tuple[str, Dict[str, Any]]:
621
        """
622
        Performs any necessary transformations before tokenization.
623

624
        This method should pop the arguments from kwargs and return the remaining :obj:`kwargs` as well.
625
        We test the :obj:`kwargs` at the end of the encoding process to be sure all the arguments have been used.
626

627
        Args:
628
            test (:obj:`str`):
629
                The text to prepare.
630
            is_pretokenized (:obj:`bool`, `optional`, defaults to :obj:`False`):
631
                Whether or not the text has been pretokenized.
632
            kwargs:
633
                Keyword arguments to use for the tokenization.
634

635
        Returns:
636
            :obj:`Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
637
        """
638
        return (text, kwargs)
639

640
    def get_special_tokens_mask(
641
        self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False
642
    ) -> List[int]:
643
        """
644
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
645
        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
646

647
        Args:
648
            token_ids_0 (:obj:`List[int]`):
649
                List of ids of the first sequence.
650
            token_ids_1 (:obj:`List[int]`, `optional`):
651
                List of ids of the second sequence.
652
            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
653
                Wheter or not the token list is already formated with special tokens for the model.
654

655
        Returns:
656
            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
657
        """
658
        return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
659

660
    def convert_ids_to_tokens(
661
        self, ids: Union[int, List[int]], skip_special_tokens: bool = False
662
    ) -> Union[str, List[str]]:
663
        """
664
        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary
665
        and added tokens.
666

667
        Args:
668
            ids (:obj:`int` or :obj:`List[int]`):
669
                The token id (or token ids) to convert to tokens.
670
            skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
671
                Whether or not to remove special tokens in the decoding.
672

673
        Returns:
674
            :obj:`str` or :obj:`List[str]`: The decoded token(s).
675
        """
676
        if isinstance(ids, int):
677
            if ids in self.added_tokens_decoder:
678
                return self.added_tokens_decoder[ids]
679
            else:
680
                return self._convert_id_to_token(ids)
681
        tokens = []
682
        for index in ids:
683
            index = int(index)
684
            if skip_special_tokens and index in self.all_special_ids:
685
                continue
686
            if index in self.added_tokens_decoder:
687
                tokens.append(self.added_tokens_decoder[index])
688
            else:
689
                tokens.append(self._convert_id_to_token(index))
690
        return tokens
691

692
    def _convert_id_to_token(self, index: int) -> str:
693
        raise NotImplementedError
694

695
    def convert_tokens_to_string(self, tokens: List[str]) -> str:
696
        """
697
        Converts a sequence of token ids in a single string.
698

699
        The most simple way to do it is ``" ".join(tokens)`` but we often want to remove
700
        sub-word tokenization artifacts at the same time.
701

702
        Args:
703
            tokens (:obj:`List[str]`): The token to join in a string.
704

705
        Return: The joined tokens.
706
        """
707
        return " ".join(tokens)
708

709
    def decode(
710
        self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True
711
    ) -> str:
712
        """
713
        Converts a sequence of ids in a string, using the tokenizer and vocabulary
714
        with options to remove special tokens and clean up tokenization spaces.
715

716
        Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
717

718
        Args:
719
            token_ids (:obj:`List[int]`):
720
                List of tokenized input ids. Can be obtained using the ``__call__`` method.
721
            skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
722
                Whether or not to remove special tokens in the decoding.
723
            clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`):
724
                Whether or not to clean up the tokenization spaces.
725

726
        Returns:
727
            :obj:`str`: The decoded sentence.
728
        """
729
        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
730

731
        # To avoid mixing byte-level and unicode for byte-level BPT
732
        # we need to build string separatly for added tokens and byte-level tokens
733
        # cf. https://github.com/huggingface/transformers/issues/1133
734
        sub_texts = []
735
        current_sub_text = []
736
        for token in filtered_tokens:
737
            if skip_special_tokens and token in self.all_special_ids:
738
                continue
739
            if token in self.added_tokens_encoder:
740
                if current_sub_text:
741
                    sub_texts.append(self.convert_tokens_to_string(current_sub_text))
742
                    current_sub_text = []
743
                sub_texts.append(token)
744
            else:
745
                current_sub_text.append(token)
746
        if current_sub_text:
747
            sub_texts.append(self.convert_tokens_to_string(current_sub_text))
748
        text = " ".join(sub_texts)
749

750
        if clean_up_tokenization_spaces:
751
            clean_text = self.clean_up_tokenization(text)
752
            return clean_text
753
        else:
754
            return text
755

756
    def save_vocabulary(self, save_directory) -> Tuple[str]:
757
        """
758
        Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens
759
        and special token mappings.
760

761
        .. warning::
762
            Please use :meth:`~transformers.PreTrainedTokenizer.save_pretrained` to save the full tokenizer state if
763
            you want to reload it using the :meth:`~transformers.PreTrainedTokenizer.from_pretrained` class method.
764

765
        Args:
766
            save_directory (:obj:`str`): The path to adirectory where the tokenizer will be saved.
767

768
        Returns:
769
            A tuple of :obj:`str`: The files saved.
770
        """
771
        raise NotImplementedError
772
CSS-LM

Использование cookies