transformers

Форк
0
/
test_tokenization_layoutxlm.py 
1951 строка · 97.4 Кб
1
# coding=utf-8
2
# Copyright 2021 The HuggingFace Inc. team.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15

16
import inspect
17
import shutil
18
import tempfile
19
import unittest
20
from typing import List
21

22
from transformers import (
23
    AddedToken,
24
    LayoutXLMTokenizerFast,
25
    SpecialTokensMixin,
26
    is_tf_available,
27
    is_torch_available,
28
    logging,
29
)
30
from transformers.models.layoutxlm.tokenization_layoutxlm import LayoutXLMTokenizer
31
from transformers.testing_utils import (
32
    get_tests_dir,
33
    is_pt_tf_cross_test,
34
    require_pandas,
35
    require_sentencepiece,
36
    require_tokenizers,
37
    require_torch,
38
    slow,
39
)
40

41
from ...test_tokenization_common import (
42
    SMALL_TRAINING_CORPUS,
43
    TokenizerTesterMixin,
44
    filter_non_english,
45
    merge_model_tokenizer_mappings,
46
)
47

48

49
logger = logging.get_logger(__name__)
50
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
51

52

53
@require_sentencepiece
54
@require_tokenizers
55
@require_pandas
56
class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
57
    tokenizer_class = LayoutXLMTokenizer
58
    rust_tokenizer_class = LayoutXLMTokenizerFast
59
    test_rust_tokenizer = True
60
    from_pretrained_filter = filter_non_english
61
    test_seq2seq = False
62
    test_sentencepiece = True
63
    maxDiff = None
64

65
    def get_words_and_boxes(self):
66
        words = ["a", "weirdly", "test"]
67
        boxes = [[423, 237, 440, 251], [427, 272, 441, 287], [419, 115, 437, 129]]
68

69
        return words, boxes
70

71
    def get_words_and_boxes_batch(self):
72
        words = [["a", "weirdly", "test"], ["hello", "my", "name", "is", "bob"]]
73
        boxes = [
74
            [[423, 237, 440, 251], [427, 272, 441, 287], [419, 115, 437, 129]],
75
            [[961, 885, 992, 912], [256, 38, 330, 58], [256, 38, 330, 58], [336, 42, 353, 57], [34, 42, 66, 69]],
76
        ]
77

78
        return words, boxes
79

80
    def get_question_words_and_boxes(self):
81
        question = "what's his name?"
82
        words = ["a", "weirdly", "test"]
83
        boxes = [[423, 237, 440, 251], [427, 272, 441, 287], [419, 115, 437, 129]]
84

85
        return question, words, boxes
86

87
    def get_question_words_and_boxes_batch(self):
88
        questions = ["what's his name?", "how is he called?"]
89
        words = [["a", "weirdly", "test"], ["what", "a", "laif", "gastn"]]
90
        boxes = [
91
            [[423, 237, 440, 251], [427, 272, 441, 287], [419, 115, 437, 129]],
92
            [[256, 38, 330, 58], [256, 38, 330, 58], [336, 42, 353, 57], [34, 42, 66, 69]],
93
        ]
94

95
        return questions, words, boxes
96

97
    def setUp(self):
98
        super().setUp()
99

100
        # We have a SentencePiece fixture for testing
101
        tokenizer = LayoutXLMTokenizer(SAMPLE_VOCAB, keep_accents=True)
102
        tokenizer.save_pretrained(self.tmpdirname)
103

104
    def get_input_output_texts(self, tokenizer):
105
        input_text = "UNwant\u00E9d,running"
106
        output_text = "unwanted, running"
107
        return input_text, output_text
108

109
    # override test in `test_tokenization_common.py` because of the required input format of the `__call__`` method of
110
    # this tokenizer
111
    def test_save_sentencepiece_tokenizer(self) -> None:
112
        if not self.test_sentencepiece or not self.test_slow_tokenizer:
113
            return
114
        # We want to verify that we will be able to save the tokenizer even if the original files that were used to
115
        # build the tokenizer have been deleted in the meantime.
116
        words, boxes = self.get_words_and_boxes()
117

118
        tokenizer_slow_1 = self.get_tokenizer()
119
        encoding_tokenizer_slow_1 = tokenizer_slow_1(
120
            words,
121
            boxes=boxes,
122
        )
123

124
        tmpdirname_1 = tempfile.mkdtemp()
125
        tmpdirname_2 = tempfile.mkdtemp()
126

127
        tokenizer_slow_1.save_pretrained(tmpdirname_1)
128
        tokenizer_slow_2 = self.tokenizer_class.from_pretrained(tmpdirname_1)
129
        encoding_tokenizer_slow_2 = tokenizer_slow_2(
130
            words,
131
            boxes=boxes,
132
        )
133

134
        shutil.rmtree(tmpdirname_1)
135
        tokenizer_slow_2.save_pretrained(tmpdirname_2)
136

137
        tokenizer_slow_3 = self.tokenizer_class.from_pretrained(tmpdirname_2)
138
        encoding_tokenizer_slow_3 = tokenizer_slow_3(
139
            words,
140
            boxes=boxes,
141
        )
142
        shutil.rmtree(tmpdirname_2)
143

144
        self.assertEqual(encoding_tokenizer_slow_1, encoding_tokenizer_slow_2)
145
        self.assertEqual(encoding_tokenizer_slow_1, encoding_tokenizer_slow_3)
146

147
    def test_split_special_tokens(self):
148
        tokenizer = self.tokenizer_class.from_pretrained("microsoft/layoutxlm-base")
149
        _, _, boxes = self.get_question_words_and_boxes()
150
        special_token = "[SPECIAL_TOKEN]"
151
        tokenizer.add_special_tokens({"additional_special_tokens": [special_token]})
152
        encoded_special_token = tokenizer.tokenize(special_token, boxes=boxes, add_special_tokens=False)
153
        self.assertEqual(len(encoded_special_token), 1)
154

155
        encoded_split_special_token = tokenizer.tokenize(
156
            special_token, add_special_tokens=False, split_special_tokens=True, boxes=boxes
157
        )
158
        self.assertTrue(len(encoded_split_special_token) > 1)
159

160
    @slow
161
    def test_sequence_builders(self):
162
        tokenizer = self.tokenizer_class.from_pretrained("microsoft/layoutxlm-base")
163

164
        question, words, boxes = self.get_question_words_and_boxes()
165

166
        text = tokenizer.encode(
167
            question.split(),
168
            boxes=[tokenizer.pad_token_box for _ in range(len(question.split()))],
169
            add_special_tokens=False,
170
        )
171
        text_2 = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
172

173
        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
174

175
        assert encoded_pair == [0] + text + [2] + [2] + text_2 + [2]
176

177
    def test_offsets_with_special_characters(self):
178
        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
179
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
180
                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
181

182
                words, boxes = self.get_words_and_boxes()
183
                words[1] = tokenizer_r.mask_token
184
                tokens = tokenizer_r.encode_plus(
185
                    words,
186
                    boxes=boxes,
187
                    return_attention_mask=False,
188
                    return_token_type_ids=False,
189
                    return_offsets_mapping=True,
190
                    add_special_tokens=True,
191
                )
192

193
                expected_results = [
194
                    ((0, 0), tokenizer_r.cls_token),
195
                    ((0, 1), "▁a"),
196
                    ((0, 6), tokenizer_r.mask_token),
197
                    ((0, 4), "▁test"),
198
                    ((0, 0), tokenizer_r.sep_token),
199
                ]
200

201
                self.assertEqual(
202
                    [e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"])
203
                )
204
                self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])
205

206
    def test_add_special_tokens(self):
207
        tokenizers: List[LayoutXLMTokenizer] = self.get_tokenizers(do_lower_case=False)
208
        for tokenizer in tokenizers:
209
            with self.subTest(f"{tokenizer.__class__.__name__}"):
210
                special_token = "[SPECIAL_TOKEN]"
211
                special_token_box = [1000, 1000, 1000, 1000]
212

213
                tokenizer.add_special_tokens({"cls_token": special_token})
214
                encoded_special_token = tokenizer.encode(
215
                    [special_token], boxes=[special_token_box], add_special_tokens=False
216
                )
217
                self.assertEqual(len(encoded_special_token), 1)
218

219
                decoded = tokenizer.decode(encoded_special_token, skip_special_tokens=True)
220
                self.assertTrue(special_token not in decoded)
221

222
    def test_add_tokens_tokenizer(self):
223
        tokenizers: List[LayoutXLMTokenizer] = self.get_tokenizers(do_lower_case=False)
224
        for tokenizer in tokenizers:
225
            with self.subTest(f"{tokenizer.__class__.__name__}"):
226
                vocab_size = tokenizer.vocab_size
227
                all_size = len(tokenizer)
228

229
                self.assertNotEqual(vocab_size, 0)
230

231
                # We usually have added tokens from the start in tests because our vocab fixtures are
232
                # smaller than the original vocabs - let's not assert this
233
                # self.assertEqual(vocab_size, all_size)
234

235
                new_toks = ["aaaaa", "bbbbbb", "cccccccccdddddddd"]
236
                added_toks = tokenizer.add_tokens(new_toks)
237
                vocab_size_2 = tokenizer.vocab_size
238
                all_size_2 = len(tokenizer)
239

240
                self.assertNotEqual(vocab_size_2, 0)
241
                self.assertEqual(vocab_size, vocab_size_2)
242
                self.assertEqual(added_toks, len(new_toks))
243
                self.assertEqual(all_size_2, all_size + len(new_toks))
244

245
                words = "aaaaa bbbbbb low cccccccccdddddddd l".split()
246
                boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
247

248
                tokens = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
249

250
                self.assertGreaterEqual(len(tokens), 4)
251
                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
252
                self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
253

254
                new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
255
                added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
256
                vocab_size_3 = tokenizer.vocab_size
257
                all_size_3 = len(tokenizer)
258

259
                self.assertNotEqual(vocab_size_3, 0)
260
                self.assertEqual(vocab_size, vocab_size_3)
261
                self.assertEqual(added_toks_2, len(new_toks_2))
262
                self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
263

264
                words = ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l".split()
265
                boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
266

267
                tokens = tokenizer.encode(
268
                    words,
269
                    boxes=boxes,
270
                    add_special_tokens=False,
271
                )
272

273
                self.assertGreaterEqual(len(tokens), 6)
274
                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
275
                self.assertGreater(tokens[0], tokens[1])
276
                self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
277
                self.assertGreater(tokens[-2], tokens[-3])
278
                self.assertEqual(tokens[0], tokenizer.eos_token_id)
279
                self.assertEqual(tokens[-2], tokenizer.pad_token_id)
280

281
    @require_tokenizers
282
    def test_encode_decode_with_spaces(self):
283
        tokenizers = self.get_tokenizers(do_lower_case=False)
284
        for tokenizer in tokenizers:
285
            with self.subTest(f"{tokenizer.__class__.__name__}"):
286
                words, boxes = self.get_words_and_boxes()
287

288
                new_toks = [AddedToken("[ABC]", normalized=False), AddedToken("[DEF]", normalized=False)]
289
                tokenizer.add_tokens(new_toks)
290
                input = "[ABC][DEF][ABC][DEF]"
291
                if self.space_between_special_tokens:
292
                    output = "[ABC] [DEF] [ABC] [DEF]"
293
                else:
294
                    output = input
295
                encoded = tokenizer.encode(input.split(), boxes=boxes, add_special_tokens=False)
296
                decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
297
                self.assertIn(decoded, [output, output.lower()])
298

299
    def test_encode_plus_with_padding(self):
300
        tokenizers = self.get_tokenizers(do_lower_case=False)
301
        for tokenizer in tokenizers:
302
            with self.subTest(f"{tokenizer.__class__.__name__}"):
303
                words, boxes = self.get_words_and_boxes()
304

305
                # check correct behaviour if no pad_token_id exists and add it eventually
306
                self._check_no_pad_token_padding(tokenizer, words)
307

308
                padding_size = 10
309
                padding_idx = tokenizer.pad_token_id
310

311
                encoded_sequence = tokenizer.encode_plus(words, boxes=boxes, return_special_tokens_mask=True)
312
                input_ids = encoded_sequence["input_ids"]
313
                special_tokens_mask = encoded_sequence["special_tokens_mask"]
314
                sequence_length = len(input_ids)
315

316
                # Test 'longest' and 'no_padding' don't do anything
317
                tokenizer.padding_side = "right"
318

319
                not_padded_sequence = tokenizer.encode_plus(
320
                    words,
321
                    boxes=boxes,
322
                    padding=False,
323
                    return_special_tokens_mask=True,
324
                )
325
                not_padded_input_ids = not_padded_sequence["input_ids"]
326

327
                not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
328
                not_padded_sequence_length = len(not_padded_input_ids)
329

330
                self.assertTrue(sequence_length == not_padded_sequence_length)
331
                self.assertTrue(input_ids == not_padded_input_ids)
332
                self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask)
333

334
                not_padded_sequence = tokenizer.encode_plus(
335
                    words,
336
                    boxes=boxes,
337
                    padding=False,
338
                    return_special_tokens_mask=True,
339
                )
340
                not_padded_input_ids = not_padded_sequence["input_ids"]
341

342
                not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
343
                not_padded_sequence_length = len(not_padded_input_ids)
344

345
                self.assertTrue(sequence_length == not_padded_sequence_length)
346
                self.assertTrue(input_ids == not_padded_input_ids)
347
                self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask)
348

349
                # Test right padding
350
                tokenizer.padding_side = "right"
351

352
                right_padded_sequence = tokenizer.encode_plus(
353
                    words,
354
                    boxes=boxes,
355
                    max_length=sequence_length + padding_size,
356
                    padding="max_length",
357
                    return_special_tokens_mask=True,
358
                )
359
                right_padded_input_ids = right_padded_sequence["input_ids"]
360

361
                right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
362
                right_padded_sequence_length = len(right_padded_input_ids)
363

364
                self.assertTrue(sequence_length + padding_size == right_padded_sequence_length)
365
                self.assertTrue(input_ids + [padding_idx] * padding_size == right_padded_input_ids)
366
                self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask)
367

368
                # Test left padding
369
                tokenizer.padding_side = "left"
370
                left_padded_sequence = tokenizer.encode_plus(
371
                    words,
372
                    boxes=boxes,
373
                    max_length=sequence_length + padding_size,
374
                    padding="max_length",
375
                    return_special_tokens_mask=True,
376
                )
377
                left_padded_input_ids = left_padded_sequence["input_ids"]
378
                left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
379
                left_padded_sequence_length = len(left_padded_input_ids)
380

381
                self.assertTrue(sequence_length + padding_size == left_padded_sequence_length)
382
                self.assertTrue([padding_idx] * padding_size + input_ids == left_padded_input_ids)
383
                self.assertTrue([1] * padding_size + special_tokens_mask == left_padded_special_tokens_mask)
384

385
                if "token_type_ids" in tokenizer.model_input_names:
386
                    token_type_ids = encoded_sequence["token_type_ids"]
387
                    left_padded_token_type_ids = left_padded_sequence["token_type_ids"]
388
                    right_padded_token_type_ids = right_padded_sequence["token_type_ids"]
389

390
                    assert token_type_ids + [0] * padding_size == right_padded_token_type_ids
391
                    assert [0] * padding_size + token_type_ids == left_padded_token_type_ids
392

393
                if "attention_mask" in tokenizer.model_input_names:
394
                    attention_mask = encoded_sequence["attention_mask"]
395
                    right_padded_attention_mask = right_padded_sequence["attention_mask"]
396
                    left_padded_attention_mask = left_padded_sequence["attention_mask"]
397

398
                    self.assertTrue(attention_mask + [0] * padding_size == right_padded_attention_mask)
399
                    self.assertTrue([0] * padding_size + attention_mask == left_padded_attention_mask)
400

401
    def test_internal_consistency(self):
402
        tokenizers = self.get_tokenizers()
403
        for tokenizer in tokenizers:
404
            with self.subTest(f"{tokenizer.__class__.__name__}"):
405
                words, boxes = self.get_words_and_boxes()
406

407
                tokens = []
408
                for word in words:
409
                    tokens.extend(tokenizer.tokenize(word))
410
                ids = tokenizer.convert_tokens_to_ids(tokens)
411
                ids_2 = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
412
                self.assertListEqual(ids, ids_2)
413

414
                tokens_2 = tokenizer.convert_ids_to_tokens(ids)
415
                self.assertNotEqual(len(tokens_2), 0)
416
                text_2 = tokenizer.decode(ids)
417
                self.assertIsInstance(text_2, str)
418

419
                output_text = "a weirdly test"
420
                self.assertEqual(text_2, output_text)
421

422
    def test_mask_output(self):
423
        tokenizers = self.get_tokenizers(fast=False, do_lower_case=False)
424
        for tokenizer in tokenizers:
425
            with self.subTest(f"{tokenizer.__class__.__name__}"):
426
                words, boxes = self.get_words_and_boxes()
427

428
                if (
429
                    tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer"
430
                    and "token_type_ids" in tokenizer.model_input_names
431
                ):
432
                    information = tokenizer.encode_plus(words, boxes=boxes, add_special_tokens=True)
433
                    sequences, mask = information["input_ids"], information["token_type_ids"]
434
                    self.assertEqual(len(sequences), len(mask))
435

436
    def test_number_of_added_tokens(self):
437
        tokenizers = self.get_tokenizers(do_lower_case=False)
438
        for tokenizer in tokenizers:
439
            with self.subTest(f"{tokenizer.__class__.__name__}"):
440
                # test 1: single sequence
441
                words, boxes = self.get_words_and_boxes()
442

443
                sequences = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
444
                attached_sequences = tokenizer.encode(words, boxes=boxes, add_special_tokens=True)
445

446
                # Method is implemented (e.g. not GPT-2)
447
                if len(attached_sequences) != 2:
448
                    self.assertEqual(
449
                        tokenizer.num_special_tokens_to_add(pair=False), len(attached_sequences) - len(sequences)
450
                    )
451

452
                # test 2: two sequences
453
                question, words, boxes = self.get_question_words_and_boxes()
454

455
                sequences = tokenizer.encode(question, words, boxes=boxes, add_special_tokens=False)
456
                attached_sequences = tokenizer.encode(question, words, boxes=boxes, add_special_tokens=True)
457

458
                # Method is implemented (e.g. not GPT-2)
459
                if len(attached_sequences) != 2:
460
                    self.assertEqual(
461
                        tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences)
462
                    )
463

464
    def test_padding_to_max_length(self):
465
        """We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated"""
466
        tokenizers = self.get_tokenizers(do_lower_case=False)
467
        for tokenizer in tokenizers:
468
            with self.subTest(f"{tokenizer.__class__.__name__}"):
469
                words, boxes = self.get_words_and_boxes()
470
                padding_size = 10
471

472
                # check correct behaviour if no pad_token_id exists and add it eventually
473
                self._check_no_pad_token_padding(tokenizer, words)
474

475
                padding_idx = tokenizer.pad_token_id
476

477
                # Check that it correctly pads when a maximum length is specified along with the padding flag set to True
478
                tokenizer.padding_side = "right"
479
                encoded_sequence = tokenizer.encode(words, boxes=boxes)
480
                sequence_length = len(encoded_sequence)
481
                # FIXME: the next line should be padding(max_length) to avoid warning
482
                padded_sequence = tokenizer.encode(
483
                    words, boxes=boxes, max_length=sequence_length + padding_size, pad_to_max_length=True
484
                )
485
                padded_sequence_length = len(padded_sequence)
486
                assert sequence_length + padding_size == padded_sequence_length
487
                assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
488

489
                # Check that nothing is done when a maximum length is not specified
490
                encoded_sequence = tokenizer.encode(words, boxes=boxes)
491
                sequence_length = len(encoded_sequence)
492

493
                tokenizer.padding_side = "right"
494
                padded_sequence_right = tokenizer.encode(words, boxes=boxes, pad_to_max_length=True)
495
                padded_sequence_right_length = len(padded_sequence_right)
496
                assert sequence_length == padded_sequence_right_length
497
                assert encoded_sequence == padded_sequence_right
498

499
    def test_padding(self, max_length=50):
500
        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
501
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
502
                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
503
                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
504

505
                self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
506
                pad_token_id = tokenizer_p.pad_token_id
507

508
                # Encode - Simple input
509
                words, boxes = self.get_words_and_boxes()
510
                input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
511
                input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
512
                self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
513
                input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, padding="max_length")
514
                input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, padding="max_length")
515
                self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
516

517
                input_r = tokenizer_r.encode(words, boxes=boxes, padding="longest")
518
                input_p = tokenizer_p.encode(words, boxes=boxes, padding=True)
519
                self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id)
520

521
                # Encode - Pair input
522
                question, words, boxes = self.get_question_words_and_boxes()
523
                input_r = tokenizer_r.encode(
524
                    question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
525
                )
526
                input_p = tokenizer_p.encode(
527
                    question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
528
                )
529
                self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
530
                input_r = tokenizer_r.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length")
531
                input_p = tokenizer_p.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length")
532
                self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
533
                input_r = tokenizer_r.encode(question, words, boxes=boxes, padding=True)
534
                input_p = tokenizer_p.encode(question, words, boxes=boxes, padding="longest")
535
                self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id)
536

537
                # Encode_plus - Simple input
538
                words, boxes = self.get_words_and_boxes()
539
                input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
540
                input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
541
                self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
542
                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
543
                input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length")
544
                input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length")
545
                self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
546
                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
547

548
                input_r = tokenizer_r.encode_plus(words, boxes=boxes, padding="longest")
549
                input_p = tokenizer_p.encode_plus(words, boxes=boxes, padding=True)
550
                self.assert_padded_input_match(
551
                    input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id
552
                )
553

554
                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
555

556
                # Encode_plus - Pair input
557
                question, words, boxes = self.get_question_words_and_boxes()
558
                input_r = tokenizer_r.encode_plus(
559
                    question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
560
                )
561
                input_p = tokenizer_p.encode_plus(
562
                    question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
563
                )
564
                self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
565
                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
566
                input_r = tokenizer_r.encode_plus(
567
                    question, words, boxes=boxes, max_length=max_length, padding="max_length"
568
                )
569
                input_p = tokenizer_p.encode_plus(
570
                    question, words, boxes=boxes, max_length=max_length, padding="max_length"
571
                )
572
                self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
573
                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
574
                input_r = tokenizer_r.encode_plus(question, words, boxes=boxes, padding="longest")
575
                input_p = tokenizer_p.encode_plus(question, words, boxes=boxes, padding=True)
576
                self.assert_padded_input_match(
577
                    input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id
578
                )
579
                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
580

581
                # Batch_encode_plus - Simple input
582
                words, boxes = self.get_words_and_boxes_batch()
583

584
                input_r = tokenizer_r.batch_encode_plus(
585
                    words,
586
                    boxes=boxes,
587
                    max_length=max_length,
588
                    pad_to_max_length=True,
589
                )
590
                input_p = tokenizer_p.batch_encode_plus(
591
                    words,
592
                    boxes=boxes,
593
                    max_length=max_length,
594
                    pad_to_max_length=True,
595
                )
596
                self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
597

598
                input_r = tokenizer_r.batch_encode_plus(
599
                    words,
600
                    boxes=boxes,
601
                    max_length=max_length,
602
                    padding="max_length",
603
                )
604
                input_p = tokenizer_p.batch_encode_plus(
605
                    words,
606
                    boxes=boxes,
607
                    max_length=max_length,
608
                    padding="max_length",
609
                )
610
                self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
611

612
                input_r = tokenizer_r.batch_encode_plus(
613
                    words,
614
                    boxes=boxes,
615
                    max_length=max_length,
616
                    padding="longest",
617
                )
618
                input_p = tokenizer_p.batch_encode_plus(
619
                    words,
620
                    boxes=boxes,
621
                    max_length=max_length,
622
                    padding=True,
623
                )
624
                self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
625

626
                input_r = tokenizer_r.batch_encode_plus(words, boxes=boxes, padding="longest")
627
                input_p = tokenizer_p.batch_encode_plus(words, boxes=boxes, padding=True)
628
                self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
629

630
                # Batch_encode_plus - Pair input
631
                questions, words, boxes = self.get_question_words_and_boxes_batch()
632

633
                input_r = tokenizer_r.batch_encode_plus(
634
                    list(zip(questions, words)),
635
                    is_pair=True,
636
                    boxes=boxes,
637
                    max_length=max_length,
638
                    truncation=True,
639
                    padding="max_length",
640
                )
641
                input_p = tokenizer_p.batch_encode_plus(
642
                    list(zip(questions, words)),
643
                    is_pair=True,
644
                    boxes=boxes,
645
                    max_length=max_length,
646
                    truncation=True,
647
                    padding="max_length",
648
                )
649
                self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
650

651
                input_r = tokenizer_r.batch_encode_plus(
652
                    list(zip(questions, words)),
653
                    is_pair=True,
654
                    boxes=boxes,
655
                    padding=True,
656
                )
657
                input_p = tokenizer_p.batch_encode_plus(
658
                    list(zip(questions, words)),
659
                    is_pair=True,
660
                    boxes=boxes,
661
                    padding="longest",
662
                )
663
                self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
664

665
                # Using pad on single examples after tokenization
666
                words, boxes = self.get_words_and_boxes()
667
                input_r = tokenizer_r.encode_plus(words, boxes=boxes)
668
                input_r = tokenizer_r.pad(input_r)
669

670
                input_p = tokenizer_r.encode_plus(words, boxes=boxes)
671
                input_p = tokenizer_r.pad(input_p)
672

673
                self.assert_padded_input_match(
674
                    input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id
675
                )
676

677
                # Using pad on single examples after tokenization
678
                input_r = tokenizer_r.encode_plus(words, boxes=boxes)
679
                input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length")
680

681
                input_p = tokenizer_r.encode_plus(words, boxes=boxes)
682
                input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length")
683

684
                self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
685

686
                # Using pad after tokenization
687
                words, boxes = self.get_words_and_boxes_batch()
688
                input_r = tokenizer_r.batch_encode_plus(
689
                    words,
690
                    boxes=boxes,
691
                )
692
                input_r = tokenizer_r.pad(input_r)
693

694
                input_p = tokenizer_r.batch_encode_plus(
695
                    words,
696
                    boxes=boxes,
697
                )
698
                input_p = tokenizer_r.pad(input_p)
699

700
                self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
701

702
                # Using pad after tokenization
703
                words, boxes = self.get_words_and_boxes_batch()
704
                input_r = tokenizer_r.batch_encode_plus(
705
                    words,
706
                    boxes=boxes,
707
                )
708
                input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length")
709

710
                input_p = tokenizer_r.batch_encode_plus(
711
                    words,
712
                    boxes=boxes,
713
                )
714
                input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length")
715

716
                self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
717

718
    def test_padding_warning_message_fast_tokenizer(self):
719
        if not self.test_rust_tokenizer:
720
            return
721

722
        words, boxes = self.get_words_and_boxes_batch()
723

724
        tokenizer_fast = self.get_rust_tokenizer()
725

726
        encoding_fast = tokenizer_fast(
727
            words,
728
            boxes=boxes,
729
        )
730

731
        with self.assertLogs("transformers", level="WARNING") as cm:
732
            tokenizer_fast.pad(encoding_fast)
733
        self.assertEqual(len(cm.records), 1)
734
        self.assertIn(
735
            "Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to"
736
            " encode the text followed by a call to the `pad` method to get a padded encoding.",
737
            cm.records[0].message,
738
        )
739

740
        if not self.test_slow_tokenizer:
741
            return
742

743
        tokenizer_slow = self.get_tokenizer()
744

745
        encoding_slow = tokenizer_slow(
746
            words,
747
            boxes=boxes,
748
        )
749

750
        with self.assertLogs(level="WARNING") as cm:
751
            # We want to assert there are no warnings, but the 'assertLogs' method does not support that.
752
            # Therefore, we are adding a dummy warning, and then we will assert it is the only warning.
753
            logger.warning("Dummy warning")
754
            tokenizer_slow.pad(encoding_slow)
755
        self.assertEqual(len(cm.records), 1)
756
        self.assertIn(
757
            "Dummy warning",
758
            cm.records[0].message,
759
        )
760

761
    def test_call(self):
762
        # Tests that all call wrap to encode_plus and batch_encode_plus
763
        tokenizers = self.get_tokenizers(do_lower_case=False)
764
        for tokenizer in tokenizers:
765
            with self.subTest(f"{tokenizer.__class__.__name__}"):
766
                # Test not batched
767
                words, boxes = self.get_words_and_boxes()
768
                encoded_sequences_1 = tokenizer.encode_plus(words, boxes=boxes)
769
                encoded_sequences_2 = tokenizer(words, boxes=boxes)
770
                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
771

772
                # Test not batched pairs
773
                question, words, boxes = self.get_question_words_and_boxes()
774
                encoded_sequences_1 = tokenizer.encode_plus(words, boxes=boxes)
775
                encoded_sequences_2 = tokenizer(words, boxes=boxes)
776
                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
777

778
                # Test batched
779
                words, boxes = self.get_words_and_boxes_batch()
780
                encoded_sequences_1 = tokenizer.batch_encode_plus(words, is_pair=False, boxes=boxes)
781
                encoded_sequences_2 = tokenizer(words, boxes=boxes)
782
                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
783

784
    def test_batch_encode_plus_batch_sequence_length(self):
785
        # Tests that all encoded values have the correct size
786
        tokenizers = self.get_tokenizers(do_lower_case=False)
787
        for tokenizer in tokenizers:
788
            with self.subTest(f"{tokenizer.__class__.__name__}"):
789
                words, boxes = self.get_words_and_boxes_batch()
790

791
                encoded_sequences = [
792
                    tokenizer.encode_plus(words_example, boxes=boxes_example)
793
                    for words_example, boxes_example in zip(words, boxes)
794
                ]
795
                encoded_sequences_batch = tokenizer.batch_encode_plus(words, is_pair=False, boxes=boxes, padding=False)
796
                self.assertListEqual(
797
                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
798
                )
799

800
                maximum_length = len(
801
                    max([encoded_sequence["input_ids"] for encoded_sequence in encoded_sequences], key=len)
802
                )
803

804
                # check correct behaviour if no pad_token_id exists and add it eventually
805
                self._check_no_pad_token_padding(tokenizer, words)
806

807
                encoded_sequences_padded = [
808
                    tokenizer.encode_plus(
809
                        words_example, boxes=boxes_example, max_length=maximum_length, padding="max_length"
810
                    )
811
                    for words_example, boxes_example in zip(words, boxes)
812
                ]
813

814
                encoded_sequences_batch_padded = tokenizer.batch_encode_plus(
815
                    words, is_pair=False, boxes=boxes, padding=True
816
                )
817
                self.assertListEqual(
818
                    encoded_sequences_padded,
819
                    self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch_padded),
820
                )
821

822
                # check 'longest' is unsensitive to a max length
823
                encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(
824
                    words, is_pair=False, boxes=boxes, padding=True
825
                )
826
                encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
827
                    words, is_pair=False, boxes=boxes, max_length=maximum_length + 10, padding="longest"
828
                )
829
                for key in encoded_sequences_batch_padded_1.keys():
830
                    self.assertListEqual(
831
                        encoded_sequences_batch_padded_1[key],
832
                        encoded_sequences_batch_padded_2[key],
833
                    )
834

835
                # check 'no_padding' is unsensitive to a max length
836
                encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(
837
                    words, is_pair=False, boxes=boxes, padding=False
838
                )
839
                encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
840
                    words, is_pair=False, boxes=boxes, max_length=maximum_length + 10, padding=False
841
                )
842
                for key in encoded_sequences_batch_padded_1.keys():
843
                    self.assertListEqual(
844
                        encoded_sequences_batch_padded_1[key],
845
                        encoded_sequences_batch_padded_2[key],
846
                    )
847

848
    @unittest.skip("batch_encode_plus does not handle overflowing tokens.")
849
    def test_batch_encode_plus_overflowing_tokens(self):
850
        pass
851

852
    def test_batch_encode_plus_padding(self):
853
        # Test that padded sequences are equivalent between batch_encode_plus and encode_plus
854

855
        # Right padding tests
856
        tokenizers = self.get_tokenizers(do_lower_case=False)
857
        for tokenizer in tokenizers:
858
            with self.subTest(f"{tokenizer.__class__.__name__}"):
859
                words, boxes = self.get_words_and_boxes_batch()
860

861
                max_length = 100
862

863
                # check correct behaviour if no pad_token_id exists and add it eventually
864
                self._check_no_pad_token_padding(tokenizer, words)
865

866
                encoded_sequences = [
867
                    tokenizer.encode_plus(
868
                        words_example, boxes=boxes_example, max_length=max_length, padding="max_length"
869
                    )
870
                    for words_example, boxes_example in zip(words, boxes)
871
                ]
872
                encoded_sequences_batch = tokenizer.batch_encode_plus(
873
                    words, is_pair=False, boxes=boxes, max_length=max_length, padding="max_length"
874
                )
875
                self.assertListEqual(
876
                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
877
                )
878

879
        # Left padding tests
880
        tokenizers = self.get_tokenizers(do_lower_case=False)
881
        for tokenizer in tokenizers:
882
            with self.subTest(f"{tokenizer.__class__.__name__}"):
883
                tokenizer.padding_side = "left"
884
                words, boxes = self.get_words_and_boxes_batch()
885

886
                max_length = 100
887

888
                # check correct behaviour if no pad_token_id exists and add it eventually
889
                self._check_no_pad_token_padding(tokenizer, words)
890

891
                encoded_sequences = [
892
                    tokenizer.encode_plus(
893
                        words_example, boxes=boxes_example, max_length=max_length, padding="max_length"
894
                    )
895
                    for words_example, boxes_example in zip(words, boxes)
896
                ]
897
                encoded_sequences_batch = tokenizer.batch_encode_plus(
898
                    words, is_pair=False, boxes=boxes, max_length=max_length, padding="max_length"
899
                )
900
                self.assertListEqual(
901
                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
902
                )
903

904
    def test_padding_to_multiple_of(self):
905
        tokenizers = self.get_tokenizers()
906
        for tokenizer in tokenizers:
907
            with self.subTest(f"{tokenizer.__class__.__name__}"):
908
                if tokenizer.pad_token is None:
909
                    self.skipTest("No padding token.")
910
                else:
911
                    words, boxes = self.get_words_and_boxes()
912

913
                    # empty_tokens = tokenizer([""], [[]], padding=True, pad_to_multiple_of=8)
914
                    normal_tokens = tokenizer(words, boxes=boxes, padding=True, pad_to_multiple_of=8)
915
                    # for key, value in empty_tokens.items():
916
                    #     self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
917
                    for key, value in normal_tokens.items():
918
                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
919

920
                    normal_tokens = tokenizer(words, boxes=boxes, pad_to_multiple_of=8)
921
                    for key, value in normal_tokens.items():
922
                        self.assertNotEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
923

924
                    # Should also work with truncation
925
                    normal_tokens = tokenizer(words, boxes=boxes, padding=True, truncation=True, pad_to_multiple_of=8)
926
                    for key, value in normal_tokens.items():
927
                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
928

929
                    # truncation to something which is not a multiple of pad_to_multiple_of raises an error
930
                    self.assertRaises(
931
                        ValueError,
932
                        tokenizer.__call__,
933
                        words,
934
                        boxes=boxes,
935
                        padding=True,
936
                        truncation=True,
937
                        max_length=12,
938
                        pad_to_multiple_of=8,
939
                    )
940

941
    def test_tokenizer_slow_store_full_signature(self):
942
        signature = inspect.signature(self.tokenizer_class.__init__)
943
        tokenizer = self.get_tokenizer()
944

945
        for parameter_name, parameter in signature.parameters.items():
946
            if parameter.default != inspect.Parameter.empty:
947
                self.assertIn(parameter_name, tokenizer.init_kwargs)
948

949
    def test_build_inputs_with_special_tokens(self):
950
        if not self.test_slow_tokenizer:
951
            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
952
            return
953

954
        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
955
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
956
                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
957
                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
958

959
                # Input tokens id
960
                words, boxes = self.get_words_and_boxes()
961
                input_simple = tokenizer_p.encode(words, boxes=boxes, add_special_tokens=False)
962
                input_pair = tokenizer_p.encode(words, boxes=boxes, add_special_tokens=False)
963

964
                # Generate output
965
                output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple)
966
                output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple)
967
                self.assertEqual(output_p, output_r)
968

969
                # Generate pair output
970
                output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair)
971
                output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
972
                self.assertEqual(output_p, output_r)
973

974
    def test_special_tokens_mask_input_pairs(self):
975
        tokenizers = self.get_tokenizers(do_lower_case=False)
976
        for tokenizer in tokenizers:
977
            with self.subTest(f"{tokenizer.__class__.__name__}"):
978
                words, boxes = self.get_words_and_boxes()
979
                encoded_sequence = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
980
                encoded_sequence_dict = tokenizer.encode_plus(
981
                    words,
982
                    boxes=boxes,
983
                    add_special_tokens=True,
984
                    return_special_tokens_mask=True,
985
                    # add_prefix_space=False,
986
                )
987
                encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
988
                special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
989
                self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
990

991
                filtered_sequence = [
992
                    (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
993
                ]
994
                filtered_sequence = [x for x in filtered_sequence if x is not None]
995
                self.assertEqual(encoded_sequence, filtered_sequence)
996

997
    def test_special_tokens_mask(self):
998
        tokenizers = self.get_tokenizers(do_lower_case=False)
999
        for tokenizer in tokenizers:
1000
            with self.subTest(f"{tokenizer.__class__.__name__}"):
1001
                words, boxes = self.get_words_and_boxes()
1002
                # Testing single inputs
1003
                encoded_sequence = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
1004
                encoded_sequence_dict = tokenizer.encode_plus(
1005
                    words, boxes=boxes, add_special_tokens=True, return_special_tokens_mask=True
1006
                )
1007
                encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
1008
                special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
1009
                self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
1010

1011
                filtered_sequence = [x for i, x in enumerate(encoded_sequence_w_special) if not special_tokens_mask[i]]
1012
                self.assertEqual(encoded_sequence, filtered_sequence)
1013

1014
    def test_save_and_load_tokenizer(self):
1015
        # safety check on max_len default value so we are sure the test works
1016
        tokenizers = self.get_tokenizers()
1017
        for tokenizer in tokenizers:
1018
            with self.subTest(f"{tokenizer.__class__.__name__}"):
1019
                self.assertNotEqual(tokenizer.model_max_length, 42)
1020

1021
        # Now let's start the test
1022
        tokenizers = self.get_tokenizers()
1023
        for tokenizer in tokenizers:
1024
            with self.subTest(f"{tokenizer.__class__.__name__}"):
1025
                # Isolate this from the other tests because we save additional tokens/etc
1026
                words, boxes = self.get_words_and_boxes()
1027
                tmpdirname = tempfile.mkdtemp()
1028

1029
                before_tokens = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
1030
                before_vocab = tokenizer.get_vocab()
1031
                tokenizer.save_pretrained(tmpdirname)
1032

1033
                after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
1034
                after_tokens = after_tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
1035
                after_vocab = after_tokenizer.get_vocab()
1036
                self.assertListEqual(before_tokens, after_tokens)
1037
                self.assertDictEqual(before_vocab, after_vocab)
1038

1039
                shutil.rmtree(tmpdirname)
1040

1041
    @unittest.skip("Not implemented")
1042
    def test_right_and_left_truncation(self):
1043
        pass
1044

1045
    def test_right_and_left_padding(self):
1046
        tokenizers = self.get_tokenizers(do_lower_case=False)
1047
        for tokenizer in tokenizers:
1048
            with self.subTest(f"{tokenizer.__class__.__name__}"):
1049
                words, boxes = self.get_words_and_boxes()
1050
                sequence = "Sequence"
1051
                padding_size = 10
1052

1053
                # check correct behaviour if no pad_token_id exists and add it eventually
1054
                self._check_no_pad_token_padding(tokenizer, sequence)
1055

1056
                padding_idx = tokenizer.pad_token_id
1057

1058
                # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
1059
                tokenizer.padding_side = "right"
1060
                encoded_sequence = tokenizer.encode(words, boxes=boxes)
1061
                sequence_length = len(encoded_sequence)
1062
                padded_sequence = tokenizer.encode(
1063
                    words, boxes=boxes, max_length=sequence_length + padding_size, padding="max_length"
1064
                )
1065
                padded_sequence_length = len(padded_sequence)
1066
                assert sequence_length + padding_size == padded_sequence_length
1067
                assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
1068

1069
                # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
1070
                tokenizer.padding_side = "left"
1071
                encoded_sequence = tokenizer.encode(words, boxes=boxes)
1072
                sequence_length = len(encoded_sequence)
1073
                padded_sequence = tokenizer.encode(
1074
                    words, boxes=boxes, max_length=sequence_length + padding_size, padding="max_length"
1075
                )
1076
                padded_sequence_length = len(padded_sequence)
1077
                assert sequence_length + padding_size == padded_sequence_length
1078
                assert [padding_idx] * padding_size + encoded_sequence == padded_sequence
1079

1080
                # RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_padding'
1081
                encoded_sequence = tokenizer.encode(words, boxes=boxes)
1082
                sequence_length = len(encoded_sequence)
1083

1084
                tokenizer.padding_side = "right"
1085
                padded_sequence_right = tokenizer.encode(words, boxes=boxes, padding=True)
1086
                padded_sequence_right_length = len(padded_sequence_right)
1087
                assert sequence_length == padded_sequence_right_length
1088
                assert encoded_sequence == padded_sequence_right
1089

1090
                tokenizer.padding_side = "left"
1091
                padded_sequence_left = tokenizer.encode(words, boxes=boxes, padding="longest")
1092
                padded_sequence_left_length = len(padded_sequence_left)
1093
                assert sequence_length == padded_sequence_left_length
1094
                assert encoded_sequence == padded_sequence_left
1095

1096
                tokenizer.padding_side = "right"
1097
                padded_sequence_right = tokenizer.encode(words, boxes=boxes)
1098
                padded_sequence_right_length = len(padded_sequence_right)
1099
                assert sequence_length == padded_sequence_right_length
1100
                assert encoded_sequence == padded_sequence_right
1101

1102
                tokenizer.padding_side = "left"
1103
                padded_sequence_left = tokenizer.encode(words, boxes=boxes, padding=False)
1104
                padded_sequence_left_length = len(padded_sequence_left)
1105
                assert sequence_length == padded_sequence_left_length
1106
                assert encoded_sequence == padded_sequence_left
1107

1108
    def test_token_type_ids(self):
1109
        tokenizers = self.get_tokenizers()
1110
        for tokenizer in tokenizers:
1111
            with self.subTest(f"{tokenizer.__class__.__name__}"):
1112
                # test 1: single sequence
1113
                words, boxes = self.get_words_and_boxes()
1114

1115
                output = tokenizer(words, boxes=boxes, return_token_type_ids=True)
1116

1117
                # Assert that the token type IDs have the same length as the input IDs
1118
                self.assertEqual(len(output["token_type_ids"]), len(output["input_ids"]))
1119

1120
                # Assert that the token type IDs have the same length as the attention mask
1121
                self.assertEqual(len(output["token_type_ids"]), len(output["attention_mask"]))
1122

1123
                self.assertIn(0, output["token_type_ids"])
1124
                self.assertNotIn(1, output["token_type_ids"])
1125

1126
                # test 2: two sequences (question + words)
1127
                question, words, boxes = self.get_question_words_and_boxes()
1128

1129
                output = tokenizer(question, words, boxes, return_token_type_ids=True)
1130

1131
                # Assert that the token type IDs have the same length as the input IDs
1132
                self.assertEqual(len(output["token_type_ids"]), len(output["input_ids"]))
1133

1134
                # Assert that the token type IDs have the same length as the attention mask
1135
                self.assertEqual(len(output["token_type_ids"]), len(output["attention_mask"]))
1136

1137
                self.assertIn(0, output["token_type_ids"])
1138
                self.assertNotIn(1, output["token_type_ids"])
1139

1140
    def test_offsets_mapping(self):
1141
        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
1142
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
1143
                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
1144

1145
                text = ["a", "wonderful", "test"]
1146
                boxes = [[1, 8, 12, 20] for _ in range(len(text))]
1147

1148
                # No pair
1149
                tokens_with_offsets = tokenizer_r.encode_plus(
1150
                    text,
1151
                    boxes=boxes,
1152
                    return_special_tokens_mask=True,
1153
                    return_offsets_mapping=True,
1154
                    add_special_tokens=True,
1155
                )
1156
                added_tokens = tokenizer_r.num_special_tokens_to_add(False)
1157
                offsets = tokens_with_offsets["offset_mapping"]
1158

1159
                # Assert there is the same number of tokens and offsets
1160
                self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"]))
1161

1162
                # Assert there is online added_tokens special_tokens
1163
                self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
1164

1165
                # Pairs
1166
                text = "what's his name"
1167
                pair = ["a", "wonderful", "test"]
1168
                boxes = [[1, 8, 12, 20] for _ in range(len(pair))]
1169
                tokens_with_offsets = tokenizer_r.encode_plus(
1170
                    text,
1171
                    pair,
1172
                    boxes=boxes,
1173
                    return_special_tokens_mask=True,
1174
                    return_offsets_mapping=True,
1175
                    add_special_tokens=True,
1176
                )
1177
                added_tokens = tokenizer_r.num_special_tokens_to_add(True)
1178
                offsets = tokens_with_offsets["offset_mapping"]
1179

1180
                # Assert there is the same number of tokens and offsets
1181
                self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"]))
1182

1183
                # Assert there is online added_tokens special_tokens
1184
                self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
1185

1186
    @require_torch
1187
    @slow
1188
    def test_torch_encode_plus_sent_to_model(self):
1189
        import torch
1190

1191
        from transformers import MODEL_MAPPING, TOKENIZER_MAPPING
1192

1193
        MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(MODEL_MAPPING, TOKENIZER_MAPPING)
1194

1195
        tokenizers = self.get_tokenizers(do_lower_case=False)
1196
        for tokenizer in tokenizers:
1197
            with self.subTest(f"{tokenizer.__class__.__name__}"):
1198
                if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
1199
                    return
1200

1201
                config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
1202
                config = config_class()
1203

1204
                if config.is_encoder_decoder or config.pad_token_id is None:
1205
                    return
1206

1207
                model = model_class(config)
1208

1209
                # Make sure the model contains at least the full vocabulary size in its embedding matrix
1210
                is_using_common_embeddings = hasattr(model.get_input_embeddings(), "weight")
1211
                assert (
1212
                    (model.get_input_embeddings().weight.shape[0] >= len(tokenizer))
1213
                    if is_using_common_embeddings
1214
                    else True
1215
                )
1216

1217
                # Build sequence
1218
                words, boxes = self.get_words_and_boxes()
1219
                encoded_sequence = tokenizer.encode_plus(words, boxes=boxes, return_tensors="pt")
1220
                batch_encoded_sequence = tokenizer.batch_encode_plus(
1221
                    [words, words], [boxes, boxes], return_tensors="pt"
1222
                )
1223
                # This should not fail
1224

1225
                with torch.no_grad():  # saves some time
1226
                    model(**encoded_sequence)
1227
                    model(**batch_encoded_sequence)
1228

1229
    def test_rust_and_python_full_tokenizers(self):
1230
        if not self.test_rust_tokenizer:
1231
            return
1232

1233
        if not self.test_slow_tokenizer:
1234
            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
1235
            return
1236

1237
        tokenizer = self.get_tokenizer()
1238
        rust_tokenizer = self.get_rust_tokenizer()
1239

1240
        words, boxes = self.get_words_and_boxes()
1241

1242
        ids = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
1243
        rust_ids = rust_tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
1244
        self.assertListEqual(ids, rust_ids)
1245

1246
        ids = tokenizer.encode(words, boxes=boxes, add_special_tokens=True)
1247
        rust_ids = rust_tokenizer.encode(words, boxes=boxes, add_special_tokens=True)
1248
        self.assertListEqual(ids, rust_ids)
1249

1250
    def test_tokenization_python_rust_equals(self):
1251
        if not self.test_slow_tokenizer:
1252
            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
1253
            return
1254

1255
        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
1256
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
1257
                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
1258
                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
1259

1260
                words, boxes = self.get_words_and_boxes()
1261

1262
                # Ensure basic input match
1263
                input_p = tokenizer_p.encode_plus(words, boxes=boxes)
1264
                input_r = tokenizer_r.encode_plus(words, boxes=boxes)
1265

1266
                for key in filter(
1267
                    lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
1268
                ):
1269
                    self.assertSequenceEqual(input_p[key], input_r[key])
1270

1271
                input_pairs_p = tokenizer_p.encode_plus(words, boxes=boxes)
1272
                input_pairs_r = tokenizer_r.encode_plus(words, boxes=boxes)
1273

1274
                for key in filter(
1275
                    lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
1276
                ):
1277
                    self.assertSequenceEqual(input_pairs_p[key], input_pairs_r[key])
1278

1279
                words = ["hello" for _ in range(1000)]
1280
                boxes = [[1000, 1000, 1000, 1000] for _ in range(1000)]
1281

1282
                # Ensure truncation match
1283
                input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=512, truncation=True)
1284
                input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=512, truncation=True)
1285

1286
                for key in filter(
1287
                    lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
1288
                ):
1289
                    self.assertSequenceEqual(input_p[key], input_r[key])
1290

1291
                # Ensure truncation with stride match
1292
                input_p = tokenizer_p.encode_plus(
1293
                    words, boxes=boxes, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
1294
                )
1295
                input_r = tokenizer_r.encode_plus(
1296
                    words, boxes=boxes, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
1297
                )
1298

1299
                for key in filter(
1300
                    lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
1301
                ):
1302
                    self.assertSequenceEqual(input_p[key], input_r[key][0])
1303

1304
    def test_embeded_special_tokens(self):
1305
        if not self.test_slow_tokenizer:
1306
            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
1307
            return
1308

1309
        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
1310
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
1311
                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
1312
                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
1313
                words, boxes = self.get_words_and_boxes()
1314
                tokens_r = tokenizer_r.encode_plus(
1315
                    words,
1316
                    boxes=boxes,
1317
                    add_special_tokens=True,
1318
                )
1319
                tokens_p = tokenizer_p.encode_plus(
1320
                    words,
1321
                    boxes=boxes,
1322
                    add_special_tokens=True,
1323
                )
1324

1325
                for key in tokens_p.keys():
1326
                    self.assertEqual(tokens_r[key], tokens_p[key])
1327

1328
                if "token_type_ids" in tokens_r:
1329
                    self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
1330

1331
                tokens_r = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
1332
                tokens_p = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
1333
                self.assertSequenceEqual(tokens_r, tokens_p)
1334

1335
    def test_compare_add_special_tokens(self):
1336
        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
1337
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
1338
                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
1339

1340
                simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False)
1341

1342
                words, boxes = self.get_words_and_boxes()
1343
                # tokenize()
1344
                no_special_tokens = tokenizer_r.tokenize(" ".join(words), add_special_tokens=False)
1345
                with_special_tokens = tokenizer_r.tokenize(" ".join(words), add_special_tokens=True)
1346
                self.assertEqual(len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add)
1347

1348
                # encode()
1349
                no_special_tokens = tokenizer_r.encode(words, boxes=boxes, add_special_tokens=False)
1350
                with_special_tokens = tokenizer_r.encode(words, boxes=boxes, add_special_tokens=True)
1351
                self.assertEqual(len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add)
1352

1353
                # encode_plus()
1354
                no_special_tokens = tokenizer_r.encode_plus(words, boxes=boxes, add_special_tokens=False)
1355
                with_special_tokens = tokenizer_r.encode_plus(words, boxes=boxes, add_special_tokens=True)
1356
                for key in no_special_tokens.keys():
1357
                    self.assertEqual(
1358
                        len(no_special_tokens[key]),
1359
                        len(with_special_tokens[key]) - simple_num_special_tokens_to_add,
1360
                    )
1361

1362
                # # batch_encode_plus
1363
                words, boxes = self.get_words_and_boxes_batch()
1364

1365
                no_special_tokens = tokenizer_r.batch_encode_plus(words, boxes=boxes, add_special_tokens=False)
1366
                with_special_tokens = tokenizer_r.batch_encode_plus(words, boxes=boxes, add_special_tokens=True)
1367
                for key in no_special_tokens.keys():
1368
                    for i_no, i_with in zip(no_special_tokens[key], with_special_tokens[key]):
1369
                        self.assertEqual(len(i_no), len(i_with) - simple_num_special_tokens_to_add)
1370

1371
    @slow
1372
    def test_layoutxlm_truncation_integration_test(self):
1373
        words, boxes = self.get_words_and_boxes()
1374

1375
        tokenizer = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base", model_max_length=512)
1376

1377
        for i in range(12, 512):
1378
            new_encoded_inputs = tokenizer.encode(words, boxes=boxes, max_length=i, truncation=True)
1379

1380
            # Ensure that the input IDs are less than the max length defined.
1381
            self.assertLessEqual(len(new_encoded_inputs), i)
1382

1383
        tokenizer.model_max_length = 20
1384
        new_encoded_inputs = tokenizer.encode(words, boxes=boxes, truncation=True)
1385
        dropped_encoded_inputs = tokenizer.encode(words, boxes=boxes, truncation=True)
1386

1387
        # Ensure that the input IDs are still truncated when no max_length is specified
1388
        self.assertListEqual(new_encoded_inputs, dropped_encoded_inputs)
1389
        self.assertLessEqual(len(new_encoded_inputs), 20)
1390

1391
    @is_pt_tf_cross_test
1392
    def test_batch_encode_plus_tensors(self):
1393
        tokenizers = self.get_tokenizers(do_lower_case=False)
1394
        for tokenizer in tokenizers:
1395
            with self.subTest(f"{tokenizer.__class__.__name__}"):
1396
                words, boxes = self.get_words_and_boxes_batch()
1397

1398
                # A Tensor cannot be build by sequences which are not the same size
1399
                self.assertRaises(ValueError, tokenizer.batch_encode_plus, words, boxes=boxes, return_tensors="pt")
1400
                self.assertRaises(ValueError, tokenizer.batch_encode_plus, words, boxes=boxes, return_tensors="tf")
1401

1402
                if tokenizer.pad_token_id is None:
1403
                    self.assertRaises(
1404
                        ValueError,
1405
                        tokenizer.batch_encode_plus,
1406
                        words,
1407
                        boxes=boxes,
1408
                        padding=True,
1409
                        return_tensors="pt",
1410
                    )
1411
                    self.assertRaises(
1412
                        ValueError,
1413
                        tokenizer.batch_encode_plus,
1414
                        words,
1415
                        boxes=boxes,
1416
                        padding="longest",
1417
                        return_tensors="tf",
1418
                    )
1419
                else:
1420
                    pytorch_tensor = tokenizer.batch_encode_plus(words, boxes=boxes, padding=True, return_tensors="pt")
1421
                    tensorflow_tensor = tokenizer.batch_encode_plus(
1422
                        words, boxes=boxes, padding="longest", return_tensors="tf"
1423
                    )
1424
                    encoded_sequences = tokenizer.batch_encode_plus(words, boxes=boxes, padding=True)
1425

1426
                    for key in encoded_sequences.keys():
1427
                        pytorch_value = pytorch_tensor[key].tolist()
1428
                        tensorflow_value = tensorflow_tensor[key].numpy().tolist()
1429
                        encoded_value = encoded_sequences[key]
1430

1431
                        self.assertEqual(pytorch_value, tensorflow_value, encoded_value)
1432

1433
    def test_sequence_ids(self):
1434
        tokenizers = self.get_tokenizers()
1435
        for tokenizer in tokenizers:
1436
            if not tokenizer.is_fast:
1437
                continue
1438
            with self.subTest(f"{tokenizer.__class__.__name__}"):
1439
                seq_0 = "Test this method."
1440
                seq_1 = ["With", "these", "inputs."]
1441
                boxes = [[1000, 1000, 1000, 1000] for _ in range(len(seq_1))]
1442

1443
                # We want to have sequence 0 and sequence 1 are tagged
1444
                # respectively with 0 and 1 token_ids
1445
                # (regardless of whether the model use token type ids)
1446
                # We use this assumption in the QA pipeline among other place
1447
                output = tokenizer(seq_0.split(), boxes=boxes)
1448
                self.assertIn(0, output.sequence_ids())
1449

1450
                output = tokenizer(seq_0, seq_1, boxes=boxes)
1451
                self.assertIn(0, output.sequence_ids())
1452
                self.assertIn(1, output.sequence_ids())
1453

1454
                if tokenizer.num_special_tokens_to_add(pair=True):
1455
                    self.assertIn(None, output.sequence_ids())
1456

1457
    def test_special_tokens_initialization(self):
1458
        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
1459
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
1460
                added_tokens = [AddedToken("<special>", lstrip=True)]
1461

1462
                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
1463
                    pretrained_name, additional_special_tokens=added_tokens, **kwargs
1464
                )
1465
                words = "Hey this is a <special> token".split()
1466
                boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
1467
                r_output = tokenizer_r.encode(words, boxes=boxes)
1468

1469
                special_token_id = tokenizer_r.encode(
1470
                    ["<special>"], boxes=[1000, 1000, 1000, 1000], add_special_tokens=False
1471
                )[0]
1472

1473
                self.assertTrue(special_token_id in r_output)
1474

1475
                if self.test_slow_tokenizer:
1476
                    tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
1477
                        pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
1478
                    )
1479
                    tokenizer_p = self.tokenizer_class.from_pretrained(
1480
                        pretrained_name, additional_special_tokens=added_tokens, **kwargs
1481
                    )
1482

1483
                    words = "Hey this is a <special> token".split()
1484
                    boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
1485

1486
                    p_output = tokenizer_p.encode(words, boxes=boxes)
1487
                    cr_output = tokenizer_cr.encode(words, boxes=boxes)
1488

1489
                    self.assertEqual(p_output, r_output)
1490
                    self.assertEqual(cr_output, r_output)
1491
                    self.assertTrue(special_token_id in p_output)
1492
                    self.assertTrue(special_token_id in cr_output)
1493

1494
    def test_training_new_tokenizer(self):
1495
        # This feature only exists for fast tokenizers
1496
        if not self.test_rust_tokenizer:
1497
            return
1498

1499
        tokenizer = self.get_rust_tokenizer()
1500
        new_tokenizer = tokenizer.train_new_from_iterator(SMALL_TRAINING_CORPUS, 100)
1501

1502
        # Test we can use the new tokenizer with something not seen during training
1503
        text = [["this", "is", "the"], ["how", "are", "you"]]
1504
        boxes = [[[1, 2, 3, 4], [5, 6, 7, 8], [1, 3, 4, 8]], [[5, 6, 7, 8], [4, 5, 6, 7], [3, 9, 2, 7]]]
1505
        inputs = new_tokenizer(text, boxes=boxes)
1506
        self.assertEqual(len(inputs["input_ids"]), 2)
1507
        decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)
1508
        expected_result = "this is the"
1509

1510
        if tokenizer.backend_tokenizer.normalizer is not None:
1511
            expected_result = tokenizer.backend_tokenizer.normalizer.normalize_str(expected_result)
1512
        self.assertEqual(expected_result, decoded_input)
1513

1514
        # We check that the parameters of the tokenizer remained the same
1515
        # Check we have the same number of added_tokens for both pair and non-pair inputs.
1516
        self.assertEqual(tokenizer.num_special_tokens_to_add(False), new_tokenizer.num_special_tokens_to_add(False))
1517
        self.assertEqual(tokenizer.num_special_tokens_to_add(True), new_tokenizer.num_special_tokens_to_add(True))
1518

1519
        # Check we have the correct max_length for both pair and non-pair inputs.
1520
        self.assertEqual(tokenizer.max_len_single_sentence, new_tokenizer.max_len_single_sentence)
1521
        self.assertEqual(tokenizer.max_len_sentences_pair, new_tokenizer.max_len_sentences_pair)
1522

1523
        # Assert the set of special tokens match as we didn't ask to change them
1524
        self.assertSequenceEqual(
1525
            tokenizer.all_special_tokens_extended,
1526
            new_tokenizer.all_special_tokens_extended,
1527
        )
1528

1529
        self.assertDictEqual(tokenizer.special_tokens_map, new_tokenizer.special_tokens_map)
1530

1531
    def test_training_new_tokenizer_with_special_tokens_change(self):
1532
        # This feature only exists for fast tokenizers
1533
        if not self.test_rust_tokenizer:
1534
            return
1535

1536
        tokenizer = self.get_rust_tokenizer()
1537
        # Test with a special tokens map
1538
        class_signature = inspect.signature(tokenizer.__class__)
1539
        if "cls_token" in class_signature.parameters:
1540
            new_tokenizer = tokenizer.train_new_from_iterator(
1541
                SMALL_TRAINING_CORPUS, 100, special_tokens_map={tokenizer.cls_token: "<cls>"}
1542
            )
1543
            cls_id = new_tokenizer.get_vocab()["<cls>"]
1544
            self.assertEqual(new_tokenizer.cls_token, "<cls>")
1545
            self.assertEqual(new_tokenizer.cls_token_id, cls_id)
1546

1547
        # Create a new mapping from the special tokens defined in the original tokenizer
1548
        special_tokens_list = SpecialTokensMixin.SPECIAL_TOKENS_ATTRIBUTES.copy()
1549
        special_tokens_list.remove("additional_special_tokens")
1550
        special_tokens_map = {}
1551
        for token in special_tokens_list:
1552
            # Get the private one to avoid unnecessary warnings.
1553
            if getattr(tokenizer, f"_{token}") is not None:
1554
                special_token = getattr(tokenizer, token)
1555
                special_tokens_map[special_token] = f"{special_token}a"
1556

1557
        # Train new tokenizer
1558
        new_tokenizer = tokenizer.train_new_from_iterator(
1559
            SMALL_TRAINING_CORPUS, 100, special_tokens_map=special_tokens_map
1560
        )
1561

1562
        # Check the changes
1563
        for token in special_tokens_list:
1564
            # Get the private one to avoid unnecessary warnings.
1565
            if getattr(tokenizer, f"_{token}") is None:
1566
                continue
1567
            special_token = getattr(tokenizer, token)
1568
            if special_token in special_tokens_map:
1569
                new_special_token = getattr(new_tokenizer, token)
1570
                self.assertEqual(special_tokens_map[special_token], new_special_token)
1571

1572
                new_id = new_tokenizer.get_vocab()[new_special_token]
1573
                self.assertEqual(getattr(new_tokenizer, f"{token}_id"), new_id)
1574

1575
        # Check if the AddedToken / string format has been kept
1576
        for special_token in tokenizer.all_special_tokens_extended:
1577
            if isinstance(special_token, AddedToken) and special_token.content not in special_tokens_map:
1578
                # The special token must appear identically in the list of the new tokenizer.
1579
                self.assertTrue(
1580
                    special_token in new_tokenizer.all_special_tokens_extended,
1581
                    f"'{special_token}' should be in {new_tokenizer.all_special_tokens_extended}",
1582
                )
1583
            elif isinstance(special_token, AddedToken):
1584
                # The special token must appear in the list of the new tokenizer as an object of type AddedToken with
1585
                # the same parameters as the old AddedToken except the content that the user has requested to change.
1586
                special_token_str = special_token.content
1587
                new_special_token_str = special_tokens_map[special_token_str]
1588

1589
                find = False
1590
                for candidate in new_tokenizer.all_special_tokens_extended:
1591
                    if (
1592
                        isinstance(candidate, AddedToken)
1593
                        and candidate.content == new_special_token_str
1594
                        and candidate.lstrip == special_token.lstrip
1595
                        and candidate.rstrip == special_token.rstrip
1596
                        and candidate.normalized == special_token.normalized
1597
                        and candidate.single_word == special_token.single_word
1598
                    ):
1599
                        find = True
1600
                        break
1601
                self.assertTrue(
1602
                    find,
1603
                    f"'{new_special_token_str}' doesn't appear in the list "
1604
                    f"'{new_tokenizer.all_special_tokens_extended}' as an AddedToken with the same parameters as "
1605
                    f"'{special_token}' in the list {tokenizer.all_special_tokens_extended}",
1606
                )
1607
            elif special_token not in special_tokens_map:
1608
                # The special token must appear identically in the list of the new tokenizer.
1609
                self.assertTrue(
1610
                    special_token in new_tokenizer.all_special_tokens_extended,
1611
                    f"'{special_token}' should be in {new_tokenizer.all_special_tokens_extended}",
1612
                )
1613

1614
            else:
1615
                # The special token must appear in the list of the new tokenizer as an object of type string.
1616
                self.assertTrue(special_tokens_map[special_token] in new_tokenizer.all_special_tokens_extended)
1617

1618
        # Test we can use the new tokenizer with something not seen during training
1619
        words = [["this", "is"], ["hello", "🤗"]]
1620
        boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[1, 2, 3, 4], [5, 6, 7, 8]]]
1621
        inputs = new_tokenizer(words, boxes=boxes)
1622
        self.assertEqual(len(inputs["input_ids"]), 2)
1623
        decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)
1624
        expected_result = "this is"
1625

1626
        if tokenizer.backend_tokenizer.normalizer is not None:
1627
            expected_result = tokenizer.backend_tokenizer.normalizer.normalize_str(expected_result)
1628
        self.assertEqual(expected_result, decoded_input)
1629

1630
    def test_prepare_for_model(self):
1631
        tokenizers = self.get_tokenizers(do_lower_case=False)
1632
        for tokenizer in tokenizers:
1633
            # only test prepare_for_model for the slow tokenizer
1634
            if tokenizer.__class__.__name__ == "LayoutXLMTokenizerFast":
1635
                continue
1636
            with self.subTest(f"{tokenizer.__class__.__name__}"):
1637
                words, boxes = self.get_words_and_boxes()
1638
                prepared_input_dict = tokenizer.prepare_for_model(words, boxes=boxes, add_special_tokens=True)
1639

1640
                input_dict = tokenizer.encode_plus(words, boxes=boxes, add_special_tokens=True)
1641

1642
                self.assertEqual(input_dict, prepared_input_dict)
1643

1644
    def test_padding_different_model_input_name(self):
1645
        if not self.test_slow_tokenizer:
1646
            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
1647
            return
1648

1649
        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
1650
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
1651
                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
1652
                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
1653
                self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
1654
                pad_token_id = tokenizer_p.pad_token_id
1655

1656
                words, boxes = self.get_words_and_boxes_batch()
1657

1658
                input_r = tokenizer_r.batch_encode_plus(words, boxes=boxes)
1659
                input_p = tokenizer_r.batch_encode_plus(words, boxes=boxes)
1660

1661
                # rename encoded batch to "inputs"
1662
                input_r["inputs"] = input_r[tokenizer_r.model_input_names[0]]
1663
                del input_r[tokenizer_r.model_input_names[0]]
1664

1665
                input_p["inputs"] = input_p[tokenizer_p.model_input_names[0]]
1666
                del input_p[tokenizer_p.model_input_names[0]]
1667

1668
                # Renaming `input_ids` to `inputs`
1669
                tokenizer_r.model_input_names = ["inputs"] + tokenizer_r.model_input_names[1:]
1670
                tokenizer_p.model_input_names = ["inputs"] + tokenizer_p.model_input_names[1:]
1671

1672
                input_r = tokenizer_r.pad(input_r, padding="longest")
1673
                input_p = tokenizer_r.pad(input_p, padding="longest")
1674

1675
                max_length = len(input_p["inputs"][0])
1676
                self.assert_batch_padded_input_match(
1677
                    input_r, input_p, max_length, pad_token_id, model_main_input_name="inputs"
1678
                )
1679

1680
    def test_batch_encode_dynamic_overflowing(self):
1681
        """
1682
        When calling batch_encode with multiple sequences, it can return different number of
1683
        overflowing encoding for each sequence:
1684
        [
1685
          Sequence 1: [Encoding 1, Encoding 2],
1686
          Sequence 2: [Encoding 1],
1687
          Sequence 3: [Encoding 1, Encoding 2, ... Encoding N]
1688
        ]
1689
        This needs to be padded so that it can represented as a tensor
1690
        """
1691
        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
1692
            tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
1693

1694
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name}, {tokenizer.__class__.__name__})"):
1695
                if is_torch_available():
1696
                    returned_tensor = "pt"
1697
                elif is_tf_available():
1698
                    returned_tensor = "tf"
1699
                else:
1700
                    returned_tensor = "jax"
1701

1702
                # Single example
1703
                words, boxes = self.get_words_and_boxes()
1704
                tokens = tokenizer.encode_plus(
1705
                    words,
1706
                    boxes=boxes,
1707
                    max_length=6,
1708
                    padding=True,
1709
                    truncation=True,
1710
                    return_tensors=returned_tensor,
1711
                    return_overflowing_tokens=True,
1712
                )
1713

1714
                for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
1715
                    if key != "bbox":
1716
                        self.assertEqual(len(tokens[key].shape), 2)
1717
                    else:
1718
                        self.assertEqual(len(tokens[key].shape), 3)
1719

1720
                # Batch of examples
1721
                # For these 2 examples, 3 training examples will be created
1722
                words, boxes = self.get_words_and_boxes_batch()
1723
                tokens = tokenizer.batch_encode_plus(
1724
                    words,
1725
                    boxes=boxes,
1726
                    max_length=6,
1727
                    padding=True,
1728
                    truncation="only_first",
1729
                    return_tensors=returned_tensor,
1730
                    return_overflowing_tokens=True,
1731
                )
1732

1733
                for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
1734
                    if key != "bbox":
1735
                        self.assertEqual(len(tokens[key].shape), 2)
1736
                        self.assertEqual(tokens[key].shape[-1], 6)
1737
                    else:
1738
                        self.assertEqual(len(tokens[key].shape), 3)
1739
                        self.assertEqual(tokens[key].shape[-1], 4)
1740

1741
    # overwrite from test_tokenization_common to speed up test
1742
    def test_save_pretrained(self):
1743
        if not self.test_slow_tokenizer:
1744
            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
1745
            return
1746

1747
        self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-layoutxlm", {})
1748
        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
1749
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
1750
                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
1751
                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
1752

1753
                tmpdirname2 = tempfile.mkdtemp()
1754

1755
                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
1756
                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
1757

1758
                # Checks it save with the same files + the tokenizer.json file for the fast one
1759
                self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
1760
                tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f)
1761
                self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
1762

1763
                # Checks everything loads correctly in the same way
1764
                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
1765
                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
1766

1767
                # Check special tokens are set accordingly on Rust and Python
1768
                for key in tokenizer_pp.special_tokens_map:
1769
                    self.assertTrue(hasattr(tokenizer_rp, key))
1770
                    # self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key))
1771
                    # self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id"))
1772

1773
                shutil.rmtree(tmpdirname2)
1774

1775
                # Save tokenizer rust, legacy_format=True
1776
                tmpdirname2 = tempfile.mkdtemp()
1777

1778
                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True)
1779
                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
1780

1781
                # Checks it save with the same files
1782
                self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
1783

1784
                # Checks everything loads correctly in the same way
1785
                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
1786
                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
1787

1788
                # Check special tokens are set accordingly on Rust and Python
1789
                for key in tokenizer_pp.special_tokens_map:
1790
                    self.assertTrue(hasattr(tokenizer_rp, key))
1791

1792
                shutil.rmtree(tmpdirname2)
1793

1794
                # Save tokenizer rust, legacy_format=False
1795
                tmpdirname2 = tempfile.mkdtemp()
1796

1797
                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False)
1798
                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
1799

1800
                # Checks it saved the tokenizer.json file
1801
                self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
1802

1803
                # Checks everything loads correctly in the same way
1804
                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
1805
                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
1806

1807
                # Check special tokens are set accordingly on Rust and Python
1808
                for key in tokenizer_pp.special_tokens_map:
1809
                    self.assertTrue(hasattr(tokenizer_rp, key))
1810

1811
                shutil.rmtree(tmpdirname2)
1812

1813
    @unittest.skip("TO DO: overwrite this very extensive test.")
1814
    def test_alignement_methods(self):
1815
        pass
1816

1817
    @unittest.skip("layoutxlm tokenizer requires boxes besides sequences.")
1818
    def test_maximum_encoding_length_pair_input(self):
1819
        pass
1820

1821
    @unittest.skip("layoutxlm tokenizer requires boxes besides sequences.")
1822
    def test_maximum_encoding_length_single_input(self):
1823
        pass
1824

1825
    @unittest.skip("layoutxlm tokenizer requires boxes besides sequences.")
1826
    def test_pretokenized_inputs(self):
1827
        pass
1828

1829
    @unittest.skip("layoutxlm tokenizer always expects pretokenized inputs.")
1830
    def test_compare_pretokenized_inputs(self):
1831
        pass
1832

1833
    @unittest.skip("layoutxlm fast tokenizer does not support prepare_for_model")
1834
    def test_compare_prepare_for_model(self):
1835
        pass
1836

1837
    @slow
1838
    def test_only_label_first_subword(self):
1839
        words = ["hello", "niels"]
1840
        boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
1841
        word_labels = [0, 1]
1842

1843
        # test slow tokenizer
1844
        tokenizer_p = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base")
1845
        encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels)
1846
        self.assertListEqual(encoding.labels, [-100, 0, -100, 1, -100, -100])
1847

1848
        tokenizer_p = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base", only_label_first_subword=False)
1849
        encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels)
1850
        self.assertListEqual(encoding.labels, [-100, 0, 0, 1, 1, -100])
1851

1852
        # test fast tokenizer
1853
        tokenizer_r = LayoutXLMTokenizerFast.from_pretrained("microsoft/layoutxlm-base")
1854
        encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels)
1855
        self.assertListEqual(encoding.labels, [-100, 0, -100, 1, -100, -100])
1856

1857
        tokenizer_r = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base", only_label_first_subword=False)
1858
        encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels)
1859
        self.assertListEqual(encoding.labels, [-100, 0, 0, 1, 1, -100])
1860

1861
    @slow
1862
    def test_layoutxlm_integration_test(self):
1863
        tokenizer_p = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base")
1864
        tokenizer_r = LayoutXLMTokenizerFast.from_pretrained("microsoft/layoutxlm-base")
1865

1866
        # There are 3 cases:
1867
        # CASE 1: document image classification (training + inference), document image token classification (inference),
1868
        # in which case only words and normalized bounding boxes are provided to the tokenizer
1869
        # CASE 2: document image token classification (training),
1870
        # in which case one also provides word labels to the tokenizer
1871
        # CASE 3: document image visual question answering (inference),
1872
        # in which case one also provides a question to the tokenizer
1873

1874
        # We need to test all 3 cases both on batched and non-batched inputs.
1875

1876
        # CASE 1: not batched
1877
        words, boxes = self.get_words_and_boxes()
1878

1879
        expected_results = {'input_ids': [0, 10, 179459, 538, 3034, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'bbox': [[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], 'attention_mask': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}  # fmt: skip
1880

1881
        encoding_p = tokenizer_p(words, boxes=boxes, padding="max_length", max_length=20)
1882
        encoding_r = tokenizer_r(words, boxes=boxes, padding="max_length", max_length=20)
1883
        self.assertDictEqual(dict(encoding_p), expected_results)
1884
        self.assertDictEqual(dict(encoding_r), expected_results)
1885

1886
        # CASE 1: batched
1887
        words, boxes = self.get_words_and_boxes_batch()
1888

1889
        expected_results = {'input_ids': [[0, 10, 179459, 538, 3034, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 33600, 31, 759, 9351, 83, 21895, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'bbox': [[[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [961, 885, 992, 912], [961, 885, 992, 912], [256, 38, 330, 58], [256, 38, 330, 58], [336, 42, 353, 57], [34, 42, 66, 69], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # fmt: skip
1890

1891
        encoding_p = tokenizer_p(words, boxes=boxes, padding="max_length", max_length=20)
1892
        encoding_r = tokenizer_r(words, boxes=boxes, padding="max_length", max_length=20)
1893
        self.assertDictEqual(dict(encoding_p), expected_results)
1894
        self.assertDictEqual(dict(encoding_r), expected_results)
1895

1896
        # CASE 2: not batched
1897
        words, boxes = self.get_words_and_boxes()
1898
        word_labels = [1, 2, 3]
1899

1900
        expected_results = {'input_ids': [0, 10, 179459, 538, 3034, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'bbox': [[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], 'labels': [-100, 1, 2, -100, 3, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], 'attention_mask': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}  # fmt: skip
1901

1902
        encoding_p = tokenizer_p(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20)
1903
        encoding_r = tokenizer_r(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20)
1904
        self.assertDictEqual(dict(encoding_p), expected_results)
1905
        self.assertDictEqual(dict(encoding_r), expected_results)
1906

1907
        # CASE 2: batched
1908
        words, boxes = self.get_words_and_boxes_batch()
1909
        word_labels = [[1, 2, 3], [2, 46, 17, 22, 3]]
1910

1911
        expected_results = {'input_ids': [[0, 10, 179459, 538, 3034, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 33600, 31, 759, 9351, 83, 21895, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'bbox': [[[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [961, 885, 992, 912], [961, 885, 992, 912], [256, 38, 330, 58], [256, 38, 330, 58], [336, 42, 353, 57], [34, 42, 66, 69], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], 'labels': [[-100, 1, 2, -100, 3, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], [-100, 2, -100, 46, 17, 22, 3, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # fmt: skip
1912

1913
        encoding_p = tokenizer_p(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20)
1914
        encoding_r = tokenizer_r(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20)
1915
        self.assertDictEqual(dict(encoding_p), expected_results)
1916
        self.assertDictEqual(dict(encoding_r), expected_results)
1917

1918
        # CASE 3: not batched
1919
        question, words, boxes = self.get_question_words_and_boxes()
1920

1921
        expected_results = {'input_ids': [0, 2367, 25, 7, 1919, 9351, 32, 2, 2, 10, 179459, 538, 3034, 2, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0], 'bbox': [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1000, 1000, 1000, 1000], [1000, 1000, 1000, 1000], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]}  # fmt: skip
1922

1923
        encoding_p = tokenizer_p(question, words, boxes, padding="max_length", max_length=20)
1924
        encoding_r = tokenizer_r(question, words, boxes, padding="max_length", max_length=20)
1925
        self.assertDictEqual(dict(encoding_p), expected_results)
1926
        self.assertDictEqual(dict(encoding_r), expected_results)
1927

1928
        # CASE 3: batched
1929
        questions, words, boxes = self.get_question_words_and_boxes_batch()
1930

1931
        expected_results = {'input_ids': [[0, 2367, 25, 7, 1919, 9351, 32, 2, 2, 10, 179459, 538, 3034, 2, 1, 1, 1, 1, 1, 1], [0, 3642, 83, 764, 35839, 32, 2, 2, 2367, 10, 21, 3190, 53496, 19, 2, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]], 'bbox': [[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1000, 1000, 1000, 1000], [1000, 1000, 1000, 1000], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1000, 1000, 1000, 1000], [1000, 1000, 1000, 1000], [256, 38, 330, 58], [256, 38, 330, 58], [336, 42, 353, 57], [336, 42, 353, 57], [34, 42, 66, 69], [34, 42, 66, 69], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]}  # fmt: skip
1932
        encoding_p = tokenizer_p(questions, words, boxes, padding="max_length", max_length=20)
1933
        encoding_r = tokenizer_r(questions, words, boxes, padding="max_length", max_length=20)
1934
        self.assertDictEqual(dict(encoding_p), expected_results)
1935
        self.assertDictEqual(dict(encoding_r), expected_results)
1936

1937
    @unittest.skip("Doesn't support another framework than PyTorch")
1938
    def test_np_encode_plus_sent_to_model(self):
1939
        pass
1940

1941
    @unittest.skip("Doesn't use SentencePiece")
1942
    def test_sentencepiece_tokenize_and_convert_tokens_to_string(self):
1943
        pass
1944

1945
    @unittest.skip("Doesn't use SentencePiece")
1946
    def test_sentencepiece_tokenize_and_decode(self):
1947
        pass
1948

1949
    @unittest.skip("Chat is not supported")
1950
    def test_chat_template(self):
1951
        pass
1952

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.