transformers

Форк
0
/
test_tokenization_layoutlmv3.py 
2433 строки · 123.1 Кб
1
# coding=utf-8
2
# Copyright 2022 The HuggingFace Inc. team.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15

16
import inspect
17
import json
18
import os
19
import re
20
import shutil
21
import tempfile
22
import unittest
23
from typing import List
24

25
from transformers import (
26
    AddedToken,
27
    LayoutLMv3TokenizerFast,
28
    SpecialTokensMixin,
29
    is_tf_available,
30
    is_torch_available,
31
    logging,
32
)
33
from transformers.models.layoutlmv3.tokenization_layoutlmv3 import VOCAB_FILES_NAMES, LayoutLMv3Tokenizer
34
from transformers.testing_utils import (
35
    is_pt_tf_cross_test,
36
    require_pandas,
37
    require_tf,
38
    require_tokenizers,
39
    require_torch,
40
    slow,
41
)
42

43
from ...test_tokenization_common import SMALL_TRAINING_CORPUS, TokenizerTesterMixin, merge_model_tokenizer_mappings
44

45

46
logger = logging.get_logger(__name__)
47

48

49
@require_tokenizers
50
@require_pandas
51
class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
52
    tokenizer_class = LayoutLMv3Tokenizer
53
    rust_tokenizer_class = LayoutLMv3TokenizerFast
54
    test_rust_tokenizer = True
55
    # determined by the tokenization algortihm and the way it's decoded by the fast tokenizers
56
    space_between_special_tokens = False
57
    test_seq2seq = False
58
    from_pretrained_kwargs = {"cls_token": "<s>"}
59

60
    def get_words_and_boxes(self):
61
        words = ["lower", "newer"]
62
        boxes = [[423, 237, 440, 251], [427, 272, 441, 287]]
63

64
        return words, boxes
65

66
    def get_words_and_boxes_batch(self):
67
        words = [["lower", "newer"], ["new", "low"]]
68
        boxes = [
69
            [[423, 237, 440, 251], [427, 272, 441, 287]],
70
            [[961, 885, 992, 912], [256, 38, 330, 58]],
71
        ]
72

73
        return words, boxes
74

75
    def get_question_words_and_boxes(self):
76
        question = "what's his name?"
77
        words = ["lower", "newer"]
78
        boxes = [[423, 237, 440, 251], [427, 272, 441, 287]]
79

80
        return question, words, boxes
81

82
    def get_question_words_and_boxes_batch(self):
83
        questions = ["what's his name?", "how is he called?"]
84
        words = [["lower", "newer"], ["newer", "lower"]]
85
        boxes = [
86
            [[423, 237, 440, 251], [427, 272, 441, 287]],
87
            [[256, 38, 330, 58], [256, 38, 330, 58]],
88
        ]
89

90
        return questions, words, boxes
91

92
    def setUp(self):
93
        super().setUp()
94

95
        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
96
        vocab = [
97
            "l",
98
            "o",
99
            "w",
100
            "e",
101
            "r",
102
            "s",
103
            "t",
104
            "i",
105
            "d",
106
            "n",
107
            "\u0120",
108
            "\u0120l",
109
            "\u0120n",
110
            "\u0120lo",
111
            "\u0120low",
112
            "er",
113
            "\u0120lowest",
114
            "\u0120newer",
115
            "\u0120wider",
116
            "<unk>",
117
        ]
118
        vocab_tokens = dict(zip(vocab, range(len(vocab))))
119
        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
120
        self.special_tokens_map = {"unk_token": "<unk>"}
121

122
        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
123
        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
124
        with open(self.vocab_file, "w", encoding="utf-8") as fp:
125
            fp.write(json.dumps(vocab_tokens) + "\n")
126
        with open(self.merges_file, "w", encoding="utf-8") as fp:
127
            fp.write("\n".join(merges))
128

129
    def get_tokenizer(self, **kwargs):
130
        kwargs.update(self.special_tokens_map)
131
        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
132

133
    def get_rust_tokenizer(self, **kwargs):
134
        kwargs.update(self.special_tokens_map)
135
        return LayoutLMv3TokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
136

137
    def get_input_output_texts(self, tokenizer):
138
        input_text = "lower newer"
139
        output_text = "lower newer"
140
        return input_text, output_text
141

142
    def test_full_tokenizer(self):
143
        tokenizer = self.tokenizer_class(self.vocab_file, self.merges_file, **self.special_tokens_map)
144
        text = "lower newer"
145
        bpe_tokens = ["Ġlow", "er", "Ġ", "n", "e", "w", "er"]
146
        tokens = tokenizer.tokenize(text)  # , add_prefix_space=True)
147
        self.assertListEqual(tokens, bpe_tokens)
148

149
        input_tokens = tokens + [tokenizer.unk_token]
150
        input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
151
        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
152

153
    @slow
154
    def test_sequence_builders(self):
155
        tokenizer = self.tokenizer_class.from_pretrained("microsoft/layoutlmv3-base")
156

157
        question, words, boxes = self.get_question_words_and_boxes()
158

159
        text = tokenizer.encode(
160
            question.split(),
161
            boxes=[tokenizer.pad_token_box for _ in range(len(question.split()))],
162
            add_special_tokens=False,
163
        )
164
        text_2 = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
165

166
        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
167

168
        assert encoded_pair == [0] + text + [2] + [2] + text_2 + [2]
169

170
    def test_add_special_tokens(self):
171
        tokenizers: List[LayoutLMv3Tokenizer] = self.get_tokenizers(do_lower_case=False)
172
        for tokenizer in tokenizers:
173
            with self.subTest(f"{tokenizer.__class__.__name__}"):
174
                special_token = "[SPECIAL_TOKEN]"
175
                special_token_box = [1000, 1000, 1000, 1000]
176

177
                tokenizer.add_special_tokens({"cls_token": special_token})
178
                encoded_special_token = tokenizer.encode(
179
                    [special_token], boxes=[special_token_box], add_special_tokens=False
180
                )
181
                self.assertEqual(len(encoded_special_token), 1)
182

183
                decoded = tokenizer.decode(encoded_special_token, skip_special_tokens=True)
184
                self.assertTrue(special_token not in decoded)
185

186
    def test_add_tokens_tokenizer(self):
187
        tokenizers: List[LayoutLMv3Tokenizer] = self.get_tokenizers(do_lower_case=False)
188
        for tokenizer in tokenizers:
189
            with self.subTest(f"{tokenizer.__class__.__name__}"):
190
                vocab_size = tokenizer.vocab_size
191
                all_size = len(tokenizer)
192

193
                self.assertNotEqual(vocab_size, 0)
194

195
                # We usually have added tokens from the start in tests because our vocab fixtures are
196
                # smaller than the original vocabs - let's not assert this
197
                # self.assertEqual(vocab_size, all_size)
198

199
                new_toks = ["aaaaa", "bbbbbb", "cccccccccdddddddd"]
200
                added_toks = tokenizer.add_tokens(new_toks)
201
                vocab_size_2 = tokenizer.vocab_size
202
                all_size_2 = len(tokenizer)
203

204
                self.assertNotEqual(vocab_size_2, 0)
205
                self.assertEqual(vocab_size, vocab_size_2)
206
                self.assertEqual(added_toks, len(new_toks))
207
                self.assertEqual(all_size_2, all_size + len(new_toks))
208

209
                words = "aaaaa bbbbbb low cccccccccdddddddd l".split()
210
                boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
211

212
                tokens = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
213

214
                self.assertGreaterEqual(len(tokens), 4)
215
                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
216
                self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
217

218
                new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
219
                added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
220
                vocab_size_3 = tokenizer.vocab_size
221
                all_size_3 = len(tokenizer)
222

223
                self.assertNotEqual(vocab_size_3, 0)
224
                self.assertEqual(vocab_size, vocab_size_3)
225
                self.assertEqual(added_toks_2, len(new_toks_2))
226
                self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
227

228
                words = ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l".split()
229
                boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
230

231
                tokens = tokenizer.encode(
232
                    words,
233
                    boxes=boxes,
234
                    add_special_tokens=False,
235
                )
236

237
                self.assertGreaterEqual(len(tokens), 6)
238
                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
239
                self.assertGreater(tokens[0], tokens[1])
240
                self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
241
                self.assertGreater(tokens[-2], tokens[-3])
242
                self.assertEqual(tokens[0], tokenizer.eos_token_id)
243
                self.assertEqual(tokens[-2], tokenizer.pad_token_id)
244

245
    @require_tokenizers
246
    def test_encode_decode_with_spaces(self):
247
        tokenizers = self.get_tokenizers(do_lower_case=False)
248
        for tokenizer in tokenizers:
249
            with self.subTest(f"{tokenizer.__class__.__name__}"):
250
                words, boxes = self.get_words_and_boxes()
251

252
                new_toks = [AddedToken("[ABC]", normalized=False), AddedToken("[DEF]", normalized=False)]
253
                tokenizer.add_tokens(new_toks)
254
                input = "[ABC][DEF][ABC][DEF]"
255
                if self.space_between_special_tokens:
256
                    output = "[ABC] [DEF] [ABC] [DEF]"
257
                else:
258
                    output = input
259
                encoded = tokenizer.encode(input.split(), boxes=boxes, add_special_tokens=False)
260
                decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
261
                self.assertIn(decoded, [output, output.lower()])
262

263
    @unittest.skip("Not implemented")
264
    def test_right_and_left_truncation(self):
265
        pass
266

267
    @unittest.skip("Not implemented")
268
    def test_split_special_tokens(self):
269
        pass
270

271
    def test_encode_plus_with_padding(self):
272
        tokenizers = self.get_tokenizers(do_lower_case=False)
273
        for tokenizer in tokenizers:
274
            with self.subTest(f"{tokenizer.__class__.__name__}"):
275
                words, boxes = self.get_words_and_boxes()
276

277
                # check correct behaviour if no pad_token_id exists and add it eventually
278
                self._check_no_pad_token_padding(tokenizer, words)
279

280
                padding_size = 10
281
                padding_idx = tokenizer.pad_token_id
282

283
                encoded_sequence = tokenizer.encode_plus(words, boxes=boxes, return_special_tokens_mask=True)
284
                input_ids = encoded_sequence["input_ids"]
285
                special_tokens_mask = encoded_sequence["special_tokens_mask"]
286
                sequence_length = len(input_ids)
287

288
                # Test 'longest' and 'no_padding' don't do anything
289
                tokenizer.padding_side = "right"
290

291
                not_padded_sequence = tokenizer.encode_plus(
292
                    words,
293
                    boxes=boxes,
294
                    padding=False,
295
                    return_special_tokens_mask=True,
296
                )
297
                not_padded_input_ids = not_padded_sequence["input_ids"]
298

299
                not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
300
                not_padded_sequence_length = len(not_padded_input_ids)
301

302
                self.assertTrue(sequence_length == not_padded_sequence_length)
303
                self.assertTrue(input_ids == not_padded_input_ids)
304
                self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask)
305

306
                not_padded_sequence = tokenizer.encode_plus(
307
                    words,
308
                    boxes=boxes,
309
                    padding=False,
310
                    return_special_tokens_mask=True,
311
                )
312
                not_padded_input_ids = not_padded_sequence["input_ids"]
313

314
                not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
315
                not_padded_sequence_length = len(not_padded_input_ids)
316

317
                self.assertTrue(sequence_length == not_padded_sequence_length)
318
                self.assertTrue(input_ids == not_padded_input_ids)
319
                self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask)
320

321
                # Test right padding
322
                tokenizer.padding_side = "right"
323

324
                right_padded_sequence = tokenizer.encode_plus(
325
                    words,
326
                    boxes=boxes,
327
                    max_length=sequence_length + padding_size,
328
                    padding="max_length",
329
                    return_special_tokens_mask=True,
330
                )
331
                right_padded_input_ids = right_padded_sequence["input_ids"]
332

333
                right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
334
                right_padded_sequence_length = len(right_padded_input_ids)
335

336
                self.assertTrue(sequence_length + padding_size == right_padded_sequence_length)
337
                self.assertTrue(input_ids + [padding_idx] * padding_size == right_padded_input_ids)
338
                self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask)
339

340
                # Test left padding
341
                tokenizer.padding_side = "left"
342
                left_padded_sequence = tokenizer.encode_plus(
343
                    words,
344
                    boxes=boxes,
345
                    max_length=sequence_length + padding_size,
346
                    padding="max_length",
347
                    return_special_tokens_mask=True,
348
                )
349
                left_padded_input_ids = left_padded_sequence["input_ids"]
350
                left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
351
                left_padded_sequence_length = len(left_padded_input_ids)
352

353
                self.assertTrue(sequence_length + padding_size == left_padded_sequence_length)
354
                self.assertTrue([padding_idx] * padding_size + input_ids == left_padded_input_ids)
355
                self.assertTrue([1] * padding_size + special_tokens_mask == left_padded_special_tokens_mask)
356

357
                if "token_type_ids" in tokenizer.model_input_names:
358
                    token_type_ids = encoded_sequence["token_type_ids"]
359
                    left_padded_token_type_ids = left_padded_sequence["token_type_ids"]
360
                    right_padded_token_type_ids = right_padded_sequence["token_type_ids"]
361

362
                    assert token_type_ids + [0] * padding_size == right_padded_token_type_ids
363
                    assert [0] * padding_size + token_type_ids == left_padded_token_type_ids
364

365
                if "attention_mask" in tokenizer.model_input_names:
366
                    attention_mask = encoded_sequence["attention_mask"]
367
                    right_padded_attention_mask = right_padded_sequence["attention_mask"]
368
                    left_padded_attention_mask = left_padded_sequence["attention_mask"]
369

370
                    self.assertTrue(attention_mask + [0] * padding_size == right_padded_attention_mask)
371
                    self.assertTrue([0] * padding_size + attention_mask == left_padded_attention_mask)
372

373
    def test_internal_consistency(self):
374
        tokenizers = self.get_tokenizers()
375
        for tokenizer in tokenizers:
376
            with self.subTest(f"{tokenizer.__class__.__name__}"):
377
                words, boxes = self.get_words_and_boxes()
378

379
                tokens = []
380
                for word in words:
381
                    tokens.extend(tokenizer.tokenize(word))
382
                ids = tokenizer.convert_tokens_to_ids(tokens)
383
                ids_2 = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
384
                self.assertListEqual(ids, ids_2)
385

386
                tokens_2 = tokenizer.convert_ids_to_tokens(ids)
387
                self.assertNotEqual(len(tokens_2), 0)
388
                text_2 = tokenizer.decode(ids)
389
                self.assertIsInstance(text_2, str)
390

391
                output_text = " lower newer"
392
                self.assertEqual(text_2, output_text)
393

394
    def test_mask_output(self):
395
        tokenizers = self.get_tokenizers(fast=False, do_lower_case=False)
396
        for tokenizer in tokenizers:
397
            with self.subTest(f"{tokenizer.__class__.__name__}"):
398
                words, boxes = self.get_words_and_boxes()
399

400
                if (
401
                    tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer"
402
                    and "token_type_ids" in tokenizer.model_input_names
403
                ):
404
                    information = tokenizer.encode_plus(words, boxes=boxes, add_special_tokens=True)
405
                    sequences, mask = information["input_ids"], information["token_type_ids"]
406
                    self.assertEqual(len(sequences), len(mask))
407

408
    def test_number_of_added_tokens(self):
409
        tokenizers = self.get_tokenizers(do_lower_case=False)
410
        for tokenizer in tokenizers:
411
            with self.subTest(f"{tokenizer.__class__.__name__}"):
412
                # test 1: single sequence
413
                words, boxes = self.get_words_and_boxes()
414

415
                sequences = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
416
                attached_sequences = tokenizer.encode(words, boxes=boxes, add_special_tokens=True)
417

418
                # Method is implemented (e.g. not GPT-2)
419
                if len(attached_sequences) != 2:
420
                    self.assertEqual(
421
                        tokenizer.num_special_tokens_to_add(pair=False), len(attached_sequences) - len(sequences)
422
                    )
423

424
                # test 2: two sequences
425
                question, words, boxes = self.get_question_words_and_boxes()
426

427
                sequences = tokenizer.encode(question, words, boxes=boxes, add_special_tokens=False)
428
                attached_sequences = tokenizer.encode(question, words, boxes=boxes, add_special_tokens=True)
429

430
                # Method is implemented (e.g. not GPT-2)
431
                if len(attached_sequences) != 2:
432
                    self.assertEqual(
433
                        tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences)
434
                    )
435

436
    def test_padding_to_max_length(self):
437
        """We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated"""
438
        tokenizers = self.get_tokenizers(do_lower_case=False)
439
        for tokenizer in tokenizers:
440
            with self.subTest(f"{tokenizer.__class__.__name__}"):
441
                words, boxes = self.get_words_and_boxes()
442
                padding_size = 10
443

444
                # check correct behaviour if no pad_token_id exists and add it eventually
445
                self._check_no_pad_token_padding(tokenizer, words)
446

447
                padding_idx = tokenizer.pad_token_id
448

449
                # Check that it correctly pads when a maximum length is specified along with the padding flag set to True
450
                tokenizer.padding_side = "right"
451
                encoded_sequence = tokenizer.encode(words, boxes=boxes)
452
                sequence_length = len(encoded_sequence)
453
                # FIXME: the next line should be padding(max_length) to avoid warning
454
                padded_sequence = tokenizer.encode(
455
                    words, boxes=boxes, max_length=sequence_length + padding_size, pad_to_max_length=True
456
                )
457
                padded_sequence_length = len(padded_sequence)
458
                assert sequence_length + padding_size == padded_sequence_length
459
                assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
460

461
                # Check that nothing is done when a maximum length is not specified
462
                encoded_sequence = tokenizer.encode(words, boxes=boxes)
463
                sequence_length = len(encoded_sequence)
464

465
                tokenizer.padding_side = "right"
466
                padded_sequence_right = tokenizer.encode(words, boxes=boxes, pad_to_max_length=True)
467
                padded_sequence_right_length = len(padded_sequence_right)
468
                assert sequence_length == padded_sequence_right_length
469
                assert encoded_sequence == padded_sequence_right
470

471
    def test_padding(self, max_length=50):
472
        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
473
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
474
                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
475
                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
476

477
                self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
478
                pad_token_id = tokenizer_p.pad_token_id
479

480
                # Encode - Simple input
481
                words, boxes = self.get_words_and_boxes()
482
                input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
483
                input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
484
                self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
485
                input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, padding="max_length")
486
                input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, padding="max_length")
487
                self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
488

489
                input_r = tokenizer_r.encode(words, boxes=boxes, padding="longest")
490
                input_p = tokenizer_p.encode(words, boxes=boxes, padding=True)
491
                self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id)
492

493
                # Encode - Pair input
494
                question, words, boxes = self.get_question_words_and_boxes()
495
                input_r = tokenizer_r.encode(
496
                    question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
497
                )
498
                input_p = tokenizer_p.encode(
499
                    question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
500
                )
501
                self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
502
                input_r = tokenizer_r.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length")
503
                input_p = tokenizer_p.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length")
504
                self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
505
                input_r = tokenizer_r.encode(question, words, boxes=boxes, padding=True)
506
                input_p = tokenizer_p.encode(question, words, boxes=boxes, padding="longest")
507
                self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id)
508

509
                # Encode_plus - Simple input
510
                words, boxes = self.get_words_and_boxes()
511
                input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
512
                input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
513
                self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
514
                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
515
                input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length")
516
                input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length")
517
                self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
518
                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
519

520
                input_r = tokenizer_r.encode_plus(words, boxes=boxes, padding="longest")
521
                input_p = tokenizer_p.encode_plus(words, boxes=boxes, padding=True)
522
                self.assert_padded_input_match(
523
                    input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id
524
                )
525

526
                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
527

528
                # Encode_plus - Pair input
529
                question, words, boxes = self.get_question_words_and_boxes()
530
                input_r = tokenizer_r.encode_plus(
531
                    question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
532
                )
533
                input_p = tokenizer_p.encode_plus(
534
                    question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
535
                )
536
                self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
537
                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
538
                input_r = tokenizer_r.encode_plus(
539
                    question, words, boxes=boxes, max_length=max_length, padding="max_length"
540
                )
541
                input_p = tokenizer_p.encode_plus(
542
                    question, words, boxes=boxes, max_length=max_length, padding="max_length"
543
                )
544
                self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
545
                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
546
                input_r = tokenizer_r.encode_plus(question, words, boxes=boxes, padding="longest")
547
                input_p = tokenizer_p.encode_plus(question, words, boxes=boxes, padding=True)
548
                self.assert_padded_input_match(
549
                    input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id
550
                )
551
                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
552

553
                # Batch_encode_plus - Simple input
554
                words, boxes = self.get_words_and_boxes_batch()
555

556
                input_r = tokenizer_r.batch_encode_plus(
557
                    words,
558
                    boxes=boxes,
559
                    max_length=max_length,
560
                    pad_to_max_length=True,
561
                )
562
                input_p = tokenizer_p.batch_encode_plus(
563
                    words,
564
                    boxes=boxes,
565
                    max_length=max_length,
566
                    pad_to_max_length=True,
567
                )
568
                self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
569

570
                input_r = tokenizer_r.batch_encode_plus(
571
                    words,
572
                    boxes=boxes,
573
                    max_length=max_length,
574
                    padding="max_length",
575
                )
576
                input_p = tokenizer_p.batch_encode_plus(
577
                    words,
578
                    boxes=boxes,
579
                    max_length=max_length,
580
                    padding="max_length",
581
                )
582
                self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
583

584
                input_r = tokenizer_r.batch_encode_plus(
585
                    words,
586
                    boxes=boxes,
587
                    max_length=max_length,
588
                    padding="longest",
589
                )
590
                input_p = tokenizer_p.batch_encode_plus(
591
                    words,
592
                    boxes=boxes,
593
                    max_length=max_length,
594
                    padding=True,
595
                )
596
                self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
597

598
                input_r = tokenizer_r.batch_encode_plus(words, boxes=boxes, padding="longest")
599
                input_p = tokenizer_p.batch_encode_plus(words, boxes=boxes, padding=True)
600
                self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
601

602
                # Batch_encode_plus - Pair input
603
                questions, words, boxes = self.get_question_words_and_boxes_batch()
604

605
                input_r = tokenizer_r.batch_encode_plus(
606
                    list(zip(questions, words)),
607
                    is_pair=True,
608
                    boxes=boxes,
609
                    max_length=max_length,
610
                    truncation=True,
611
                    padding="max_length",
612
                )
613
                input_p = tokenizer_p.batch_encode_plus(
614
                    list(zip(questions, words)),
615
                    is_pair=True,
616
                    boxes=boxes,
617
                    max_length=max_length,
618
                    truncation=True,
619
                    padding="max_length",
620
                )
621
                self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
622

623
                input_r = tokenizer_r.batch_encode_plus(
624
                    list(zip(questions, words)),
625
                    is_pair=True,
626
                    boxes=boxes,
627
                    padding=True,
628
                )
629
                input_p = tokenizer_p.batch_encode_plus(
630
                    list(zip(questions, words)),
631
                    is_pair=True,
632
                    boxes=boxes,
633
                    padding="longest",
634
                )
635
                self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
636

637
                # Using pad on single examples after tokenization
638
                words, boxes = self.get_words_and_boxes()
639
                input_r = tokenizer_r.encode_plus(words, boxes=boxes)
640
                input_r = tokenizer_r.pad(input_r)
641

642
                input_p = tokenizer_r.encode_plus(words, boxes=boxes)
643
                input_p = tokenizer_r.pad(input_p)
644

645
                self.assert_padded_input_match(
646
                    input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id
647
                )
648

649
                # Using pad on single examples after tokenization
650
                input_r = tokenizer_r.encode_plus(words, boxes=boxes)
651
                input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length")
652

653
                input_p = tokenizer_r.encode_plus(words, boxes=boxes)
654
                input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length")
655

656
                self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
657

658
                # Using pad after tokenization
659
                words, boxes = self.get_words_and_boxes_batch()
660
                input_r = tokenizer_r.batch_encode_plus(
661
                    words,
662
                    boxes=boxes,
663
                )
664
                input_r = tokenizer_r.pad(input_r)
665

666
                input_p = tokenizer_r.batch_encode_plus(
667
                    words,
668
                    boxes=boxes,
669
                )
670
                input_p = tokenizer_r.pad(input_p)
671

672
                self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
673

674
                # Using pad after tokenization
675
                words, boxes = self.get_words_and_boxes_batch()
676
                input_r = tokenizer_r.batch_encode_plus(
677
                    words,
678
                    boxes=boxes,
679
                )
680
                input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length")
681

682
                input_p = tokenizer_r.batch_encode_plus(
683
                    words,
684
                    boxes=boxes,
685
                )
686
                input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length")
687

688
                self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
689

690
    def test_padding_warning_message_fast_tokenizer(self):
691
        if not self.test_rust_tokenizer:
692
            return
693

694
        words, boxes = self.get_words_and_boxes_batch()
695

696
        tokenizer_fast = self.get_rust_tokenizer()
697

698
        encoding_fast = tokenizer_fast(
699
            words,
700
            boxes=boxes,
701
        )
702

703
        with self.assertLogs("transformers", level="WARNING") as cm:
704
            tokenizer_fast.pad(encoding_fast)
705
        self.assertEqual(len(cm.records), 1)
706
        self.assertIn(
707
            "Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to"
708
            " encode the text followed by a call to the `pad` method to get a padded encoding.",
709
            cm.records[0].message,
710
        )
711

712
        if not self.test_slow_tokenizer:
713
            return
714

715
        tokenizer_slow = self.get_tokenizer()
716

717
        encoding_slow = tokenizer_slow(
718
            words,
719
            boxes=boxes,
720
        )
721

722
        with self.assertLogs(level="WARNING") as cm:
723
            # We want to assert there are no warnings, but the 'assertLogs' method does not support that.
724
            # Therefore, we are adding a dummy warning, and then we will assert it is the only warning.
725
            logger.warning("Dummy warning")
726
            tokenizer_slow.pad(encoding_slow)
727
        self.assertEqual(len(cm.records), 1)
728
        self.assertIn(
729
            "Dummy warning",
730
            cm.records[0].message,
731
        )
732

733
    def test_call(self):
734
        # Tests that all call wrap to encode_plus and batch_encode_plus
735
        tokenizers = self.get_tokenizers(do_lower_case=False)
736
        for tokenizer in tokenizers:
737
            with self.subTest(f"{tokenizer.__class__.__name__}"):
738
                # Test not batched
739
                words, boxes = self.get_words_and_boxes()
740
                encoded_sequences_1 = tokenizer.encode_plus(words, boxes=boxes)
741
                encoded_sequences_2 = tokenizer(words, boxes=boxes)
742
                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
743

744
                # Test not batched pairs
745
                question, words, boxes = self.get_question_words_and_boxes()
746
                encoded_sequences_1 = tokenizer.encode_plus(words, boxes=boxes)
747
                encoded_sequences_2 = tokenizer(words, boxes=boxes)
748
                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
749

750
                # Test batched
751
                words, boxes = self.get_words_and_boxes_batch()
752
                encoded_sequences_1 = tokenizer.batch_encode_plus(words, is_pair=False, boxes=boxes)
753
                encoded_sequences_2 = tokenizer(words, boxes=boxes)
754
                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
755

756
    def test_batch_encode_plus_batch_sequence_length(self):
757
        # Tests that all encoded values have the correct size
758
        tokenizers = self.get_tokenizers(do_lower_case=False)
759
        for tokenizer in tokenizers:
760
            with self.subTest(f"{tokenizer.__class__.__name__}"):
761
                words, boxes = self.get_words_and_boxes_batch()
762

763
                encoded_sequences = [
764
                    tokenizer.encode_plus(words_example, boxes=boxes_example)
765
                    for words_example, boxes_example in zip(words, boxes)
766
                ]
767
                encoded_sequences_batch = tokenizer.batch_encode_plus(words, is_pair=False, boxes=boxes, padding=False)
768
                self.assertListEqual(
769
                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
770
                )
771

772
                maximum_length = len(
773
                    max([encoded_sequence["input_ids"] for encoded_sequence in encoded_sequences], key=len)
774
                )
775

776
                # check correct behaviour if no pad_token_id exists and add it eventually
777
                self._check_no_pad_token_padding(tokenizer, words)
778

779
                encoded_sequences_padded = [
780
                    tokenizer.encode_plus(
781
                        words_example, boxes=boxes_example, max_length=maximum_length, padding="max_length"
782
                    )
783
                    for words_example, boxes_example in zip(words, boxes)
784
                ]
785

786
                encoded_sequences_batch_padded = tokenizer.batch_encode_plus(
787
                    words, is_pair=False, boxes=boxes, padding=True
788
                )
789
                self.assertListEqual(
790
                    encoded_sequences_padded,
791
                    self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch_padded),
792
                )
793

794
                # check 'longest' is unsensitive to a max length
795
                encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(
796
                    words, is_pair=False, boxes=boxes, padding=True
797
                )
798
                encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
799
                    words, is_pair=False, boxes=boxes, max_length=maximum_length + 10, padding="longest"
800
                )
801
                for key in encoded_sequences_batch_padded_1.keys():
802
                    self.assertListEqual(
803
                        encoded_sequences_batch_padded_1[key],
804
                        encoded_sequences_batch_padded_2[key],
805
                    )
806

807
                # check 'no_padding' is unsensitive to a max length
808
                encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(
809
                    words, is_pair=False, boxes=boxes, padding=False
810
                )
811
                encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
812
                    words, is_pair=False, boxes=boxes, max_length=maximum_length + 10, padding=False
813
                )
814
                for key in encoded_sequences_batch_padded_1.keys():
815
                    self.assertListEqual(
816
                        encoded_sequences_batch_padded_1[key],
817
                        encoded_sequences_batch_padded_2[key],
818
                    )
819

820
    @unittest.skip("batch_encode_plus does not handle overflowing tokens.")
821
    def test_batch_encode_plus_overflowing_tokens(self):
822
        pass
823

824
    def test_batch_encode_plus_padding(self):
825
        # Test that padded sequences are equivalent between batch_encode_plus and encode_plus
826

827
        # Right padding tests
828
        tokenizers = self.get_tokenizers(do_lower_case=False)
829
        for tokenizer in tokenizers:
830
            with self.subTest(f"{tokenizer.__class__.__name__}"):
831
                words, boxes = self.get_words_and_boxes_batch()
832

833
                max_length = 100
834

835
                # check correct behaviour if no pad_token_id exists and add it eventually
836
                self._check_no_pad_token_padding(tokenizer, words)
837

838
                encoded_sequences = [
839
                    tokenizer.encode_plus(
840
                        words_example, boxes=boxes_example, max_length=max_length, padding="max_length"
841
                    )
842
                    for words_example, boxes_example in zip(words, boxes)
843
                ]
844
                encoded_sequences_batch = tokenizer.batch_encode_plus(
845
                    words, is_pair=False, boxes=boxes, max_length=max_length, padding="max_length"
846
                )
847
                self.assertListEqual(
848
                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
849
                )
850

851
        # Left padding tests
852
        tokenizers = self.get_tokenizers(do_lower_case=False)
853
        for tokenizer in tokenizers:
854
            with self.subTest(f"{tokenizer.__class__.__name__}"):
855
                tokenizer.padding_side = "left"
856
                words, boxes = self.get_words_and_boxes_batch()
857

858
                max_length = 100
859

860
                # check correct behaviour if no pad_token_id exists and add it eventually
861
                self._check_no_pad_token_padding(tokenizer, words)
862

863
                encoded_sequences = [
864
                    tokenizer.encode_plus(
865
                        words_example, boxes=boxes_example, max_length=max_length, padding="max_length"
866
                    )
867
                    for words_example, boxes_example in zip(words, boxes)
868
                ]
869
                encoded_sequences_batch = tokenizer.batch_encode_plus(
870
                    words, is_pair=False, boxes=boxes, max_length=max_length, padding="max_length"
871
                )
872
                self.assertListEqual(
873
                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
874
                )
875

876
    def test_padding_to_multiple_of(self):
877
        tokenizers = self.get_tokenizers()
878
        for tokenizer in tokenizers:
879
            with self.subTest(f"{tokenizer.__class__.__name__}"):
880
                if tokenizer.pad_token is None:
881
                    self.skipTest("No padding token.")
882
                else:
883
                    words, boxes = self.get_words_and_boxes()
884

885
                    # empty_tokens = tokenizer([""], [[]], padding=True, pad_to_multiple_of=8)
886
                    normal_tokens = tokenizer(words, boxes=boxes, padding=True, pad_to_multiple_of=8)
887
                    # for key, value in empty_tokens.items():
888
                    #     self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
889
                    for key, value in normal_tokens.items():
890
                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
891

892
                    normal_tokens = tokenizer(words, boxes=boxes, pad_to_multiple_of=8)
893
                    for key, value in normal_tokens.items():
894
                        self.assertNotEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
895

896
                    # Should also work with truncation
897
                    normal_tokens = tokenizer(words, boxes=boxes, padding=True, truncation=True, pad_to_multiple_of=8)
898
                    for key, value in normal_tokens.items():
899
                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
900

901
                    # truncation to something which is not a multiple of pad_to_multiple_of raises an error
902
                    self.assertRaises(
903
                        ValueError,
904
                        tokenizer.__call__,
905
                        words,
906
                        boxes=boxes,
907
                        padding=True,
908
                        truncation=True,
909
                        max_length=12,
910
                        pad_to_multiple_of=8,
911
                    )
912

913
    def test_tokenizer_slow_store_full_signature(self):
914
        signature = inspect.signature(self.tokenizer_class.__init__)
915
        tokenizer = self.get_tokenizer()
916

917
        for parameter_name, parameter in signature.parameters.items():
918
            if parameter.default != inspect.Parameter.empty:
919
                self.assertIn(parameter_name, tokenizer.init_kwargs)
920

921
    def test_build_inputs_with_special_tokens(self):
922
        if not self.test_slow_tokenizer:
923
            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
924
            return
925

926
        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
927
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
928
                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
929
                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
930

931
                # Input tokens id
932
                words, boxes = self.get_words_and_boxes()
933
                input_simple = tokenizer_p.encode(words, boxes=boxes, add_special_tokens=False)
934
                input_pair = tokenizer_p.encode(words, boxes=boxes, add_special_tokens=False)
935

936
                # Generate output
937
                output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple)
938
                output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple)
939
                self.assertEqual(output_p, output_r)
940

941
                # Generate pair output
942
                output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair)
943
                output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
944
                self.assertEqual(output_p, output_r)
945

946
    def test_special_tokens_mask_input_pairs(self):
947
        tokenizers = self.get_tokenizers(do_lower_case=False)
948
        for tokenizer in tokenizers:
949
            with self.subTest(f"{tokenizer.__class__.__name__}"):
950
                words, boxes = self.get_words_and_boxes()
951
                encoded_sequence = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
952
                encoded_sequence_dict = tokenizer.encode_plus(
953
                    words,
954
                    boxes=boxes,
955
                    add_special_tokens=True,
956
                    return_special_tokens_mask=True,
957
                    # add_prefix_space=False,
958
                )
959
                encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
960
                special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
961
                self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
962

963
                filtered_sequence = [
964
                    (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
965
                ]
966
                filtered_sequence = [x for x in filtered_sequence if x is not None]
967
                self.assertEqual(encoded_sequence, filtered_sequence)
968

969
    def test_special_tokens_mask(self):
970
        tokenizers = self.get_tokenizers(do_lower_case=False)
971
        for tokenizer in tokenizers:
972
            with self.subTest(f"{tokenizer.__class__.__name__}"):
973
                words, boxes = self.get_words_and_boxes()
974
                # Testing single inputs
975
                encoded_sequence = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
976
                encoded_sequence_dict = tokenizer.encode_plus(
977
                    words, boxes=boxes, add_special_tokens=True, return_special_tokens_mask=True
978
                )
979
                encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
980
                special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
981
                self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
982

983
                filtered_sequence = [x for i, x in enumerate(encoded_sequence_w_special) if not special_tokens_mask[i]]
984
                self.assertEqual(encoded_sequence, filtered_sequence)
985

986
    def test_save_and_load_tokenizer(self):
987
        # safety check on max_len default value so we are sure the test works
988
        tokenizers = self.get_tokenizers()
989
        for tokenizer in tokenizers:
990
            with self.subTest(f"{tokenizer.__class__.__name__}"):
991
                self.assertNotEqual(tokenizer.model_max_length, 42)
992

993
        # Now let's start the test
994
        tokenizers = self.get_tokenizers()
995
        for tokenizer in tokenizers:
996
            with self.subTest(f"{tokenizer.__class__.__name__}"):
997
                # Isolate this from the other tests because we save additional tokens/etc
998
                words, boxes = self.get_words_and_boxes()
999
                tmpdirname = tempfile.mkdtemp()
1000

1001
                before_tokens = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
1002
                before_vocab = tokenizer.get_vocab()
1003
                tokenizer.save_pretrained(tmpdirname)
1004

1005
                after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
1006
                after_tokens = after_tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
1007
                after_vocab = after_tokenizer.get_vocab()
1008
                self.assertListEqual(before_tokens, after_tokens)
1009
                self.assertDictEqual(before_vocab, after_vocab)
1010

1011
                shutil.rmtree(tmpdirname)
1012

1013
    def test_right_and_left_padding(self):
1014
        tokenizers = self.get_tokenizers(do_lower_case=False)
1015
        for tokenizer in tokenizers:
1016
            with self.subTest(f"{tokenizer.__class__.__name__}"):
1017
                words, boxes = self.get_words_and_boxes()
1018
                sequence = "Sequence"
1019
                padding_size = 10
1020

1021
                # check correct behaviour if no pad_token_id exists and add it eventually
1022
                self._check_no_pad_token_padding(tokenizer, sequence)
1023

1024
                padding_idx = tokenizer.pad_token_id
1025

1026
                # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
1027
                tokenizer.padding_side = "right"
1028
                encoded_sequence = tokenizer.encode(words, boxes=boxes)
1029
                sequence_length = len(encoded_sequence)
1030
                padded_sequence = tokenizer.encode(
1031
                    words, boxes=boxes, max_length=sequence_length + padding_size, padding="max_length"
1032
                )
1033
                padded_sequence_length = len(padded_sequence)
1034
                assert sequence_length + padding_size == padded_sequence_length
1035
                assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
1036

1037
                # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
1038
                tokenizer.padding_side = "left"
1039
                encoded_sequence = tokenizer.encode(words, boxes=boxes)
1040
                sequence_length = len(encoded_sequence)
1041
                padded_sequence = tokenizer.encode(
1042
                    words, boxes=boxes, max_length=sequence_length + padding_size, padding="max_length"
1043
                )
1044
                padded_sequence_length = len(padded_sequence)
1045
                assert sequence_length + padding_size == padded_sequence_length
1046
                assert [padding_idx] * padding_size + encoded_sequence == padded_sequence
1047

1048
                # RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_padding'
1049
                encoded_sequence = tokenizer.encode(words, boxes=boxes)
1050
                sequence_length = len(encoded_sequence)
1051

1052
                tokenizer.padding_side = "right"
1053
                padded_sequence_right = tokenizer.encode(words, boxes=boxes, padding=True)
1054
                padded_sequence_right_length = len(padded_sequence_right)
1055
                assert sequence_length == padded_sequence_right_length
1056
                assert encoded_sequence == padded_sequence_right
1057

1058
                tokenizer.padding_side = "left"
1059
                padded_sequence_left = tokenizer.encode(words, boxes=boxes, padding="longest")
1060
                padded_sequence_left_length = len(padded_sequence_left)
1061
                assert sequence_length == padded_sequence_left_length
1062
                assert encoded_sequence == padded_sequence_left
1063

1064
                tokenizer.padding_side = "right"
1065
                padded_sequence_right = tokenizer.encode(words, boxes=boxes)
1066
                padded_sequence_right_length = len(padded_sequence_right)
1067
                assert sequence_length == padded_sequence_right_length
1068
                assert encoded_sequence == padded_sequence_right
1069

1070
                tokenizer.padding_side = "left"
1071
                padded_sequence_left = tokenizer.encode(words, boxes=boxes, padding=False)
1072
                padded_sequence_left_length = len(padded_sequence_left)
1073
                assert sequence_length == padded_sequence_left_length
1074
                assert encoded_sequence == padded_sequence_left
1075

1076
    def test_token_type_ids(self):
1077
        tokenizers = self.get_tokenizers()
1078
        for tokenizer in tokenizers:
1079
            with self.subTest(f"{tokenizer.__class__.__name__}"):
1080
                # test 1: single sequence
1081
                words, boxes = self.get_words_and_boxes()
1082

1083
                output = tokenizer(words, boxes=boxes, return_token_type_ids=True)
1084

1085
                # Assert that the token type IDs have the same length as the input IDs
1086
                self.assertEqual(len(output["token_type_ids"]), len(output["input_ids"]))
1087

1088
                # Assert that the token type IDs have the same length as the attention mask
1089
                self.assertEqual(len(output["token_type_ids"]), len(output["attention_mask"]))
1090

1091
                self.assertIn(0, output["token_type_ids"])
1092
                self.assertNotIn(1, output["token_type_ids"])
1093

1094
                # test 2: two sequences (question + words)
1095
                question, words, boxes = self.get_question_words_and_boxes()
1096

1097
                output = tokenizer(question, words, boxes, return_token_type_ids=True)
1098

1099
                # Assert that the token type IDs have the same length as the input IDs
1100
                self.assertEqual(len(output["token_type_ids"]), len(output["input_ids"]))
1101

1102
                # Assert that the token type IDs have the same length as the attention mask
1103
                self.assertEqual(len(output["token_type_ids"]), len(output["attention_mask"]))
1104

1105
                self.assertIn(0, output["token_type_ids"])
1106

1107
    def test_offsets_mapping(self):
1108
        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
1109
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
1110
                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
1111

1112
                text = ["a", "wonderful", "test"]
1113
                boxes = [[1, 8, 12, 20] for _ in range(len(text))]
1114

1115
                # No pair
1116
                tokens_with_offsets = tokenizer_r.encode_plus(
1117
                    text,
1118
                    boxes=boxes,
1119
                    return_special_tokens_mask=True,
1120
                    return_offsets_mapping=True,
1121
                    add_special_tokens=True,
1122
                )
1123
                added_tokens = tokenizer_r.num_special_tokens_to_add(False)
1124
                offsets = tokens_with_offsets["offset_mapping"]
1125

1126
                # Assert there is the same number of tokens and offsets
1127
                self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"]))
1128

1129
                # Assert there is online added_tokens special_tokens
1130
                self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
1131

1132
                # Pairs
1133
                text = "what's his name"
1134
                pair = ["a", "wonderful", "test"]
1135
                boxes = [[1, 8, 12, 20] for _ in range(len(pair))]
1136
                tokens_with_offsets = tokenizer_r.encode_plus(
1137
                    text,
1138
                    pair,
1139
                    boxes=boxes,
1140
                    return_special_tokens_mask=True,
1141
                    return_offsets_mapping=True,
1142
                    add_special_tokens=True,
1143
                )
1144
                added_tokens = tokenizer_r.num_special_tokens_to_add(True)
1145
                offsets = tokens_with_offsets["offset_mapping"]
1146

1147
                # Assert there is the same number of tokens and offsets
1148
                self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"]))
1149

1150
                # Assert there is online added_tokens special_tokens
1151
                self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
1152

1153
    @require_torch
1154
    @slow
1155
    def test_torch_encode_plus_sent_to_model(self):
1156
        import torch
1157

1158
        from transformers import MODEL_MAPPING, TOKENIZER_MAPPING
1159

1160
        MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(MODEL_MAPPING, TOKENIZER_MAPPING)
1161

1162
        tokenizers = self.get_tokenizers(do_lower_case=False)
1163
        for tokenizer in tokenizers:
1164
            with self.subTest(f"{tokenizer.__class__.__name__}"):
1165
                if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
1166
                    return
1167

1168
                config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
1169
                config = config_class()
1170

1171
                if config.is_encoder_decoder or config.pad_token_id is None:
1172
                    return
1173

1174
                model = model_class(config)
1175

1176
                # Make sure the model contains at least the full vocabulary size in its embedding matrix
1177
                is_using_common_embeddings = hasattr(model.get_input_embeddings(), "weight")
1178
                assert (
1179
                    (model.get_input_embeddings().weight.shape[0] >= len(tokenizer))
1180
                    if is_using_common_embeddings
1181
                    else True
1182
                )
1183

1184
                # Build sequence
1185
                words, boxes = self.get_words_and_boxes()
1186
                encoded_sequence = tokenizer.encode_plus(words, boxes=boxes, return_tensors="pt")
1187
                batch_encoded_sequence = tokenizer.batch_encode_plus(
1188
                    [words, words], boxes=[boxes, boxes], return_tensors="pt"
1189
                )
1190

1191
                # We add dummy pixel_values keys (as LayoutLMv3 actually also requires a feature extractor
1192
                # to prepare the image input)
1193
                encoded_sequence["pixel_values"] = torch.randn(1, 3, 224, 224)
1194
                batch_encoded_sequence["pixel_values"] = torch.randn(2, 3, 224, 224)
1195

1196
                # This should not fail
1197
                with torch.no_grad():  # saves some time
1198
                    model(**encoded_sequence)
1199
                    model(**batch_encoded_sequence)
1200

1201
    def test_rust_and_python_full_tokenizers(self):
1202
        if not self.test_rust_tokenizer:
1203
            return
1204

1205
        if not self.test_slow_tokenizer:
1206
            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
1207
            return
1208

1209
        tokenizer = self.get_tokenizer()
1210
        rust_tokenizer = self.get_rust_tokenizer()
1211

1212
        words, boxes = self.get_words_and_boxes()
1213

1214
        ids = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
1215
        rust_ids = rust_tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
1216
        self.assertListEqual(ids, rust_ids)
1217

1218
        ids = tokenizer.encode(words, boxes=boxes, add_special_tokens=True)
1219
        rust_ids = rust_tokenizer.encode(words, boxes=boxes, add_special_tokens=True)
1220
        self.assertListEqual(ids, rust_ids)
1221

1222
    def test_tokenization_python_rust_equals(self):
1223
        if not self.test_slow_tokenizer:
1224
            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
1225
            return
1226

1227
        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
1228
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
1229
                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
1230
                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
1231

1232
                words, boxes = self.get_words_and_boxes()
1233

1234
                # Ensure basic input match
1235
                input_p = tokenizer_p.encode_plus(words, boxes=boxes)
1236
                input_r = tokenizer_r.encode_plus(words, boxes=boxes)
1237

1238
                for key in filter(
1239
                    lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
1240
                ):
1241
                    self.assertSequenceEqual(input_p[key], input_r[key])
1242

1243
                input_pairs_p = tokenizer_p.encode_plus(words, boxes=boxes)
1244
                input_pairs_r = tokenizer_r.encode_plus(words, boxes=boxes)
1245

1246
                for key in filter(
1247
                    lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
1248
                ):
1249
                    self.assertSequenceEqual(input_pairs_p[key], input_pairs_r[key])
1250

1251
                words = ["hello" for _ in range(1000)]
1252
                boxes = [[1000, 1000, 1000, 1000] for _ in range(1000)]
1253

1254
                # Ensure truncation match
1255
                input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=512, truncation=True)
1256
                input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=512, truncation=True)
1257

1258
                for key in filter(
1259
                    lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
1260
                ):
1261
                    self.assertSequenceEqual(input_p[key], input_r[key])
1262

1263
                # Ensure truncation with stride match
1264
                input_p = tokenizer_p.encode_plus(
1265
                    words, boxes=boxes, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
1266
                )
1267
                input_r = tokenizer_r.encode_plus(
1268
                    words, boxes=boxes, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
1269
                )
1270

1271
                for key in filter(
1272
                    lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
1273
                ):
1274
                    self.assertSequenceEqual(input_p[key], input_r[key][0])
1275

1276
    def test_embeded_special_tokens(self):
1277
        if not self.test_slow_tokenizer:
1278
            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
1279
            return
1280

1281
        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
1282
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
1283
                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
1284
                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
1285
                words, boxes = self.get_words_and_boxes()
1286
                tokens_r = tokenizer_r.encode_plus(
1287
                    words,
1288
                    boxes=boxes,
1289
                    add_special_tokens=True,
1290
                )
1291
                tokens_p = tokenizer_p.encode_plus(
1292
                    words,
1293
                    boxes=boxes,
1294
                    add_special_tokens=True,
1295
                )
1296

1297
                for key in tokens_p.keys():
1298
                    self.assertEqual(tokens_r[key], tokens_p[key])
1299

1300
                if "token_type_ids" in tokens_r:
1301
                    self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
1302

1303
                tokens_r = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
1304
                tokens_p = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
1305
                self.assertSequenceEqual(tokens_r, tokens_p)
1306

1307
    def test_compare_add_special_tokens(self):
1308
        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
1309
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
1310
                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
1311

1312
                simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False)
1313

1314
                words, boxes = self.get_words_and_boxes()
1315
                # tokenize()
1316
                no_special_tokens = tokenizer_r.tokenize(" ".join(words), add_special_tokens=False)
1317
                with_special_tokens = tokenizer_r.tokenize(" ".join(words), add_special_tokens=True)
1318
                self.assertEqual(len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add)
1319

1320
                # encode()
1321
                no_special_tokens = tokenizer_r.encode(words, boxes=boxes, add_special_tokens=False)
1322
                with_special_tokens = tokenizer_r.encode(words, boxes=boxes, add_special_tokens=True)
1323
                self.assertEqual(len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add)
1324

1325
                # encode_plus()
1326
                no_special_tokens = tokenizer_r.encode_plus(words, boxes=boxes, add_special_tokens=False)
1327
                with_special_tokens = tokenizer_r.encode_plus(words, boxes=boxes, add_special_tokens=True)
1328
                for key in no_special_tokens.keys():
1329
                    self.assertEqual(
1330
                        len(no_special_tokens[key]),
1331
                        len(with_special_tokens[key]) - simple_num_special_tokens_to_add,
1332
                    )
1333

1334
                # # batch_encode_plus
1335
                words, boxes = self.get_words_and_boxes_batch()
1336

1337
                no_special_tokens = tokenizer_r.batch_encode_plus(words, boxes=boxes, add_special_tokens=False)
1338
                with_special_tokens = tokenizer_r.batch_encode_plus(words, boxes=boxes, add_special_tokens=True)
1339
                for key in no_special_tokens.keys():
1340
                    for i_no, i_with in zip(no_special_tokens[key], with_special_tokens[key]):
1341
                        self.assertEqual(len(i_no), len(i_with) - simple_num_special_tokens_to_add)
1342

1343
    @slow
1344
    def test_layoutlmv3_truncation_integration_test(self):
1345
        words, boxes = self.get_words_and_boxes()
1346

1347
        tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base", model_max_length=512)
1348

1349
        for i in range(12, 512):
1350
            new_encoded_inputs = tokenizer.encode(words, boxes=boxes, max_length=i, truncation=True)
1351

1352
            # Ensure that the input IDs are less than the max length defined.
1353
            self.assertLessEqual(len(new_encoded_inputs), i)
1354

1355
        tokenizer.model_max_length = 20
1356
        new_encoded_inputs = tokenizer.encode(words, boxes=boxes, truncation=True)
1357
        dropped_encoded_inputs = tokenizer.encode(words, boxes=boxes, truncation=True)
1358

1359
        # Ensure that the input IDs are still truncated when no max_length is specified
1360
        self.assertListEqual(new_encoded_inputs, dropped_encoded_inputs)
1361
        self.assertLessEqual(len(new_encoded_inputs), 20)
1362

1363
    @is_pt_tf_cross_test
1364
    def test_batch_encode_plus_tensors(self):
1365
        tokenizers = self.get_tokenizers(do_lower_case=False)
1366
        for tokenizer in tokenizers:
1367
            with self.subTest(f"{tokenizer.__class__.__name__}"):
1368
                words, boxes = self.get_words_and_boxes_batch()
1369

1370
                # A Tensor cannot be build by sequences which are not the same size
1371
                self.assertRaises(ValueError, tokenizer.batch_encode_plus, words, boxes=boxes, return_tensors="pt")
1372
                self.assertRaises(ValueError, tokenizer.batch_encode_plus, words, boxes=boxes, return_tensors="tf")
1373

1374
                if tokenizer.pad_token_id is None:
1375
                    self.assertRaises(
1376
                        ValueError,
1377
                        tokenizer.batch_encode_plus,
1378
                        words,
1379
                        boxes=boxes,
1380
                        padding=True,
1381
                        return_tensors="pt",
1382
                    )
1383
                    self.assertRaises(
1384
                        ValueError,
1385
                        tokenizer.batch_encode_plus,
1386
                        words,
1387
                        boxes=boxes,
1388
                        padding="longest",
1389
                        return_tensors="tf",
1390
                    )
1391
                else:
1392
                    pytorch_tensor = tokenizer.batch_encode_plus(words, boxes=boxes, padding=True, return_tensors="pt")
1393
                    tensorflow_tensor = tokenizer.batch_encode_plus(
1394
                        words, boxes=boxes, padding="longest", return_tensors="tf"
1395
                    )
1396
                    encoded_sequences = tokenizer.batch_encode_plus(words, boxes=boxes, padding=True)
1397

1398
                    for key in encoded_sequences.keys():
1399
                        pytorch_value = pytorch_tensor[key].tolist()
1400
                        tensorflow_value = tensorflow_tensor[key].numpy().tolist()
1401
                        encoded_value = encoded_sequences[key]
1402

1403
                        self.assertEqual(pytorch_value, tensorflow_value, encoded_value)
1404

1405
    def test_sequence_ids(self):
1406
        tokenizers = self.get_tokenizers()
1407
        for tokenizer in tokenizers:
1408
            if not tokenizer.is_fast:
1409
                continue
1410
            with self.subTest(f"{tokenizer.__class__.__name__}"):
1411
                seq_0 = "Test this method."
1412
                seq_1 = ["With", "these", "inputs."]
1413
                boxes = [[1000, 1000, 1000, 1000] for _ in range(len(seq_1))]
1414

1415
                # We want to have sequence 0 and sequence 1 are tagged
1416
                # respectively with 0 and 1 token_ids
1417
                # (regardless of whether the model use token type ids)
1418
                # We use this assumption in the QA pipeline among other place
1419
                output = tokenizer(seq_0.split(), boxes=boxes)
1420
                self.assertIn(0, output.sequence_ids())
1421

1422
                output = tokenizer(seq_0, seq_1, boxes=boxes)
1423
                self.assertIn(0, output.sequence_ids())
1424
                self.assertIn(1, output.sequence_ids())
1425

1426
                if tokenizer.num_special_tokens_to_add(pair=True):
1427
                    self.assertIn(None, output.sequence_ids())
1428

1429
    def test_special_tokens_initialization(self):
1430
        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
1431
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
1432
                added_tokens = [AddedToken("<special>", lstrip=True)]
1433

1434
                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
1435
                    pretrained_name, additional_special_tokens=added_tokens, **kwargs
1436
                )
1437
                words = "Hey this is a <special> token".split()
1438
                boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
1439
                r_output = tokenizer_r.encode(words, boxes=boxes)
1440

1441
                special_token_id = tokenizer_r.encode(
1442
                    ["<special>"], boxes=[1000, 1000, 1000, 1000], add_special_tokens=False
1443
                )[0]
1444

1445
                self.assertTrue(special_token_id in r_output)
1446

1447
                if self.test_slow_tokenizer:
1448
                    tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
1449
                        pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
1450
                    )
1451
                    tokenizer_p = self.tokenizer_class.from_pretrained(
1452
                        pretrained_name, additional_special_tokens=added_tokens, **kwargs
1453
                    )
1454

1455
                    words = "Hey this is a <special> token".split()
1456
                    boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
1457

1458
                    p_output = tokenizer_p.encode(words, boxes=boxes)
1459
                    cr_output = tokenizer_cr.encode(words, boxes=boxes)
1460

1461
                    self.assertEqual(p_output, r_output)
1462
                    self.assertEqual(cr_output, r_output)
1463
                    self.assertTrue(special_token_id in p_output)
1464
                    self.assertTrue(special_token_id in cr_output)
1465

1466
    def test_training_new_tokenizer(self):
1467
        # This feature only exists for fast tokenizers
1468
        if not self.test_rust_tokenizer:
1469
            return
1470

1471
        tokenizer = self.get_rust_tokenizer()
1472
        new_tokenizer = tokenizer.train_new_from_iterator(SMALL_TRAINING_CORPUS, 100)
1473

1474
        # Test we can use the new tokenizer with something not seen during training
1475
        text = [["this", "is", "the"], ["how", "are", "you"]]
1476
        boxes = [[[1, 2, 3, 4], [5, 6, 7, 8], [1, 3, 4, 8]], [[5, 6, 7, 8], [4, 5, 6, 7], [3, 9, 2, 7]]]
1477
        inputs = new_tokenizer(text, boxes=boxes)
1478
        self.assertEqual(len(inputs["input_ids"]), 2)
1479
        decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)
1480
        expected_result = " this is the"
1481

1482
        if tokenizer.backend_tokenizer.normalizer is not None:
1483
            expected_result = tokenizer.backend_tokenizer.normalizer.normalize_str(expected_result)
1484
        self.assertEqual(expected_result, decoded_input)
1485

1486
        # We check that the parameters of the tokenizer remained the same
1487
        # Check we have the same number of added_tokens for both pair and non-pair inputs.
1488
        self.assertEqual(tokenizer.num_special_tokens_to_add(False), new_tokenizer.num_special_tokens_to_add(False))
1489
        self.assertEqual(tokenizer.num_special_tokens_to_add(True), new_tokenizer.num_special_tokens_to_add(True))
1490

1491
        # Check we have the correct max_length for both pair and non-pair inputs.
1492
        self.assertEqual(tokenizer.max_len_single_sentence, new_tokenizer.max_len_single_sentence)
1493
        self.assertEqual(tokenizer.max_len_sentences_pair, new_tokenizer.max_len_sentences_pair)
1494

1495
        # Assert the set of special tokens match as we didn't ask to change them
1496
        self.assertSequenceEqual(
1497
            tokenizer.all_special_tokens_extended,
1498
            new_tokenizer.all_special_tokens_extended,
1499
        )
1500

1501
        self.assertDictEqual(tokenizer.special_tokens_map, new_tokenizer.special_tokens_map)
1502

1503
    def test_training_new_tokenizer_with_special_tokens_change(self):
1504
        # This feature only exists for fast tokenizers
1505
        if not self.test_rust_tokenizer:
1506
            return
1507

1508
        tokenizer = self.get_rust_tokenizer()
1509
        # Test with a special tokens map
1510
        class_signature = inspect.signature(tokenizer.__class__)
1511
        if "cls_token" in class_signature.parameters:
1512
            new_tokenizer = tokenizer.train_new_from_iterator(
1513
                SMALL_TRAINING_CORPUS, 100, special_tokens_map={tokenizer.cls_token: "<cls>"}
1514
            )
1515
            cls_id = new_tokenizer.get_vocab()["<cls>"]
1516
            self.assertEqual(new_tokenizer.cls_token, "<cls>")
1517
            self.assertEqual(new_tokenizer.cls_token_id, cls_id)
1518

1519
        # Create a new mapping from the special tokens defined in the original tokenizer
1520
        special_tokens_list = SpecialTokensMixin.SPECIAL_TOKENS_ATTRIBUTES.copy()
1521
        special_tokens_list.remove("additional_special_tokens")
1522
        special_tokens_map = {}
1523
        for token in special_tokens_list:
1524
            # Get the private one to avoid unnecessary warnings.
1525
            if getattr(tokenizer, f"_{token}") is not None:
1526
                special_token = getattr(tokenizer, token)
1527
                special_tokens_map[special_token] = f"{special_token}a"
1528

1529
        # Train new tokenizer
1530
        new_tokenizer = tokenizer.train_new_from_iterator(
1531
            SMALL_TRAINING_CORPUS, 100, special_tokens_map=special_tokens_map
1532
        )
1533

1534
        # Check the changes
1535
        for token in special_tokens_list:
1536
            # Get the private one to avoid unnecessary warnings.
1537
            if getattr(tokenizer, f"_{token}") is None:
1538
                continue
1539
            special_token = getattr(tokenizer, token)
1540
            if special_token in special_tokens_map:
1541
                new_special_token = getattr(new_tokenizer, token)
1542
                self.assertEqual(special_tokens_map[special_token], new_special_token)
1543

1544
                new_id = new_tokenizer.get_vocab()[new_special_token]
1545
                self.assertEqual(getattr(new_tokenizer, f"{token}_id"), new_id)
1546

1547
        # Check if the AddedToken / string format has been kept
1548
        for special_token in tokenizer.all_special_tokens_extended:
1549
            if isinstance(special_token, AddedToken) and special_token.content not in special_tokens_map:
1550
                # The special token must appear identically in the list of the new tokenizer.
1551
                self.assertTrue(
1552
                    special_token in new_tokenizer.all_special_tokens_extended,
1553
                    f"'{special_token}' should be in {new_tokenizer.all_special_tokens_extended}",
1554
                )
1555
            elif isinstance(special_token, AddedToken):
1556
                # The special token must appear in the list of the new tokenizer as an object of type AddedToken with
1557
                # the same parameters as the old AddedToken except the content that the user has requested to change.
1558
                special_token_str = special_token.content
1559
                new_special_token_str = special_tokens_map[special_token_str]
1560

1561
                find = False
1562
                for candidate in new_tokenizer.all_special_tokens_extended:
1563
                    if (
1564
                        isinstance(candidate, AddedToken)
1565
                        and candidate.content == new_special_token_str
1566
                        and candidate.lstrip == special_token.lstrip
1567
                        and candidate.rstrip == special_token.rstrip
1568
                        and candidate.normalized == special_token.normalized
1569
                        and candidate.single_word == special_token.single_word
1570
                    ):
1571
                        find = True
1572
                        break
1573
                self.assertTrue(
1574
                    find,
1575
                    f"'{new_special_token_str}' doesn't appear in the list "
1576
                    f"'{new_tokenizer.all_special_tokens_extended}' as an AddedToken with the same parameters as "
1577
                    f"'{special_token}' in the list {tokenizer.all_special_tokens_extended}",
1578
                )
1579
            elif special_token not in special_tokens_map:
1580
                # The special token must appear identically in the list of the new tokenizer.
1581
                self.assertTrue(
1582
                    special_token in new_tokenizer.all_special_tokens_extended,
1583
                    f"'{special_token}' should be in {new_tokenizer.all_special_tokens_extended}",
1584
                )
1585

1586
            else:
1587
                # The special token must appear in the list of the new tokenizer as an object of type string.
1588
                self.assertTrue(special_tokens_map[special_token] in new_tokenizer.all_special_tokens_extended)
1589

1590
        # Test we can use the new tokenizer with something not seen during training
1591
        words = [["this", "is"], ["hello", "🤗"]]
1592
        boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[1, 2, 3, 4], [5, 6, 7, 8]]]
1593
        inputs = new_tokenizer(words, boxes=boxes)
1594
        self.assertEqual(len(inputs["input_ids"]), 2)
1595
        decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)
1596
        expected_result = " this is"
1597

1598
        if tokenizer.backend_tokenizer.normalizer is not None:
1599
            expected_result = tokenizer.backend_tokenizer.normalizer.normalize_str(expected_result)
1600
        self.assertEqual(expected_result, decoded_input)
1601

1602
    def test_prepare_for_model(self):
1603
        tokenizers = self.get_tokenizers(do_lower_case=False)
1604
        for tokenizer in tokenizers:
1605
            # only test prepare_for_model for the slow tokenizer
1606
            if tokenizer.__class__.__name__ == "LayoutLMv3TokenizerFast":
1607
                continue
1608
            with self.subTest(f"{tokenizer.__class__.__name__}"):
1609
                words, boxes = self.get_words_and_boxes()
1610
                prepared_input_dict = tokenizer.prepare_for_model(words, boxes=boxes, add_special_tokens=True)
1611

1612
                input_dict = tokenizer.encode_plus(words, boxes=boxes, add_special_tokens=True)
1613

1614
                self.assertEqual(input_dict, prepared_input_dict)
1615

1616
    def test_padding_different_model_input_name(self):
1617
        if not self.test_slow_tokenizer:
1618
            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
1619
            return
1620

1621
        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
1622
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
1623
                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
1624
                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
1625
                self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
1626
                pad_token_id = tokenizer_p.pad_token_id
1627

1628
                words, boxes = self.get_words_and_boxes_batch()
1629

1630
                input_r = tokenizer_r.batch_encode_plus(words, boxes=boxes)
1631
                input_p = tokenizer_r.batch_encode_plus(words, boxes=boxes)
1632

1633
                # rename encoded batch to "inputs"
1634
                input_r["inputs"] = input_r[tokenizer_r.model_input_names[0]]
1635
                del input_r[tokenizer_r.model_input_names[0]]
1636

1637
                input_p["inputs"] = input_p[tokenizer_p.model_input_names[0]]
1638
                del input_p[tokenizer_p.model_input_names[0]]
1639

1640
                # Renaming `input_ids` to `inputs`
1641
                tokenizer_r.model_input_names = ["inputs"] + tokenizer_r.model_input_names[1:]
1642
                tokenizer_p.model_input_names = ["inputs"] + tokenizer_p.model_input_names[1:]
1643

1644
                input_r = tokenizer_r.pad(input_r, padding="longest")
1645
                input_p = tokenizer_r.pad(input_p, padding="longest")
1646

1647
                max_length = len(input_p["inputs"][0])
1648
                self.assert_batch_padded_input_match(
1649
                    input_r, input_p, max_length, pad_token_id, model_main_input_name="inputs"
1650
                )
1651

1652
    def test_batch_encode_dynamic_overflowing(self):
1653
        """
1654
        When calling batch_encode with multiple sequences, it can return different number of
1655
        overflowing encoding for each sequence:
1656
        [
1657
          Sequence 1: [Encoding 1, Encoding 2],
1658
          Sequence 2: [Encoding 1],
1659
          Sequence 3: [Encoding 1, Encoding 2, ... Encoding N]
1660
        ]
1661
        This needs to be padded so that it can represented as a tensor
1662
        """
1663
        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
1664
            tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
1665

1666
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name}, {tokenizer.__class__.__name__})"):
1667
                if is_torch_available():
1668
                    returned_tensor = "pt"
1669
                elif is_tf_available():
1670
                    returned_tensor = "tf"
1671
                else:
1672
                    returned_tensor = "jax"
1673

1674
                # Single example
1675
                words = ["HuggingFace", "is", "solving", "NLP", "one", "commit", "at", "a", "time"]
1676
                boxes = [[i, i, i, i] for i in range(len(words))]
1677
                tokens = tokenizer.encode_plus(
1678
                    words,
1679
                    boxes=boxes,
1680
                    max_length=6,
1681
                    padding=True,
1682
                    truncation=True,
1683
                    return_tensors=returned_tensor,
1684
                    return_overflowing_tokens=True,
1685
                )
1686

1687
                for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
1688
                    if key != "bbox":
1689
                        self.assertEqual(len(tokens[key].shape), 2)
1690
                    else:
1691
                        self.assertEqual(len(tokens[key].shape), 3)
1692

1693
                # Batch of examples
1694
                # For these 2 examples, 3 training examples will be created
1695
                words_batched = [
1696
                    ["HuggingFace", "is", "solving", "NLP", "one", "commit", "at", "a", "time"],
1697
                    ["Very", "tiny", "input"],
1698
                ]
1699
                boxes_batched = [[[i, i, i, i] for i in range(len(words_item))] for words_item in words_batched]
1700
                tokens = tokenizer.batch_encode_plus(
1701
                    words_batched,
1702
                    boxes=boxes_batched,
1703
                    max_length=6,
1704
                    padding=True,
1705
                    truncation="only_first",
1706
                    return_tensors=returned_tensor,
1707
                    return_overflowing_tokens=True,
1708
                )
1709

1710
                for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
1711
                    if key != "bbox":
1712
                        self.assertEqual(len(tokens[key].shape), 2)
1713
                        self.assertEqual(tokens[key].shape[-1], 6)
1714
                    else:
1715
                        self.assertEqual(len(tokens[key].shape), 3)
1716
                        self.assertEqual(tokens[key].shape[-1], 4)
1717

1718
    @unittest.skip("TO DO: overwrite this very extensive test.")
1719
    def test_alignement_methods(self):
1720
        pass
1721

1722
    def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5):
1723
        toks = [(i, tokenizer.decode([i], clean_up_tokenization_spaces=False)) for i in range(len(tokenizer))]
1724
        toks = list(filter(lambda t: re.match(r"^[ a-zA-Z]+$", t[1]), toks))
1725
        toks = list(
1726
            filter(
1727
                lambda t: [t[0]]
1728
                == tokenizer.encode(t[1].split(" "), boxes=len(t[1]) * [[1, 1, 1, 1]], add_special_tokens=False),
1729
                toks,
1730
            )
1731
        )
1732
        if max_length is not None and len(toks) > max_length:
1733
            toks = toks[:max_length]
1734
        if min_length is not None and len(toks) < min_length and len(toks) > 0:
1735
            while len(toks) < min_length:
1736
                toks = toks + toks
1737
        # toks_str = [t[1] for t in toks]
1738
        toks_ids = [t[0] for t in toks]
1739

1740
        # Ensure consistency
1741
        output_txt = tokenizer.decode(toks_ids, clean_up_tokenization_spaces=False)
1742
        if " " not in output_txt and len(toks_ids) > 1:
1743
            output_txt = (
1744
                tokenizer.decode([toks_ids[0]], clean_up_tokenization_spaces=False)
1745
                + " "
1746
                + tokenizer.decode(toks_ids[1:], clean_up_tokenization_spaces=False)
1747
            )
1748
        if with_prefix_space:
1749
            output_txt = " " + output_txt
1750
        words = output_txt.split(" ")
1751
        boxes = [[i, i, i, i] for i in range(len(words))]
1752
        output_ids = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
1753

1754
        return words, boxes, output_ids
1755

1756
    def test_added_token_with_space_before(self):
1757
        tokenizer_s = self.get_tokenizer()
1758
        tokenizer_f = self.get_rust_tokenizer()
1759

1760
        tokens_to_add = ["AAA", "bbb"]
1761

1762
        words_with_space = [f" {token}" for token in tokens_to_add + list(tokenizer_s.added_tokens_encoder.keys())]
1763
        words_without_space = tokens_to_add + list(tokenizer_s.added_tokens_encoder.keys())
1764
        boxes = [[i, i, i, i] for i in range(len(words_with_space))]
1765

1766
        tokens_to_add_formated = [
1767
            AddedToken(token, rstrip=True, lstrip=True, single_word=False) for token in tokens_to_add
1768
        ]
1769
        tokenizer_s.add_tokens(tokens_to_add_formated)
1770
        tokenizer_f.add_tokens(tokens_to_add_formated)
1771

1772
        ids_s = tokenizer_s(words_with_space, boxes=boxes).input_ids
1773
        ids_f = tokenizer_f(words_with_space, boxes=boxes).input_ids
1774

1775
        tokens_s = tokenizer_s.convert_ids_to_tokens(ids_s)
1776
        tokens_f = tokenizer_f.convert_ids_to_tokens(ids_f)
1777

1778
        ids_s = tokenizer_s(words_without_space, boxes=boxes).input_ids
1779
        ids_f = tokenizer_f(words_without_space, boxes=boxes).input_ids
1780

1781
        tokens_s = tokenizer_s.convert_ids_to_tokens(ids_s)
1782
        tokens_f = tokenizer_f.convert_ids_to_tokens(ids_f)
1783

1784
        self.assertEqual(tokens_s, tokens_f)
1785

1786
    def test_maximum_encoding_length_pair_input(self):
1787
        tokenizers = self.get_tokenizers(do_lower_case=False, model_max_length=100)
1788
        for tokenizer in tokenizers:
1789
            with self.subTest(f"{tokenizer.__class__.__name__}"):
1790
                # Build a sequence from our model's vocabulary
1791
                stride = 2
1792
                seq_0, boxes_0, ids = self.get_clean_sequence(tokenizer, max_length=20)
1793
                question_0 = " ".join(map(str, seq_0))
1794
                if len(ids) <= 2 + stride:
1795
                    seq_0 = (seq_0 + " ") * (2 + stride)
1796
                    ids = None
1797

1798
                seq0_tokens = tokenizer(seq_0, boxes=boxes_0, add_special_tokens=False)
1799
                seq0_input_ids = seq0_tokens["input_ids"]
1800

1801
                self.assertGreater(len(seq0_input_ids), 2 + stride)
1802
                question_1 = "This is another sentence to be encoded."
1803
                seq_1 = ["what", "a", "weird", "test", "weirdly", "weird"]
1804
                boxes_1 = [[i, i, i, i] for i in range(1, len(seq_1) + 1)]
1805
                seq1_tokens = tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)
1806
                if abs(len(seq0_input_ids) - len(seq1_tokens["input_ids"])) <= 2:
1807
                    seq1_tokens_input_ids = seq1_tokens["input_ids"] + seq1_tokens["input_ids"]
1808
                    seq_1 = tokenizer.decode(seq1_tokens_input_ids, clean_up_tokenization_spaces=False)
1809
                    seq_1 = seq_1.split(" ")
1810
                    boxes_1 = [[i, i, i, i] for i in range(1, len(seq_1) + 1)]
1811
                seq1_tokens = tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)
1812
                seq1_input_ids = seq1_tokens["input_ids"]
1813

1814
                self.assertGreater(len(seq1_input_ids), 2 + stride)
1815

1816
                smallest = seq1_input_ids if len(seq0_input_ids) > len(seq1_input_ids) else seq0_input_ids
1817

1818
                # We are not using the special tokens - a bit too hard to test all the tokenizers with this
1819
                # TODO try this again later
1820
                sequence = tokenizer(
1821
                    question_0, seq_1, boxes=boxes_1, add_special_tokens=False
1822
                )  # , add_prefix_space=False)
1823

1824
                # Test with max model input length
1825
                model_max_length = tokenizer.model_max_length
1826
                self.assertEqual(model_max_length, 100)
1827
                seq_2 = seq_0 * model_max_length
1828
                question_2 = " ".join(map(str, seq_2))
1829
                boxes_2 = boxes_0 * model_max_length
1830
                self.assertGreater(len(seq_2), model_max_length)
1831

1832
                sequence1 = tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)
1833
                total_length1 = len(sequence1["input_ids"])
1834
                sequence2 = tokenizer(question_2, seq_1, boxes=boxes_1, add_special_tokens=False)
1835
                total_length2 = len(sequence2["input_ids"])
1836
                self.assertLess(total_length1, model_max_length, "Issue with the testing sequence, please update it.")
1837
                self.assertGreater(
1838
                    total_length2, model_max_length, "Issue with the testing sequence, please update it."
1839
                )
1840

1841
                # Simple
1842
                padding_strategies = (
1843
                    [False, True, "longest"] if tokenizer.pad_token and tokenizer.pad_token_id >= 0 else [False]
1844
                )
1845
                for padding_state in padding_strategies:
1846
                    with self.subTest(f"{tokenizer.__class__.__name__} Padding: {padding_state}"):
1847
                        for truncation_state in [True, "longest_first", "only_first"]:
1848
                            with self.subTest(f"{tokenizer.__class__.__name__} Truncation: {truncation_state}"):
1849
                                output = tokenizer(
1850
                                    question_2,
1851
                                    seq_1,
1852
                                    boxes=boxes_1,
1853
                                    padding=padding_state,
1854
                                    truncation=truncation_state,
1855
                                )
1856
                                self.assertEqual(len(output["input_ids"]), model_max_length)
1857
                                self.assertEqual(len(output["bbox"]), model_max_length)
1858

1859
                                output = tokenizer(
1860
                                    [question_2],
1861
                                    [seq_1],
1862
                                    boxes=[boxes_1],
1863
                                    padding=padding_state,
1864
                                    truncation=truncation_state,
1865
                                )
1866
                                self.assertEqual(len(output["input_ids"][0]), model_max_length)
1867
                                self.assertEqual(len(output["bbox"][0]), model_max_length)
1868

1869
                        # Simple
1870
                        output = tokenizer(
1871
                            question_1, seq_2, boxes=boxes_2, padding=padding_state, truncation="only_second"
1872
                        )
1873
                        self.assertEqual(len(output["input_ids"]), model_max_length)
1874
                        self.assertEqual(len(output["bbox"]), model_max_length)
1875

1876
                        output = tokenizer(
1877
                            [question_1], [seq_2], boxes=[boxes_2], padding=padding_state, truncation="only_second"
1878
                        )
1879
                        self.assertEqual(len(output["input_ids"][0]), model_max_length)
1880
                        self.assertEqual(len(output["bbox"][0]), model_max_length)
1881

1882
                        # Simple with no truncation
1883
                        # Reset warnings
1884
                        tokenizer.deprecation_warnings = {}
1885
                        with self.assertLogs("transformers", level="WARNING") as cm:
1886
                            output = tokenizer(
1887
                                question_1, seq_2, boxes=boxes_2, padding=padding_state, truncation=False
1888
                            )
1889
                            self.assertNotEqual(len(output["input_ids"]), model_max_length)
1890
                            self.assertNotEqual(len(output["bbox"]), model_max_length)
1891
                        self.assertEqual(len(cm.records), 1)
1892
                        self.assertTrue(
1893
                            cm.records[0].message.startswith(
1894
                                "Token indices sequence length is longer than the specified maximum sequence length"
1895
                                " for this model"
1896
                            )
1897
                        )
1898

1899
                        tokenizer.deprecation_warnings = {}
1900
                        with self.assertLogs("transformers", level="WARNING") as cm:
1901
                            output = tokenizer(
1902
                                [question_1], [seq_2], boxes=[boxes_2], padding=padding_state, truncation=False
1903
                            )
1904
                            self.assertNotEqual(len(output["input_ids"][0]), model_max_length)
1905
                            self.assertNotEqual(len(output["bbox"][0]), model_max_length)
1906
                        self.assertEqual(len(cm.records), 1)
1907
                        self.assertTrue(
1908
                            cm.records[0].message.startswith(
1909
                                "Token indices sequence length is longer than the specified maximum sequence length"
1910
                                " for this model"
1911
                            )
1912
                        )
1913
                # Check the order of Sequence of input ids, overflowing tokens and bbox sequence with truncation
1914
                truncated_first_sequence = (
1915
                    tokenizer(seq_0, boxes=boxes_0, add_special_tokens=False)["input_ids"][:-2]
1916
                    + tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)["input_ids"]
1917
                )
1918
                truncated_second_sequence = (
1919
                    tokenizer(seq_0, boxes=boxes_0, add_special_tokens=False)["input_ids"]
1920
                    + tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)["input_ids"][:-2]
1921
                )
1922
                truncated_longest_sequence = (
1923
                    truncated_first_sequence
1924
                    if len(seq0_input_ids) > len(seq1_input_ids)
1925
                    else truncated_second_sequence
1926
                )
1927

1928
                overflow_first_sequence = (
1929
                    tokenizer(seq_0, boxes=boxes_0, add_special_tokens=False)["input_ids"][-(2 + stride) :]
1930
                    + tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)["input_ids"]
1931
                )
1932
                overflow_second_sequence = (
1933
                    tokenizer(seq_0, boxes=boxes_0, add_special_tokens=False)["input_ids"]
1934
                    + tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)["input_ids"][-(2 + stride) :]
1935
                )
1936
                overflow_longest_sequence = (
1937
                    overflow_first_sequence if len(seq0_input_ids) > len(seq1_input_ids) else overflow_second_sequence
1938
                )
1939

1940
                bbox_first = [[0, 0, 0, 0]] * (len(seq0_input_ids) - 2)
1941
                bbox_first_sequence = bbox_first + tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)["bbox"]
1942
                overflowing_token_bbox_first_sequence_slow = [[0, 0, 0, 0]] * (2 + stride)
1943
                overflowing_token_bbox_first_sequence_fast = [[0, 0, 0, 0]] * (2 + stride) + tokenizer(
1944
                    seq_1, boxes=boxes_1, add_special_tokens=False
1945
                )["bbox"]
1946

1947
                bbox_second = [[0, 0, 0, 0]] * len(seq0_input_ids)
1948
                bbox_second_sequence = (
1949
                    bbox_second + tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)["bbox"][:-2]
1950
                )
1951
                overflowing_token_bbox_second_sequence_slow = tokenizer(
1952
                    seq_1, boxes=boxes_1, add_special_tokens=False
1953
                )["bbox"][-(2 + stride) :]
1954
                overflowing_token_bbox_second_sequence_fast = [[0, 0, 0, 0]] * len(seq0_input_ids) + tokenizer(
1955
                    seq_1, boxes=boxes_1, add_special_tokens=False
1956
                )["bbox"][-(2 + stride) :]
1957

1958
                bbox_longest_sequence = (
1959
                    bbox_first_sequence if len(seq0_tokens) > len(seq1_tokens) else bbox_second_sequence
1960
                )
1961
                overflowing_token_bbox_longest_sequence_fast = (
1962
                    overflowing_token_bbox_first_sequence_fast
1963
                    if len(seq0_tokens) > len(seq1_tokens)
1964
                    else overflowing_token_bbox_second_sequence_fast
1965
                )
1966

1967
                # Overflowing tokens are handled quite differently in slow and fast tokenizers
1968
                if isinstance(tokenizer, LayoutLMv3TokenizerFast):
1969
                    information = tokenizer(
1970
                        question_0,
1971
                        seq_1,
1972
                        boxes=boxes_1,
1973
                        max_length=len(sequence["input_ids"]) - 2,
1974
                        add_special_tokens=False,
1975
                        stride=stride,
1976
                        truncation="longest_first",
1977
                        return_overflowing_tokens=True,
1978
                        # add_prefix_space=False,
1979
                    )
1980
                    truncated_sequence = information["input_ids"][0]
1981
                    overflowing_tokens = information["input_ids"][1]
1982
                    bbox = information["bbox"][0]
1983
                    overflowing_bbox = information["bbox"][1]
1984
                    self.assertEqual(len(information["input_ids"]), 2)
1985

1986
                    self.assertEqual(len(truncated_sequence), len(sequence["input_ids"]) - 2)
1987
                    self.assertEqual(truncated_sequence, truncated_longest_sequence)
1988

1989
                    self.assertEqual(len(overflowing_tokens), 2 + stride + len(smallest))
1990
                    self.assertEqual(overflowing_tokens, overflow_longest_sequence)
1991
                    self.assertEqual(bbox, bbox_longest_sequence)
1992

1993
                    self.assertEqual(len(overflowing_bbox), 2 + stride + len(smallest))
1994
                    self.assertEqual(overflowing_bbox, overflowing_token_bbox_longest_sequence_fast)
1995
                else:
1996
                    # No overflowing tokens when using 'longest' in python tokenizers
1997
                    with self.assertRaises(ValueError) as context:
1998
                        information = tokenizer(
1999
                            question_0,
2000
                            seq_1,
2001
                            boxes=boxes_1,
2002
                            max_length=len(sequence["input_ids"]) - 2,
2003
                            add_special_tokens=False,
2004
                            stride=stride,
2005
                            truncation="longest_first",
2006
                            return_overflowing_tokens=True,
2007
                            # add_prefix_space=False,
2008
                        )
2009

2010
                    self.assertTrue(
2011
                        context.exception.args[0].startswith(
2012
                            "Not possible to return overflowing tokens for pair of sequences with the "
2013
                            "`longest_first`. Please select another truncation strategy than `longest_first`, "
2014
                            "for instance `only_second` or `only_first`."
2015
                        )
2016
                    )
2017

2018
                # Overflowing tokens are handled quite differently in slow and fast tokenizers
2019
                if isinstance(tokenizer, LayoutLMv3TokenizerFast):
2020
                    information = tokenizer(
2021
                        question_0,
2022
                        seq_1,
2023
                        boxes=boxes_1,
2024
                        max_length=len(sequence["input_ids"]) - 2,
2025
                        add_special_tokens=False,
2026
                        stride=stride,
2027
                        truncation=True,
2028
                        return_overflowing_tokens=True,
2029
                        # add_prefix_space=False,
2030
                    )
2031
                    truncated_sequence = information["input_ids"][0]
2032
                    overflowing_tokens = information["input_ids"][1]
2033
                    bbox = information["bbox"][0]
2034
                    overflowing_bbox = information["bbox"][1]
2035
                    self.assertEqual(len(information["input_ids"]), 2)
2036

2037
                    self.assertEqual(len(truncated_sequence), len(sequence["input_ids"]) - 2)
2038
                    self.assertEqual(truncated_sequence, truncated_longest_sequence)
2039

2040
                    self.assertEqual(len(overflowing_tokens), 2 + stride + len(smallest))
2041
                    self.assertEqual(overflowing_tokens, overflow_longest_sequence)
2042
                    self.assertEqual(bbox, bbox_longest_sequence)
2043
                    self.assertEqual(overflowing_bbox, overflowing_token_bbox_longest_sequence_fast)
2044
                else:
2045
                    # No overflowing tokens when using 'longest' in python tokenizers
2046
                    with self.assertRaises(ValueError) as context:
2047
                        information = tokenizer(
2048
                            question_0,
2049
                            seq_1,
2050
                            boxes=boxes_1,
2051
                            max_length=len(sequence["input_ids"]) - 2,
2052
                            add_special_tokens=False,
2053
                            stride=stride,
2054
                            truncation=True,
2055
                            return_overflowing_tokens=True,
2056
                            # add_prefix_space=False,
2057
                        )
2058

2059
                    self.assertTrue(
2060
                        context.exception.args[0].startswith(
2061
                            "Not possible to return overflowing tokens for pair of sequences with the "
2062
                            "`longest_first`. Please select another truncation strategy than `longest_first`, "
2063
                            "for instance `only_second` or `only_first`."
2064
                        )
2065
                    )
2066

2067
                information_first_truncated = tokenizer(
2068
                    question_0,
2069
                    seq_1,
2070
                    boxes=boxes_1,
2071
                    max_length=len(sequence["input_ids"]) - 2,
2072
                    add_special_tokens=False,
2073
                    stride=stride,
2074
                    truncation="only_first",
2075
                    return_overflowing_tokens=True,
2076
                    # add_prefix_space=False,
2077
                )
2078
                # Overflowing tokens are handled quite differently in slow and fast tokenizers
2079
                if isinstance(tokenizer, LayoutLMv3TokenizerFast):
2080
                    truncated_sequence = information_first_truncated["input_ids"][0]
2081
                    overflowing_tokens = information_first_truncated["input_ids"][1]
2082
                    bbox = information_first_truncated["bbox"][0]
2083
                    overflowing_bbox = information_first_truncated["bbox"][0]
2084
                    self.assertEqual(len(information_first_truncated["input_ids"]), 2)
2085

2086
                    self.assertEqual(len(truncated_sequence), len(sequence["input_ids"]) - 2)
2087
                    self.assertEqual(truncated_sequence, truncated_first_sequence)
2088

2089
                    self.assertEqual(len(overflowing_tokens), 2 + stride + len(seq1_input_ids))
2090
                    self.assertEqual(overflowing_tokens, overflow_first_sequence)
2091
                    self.assertEqual(bbox, bbox_first_sequence)
2092
                    self.assertEqual(overflowing_bbox, overflowing_token_bbox_first_sequence_fast)
2093
                else:
2094
                    truncated_sequence = information_first_truncated["input_ids"]
2095
                    overflowing_tokens = information_first_truncated["overflowing_tokens"]
2096
                    overflowing_bbox = information_first_truncated["overflowing_token_boxes"]
2097
                    bbox = information_first_truncated["bbox"]
2098

2099
                    self.assertEqual(len(truncated_sequence), len(sequence["input_ids"]) - 2)
2100
                    self.assertEqual(truncated_sequence, truncated_first_sequence)
2101

2102
                    self.assertEqual(len(overflowing_tokens), 2 + stride)
2103
                    self.assertEqual(overflowing_tokens, seq0_input_ids[-(2 + stride) :])
2104
                    self.assertEqual(bbox, bbox_first_sequence)
2105
                    self.assertEqual(overflowing_bbox, overflowing_token_bbox_first_sequence_slow)
2106

2107
                information_second_truncated = tokenizer(
2108
                    question_0,
2109
                    seq_1,
2110
                    boxes=boxes_1,
2111
                    max_length=len(sequence["input_ids"]) - 2,
2112
                    add_special_tokens=False,
2113
                    stride=stride,
2114
                    truncation="only_second",
2115
                    return_overflowing_tokens=True,
2116
                    # add_prefix_space=False,
2117
                )
2118
                # Overflowing tokens are handled quite differently in slow and fast tokenizers
2119
                if isinstance(tokenizer, LayoutLMv3TokenizerFast):
2120
                    truncated_sequence = information_second_truncated["input_ids"][0]
2121
                    overflowing_tokens = information_second_truncated["input_ids"][1]
2122
                    bbox = information_second_truncated["bbox"][0]
2123
                    overflowing_bbox = information_second_truncated["bbox"][1]
2124

2125
                    self.assertEqual(len(information_second_truncated["input_ids"]), 2)
2126

2127
                    self.assertEqual(len(truncated_sequence), len(sequence["input_ids"]) - 2)
2128
                    self.assertEqual(truncated_sequence, truncated_second_sequence)
2129

2130
                    self.assertEqual(len(overflowing_tokens), 2 + stride + len(seq0_input_ids))
2131
                    self.assertEqual(overflowing_tokens, overflow_second_sequence)
2132
                    self.assertEqual(bbox, bbox_second_sequence)
2133
                    self.assertEqual(overflowing_bbox, overflowing_token_bbox_second_sequence_fast)
2134
                else:
2135
                    truncated_sequence = information_second_truncated["input_ids"]
2136
                    overflowing_tokens = information_second_truncated["overflowing_tokens"]
2137
                    bbox = information_second_truncated["bbox"]
2138
                    overflowing_bbox = information_second_truncated["overflowing_token_boxes"]
2139

2140
                    self.assertEqual(len(truncated_sequence), len(sequence["input_ids"]) - 2)
2141
                    self.assertEqual(truncated_sequence, truncated_second_sequence)
2142

2143
                    self.assertEqual(len(overflowing_tokens), 2 + stride)
2144
                    self.assertEqual(overflowing_tokens, seq1_input_ids[-(2 + stride) :])
2145
                    self.assertEqual(bbox, bbox_second_sequence)
2146
                    self.assertEqual(overflowing_bbox, overflowing_token_bbox_second_sequence_slow)
2147

2148
    def test_maximum_encoding_length_single_input(self):
2149
        tokenizers = self.get_tokenizers(do_lower_case=False, model_max_length=100)
2150
        for tokenizer in tokenizers:
2151
            with self.subTest(f"{tokenizer.__class__.__name__}"):
2152
                seq_0, boxes_0, ids = self.get_clean_sequence(tokenizer, max_length=20)
2153

2154
                sequence = tokenizer(seq_0, boxes=boxes_0, add_special_tokens=False)
2155
                total_length = len(sequence["input_ids"])
2156

2157
                self.assertGreater(
2158
                    total_length, 4, "Issue with the testing sequence, please update it, it's too short"
2159
                )
2160

2161
                # Test with max model input length
2162
                model_max_length = tokenizer.model_max_length
2163
                self.assertEqual(model_max_length, 100)
2164
                seq_1 = seq_0 * model_max_length
2165
                boxes_1 = boxes_0 * model_max_length
2166
                sequence1 = tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)
2167
                total_length1 = len(sequence1["input_ids"])
2168
                self.assertGreater(
2169
                    total_length1,
2170
                    model_max_length,
2171
                    "Issue with the testing sequence, please update it, it's too short",
2172
                )
2173

2174
                # Simple
2175
                padding_strategies = (
2176
                    [False, True, "longest"] if tokenizer.pad_token and tokenizer.pad_token_id >= 0 else [False]
2177
                )
2178
                for padding_state in padding_strategies:
2179
                    with self.subTest(f"Padding: {padding_state}"):
2180
                        for truncation_state in [True, "longest_first", "only_first"]:
2181
                            with self.subTest(f"Truncation: {truncation_state}"):
2182
                                output = tokenizer(
2183
                                    seq_1,
2184
                                    boxes=boxes_1,
2185
                                    padding=padding_state,
2186
                                    truncation=truncation_state,
2187
                                )
2188

2189
                                self.assertEqual(len(output["input_ids"]), model_max_length)
2190
                                self.assertEqual(len(output["bbox"]), model_max_length)
2191

2192
                                output = tokenizer(
2193
                                    [seq_1],
2194
                                    boxes=[boxes_1],
2195
                                    padding=padding_state,
2196
                                    truncation=truncation_state,
2197
                                )
2198
                                self.assertEqual(len(output["input_ids"][0]), model_max_length)
2199
                                self.assertEqual(len(output["bbox"][0]), model_max_length)
2200

2201
                        # Simple with no truncation
2202
                        # Reset warnings
2203
                        tokenizer.deprecation_warnings = {}
2204
                        with self.assertLogs("transformers", level="WARNING") as cm:
2205
                            output = tokenizer(seq_1, boxes=boxes_1, padding=padding_state, truncation=False)
2206
                            self.assertNotEqual(len(output["input_ids"]), model_max_length)
2207
                            self.assertNotEqual(len(output["bbox"]), model_max_length)
2208
                        self.assertEqual(len(cm.records), 1)
2209
                        self.assertTrue(
2210
                            cm.records[0].message.startswith(
2211
                                "Token indices sequence length is longer than the specified maximum sequence length"
2212
                                " for this model"
2213
                            )
2214
                        )
2215

2216
                        tokenizer.deprecation_warnings = {}
2217
                        with self.assertLogs("transformers", level="WARNING") as cm:
2218
                            output = tokenizer([seq_1], boxes=[boxes_1], padding=padding_state, truncation=False)
2219
                            self.assertNotEqual(len(output["input_ids"][0]), model_max_length)
2220
                            self.assertNotEqual(len(output["bbox"][0]), model_max_length)
2221
                        self.assertEqual(len(cm.records), 1)
2222
                        self.assertTrue(
2223
                            cm.records[0].message.startswith(
2224
                                "Token indices sequence length is longer than the specified maximum sequence length"
2225
                                " for this model"
2226
                            )
2227
                        )
2228
                # Check the order of Sequence of input ids, overflowing tokens and bbox sequence with truncation
2229
                stride = 2
2230
                information = tokenizer(
2231
                    seq_0,
2232
                    boxes=boxes_0,
2233
                    max_length=total_length - 2,
2234
                    add_special_tokens=False,
2235
                    stride=stride,
2236
                    truncation=True,
2237
                    return_overflowing_tokens=True,
2238
                    # add_prefix_space=False,
2239
                )
2240

2241
                # Overflowing tokens are handled quite differently in slow and fast tokenizers
2242
                if isinstance(tokenizer, LayoutLMv3TokenizerFast):
2243
                    truncated_sequence = information["input_ids"][0]
2244
                    overflowing_tokens = information["input_ids"][1]
2245
                    # bbox = information["bbox"][0]
2246
                    # overflowing_bbox = information["bbox"][1]
2247
                    self.assertEqual(len(information["input_ids"]), 2)
2248

2249
                    self.assertEqual(len(truncated_sequence), total_length - 2)
2250
                    self.assertEqual(truncated_sequence, sequence["input_ids"][:-2])
2251

2252
                    self.assertEqual(len(overflowing_tokens), 2 + stride)
2253
                    self.assertEqual(overflowing_tokens, sequence["input_ids"][-(2 + stride) :])
2254

2255
                    # self.assertEqual(bbox, sequence["bbox"][:-2])
2256
                    # self.assertEqual(overflowing_bbox, sequence["bbox"][-(2 + stride) :])
2257
                else:
2258
                    truncated_sequence = information["input_ids"]
2259
                    overflowing_tokens = information["overflowing_tokens"]
2260
                    # bbox = information["bbox"]
2261
                    # overflowing_bbox = information["overflowing_token_boxes"]
2262
                    self.assertEqual(len(truncated_sequence), total_length - 2)
2263
                    self.assertEqual(truncated_sequence, sequence["input_ids"][:-2])
2264

2265
                    self.assertEqual(len(overflowing_tokens), 2 + stride)
2266
                    self.assertEqual(overflowing_tokens, sequence["input_ids"][-(2 + stride) :])
2267
                    # self.assertEqual(bbox, sequence["bbox"][:-2])
2268
                    # self.assertEqual(overflowing_bbox, sequence["bbox"][-(2 + stride) :])
2269

2270
    @unittest.skip("LayoutLMv3 tokenizer requires boxes besides sequences.")
2271
    def test_pretokenized_inputs(self):
2272
        pass
2273

2274
    @unittest.skip("LayoutLMv3 tokenizer always expects pretokenized inputs.")
2275
    def test_compare_pretokenized_inputs(self):
2276
        pass
2277

2278
    @unittest.skip("LayoutLMv3 fast tokenizer does not support prepare_for_model")
2279
    def test_compare_prepare_for_model(self):
2280
        pass
2281

2282
    @slow
2283
    def test_only_label_first_subword(self):
2284
        words = ["hello", "niels", "0000000000000000"]
2285
        boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
2286
        word_labels = [0, 1, 2]
2287

2288
        # test slow tokenizer
2289
        tokenizer_p = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base", add_visual_labels=False)
2290
        encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels)
2291
        self.assertListEqual(encoding.labels, [-100, 0, 1, -100, 2, -100, -100])
2292

2293
        tokenizer_p = LayoutLMv3Tokenizer.from_pretrained(
2294
            "microsoft/layoutlmv3-base",
2295
            only_label_first_subword=False,
2296
            add_visual_labels=False,
2297
        )
2298
        encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels)
2299
        self.assertListEqual(encoding.labels, [-100, 0, 1, 1, 2, 2, -100])
2300

2301
        # test fast tokenizer
2302
        tokenizer_r = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base", add_visual_labels=False)
2303
        encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels)
2304
        self.assertListEqual(encoding.labels, [-100, 0, 1, -100, 2, -100, -100])
2305

2306
        tokenizer_r = LayoutLMv3Tokenizer.from_pretrained(
2307
            "microsoft/layoutlmv3-base",
2308
            only_label_first_subword=False,
2309
            add_visual_labels=False,
2310
        )
2311
        encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels)
2312
        self.assertListEqual(encoding.labels, [-100, 0, 1, 1, 2, 2, -100])
2313

2314
    @slow
2315
    def test_layoutlmv3_integration_test(self):
2316
        tokenizer_p = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
2317
        tokenizer_r = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base")
2318

2319
        # There are 3 cases:
2320
        # CASE 1: document image classification (training + inference), document image token classification (inference),
2321
        # in which case only words and normalized bounding boxes are provided to the tokenizer
2322
        # CASE 2: document image token classification (training),
2323
        # in which case one also provides word labels to the tokenizer
2324
        # CASE 3: document image visual question answering (inference),
2325
        # in which case one also provides a question to the tokenizer
2326

2327
        # We need to test all 3 cases both on batched and non-batched inputs.
2328

2329
        # CASE 1: not batched
2330
        words, boxes = self.get_words_and_boxes()
2331

2332
        expected_results = {'input_ids': [0, 795, 13964, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'bbox': [[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], 'attention_mask': [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}  # fmt: skip
2333

2334
        encoding_p = tokenizer_p(words, boxes=boxes, padding="max_length", max_length=20)
2335
        encoding_r = tokenizer_r(words, boxes=boxes, padding="max_length", max_length=20)
2336
        self.assertDictEqual(dict(encoding_p), expected_results)
2337
        self.assertDictEqual(dict(encoding_r), expected_results)
2338

2339
        # CASE 1: batched
2340
        words, boxes = self.get_words_and_boxes_batch()
2341

2342
        expected_results = {'input_ids': [[0, 795, 13964, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 92, 614, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'bbox': [[[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [961, 885, 992, 912], [256, 38, 330, 58], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], 'attention_mask': [[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # fmt: skip
2343

2344
        encoding_p = tokenizer_p(words, boxes=boxes, padding="max_length", max_length=20)
2345
        encoding_r = tokenizer_r(words, boxes=boxes, padding="max_length", max_length=20)
2346
        self.assertDictEqual(dict(encoding_p), expected_results)
2347
        self.assertDictEqual(dict(encoding_r), expected_results)
2348

2349
        # CASE 2: not batched
2350
        words, boxes = self.get_words_and_boxes()
2351
        word_labels = [1, 2]
2352

2353
        expected_results = {'input_ids': [0, 795, 13964, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'bbox': [[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], 'labels': [-100, 1, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], 'attention_mask': [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}  # fmt: skip
2354

2355
        encoding_p = tokenizer_p(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20)
2356
        encoding_r = tokenizer_r(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20)
2357
        self.assertDictEqual(dict(encoding_p), expected_results)
2358
        self.assertDictEqual(dict(encoding_r), expected_results)
2359

2360
        # # CASE 2: batched
2361
        words, boxes = self.get_words_and_boxes_batch()
2362
        word_labels = [[1, 2], [2, 46]]
2363

2364
        expected_results = {'input_ids': [[0, 795, 13964, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 92, 614, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'bbox': [[[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [961, 885, 992, 912], [256, 38, 330, 58], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], 'labels': [[-100, 1, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], [-100, 2, 46, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]], 'attention_mask': [[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # fmt: skip
2365

2366
        encoding_p = tokenizer_p(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20)
2367
        encoding_r = tokenizer_r(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20)
2368
        self.assertDictEqual(dict(encoding_p), expected_results)
2369
        self.assertDictEqual(dict(encoding_r), expected_results)
2370

2371
        # # CASE 3: not batched
2372
        question, words, boxes = self.get_question_words_and_boxes()
2373

2374
        expected_results = {'input_ids': [0, 99, 18, 39, 766, 116, 2, 2, 795, 13964, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'bbox': [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]}  # fmt: skip
2375

2376
        encoding_p = tokenizer_p(question, words, boxes, padding="max_length", max_length=20)
2377
        encoding_r = tokenizer_r(question, words, boxes, padding="max_length", max_length=20)
2378
        self.assertDictEqual(dict(encoding_p), expected_results)
2379
        self.assertDictEqual(dict(encoding_r), expected_results)
2380

2381
        # # CASE 3: batched
2382
        questions, words, boxes = self.get_question_words_and_boxes_batch()
2383

2384
        expected_results = {'input_ids': [[0, 99, 18, 39, 766, 116, 2, 2, 795, 13964, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 141, 16, 37, 373, 116, 2, 2, 13964, 795, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'bbox': [[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [256, 38, 330, 58], [256, 38, 330, 58], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # fmt: skip
2385

2386
        encoding_p = tokenizer_p(questions, words, boxes, padding="max_length", max_length=20)
2387
        encoding_r = tokenizer_r(questions, words, boxes, padding="max_length", max_length=20)
2388
        self.assertDictEqual(dict(encoding_p), expected_results)
2389
        self.assertDictEqual(dict(encoding_r), expected_results)
2390

2391
    @unittest.skip("Doesn't support another framework than PyTorch")
2392
    def test_np_encode_plus_sent_to_model(self):
2393
        pass
2394

2395
    @require_tf
2396
    @slow
2397
    def test_tf_encode_plus_sent_to_model(self):
2398
        from transformers import TF_MODEL_MAPPING, TOKENIZER_MAPPING
2399

2400
        MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(TF_MODEL_MAPPING, TOKENIZER_MAPPING)
2401

2402
        tokenizers = self.get_tokenizers(do_lower_case=False)
2403
        for tokenizer in tokenizers:
2404
            with self.subTest(f"{tokenizer.__class__.__name__}"):
2405
                if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
2406
                    return
2407

2408
                config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
2409
                config = config_class()
2410

2411
                if config.is_encoder_decoder or config.pad_token_id is None:
2412
                    return
2413

2414
                model = model_class(config)
2415

2416
                # Make sure the model contains at least the full vocabulary size in its embedding matrix
2417
                self.assertGreaterEqual(model.config.vocab_size, len(tokenizer))
2418

2419
                # Build sequence
2420
                first_ten_tokens = list(tokenizer.get_vocab().keys())[:10]
2421
                boxes = [[1000, 1000, 1000, 1000] for _ in range(len(first_ten_tokens))]
2422
                encoded_sequence = tokenizer.encode_plus(first_ten_tokens, boxes=boxes, return_tensors="tf")
2423
                batch_encoded_sequence = tokenizer.batch_encode_plus(
2424
                    [first_ten_tokens, first_ten_tokens], boxes=[boxes, boxes], return_tensors="tf"
2425
                )
2426

2427
                # This should not fail
2428
                model(encoded_sequence)
2429
                model(batch_encoded_sequence)
2430

2431
    @unittest.skip("Chat is not supported")
2432
    def test_chat_template(self):
2433
        pass
2434

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.