transformers
2433 строки · 123.1 Кб
1# coding=utf-8
2# Copyright 2022 The HuggingFace Inc. team.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16import inspect
17import json
18import os
19import re
20import shutil
21import tempfile
22import unittest
23from typing import List
24
25from transformers import (
26AddedToken,
27LayoutLMv3TokenizerFast,
28SpecialTokensMixin,
29is_tf_available,
30is_torch_available,
31logging,
32)
33from transformers.models.layoutlmv3.tokenization_layoutlmv3 import VOCAB_FILES_NAMES, LayoutLMv3Tokenizer
34from transformers.testing_utils import (
35is_pt_tf_cross_test,
36require_pandas,
37require_tf,
38require_tokenizers,
39require_torch,
40slow,
41)
42
43from ...test_tokenization_common import SMALL_TRAINING_CORPUS, TokenizerTesterMixin, merge_model_tokenizer_mappings
44
45
46logger = logging.get_logger(__name__)
47
48
49@require_tokenizers
50@require_pandas
51class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
52tokenizer_class = LayoutLMv3Tokenizer
53rust_tokenizer_class = LayoutLMv3TokenizerFast
54test_rust_tokenizer = True
55# determined by the tokenization algortihm and the way it's decoded by the fast tokenizers
56space_between_special_tokens = False
57test_seq2seq = False
58from_pretrained_kwargs = {"cls_token": "<s>"}
59
60def get_words_and_boxes(self):
61words = ["lower", "newer"]
62boxes = [[423, 237, 440, 251], [427, 272, 441, 287]]
63
64return words, boxes
65
66def get_words_and_boxes_batch(self):
67words = [["lower", "newer"], ["new", "low"]]
68boxes = [
69[[423, 237, 440, 251], [427, 272, 441, 287]],
70[[961, 885, 992, 912], [256, 38, 330, 58]],
71]
72
73return words, boxes
74
75def get_question_words_and_boxes(self):
76question = "what's his name?"
77words = ["lower", "newer"]
78boxes = [[423, 237, 440, 251], [427, 272, 441, 287]]
79
80return question, words, boxes
81
82def get_question_words_and_boxes_batch(self):
83questions = ["what's his name?", "how is he called?"]
84words = [["lower", "newer"], ["newer", "lower"]]
85boxes = [
86[[423, 237, 440, 251], [427, 272, 441, 287]],
87[[256, 38, 330, 58], [256, 38, 330, 58]],
88]
89
90return questions, words, boxes
91
92def setUp(self):
93super().setUp()
94
95# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
96vocab = [
97"l",
98"o",
99"w",
100"e",
101"r",
102"s",
103"t",
104"i",
105"d",
106"n",
107"\u0120",
108"\u0120l",
109"\u0120n",
110"\u0120lo",
111"\u0120low",
112"er",
113"\u0120lowest",
114"\u0120newer",
115"\u0120wider",
116"<unk>",
117]
118vocab_tokens = dict(zip(vocab, range(len(vocab))))
119merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
120self.special_tokens_map = {"unk_token": "<unk>"}
121
122self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
123self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
124with open(self.vocab_file, "w", encoding="utf-8") as fp:
125fp.write(json.dumps(vocab_tokens) + "\n")
126with open(self.merges_file, "w", encoding="utf-8") as fp:
127fp.write("\n".join(merges))
128
129def get_tokenizer(self, **kwargs):
130kwargs.update(self.special_tokens_map)
131return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
132
133def get_rust_tokenizer(self, **kwargs):
134kwargs.update(self.special_tokens_map)
135return LayoutLMv3TokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
136
137def get_input_output_texts(self, tokenizer):
138input_text = "lower newer"
139output_text = "lower newer"
140return input_text, output_text
141
142def test_full_tokenizer(self):
143tokenizer = self.tokenizer_class(self.vocab_file, self.merges_file, **self.special_tokens_map)
144text = "lower newer"
145bpe_tokens = ["Ġlow", "er", "Ġ", "n", "e", "w", "er"]
146tokens = tokenizer.tokenize(text) # , add_prefix_space=True)
147self.assertListEqual(tokens, bpe_tokens)
148
149input_tokens = tokens + [tokenizer.unk_token]
150input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
151self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
152
153@slow
154def test_sequence_builders(self):
155tokenizer = self.tokenizer_class.from_pretrained("microsoft/layoutlmv3-base")
156
157question, words, boxes = self.get_question_words_and_boxes()
158
159text = tokenizer.encode(
160question.split(),
161boxes=[tokenizer.pad_token_box for _ in range(len(question.split()))],
162add_special_tokens=False,
163)
164text_2 = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
165
166encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
167
168assert encoded_pair == [0] + text + [2] + [2] + text_2 + [2]
169
170def test_add_special_tokens(self):
171tokenizers: List[LayoutLMv3Tokenizer] = self.get_tokenizers(do_lower_case=False)
172for tokenizer in tokenizers:
173with self.subTest(f"{tokenizer.__class__.__name__}"):
174special_token = "[SPECIAL_TOKEN]"
175special_token_box = [1000, 1000, 1000, 1000]
176
177tokenizer.add_special_tokens({"cls_token": special_token})
178encoded_special_token = tokenizer.encode(
179[special_token], boxes=[special_token_box], add_special_tokens=False
180)
181self.assertEqual(len(encoded_special_token), 1)
182
183decoded = tokenizer.decode(encoded_special_token, skip_special_tokens=True)
184self.assertTrue(special_token not in decoded)
185
186def test_add_tokens_tokenizer(self):
187tokenizers: List[LayoutLMv3Tokenizer] = self.get_tokenizers(do_lower_case=False)
188for tokenizer in tokenizers:
189with self.subTest(f"{tokenizer.__class__.__name__}"):
190vocab_size = tokenizer.vocab_size
191all_size = len(tokenizer)
192
193self.assertNotEqual(vocab_size, 0)
194
195# We usually have added tokens from the start in tests because our vocab fixtures are
196# smaller than the original vocabs - let's not assert this
197# self.assertEqual(vocab_size, all_size)
198
199new_toks = ["aaaaa", "bbbbbb", "cccccccccdddddddd"]
200added_toks = tokenizer.add_tokens(new_toks)
201vocab_size_2 = tokenizer.vocab_size
202all_size_2 = len(tokenizer)
203
204self.assertNotEqual(vocab_size_2, 0)
205self.assertEqual(vocab_size, vocab_size_2)
206self.assertEqual(added_toks, len(new_toks))
207self.assertEqual(all_size_2, all_size + len(new_toks))
208
209words = "aaaaa bbbbbb low cccccccccdddddddd l".split()
210boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
211
212tokens = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
213
214self.assertGreaterEqual(len(tokens), 4)
215self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
216self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
217
218new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
219added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
220vocab_size_3 = tokenizer.vocab_size
221all_size_3 = len(tokenizer)
222
223self.assertNotEqual(vocab_size_3, 0)
224self.assertEqual(vocab_size, vocab_size_3)
225self.assertEqual(added_toks_2, len(new_toks_2))
226self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
227
228words = ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l".split()
229boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
230
231tokens = tokenizer.encode(
232words,
233boxes=boxes,
234add_special_tokens=False,
235)
236
237self.assertGreaterEqual(len(tokens), 6)
238self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
239self.assertGreater(tokens[0], tokens[1])
240self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
241self.assertGreater(tokens[-2], tokens[-3])
242self.assertEqual(tokens[0], tokenizer.eos_token_id)
243self.assertEqual(tokens[-2], tokenizer.pad_token_id)
244
245@require_tokenizers
246def test_encode_decode_with_spaces(self):
247tokenizers = self.get_tokenizers(do_lower_case=False)
248for tokenizer in tokenizers:
249with self.subTest(f"{tokenizer.__class__.__name__}"):
250words, boxes = self.get_words_and_boxes()
251
252new_toks = [AddedToken("[ABC]", normalized=False), AddedToken("[DEF]", normalized=False)]
253tokenizer.add_tokens(new_toks)
254input = "[ABC][DEF][ABC][DEF]"
255if self.space_between_special_tokens:
256output = "[ABC] [DEF] [ABC] [DEF]"
257else:
258output = input
259encoded = tokenizer.encode(input.split(), boxes=boxes, add_special_tokens=False)
260decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
261self.assertIn(decoded, [output, output.lower()])
262
263@unittest.skip("Not implemented")
264def test_right_and_left_truncation(self):
265pass
266
267@unittest.skip("Not implemented")
268def test_split_special_tokens(self):
269pass
270
271def test_encode_plus_with_padding(self):
272tokenizers = self.get_tokenizers(do_lower_case=False)
273for tokenizer in tokenizers:
274with self.subTest(f"{tokenizer.__class__.__name__}"):
275words, boxes = self.get_words_and_boxes()
276
277# check correct behaviour if no pad_token_id exists and add it eventually
278self._check_no_pad_token_padding(tokenizer, words)
279
280padding_size = 10
281padding_idx = tokenizer.pad_token_id
282
283encoded_sequence = tokenizer.encode_plus(words, boxes=boxes, return_special_tokens_mask=True)
284input_ids = encoded_sequence["input_ids"]
285special_tokens_mask = encoded_sequence["special_tokens_mask"]
286sequence_length = len(input_ids)
287
288# Test 'longest' and 'no_padding' don't do anything
289tokenizer.padding_side = "right"
290
291not_padded_sequence = tokenizer.encode_plus(
292words,
293boxes=boxes,
294padding=False,
295return_special_tokens_mask=True,
296)
297not_padded_input_ids = not_padded_sequence["input_ids"]
298
299not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
300not_padded_sequence_length = len(not_padded_input_ids)
301
302self.assertTrue(sequence_length == not_padded_sequence_length)
303self.assertTrue(input_ids == not_padded_input_ids)
304self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask)
305
306not_padded_sequence = tokenizer.encode_plus(
307words,
308boxes=boxes,
309padding=False,
310return_special_tokens_mask=True,
311)
312not_padded_input_ids = not_padded_sequence["input_ids"]
313
314not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
315not_padded_sequence_length = len(not_padded_input_ids)
316
317self.assertTrue(sequence_length == not_padded_sequence_length)
318self.assertTrue(input_ids == not_padded_input_ids)
319self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask)
320
321# Test right padding
322tokenizer.padding_side = "right"
323
324right_padded_sequence = tokenizer.encode_plus(
325words,
326boxes=boxes,
327max_length=sequence_length + padding_size,
328padding="max_length",
329return_special_tokens_mask=True,
330)
331right_padded_input_ids = right_padded_sequence["input_ids"]
332
333right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
334right_padded_sequence_length = len(right_padded_input_ids)
335
336self.assertTrue(sequence_length + padding_size == right_padded_sequence_length)
337self.assertTrue(input_ids + [padding_idx] * padding_size == right_padded_input_ids)
338self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask)
339
340# Test left padding
341tokenizer.padding_side = "left"
342left_padded_sequence = tokenizer.encode_plus(
343words,
344boxes=boxes,
345max_length=sequence_length + padding_size,
346padding="max_length",
347return_special_tokens_mask=True,
348)
349left_padded_input_ids = left_padded_sequence["input_ids"]
350left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
351left_padded_sequence_length = len(left_padded_input_ids)
352
353self.assertTrue(sequence_length + padding_size == left_padded_sequence_length)
354self.assertTrue([padding_idx] * padding_size + input_ids == left_padded_input_ids)
355self.assertTrue([1] * padding_size + special_tokens_mask == left_padded_special_tokens_mask)
356
357if "token_type_ids" in tokenizer.model_input_names:
358token_type_ids = encoded_sequence["token_type_ids"]
359left_padded_token_type_ids = left_padded_sequence["token_type_ids"]
360right_padded_token_type_ids = right_padded_sequence["token_type_ids"]
361
362assert token_type_ids + [0] * padding_size == right_padded_token_type_ids
363assert [0] * padding_size + token_type_ids == left_padded_token_type_ids
364
365if "attention_mask" in tokenizer.model_input_names:
366attention_mask = encoded_sequence["attention_mask"]
367right_padded_attention_mask = right_padded_sequence["attention_mask"]
368left_padded_attention_mask = left_padded_sequence["attention_mask"]
369
370self.assertTrue(attention_mask + [0] * padding_size == right_padded_attention_mask)
371self.assertTrue([0] * padding_size + attention_mask == left_padded_attention_mask)
372
373def test_internal_consistency(self):
374tokenizers = self.get_tokenizers()
375for tokenizer in tokenizers:
376with self.subTest(f"{tokenizer.__class__.__name__}"):
377words, boxes = self.get_words_and_boxes()
378
379tokens = []
380for word in words:
381tokens.extend(tokenizer.tokenize(word))
382ids = tokenizer.convert_tokens_to_ids(tokens)
383ids_2 = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
384self.assertListEqual(ids, ids_2)
385
386tokens_2 = tokenizer.convert_ids_to_tokens(ids)
387self.assertNotEqual(len(tokens_2), 0)
388text_2 = tokenizer.decode(ids)
389self.assertIsInstance(text_2, str)
390
391output_text = " lower newer"
392self.assertEqual(text_2, output_text)
393
394def test_mask_output(self):
395tokenizers = self.get_tokenizers(fast=False, do_lower_case=False)
396for tokenizer in tokenizers:
397with self.subTest(f"{tokenizer.__class__.__name__}"):
398words, boxes = self.get_words_and_boxes()
399
400if (
401tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer"
402and "token_type_ids" in tokenizer.model_input_names
403):
404information = tokenizer.encode_plus(words, boxes=boxes, add_special_tokens=True)
405sequences, mask = information["input_ids"], information["token_type_ids"]
406self.assertEqual(len(sequences), len(mask))
407
408def test_number_of_added_tokens(self):
409tokenizers = self.get_tokenizers(do_lower_case=False)
410for tokenizer in tokenizers:
411with self.subTest(f"{tokenizer.__class__.__name__}"):
412# test 1: single sequence
413words, boxes = self.get_words_and_boxes()
414
415sequences = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
416attached_sequences = tokenizer.encode(words, boxes=boxes, add_special_tokens=True)
417
418# Method is implemented (e.g. not GPT-2)
419if len(attached_sequences) != 2:
420self.assertEqual(
421tokenizer.num_special_tokens_to_add(pair=False), len(attached_sequences) - len(sequences)
422)
423
424# test 2: two sequences
425question, words, boxes = self.get_question_words_and_boxes()
426
427sequences = tokenizer.encode(question, words, boxes=boxes, add_special_tokens=False)
428attached_sequences = tokenizer.encode(question, words, boxes=boxes, add_special_tokens=True)
429
430# Method is implemented (e.g. not GPT-2)
431if len(attached_sequences) != 2:
432self.assertEqual(
433tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences)
434)
435
436def test_padding_to_max_length(self):
437"""We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated"""
438tokenizers = self.get_tokenizers(do_lower_case=False)
439for tokenizer in tokenizers:
440with self.subTest(f"{tokenizer.__class__.__name__}"):
441words, boxes = self.get_words_and_boxes()
442padding_size = 10
443
444# check correct behaviour if no pad_token_id exists and add it eventually
445self._check_no_pad_token_padding(tokenizer, words)
446
447padding_idx = tokenizer.pad_token_id
448
449# Check that it correctly pads when a maximum length is specified along with the padding flag set to True
450tokenizer.padding_side = "right"
451encoded_sequence = tokenizer.encode(words, boxes=boxes)
452sequence_length = len(encoded_sequence)
453# FIXME: the next line should be padding(max_length) to avoid warning
454padded_sequence = tokenizer.encode(
455words, boxes=boxes, max_length=sequence_length + padding_size, pad_to_max_length=True
456)
457padded_sequence_length = len(padded_sequence)
458assert sequence_length + padding_size == padded_sequence_length
459assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
460
461# Check that nothing is done when a maximum length is not specified
462encoded_sequence = tokenizer.encode(words, boxes=boxes)
463sequence_length = len(encoded_sequence)
464
465tokenizer.padding_side = "right"
466padded_sequence_right = tokenizer.encode(words, boxes=boxes, pad_to_max_length=True)
467padded_sequence_right_length = len(padded_sequence_right)
468assert sequence_length == padded_sequence_right_length
469assert encoded_sequence == padded_sequence_right
470
471def test_padding(self, max_length=50):
472for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
473with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
474tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
475tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
476
477self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
478pad_token_id = tokenizer_p.pad_token_id
479
480# Encode - Simple input
481words, boxes = self.get_words_and_boxes()
482input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
483input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
484self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
485input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, padding="max_length")
486input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, padding="max_length")
487self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
488
489input_r = tokenizer_r.encode(words, boxes=boxes, padding="longest")
490input_p = tokenizer_p.encode(words, boxes=boxes, padding=True)
491self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id)
492
493# Encode - Pair input
494question, words, boxes = self.get_question_words_and_boxes()
495input_r = tokenizer_r.encode(
496question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
497)
498input_p = tokenizer_p.encode(
499question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
500)
501self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
502input_r = tokenizer_r.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length")
503input_p = tokenizer_p.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length")
504self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
505input_r = tokenizer_r.encode(question, words, boxes=boxes, padding=True)
506input_p = tokenizer_p.encode(question, words, boxes=boxes, padding="longest")
507self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id)
508
509# Encode_plus - Simple input
510words, boxes = self.get_words_and_boxes()
511input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
512input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
513self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
514self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
515input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length")
516input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length")
517self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
518self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
519
520input_r = tokenizer_r.encode_plus(words, boxes=boxes, padding="longest")
521input_p = tokenizer_p.encode_plus(words, boxes=boxes, padding=True)
522self.assert_padded_input_match(
523input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id
524)
525
526self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
527
528# Encode_plus - Pair input
529question, words, boxes = self.get_question_words_and_boxes()
530input_r = tokenizer_r.encode_plus(
531question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
532)
533input_p = tokenizer_p.encode_plus(
534question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
535)
536self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
537self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
538input_r = tokenizer_r.encode_plus(
539question, words, boxes=boxes, max_length=max_length, padding="max_length"
540)
541input_p = tokenizer_p.encode_plus(
542question, words, boxes=boxes, max_length=max_length, padding="max_length"
543)
544self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
545self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
546input_r = tokenizer_r.encode_plus(question, words, boxes=boxes, padding="longest")
547input_p = tokenizer_p.encode_plus(question, words, boxes=boxes, padding=True)
548self.assert_padded_input_match(
549input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id
550)
551self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
552
553# Batch_encode_plus - Simple input
554words, boxes = self.get_words_and_boxes_batch()
555
556input_r = tokenizer_r.batch_encode_plus(
557words,
558boxes=boxes,
559max_length=max_length,
560pad_to_max_length=True,
561)
562input_p = tokenizer_p.batch_encode_plus(
563words,
564boxes=boxes,
565max_length=max_length,
566pad_to_max_length=True,
567)
568self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
569
570input_r = tokenizer_r.batch_encode_plus(
571words,
572boxes=boxes,
573max_length=max_length,
574padding="max_length",
575)
576input_p = tokenizer_p.batch_encode_plus(
577words,
578boxes=boxes,
579max_length=max_length,
580padding="max_length",
581)
582self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
583
584input_r = tokenizer_r.batch_encode_plus(
585words,
586boxes=boxes,
587max_length=max_length,
588padding="longest",
589)
590input_p = tokenizer_p.batch_encode_plus(
591words,
592boxes=boxes,
593max_length=max_length,
594padding=True,
595)
596self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
597
598input_r = tokenizer_r.batch_encode_plus(words, boxes=boxes, padding="longest")
599input_p = tokenizer_p.batch_encode_plus(words, boxes=boxes, padding=True)
600self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
601
602# Batch_encode_plus - Pair input
603questions, words, boxes = self.get_question_words_and_boxes_batch()
604
605input_r = tokenizer_r.batch_encode_plus(
606list(zip(questions, words)),
607is_pair=True,
608boxes=boxes,
609max_length=max_length,
610truncation=True,
611padding="max_length",
612)
613input_p = tokenizer_p.batch_encode_plus(
614list(zip(questions, words)),
615is_pair=True,
616boxes=boxes,
617max_length=max_length,
618truncation=True,
619padding="max_length",
620)
621self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
622
623input_r = tokenizer_r.batch_encode_plus(
624list(zip(questions, words)),
625is_pair=True,
626boxes=boxes,
627padding=True,
628)
629input_p = tokenizer_p.batch_encode_plus(
630list(zip(questions, words)),
631is_pair=True,
632boxes=boxes,
633padding="longest",
634)
635self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
636
637# Using pad on single examples after tokenization
638words, boxes = self.get_words_and_boxes()
639input_r = tokenizer_r.encode_plus(words, boxes=boxes)
640input_r = tokenizer_r.pad(input_r)
641
642input_p = tokenizer_r.encode_plus(words, boxes=boxes)
643input_p = tokenizer_r.pad(input_p)
644
645self.assert_padded_input_match(
646input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id
647)
648
649# Using pad on single examples after tokenization
650input_r = tokenizer_r.encode_plus(words, boxes=boxes)
651input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length")
652
653input_p = tokenizer_r.encode_plus(words, boxes=boxes)
654input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length")
655
656self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
657
658# Using pad after tokenization
659words, boxes = self.get_words_and_boxes_batch()
660input_r = tokenizer_r.batch_encode_plus(
661words,
662boxes=boxes,
663)
664input_r = tokenizer_r.pad(input_r)
665
666input_p = tokenizer_r.batch_encode_plus(
667words,
668boxes=boxes,
669)
670input_p = tokenizer_r.pad(input_p)
671
672self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
673
674# Using pad after tokenization
675words, boxes = self.get_words_and_boxes_batch()
676input_r = tokenizer_r.batch_encode_plus(
677words,
678boxes=boxes,
679)
680input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length")
681
682input_p = tokenizer_r.batch_encode_plus(
683words,
684boxes=boxes,
685)
686input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length")
687
688self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
689
690def test_padding_warning_message_fast_tokenizer(self):
691if not self.test_rust_tokenizer:
692return
693
694words, boxes = self.get_words_and_boxes_batch()
695
696tokenizer_fast = self.get_rust_tokenizer()
697
698encoding_fast = tokenizer_fast(
699words,
700boxes=boxes,
701)
702
703with self.assertLogs("transformers", level="WARNING") as cm:
704tokenizer_fast.pad(encoding_fast)
705self.assertEqual(len(cm.records), 1)
706self.assertIn(
707"Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to"
708" encode the text followed by a call to the `pad` method to get a padded encoding.",
709cm.records[0].message,
710)
711
712if not self.test_slow_tokenizer:
713return
714
715tokenizer_slow = self.get_tokenizer()
716
717encoding_slow = tokenizer_slow(
718words,
719boxes=boxes,
720)
721
722with self.assertLogs(level="WARNING") as cm:
723# We want to assert there are no warnings, but the 'assertLogs' method does not support that.
724# Therefore, we are adding a dummy warning, and then we will assert it is the only warning.
725logger.warning("Dummy warning")
726tokenizer_slow.pad(encoding_slow)
727self.assertEqual(len(cm.records), 1)
728self.assertIn(
729"Dummy warning",
730cm.records[0].message,
731)
732
733def test_call(self):
734# Tests that all call wrap to encode_plus and batch_encode_plus
735tokenizers = self.get_tokenizers(do_lower_case=False)
736for tokenizer in tokenizers:
737with self.subTest(f"{tokenizer.__class__.__name__}"):
738# Test not batched
739words, boxes = self.get_words_and_boxes()
740encoded_sequences_1 = tokenizer.encode_plus(words, boxes=boxes)
741encoded_sequences_2 = tokenizer(words, boxes=boxes)
742self.assertEqual(encoded_sequences_1, encoded_sequences_2)
743
744# Test not batched pairs
745question, words, boxes = self.get_question_words_and_boxes()
746encoded_sequences_1 = tokenizer.encode_plus(words, boxes=boxes)
747encoded_sequences_2 = tokenizer(words, boxes=boxes)
748self.assertEqual(encoded_sequences_1, encoded_sequences_2)
749
750# Test batched
751words, boxes = self.get_words_and_boxes_batch()
752encoded_sequences_1 = tokenizer.batch_encode_plus(words, is_pair=False, boxes=boxes)
753encoded_sequences_2 = tokenizer(words, boxes=boxes)
754self.assertEqual(encoded_sequences_1, encoded_sequences_2)
755
756def test_batch_encode_plus_batch_sequence_length(self):
757# Tests that all encoded values have the correct size
758tokenizers = self.get_tokenizers(do_lower_case=False)
759for tokenizer in tokenizers:
760with self.subTest(f"{tokenizer.__class__.__name__}"):
761words, boxes = self.get_words_and_boxes_batch()
762
763encoded_sequences = [
764tokenizer.encode_plus(words_example, boxes=boxes_example)
765for words_example, boxes_example in zip(words, boxes)
766]
767encoded_sequences_batch = tokenizer.batch_encode_plus(words, is_pair=False, boxes=boxes, padding=False)
768self.assertListEqual(
769encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
770)
771
772maximum_length = len(
773max([encoded_sequence["input_ids"] for encoded_sequence in encoded_sequences], key=len)
774)
775
776# check correct behaviour if no pad_token_id exists and add it eventually
777self._check_no_pad_token_padding(tokenizer, words)
778
779encoded_sequences_padded = [
780tokenizer.encode_plus(
781words_example, boxes=boxes_example, max_length=maximum_length, padding="max_length"
782)
783for words_example, boxes_example in zip(words, boxes)
784]
785
786encoded_sequences_batch_padded = tokenizer.batch_encode_plus(
787words, is_pair=False, boxes=boxes, padding=True
788)
789self.assertListEqual(
790encoded_sequences_padded,
791self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch_padded),
792)
793
794# check 'longest' is unsensitive to a max length
795encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(
796words, is_pair=False, boxes=boxes, padding=True
797)
798encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
799words, is_pair=False, boxes=boxes, max_length=maximum_length + 10, padding="longest"
800)
801for key in encoded_sequences_batch_padded_1.keys():
802self.assertListEqual(
803encoded_sequences_batch_padded_1[key],
804encoded_sequences_batch_padded_2[key],
805)
806
807# check 'no_padding' is unsensitive to a max length
808encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(
809words, is_pair=False, boxes=boxes, padding=False
810)
811encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
812words, is_pair=False, boxes=boxes, max_length=maximum_length + 10, padding=False
813)
814for key in encoded_sequences_batch_padded_1.keys():
815self.assertListEqual(
816encoded_sequences_batch_padded_1[key],
817encoded_sequences_batch_padded_2[key],
818)
819
820@unittest.skip("batch_encode_plus does not handle overflowing tokens.")
821def test_batch_encode_plus_overflowing_tokens(self):
822pass
823
824def test_batch_encode_plus_padding(self):
825# Test that padded sequences are equivalent between batch_encode_plus and encode_plus
826
827# Right padding tests
828tokenizers = self.get_tokenizers(do_lower_case=False)
829for tokenizer in tokenizers:
830with self.subTest(f"{tokenizer.__class__.__name__}"):
831words, boxes = self.get_words_and_boxes_batch()
832
833max_length = 100
834
835# check correct behaviour if no pad_token_id exists and add it eventually
836self._check_no_pad_token_padding(tokenizer, words)
837
838encoded_sequences = [
839tokenizer.encode_plus(
840words_example, boxes=boxes_example, max_length=max_length, padding="max_length"
841)
842for words_example, boxes_example in zip(words, boxes)
843]
844encoded_sequences_batch = tokenizer.batch_encode_plus(
845words, is_pair=False, boxes=boxes, max_length=max_length, padding="max_length"
846)
847self.assertListEqual(
848encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
849)
850
851# Left padding tests
852tokenizers = self.get_tokenizers(do_lower_case=False)
853for tokenizer in tokenizers:
854with self.subTest(f"{tokenizer.__class__.__name__}"):
855tokenizer.padding_side = "left"
856words, boxes = self.get_words_and_boxes_batch()
857
858max_length = 100
859
860# check correct behaviour if no pad_token_id exists and add it eventually
861self._check_no_pad_token_padding(tokenizer, words)
862
863encoded_sequences = [
864tokenizer.encode_plus(
865words_example, boxes=boxes_example, max_length=max_length, padding="max_length"
866)
867for words_example, boxes_example in zip(words, boxes)
868]
869encoded_sequences_batch = tokenizer.batch_encode_plus(
870words, is_pair=False, boxes=boxes, max_length=max_length, padding="max_length"
871)
872self.assertListEqual(
873encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
874)
875
876def test_padding_to_multiple_of(self):
877tokenizers = self.get_tokenizers()
878for tokenizer in tokenizers:
879with self.subTest(f"{tokenizer.__class__.__name__}"):
880if tokenizer.pad_token is None:
881self.skipTest("No padding token.")
882else:
883words, boxes = self.get_words_and_boxes()
884
885# empty_tokens = tokenizer([""], [[]], padding=True, pad_to_multiple_of=8)
886normal_tokens = tokenizer(words, boxes=boxes, padding=True, pad_to_multiple_of=8)
887# for key, value in empty_tokens.items():
888# self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
889for key, value in normal_tokens.items():
890self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
891
892normal_tokens = tokenizer(words, boxes=boxes, pad_to_multiple_of=8)
893for key, value in normal_tokens.items():
894self.assertNotEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
895
896# Should also work with truncation
897normal_tokens = tokenizer(words, boxes=boxes, padding=True, truncation=True, pad_to_multiple_of=8)
898for key, value in normal_tokens.items():
899self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
900
901# truncation to something which is not a multiple of pad_to_multiple_of raises an error
902self.assertRaises(
903ValueError,
904tokenizer.__call__,
905words,
906boxes=boxes,
907padding=True,
908truncation=True,
909max_length=12,
910pad_to_multiple_of=8,
911)
912
913def test_tokenizer_slow_store_full_signature(self):
914signature = inspect.signature(self.tokenizer_class.__init__)
915tokenizer = self.get_tokenizer()
916
917for parameter_name, parameter in signature.parameters.items():
918if parameter.default != inspect.Parameter.empty:
919self.assertIn(parameter_name, tokenizer.init_kwargs)
920
921def test_build_inputs_with_special_tokens(self):
922if not self.test_slow_tokenizer:
923# as we don't have a slow version, we can't compare the outputs between slow and fast versions
924return
925
926for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
927with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
928tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
929tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
930
931# Input tokens id
932words, boxes = self.get_words_and_boxes()
933input_simple = tokenizer_p.encode(words, boxes=boxes, add_special_tokens=False)
934input_pair = tokenizer_p.encode(words, boxes=boxes, add_special_tokens=False)
935
936# Generate output
937output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple)
938output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple)
939self.assertEqual(output_p, output_r)
940
941# Generate pair output
942output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair)
943output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
944self.assertEqual(output_p, output_r)
945
946def test_special_tokens_mask_input_pairs(self):
947tokenizers = self.get_tokenizers(do_lower_case=False)
948for tokenizer in tokenizers:
949with self.subTest(f"{tokenizer.__class__.__name__}"):
950words, boxes = self.get_words_and_boxes()
951encoded_sequence = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
952encoded_sequence_dict = tokenizer.encode_plus(
953words,
954boxes=boxes,
955add_special_tokens=True,
956return_special_tokens_mask=True,
957# add_prefix_space=False,
958)
959encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
960special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
961self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
962
963filtered_sequence = [
964(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
965]
966filtered_sequence = [x for x in filtered_sequence if x is not None]
967self.assertEqual(encoded_sequence, filtered_sequence)
968
969def test_special_tokens_mask(self):
970tokenizers = self.get_tokenizers(do_lower_case=False)
971for tokenizer in tokenizers:
972with self.subTest(f"{tokenizer.__class__.__name__}"):
973words, boxes = self.get_words_and_boxes()
974# Testing single inputs
975encoded_sequence = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
976encoded_sequence_dict = tokenizer.encode_plus(
977words, boxes=boxes, add_special_tokens=True, return_special_tokens_mask=True
978)
979encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
980special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
981self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
982
983filtered_sequence = [x for i, x in enumerate(encoded_sequence_w_special) if not special_tokens_mask[i]]
984self.assertEqual(encoded_sequence, filtered_sequence)
985
986def test_save_and_load_tokenizer(self):
987# safety check on max_len default value so we are sure the test works
988tokenizers = self.get_tokenizers()
989for tokenizer in tokenizers:
990with self.subTest(f"{tokenizer.__class__.__name__}"):
991self.assertNotEqual(tokenizer.model_max_length, 42)
992
993# Now let's start the test
994tokenizers = self.get_tokenizers()
995for tokenizer in tokenizers:
996with self.subTest(f"{tokenizer.__class__.__name__}"):
997# Isolate this from the other tests because we save additional tokens/etc
998words, boxes = self.get_words_and_boxes()
999tmpdirname = tempfile.mkdtemp()
1000
1001before_tokens = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
1002before_vocab = tokenizer.get_vocab()
1003tokenizer.save_pretrained(tmpdirname)
1004
1005after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
1006after_tokens = after_tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
1007after_vocab = after_tokenizer.get_vocab()
1008self.assertListEqual(before_tokens, after_tokens)
1009self.assertDictEqual(before_vocab, after_vocab)
1010
1011shutil.rmtree(tmpdirname)
1012
1013def test_right_and_left_padding(self):
1014tokenizers = self.get_tokenizers(do_lower_case=False)
1015for tokenizer in tokenizers:
1016with self.subTest(f"{tokenizer.__class__.__name__}"):
1017words, boxes = self.get_words_and_boxes()
1018sequence = "Sequence"
1019padding_size = 10
1020
1021# check correct behaviour if no pad_token_id exists and add it eventually
1022self._check_no_pad_token_padding(tokenizer, sequence)
1023
1024padding_idx = tokenizer.pad_token_id
1025
1026# RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
1027tokenizer.padding_side = "right"
1028encoded_sequence = tokenizer.encode(words, boxes=boxes)
1029sequence_length = len(encoded_sequence)
1030padded_sequence = tokenizer.encode(
1031words, boxes=boxes, max_length=sequence_length + padding_size, padding="max_length"
1032)
1033padded_sequence_length = len(padded_sequence)
1034assert sequence_length + padding_size == padded_sequence_length
1035assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
1036
1037# LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
1038tokenizer.padding_side = "left"
1039encoded_sequence = tokenizer.encode(words, boxes=boxes)
1040sequence_length = len(encoded_sequence)
1041padded_sequence = tokenizer.encode(
1042words, boxes=boxes, max_length=sequence_length + padding_size, padding="max_length"
1043)
1044padded_sequence_length = len(padded_sequence)
1045assert sequence_length + padding_size == padded_sequence_length
1046assert [padding_idx] * padding_size + encoded_sequence == padded_sequence
1047
1048# RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_padding'
1049encoded_sequence = tokenizer.encode(words, boxes=boxes)
1050sequence_length = len(encoded_sequence)
1051
1052tokenizer.padding_side = "right"
1053padded_sequence_right = tokenizer.encode(words, boxes=boxes, padding=True)
1054padded_sequence_right_length = len(padded_sequence_right)
1055assert sequence_length == padded_sequence_right_length
1056assert encoded_sequence == padded_sequence_right
1057
1058tokenizer.padding_side = "left"
1059padded_sequence_left = tokenizer.encode(words, boxes=boxes, padding="longest")
1060padded_sequence_left_length = len(padded_sequence_left)
1061assert sequence_length == padded_sequence_left_length
1062assert encoded_sequence == padded_sequence_left
1063
1064tokenizer.padding_side = "right"
1065padded_sequence_right = tokenizer.encode(words, boxes=boxes)
1066padded_sequence_right_length = len(padded_sequence_right)
1067assert sequence_length == padded_sequence_right_length
1068assert encoded_sequence == padded_sequence_right
1069
1070tokenizer.padding_side = "left"
1071padded_sequence_left = tokenizer.encode(words, boxes=boxes, padding=False)
1072padded_sequence_left_length = len(padded_sequence_left)
1073assert sequence_length == padded_sequence_left_length
1074assert encoded_sequence == padded_sequence_left
1075
1076def test_token_type_ids(self):
1077tokenizers = self.get_tokenizers()
1078for tokenizer in tokenizers:
1079with self.subTest(f"{tokenizer.__class__.__name__}"):
1080# test 1: single sequence
1081words, boxes = self.get_words_and_boxes()
1082
1083output = tokenizer(words, boxes=boxes, return_token_type_ids=True)
1084
1085# Assert that the token type IDs have the same length as the input IDs
1086self.assertEqual(len(output["token_type_ids"]), len(output["input_ids"]))
1087
1088# Assert that the token type IDs have the same length as the attention mask
1089self.assertEqual(len(output["token_type_ids"]), len(output["attention_mask"]))
1090
1091self.assertIn(0, output["token_type_ids"])
1092self.assertNotIn(1, output["token_type_ids"])
1093
1094# test 2: two sequences (question + words)
1095question, words, boxes = self.get_question_words_and_boxes()
1096
1097output = tokenizer(question, words, boxes, return_token_type_ids=True)
1098
1099# Assert that the token type IDs have the same length as the input IDs
1100self.assertEqual(len(output["token_type_ids"]), len(output["input_ids"]))
1101
1102# Assert that the token type IDs have the same length as the attention mask
1103self.assertEqual(len(output["token_type_ids"]), len(output["attention_mask"]))
1104
1105self.assertIn(0, output["token_type_ids"])
1106
1107def test_offsets_mapping(self):
1108for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
1109with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
1110tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
1111
1112text = ["a", "wonderful", "test"]
1113boxes = [[1, 8, 12, 20] for _ in range(len(text))]
1114
1115# No pair
1116tokens_with_offsets = tokenizer_r.encode_plus(
1117text,
1118boxes=boxes,
1119return_special_tokens_mask=True,
1120return_offsets_mapping=True,
1121add_special_tokens=True,
1122)
1123added_tokens = tokenizer_r.num_special_tokens_to_add(False)
1124offsets = tokens_with_offsets["offset_mapping"]
1125
1126# Assert there is the same number of tokens and offsets
1127self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"]))
1128
1129# Assert there is online added_tokens special_tokens
1130self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
1131
1132# Pairs
1133text = "what's his name"
1134pair = ["a", "wonderful", "test"]
1135boxes = [[1, 8, 12, 20] for _ in range(len(pair))]
1136tokens_with_offsets = tokenizer_r.encode_plus(
1137text,
1138pair,
1139boxes=boxes,
1140return_special_tokens_mask=True,
1141return_offsets_mapping=True,
1142add_special_tokens=True,
1143)
1144added_tokens = tokenizer_r.num_special_tokens_to_add(True)
1145offsets = tokens_with_offsets["offset_mapping"]
1146
1147# Assert there is the same number of tokens and offsets
1148self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"]))
1149
1150# Assert there is online added_tokens special_tokens
1151self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
1152
1153@require_torch
1154@slow
1155def test_torch_encode_plus_sent_to_model(self):
1156import torch
1157
1158from transformers import MODEL_MAPPING, TOKENIZER_MAPPING
1159
1160MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(MODEL_MAPPING, TOKENIZER_MAPPING)
1161
1162tokenizers = self.get_tokenizers(do_lower_case=False)
1163for tokenizer in tokenizers:
1164with self.subTest(f"{tokenizer.__class__.__name__}"):
1165if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
1166return
1167
1168config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
1169config = config_class()
1170
1171if config.is_encoder_decoder or config.pad_token_id is None:
1172return
1173
1174model = model_class(config)
1175
1176# Make sure the model contains at least the full vocabulary size in its embedding matrix
1177is_using_common_embeddings = hasattr(model.get_input_embeddings(), "weight")
1178assert (
1179(model.get_input_embeddings().weight.shape[0] >= len(tokenizer))
1180if is_using_common_embeddings
1181else True
1182)
1183
1184# Build sequence
1185words, boxes = self.get_words_and_boxes()
1186encoded_sequence = tokenizer.encode_plus(words, boxes=boxes, return_tensors="pt")
1187batch_encoded_sequence = tokenizer.batch_encode_plus(
1188[words, words], boxes=[boxes, boxes], return_tensors="pt"
1189)
1190
1191# We add dummy pixel_values keys (as LayoutLMv3 actually also requires a feature extractor
1192# to prepare the image input)
1193encoded_sequence["pixel_values"] = torch.randn(1, 3, 224, 224)
1194batch_encoded_sequence["pixel_values"] = torch.randn(2, 3, 224, 224)
1195
1196# This should not fail
1197with torch.no_grad(): # saves some time
1198model(**encoded_sequence)
1199model(**batch_encoded_sequence)
1200
1201def test_rust_and_python_full_tokenizers(self):
1202if not self.test_rust_tokenizer:
1203return
1204
1205if not self.test_slow_tokenizer:
1206# as we don't have a slow version, we can't compare the outputs between slow and fast versions
1207return
1208
1209tokenizer = self.get_tokenizer()
1210rust_tokenizer = self.get_rust_tokenizer()
1211
1212words, boxes = self.get_words_and_boxes()
1213
1214ids = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
1215rust_ids = rust_tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
1216self.assertListEqual(ids, rust_ids)
1217
1218ids = tokenizer.encode(words, boxes=boxes, add_special_tokens=True)
1219rust_ids = rust_tokenizer.encode(words, boxes=boxes, add_special_tokens=True)
1220self.assertListEqual(ids, rust_ids)
1221
1222def test_tokenization_python_rust_equals(self):
1223if not self.test_slow_tokenizer:
1224# as we don't have a slow version, we can't compare the outputs between slow and fast versions
1225return
1226
1227for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
1228with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
1229tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
1230tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
1231
1232words, boxes = self.get_words_and_boxes()
1233
1234# Ensure basic input match
1235input_p = tokenizer_p.encode_plus(words, boxes=boxes)
1236input_r = tokenizer_r.encode_plus(words, boxes=boxes)
1237
1238for key in filter(
1239lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
1240):
1241self.assertSequenceEqual(input_p[key], input_r[key])
1242
1243input_pairs_p = tokenizer_p.encode_plus(words, boxes=boxes)
1244input_pairs_r = tokenizer_r.encode_plus(words, boxes=boxes)
1245
1246for key in filter(
1247lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
1248):
1249self.assertSequenceEqual(input_pairs_p[key], input_pairs_r[key])
1250
1251words = ["hello" for _ in range(1000)]
1252boxes = [[1000, 1000, 1000, 1000] for _ in range(1000)]
1253
1254# Ensure truncation match
1255input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=512, truncation=True)
1256input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=512, truncation=True)
1257
1258for key in filter(
1259lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
1260):
1261self.assertSequenceEqual(input_p[key], input_r[key])
1262
1263# Ensure truncation with stride match
1264input_p = tokenizer_p.encode_plus(
1265words, boxes=boxes, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
1266)
1267input_r = tokenizer_r.encode_plus(
1268words, boxes=boxes, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
1269)
1270
1271for key in filter(
1272lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
1273):
1274self.assertSequenceEqual(input_p[key], input_r[key][0])
1275
1276def test_embeded_special_tokens(self):
1277if not self.test_slow_tokenizer:
1278# as we don't have a slow version, we can't compare the outputs between slow and fast versions
1279return
1280
1281for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
1282with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
1283tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
1284tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
1285words, boxes = self.get_words_and_boxes()
1286tokens_r = tokenizer_r.encode_plus(
1287words,
1288boxes=boxes,
1289add_special_tokens=True,
1290)
1291tokens_p = tokenizer_p.encode_plus(
1292words,
1293boxes=boxes,
1294add_special_tokens=True,
1295)
1296
1297for key in tokens_p.keys():
1298self.assertEqual(tokens_r[key], tokens_p[key])
1299
1300if "token_type_ids" in tokens_r:
1301self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
1302
1303tokens_r = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
1304tokens_p = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
1305self.assertSequenceEqual(tokens_r, tokens_p)
1306
1307def test_compare_add_special_tokens(self):
1308for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
1309with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
1310tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
1311
1312simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False)
1313
1314words, boxes = self.get_words_and_boxes()
1315# tokenize()
1316no_special_tokens = tokenizer_r.tokenize(" ".join(words), add_special_tokens=False)
1317with_special_tokens = tokenizer_r.tokenize(" ".join(words), add_special_tokens=True)
1318self.assertEqual(len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add)
1319
1320# encode()
1321no_special_tokens = tokenizer_r.encode(words, boxes=boxes, add_special_tokens=False)
1322with_special_tokens = tokenizer_r.encode(words, boxes=boxes, add_special_tokens=True)
1323self.assertEqual(len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add)
1324
1325# encode_plus()
1326no_special_tokens = tokenizer_r.encode_plus(words, boxes=boxes, add_special_tokens=False)
1327with_special_tokens = tokenizer_r.encode_plus(words, boxes=boxes, add_special_tokens=True)
1328for key in no_special_tokens.keys():
1329self.assertEqual(
1330len(no_special_tokens[key]),
1331len(with_special_tokens[key]) - simple_num_special_tokens_to_add,
1332)
1333
1334# # batch_encode_plus
1335words, boxes = self.get_words_and_boxes_batch()
1336
1337no_special_tokens = tokenizer_r.batch_encode_plus(words, boxes=boxes, add_special_tokens=False)
1338with_special_tokens = tokenizer_r.batch_encode_plus(words, boxes=boxes, add_special_tokens=True)
1339for key in no_special_tokens.keys():
1340for i_no, i_with in zip(no_special_tokens[key], with_special_tokens[key]):
1341self.assertEqual(len(i_no), len(i_with) - simple_num_special_tokens_to_add)
1342
1343@slow
1344def test_layoutlmv3_truncation_integration_test(self):
1345words, boxes = self.get_words_and_boxes()
1346
1347tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base", model_max_length=512)
1348
1349for i in range(12, 512):
1350new_encoded_inputs = tokenizer.encode(words, boxes=boxes, max_length=i, truncation=True)
1351
1352# Ensure that the input IDs are less than the max length defined.
1353self.assertLessEqual(len(new_encoded_inputs), i)
1354
1355tokenizer.model_max_length = 20
1356new_encoded_inputs = tokenizer.encode(words, boxes=boxes, truncation=True)
1357dropped_encoded_inputs = tokenizer.encode(words, boxes=boxes, truncation=True)
1358
1359# Ensure that the input IDs are still truncated when no max_length is specified
1360self.assertListEqual(new_encoded_inputs, dropped_encoded_inputs)
1361self.assertLessEqual(len(new_encoded_inputs), 20)
1362
1363@is_pt_tf_cross_test
1364def test_batch_encode_plus_tensors(self):
1365tokenizers = self.get_tokenizers(do_lower_case=False)
1366for tokenizer in tokenizers:
1367with self.subTest(f"{tokenizer.__class__.__name__}"):
1368words, boxes = self.get_words_and_boxes_batch()
1369
1370# A Tensor cannot be build by sequences which are not the same size
1371self.assertRaises(ValueError, tokenizer.batch_encode_plus, words, boxes=boxes, return_tensors="pt")
1372self.assertRaises(ValueError, tokenizer.batch_encode_plus, words, boxes=boxes, return_tensors="tf")
1373
1374if tokenizer.pad_token_id is None:
1375self.assertRaises(
1376ValueError,
1377tokenizer.batch_encode_plus,
1378words,
1379boxes=boxes,
1380padding=True,
1381return_tensors="pt",
1382)
1383self.assertRaises(
1384ValueError,
1385tokenizer.batch_encode_plus,
1386words,
1387boxes=boxes,
1388padding="longest",
1389return_tensors="tf",
1390)
1391else:
1392pytorch_tensor = tokenizer.batch_encode_plus(words, boxes=boxes, padding=True, return_tensors="pt")
1393tensorflow_tensor = tokenizer.batch_encode_plus(
1394words, boxes=boxes, padding="longest", return_tensors="tf"
1395)
1396encoded_sequences = tokenizer.batch_encode_plus(words, boxes=boxes, padding=True)
1397
1398for key in encoded_sequences.keys():
1399pytorch_value = pytorch_tensor[key].tolist()
1400tensorflow_value = tensorflow_tensor[key].numpy().tolist()
1401encoded_value = encoded_sequences[key]
1402
1403self.assertEqual(pytorch_value, tensorflow_value, encoded_value)
1404
1405def test_sequence_ids(self):
1406tokenizers = self.get_tokenizers()
1407for tokenizer in tokenizers:
1408if not tokenizer.is_fast:
1409continue
1410with self.subTest(f"{tokenizer.__class__.__name__}"):
1411seq_0 = "Test this method."
1412seq_1 = ["With", "these", "inputs."]
1413boxes = [[1000, 1000, 1000, 1000] for _ in range(len(seq_1))]
1414
1415# We want to have sequence 0 and sequence 1 are tagged
1416# respectively with 0 and 1 token_ids
1417# (regardless of whether the model use token type ids)
1418# We use this assumption in the QA pipeline among other place
1419output = tokenizer(seq_0.split(), boxes=boxes)
1420self.assertIn(0, output.sequence_ids())
1421
1422output = tokenizer(seq_0, seq_1, boxes=boxes)
1423self.assertIn(0, output.sequence_ids())
1424self.assertIn(1, output.sequence_ids())
1425
1426if tokenizer.num_special_tokens_to_add(pair=True):
1427self.assertIn(None, output.sequence_ids())
1428
1429def test_special_tokens_initialization(self):
1430for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
1431with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
1432added_tokens = [AddedToken("<special>", lstrip=True)]
1433
1434tokenizer_r = self.rust_tokenizer_class.from_pretrained(
1435pretrained_name, additional_special_tokens=added_tokens, **kwargs
1436)
1437words = "Hey this is a <special> token".split()
1438boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
1439r_output = tokenizer_r.encode(words, boxes=boxes)
1440
1441special_token_id = tokenizer_r.encode(
1442["<special>"], boxes=[1000, 1000, 1000, 1000], add_special_tokens=False
1443)[0]
1444
1445self.assertTrue(special_token_id in r_output)
1446
1447if self.test_slow_tokenizer:
1448tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
1449pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
1450)
1451tokenizer_p = self.tokenizer_class.from_pretrained(
1452pretrained_name, additional_special_tokens=added_tokens, **kwargs
1453)
1454
1455words = "Hey this is a <special> token".split()
1456boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
1457
1458p_output = tokenizer_p.encode(words, boxes=boxes)
1459cr_output = tokenizer_cr.encode(words, boxes=boxes)
1460
1461self.assertEqual(p_output, r_output)
1462self.assertEqual(cr_output, r_output)
1463self.assertTrue(special_token_id in p_output)
1464self.assertTrue(special_token_id in cr_output)
1465
1466def test_training_new_tokenizer(self):
1467# This feature only exists for fast tokenizers
1468if not self.test_rust_tokenizer:
1469return
1470
1471tokenizer = self.get_rust_tokenizer()
1472new_tokenizer = tokenizer.train_new_from_iterator(SMALL_TRAINING_CORPUS, 100)
1473
1474# Test we can use the new tokenizer with something not seen during training
1475text = [["this", "is", "the"], ["how", "are", "you"]]
1476boxes = [[[1, 2, 3, 4], [5, 6, 7, 8], [1, 3, 4, 8]], [[5, 6, 7, 8], [4, 5, 6, 7], [3, 9, 2, 7]]]
1477inputs = new_tokenizer(text, boxes=boxes)
1478self.assertEqual(len(inputs["input_ids"]), 2)
1479decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)
1480expected_result = " this is the"
1481
1482if tokenizer.backend_tokenizer.normalizer is not None:
1483expected_result = tokenizer.backend_tokenizer.normalizer.normalize_str(expected_result)
1484self.assertEqual(expected_result, decoded_input)
1485
1486# We check that the parameters of the tokenizer remained the same
1487# Check we have the same number of added_tokens for both pair and non-pair inputs.
1488self.assertEqual(tokenizer.num_special_tokens_to_add(False), new_tokenizer.num_special_tokens_to_add(False))
1489self.assertEqual(tokenizer.num_special_tokens_to_add(True), new_tokenizer.num_special_tokens_to_add(True))
1490
1491# Check we have the correct max_length for both pair and non-pair inputs.
1492self.assertEqual(tokenizer.max_len_single_sentence, new_tokenizer.max_len_single_sentence)
1493self.assertEqual(tokenizer.max_len_sentences_pair, new_tokenizer.max_len_sentences_pair)
1494
1495# Assert the set of special tokens match as we didn't ask to change them
1496self.assertSequenceEqual(
1497tokenizer.all_special_tokens_extended,
1498new_tokenizer.all_special_tokens_extended,
1499)
1500
1501self.assertDictEqual(tokenizer.special_tokens_map, new_tokenizer.special_tokens_map)
1502
1503def test_training_new_tokenizer_with_special_tokens_change(self):
1504# This feature only exists for fast tokenizers
1505if not self.test_rust_tokenizer:
1506return
1507
1508tokenizer = self.get_rust_tokenizer()
1509# Test with a special tokens map
1510class_signature = inspect.signature(tokenizer.__class__)
1511if "cls_token" in class_signature.parameters:
1512new_tokenizer = tokenizer.train_new_from_iterator(
1513SMALL_TRAINING_CORPUS, 100, special_tokens_map={tokenizer.cls_token: "<cls>"}
1514)
1515cls_id = new_tokenizer.get_vocab()["<cls>"]
1516self.assertEqual(new_tokenizer.cls_token, "<cls>")
1517self.assertEqual(new_tokenizer.cls_token_id, cls_id)
1518
1519# Create a new mapping from the special tokens defined in the original tokenizer
1520special_tokens_list = SpecialTokensMixin.SPECIAL_TOKENS_ATTRIBUTES.copy()
1521special_tokens_list.remove("additional_special_tokens")
1522special_tokens_map = {}
1523for token in special_tokens_list:
1524# Get the private one to avoid unnecessary warnings.
1525if getattr(tokenizer, f"_{token}") is not None:
1526special_token = getattr(tokenizer, token)
1527special_tokens_map[special_token] = f"{special_token}a"
1528
1529# Train new tokenizer
1530new_tokenizer = tokenizer.train_new_from_iterator(
1531SMALL_TRAINING_CORPUS, 100, special_tokens_map=special_tokens_map
1532)
1533
1534# Check the changes
1535for token in special_tokens_list:
1536# Get the private one to avoid unnecessary warnings.
1537if getattr(tokenizer, f"_{token}") is None:
1538continue
1539special_token = getattr(tokenizer, token)
1540if special_token in special_tokens_map:
1541new_special_token = getattr(new_tokenizer, token)
1542self.assertEqual(special_tokens_map[special_token], new_special_token)
1543
1544new_id = new_tokenizer.get_vocab()[new_special_token]
1545self.assertEqual(getattr(new_tokenizer, f"{token}_id"), new_id)
1546
1547# Check if the AddedToken / string format has been kept
1548for special_token in tokenizer.all_special_tokens_extended:
1549if isinstance(special_token, AddedToken) and special_token.content not in special_tokens_map:
1550# The special token must appear identically in the list of the new tokenizer.
1551self.assertTrue(
1552special_token in new_tokenizer.all_special_tokens_extended,
1553f"'{special_token}' should be in {new_tokenizer.all_special_tokens_extended}",
1554)
1555elif isinstance(special_token, AddedToken):
1556# The special token must appear in the list of the new tokenizer as an object of type AddedToken with
1557# the same parameters as the old AddedToken except the content that the user has requested to change.
1558special_token_str = special_token.content
1559new_special_token_str = special_tokens_map[special_token_str]
1560
1561find = False
1562for candidate in new_tokenizer.all_special_tokens_extended:
1563if (
1564isinstance(candidate, AddedToken)
1565and candidate.content == new_special_token_str
1566and candidate.lstrip == special_token.lstrip
1567and candidate.rstrip == special_token.rstrip
1568and candidate.normalized == special_token.normalized
1569and candidate.single_word == special_token.single_word
1570):
1571find = True
1572break
1573self.assertTrue(
1574find,
1575f"'{new_special_token_str}' doesn't appear in the list "
1576f"'{new_tokenizer.all_special_tokens_extended}' as an AddedToken with the same parameters as "
1577f"'{special_token}' in the list {tokenizer.all_special_tokens_extended}",
1578)
1579elif special_token not in special_tokens_map:
1580# The special token must appear identically in the list of the new tokenizer.
1581self.assertTrue(
1582special_token in new_tokenizer.all_special_tokens_extended,
1583f"'{special_token}' should be in {new_tokenizer.all_special_tokens_extended}",
1584)
1585
1586else:
1587# The special token must appear in the list of the new tokenizer as an object of type string.
1588self.assertTrue(special_tokens_map[special_token] in new_tokenizer.all_special_tokens_extended)
1589
1590# Test we can use the new tokenizer with something not seen during training
1591words = [["this", "is"], ["hello", "🤗"]]
1592boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[1, 2, 3, 4], [5, 6, 7, 8]]]
1593inputs = new_tokenizer(words, boxes=boxes)
1594self.assertEqual(len(inputs["input_ids"]), 2)
1595decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)
1596expected_result = " this is"
1597
1598if tokenizer.backend_tokenizer.normalizer is not None:
1599expected_result = tokenizer.backend_tokenizer.normalizer.normalize_str(expected_result)
1600self.assertEqual(expected_result, decoded_input)
1601
1602def test_prepare_for_model(self):
1603tokenizers = self.get_tokenizers(do_lower_case=False)
1604for tokenizer in tokenizers:
1605# only test prepare_for_model for the slow tokenizer
1606if tokenizer.__class__.__name__ == "LayoutLMv3TokenizerFast":
1607continue
1608with self.subTest(f"{tokenizer.__class__.__name__}"):
1609words, boxes = self.get_words_and_boxes()
1610prepared_input_dict = tokenizer.prepare_for_model(words, boxes=boxes, add_special_tokens=True)
1611
1612input_dict = tokenizer.encode_plus(words, boxes=boxes, add_special_tokens=True)
1613
1614self.assertEqual(input_dict, prepared_input_dict)
1615
1616def test_padding_different_model_input_name(self):
1617if not self.test_slow_tokenizer:
1618# as we don't have a slow version, we can't compare the outputs between slow and fast versions
1619return
1620
1621for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
1622with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
1623tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
1624tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
1625self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
1626pad_token_id = tokenizer_p.pad_token_id
1627
1628words, boxes = self.get_words_and_boxes_batch()
1629
1630input_r = tokenizer_r.batch_encode_plus(words, boxes=boxes)
1631input_p = tokenizer_r.batch_encode_plus(words, boxes=boxes)
1632
1633# rename encoded batch to "inputs"
1634input_r["inputs"] = input_r[tokenizer_r.model_input_names[0]]
1635del input_r[tokenizer_r.model_input_names[0]]
1636
1637input_p["inputs"] = input_p[tokenizer_p.model_input_names[0]]
1638del input_p[tokenizer_p.model_input_names[0]]
1639
1640# Renaming `input_ids` to `inputs`
1641tokenizer_r.model_input_names = ["inputs"] + tokenizer_r.model_input_names[1:]
1642tokenizer_p.model_input_names = ["inputs"] + tokenizer_p.model_input_names[1:]
1643
1644input_r = tokenizer_r.pad(input_r, padding="longest")
1645input_p = tokenizer_r.pad(input_p, padding="longest")
1646
1647max_length = len(input_p["inputs"][0])
1648self.assert_batch_padded_input_match(
1649input_r, input_p, max_length, pad_token_id, model_main_input_name="inputs"
1650)
1651
1652def test_batch_encode_dynamic_overflowing(self):
1653"""
1654When calling batch_encode with multiple sequences, it can return different number of
1655overflowing encoding for each sequence:
1656[
1657Sequence 1: [Encoding 1, Encoding 2],
1658Sequence 2: [Encoding 1],
1659Sequence 3: [Encoding 1, Encoding 2, ... Encoding N]
1660]
1661This needs to be padded so that it can represented as a tensor
1662"""
1663for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
1664tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
1665
1666with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name}, {tokenizer.__class__.__name__})"):
1667if is_torch_available():
1668returned_tensor = "pt"
1669elif is_tf_available():
1670returned_tensor = "tf"
1671else:
1672returned_tensor = "jax"
1673
1674# Single example
1675words = ["HuggingFace", "is", "solving", "NLP", "one", "commit", "at", "a", "time"]
1676boxes = [[i, i, i, i] for i in range(len(words))]
1677tokens = tokenizer.encode_plus(
1678words,
1679boxes=boxes,
1680max_length=6,
1681padding=True,
1682truncation=True,
1683return_tensors=returned_tensor,
1684return_overflowing_tokens=True,
1685)
1686
1687for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
1688if key != "bbox":
1689self.assertEqual(len(tokens[key].shape), 2)
1690else:
1691self.assertEqual(len(tokens[key].shape), 3)
1692
1693# Batch of examples
1694# For these 2 examples, 3 training examples will be created
1695words_batched = [
1696["HuggingFace", "is", "solving", "NLP", "one", "commit", "at", "a", "time"],
1697["Very", "tiny", "input"],
1698]
1699boxes_batched = [[[i, i, i, i] for i in range(len(words_item))] for words_item in words_batched]
1700tokens = tokenizer.batch_encode_plus(
1701words_batched,
1702boxes=boxes_batched,
1703max_length=6,
1704padding=True,
1705truncation="only_first",
1706return_tensors=returned_tensor,
1707return_overflowing_tokens=True,
1708)
1709
1710for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
1711if key != "bbox":
1712self.assertEqual(len(tokens[key].shape), 2)
1713self.assertEqual(tokens[key].shape[-1], 6)
1714else:
1715self.assertEqual(len(tokens[key].shape), 3)
1716self.assertEqual(tokens[key].shape[-1], 4)
1717
1718@unittest.skip("TO DO: overwrite this very extensive test.")
1719def test_alignement_methods(self):
1720pass
1721
1722def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5):
1723toks = [(i, tokenizer.decode([i], clean_up_tokenization_spaces=False)) for i in range(len(tokenizer))]
1724toks = list(filter(lambda t: re.match(r"^[ a-zA-Z]+$", t[1]), toks))
1725toks = list(
1726filter(
1727lambda t: [t[0]]
1728== tokenizer.encode(t[1].split(" "), boxes=len(t[1]) * [[1, 1, 1, 1]], add_special_tokens=False),
1729toks,
1730)
1731)
1732if max_length is not None and len(toks) > max_length:
1733toks = toks[:max_length]
1734if min_length is not None and len(toks) < min_length and len(toks) > 0:
1735while len(toks) < min_length:
1736toks = toks + toks
1737# toks_str = [t[1] for t in toks]
1738toks_ids = [t[0] for t in toks]
1739
1740# Ensure consistency
1741output_txt = tokenizer.decode(toks_ids, clean_up_tokenization_spaces=False)
1742if " " not in output_txt and len(toks_ids) > 1:
1743output_txt = (
1744tokenizer.decode([toks_ids[0]], clean_up_tokenization_spaces=False)
1745+ " "
1746+ tokenizer.decode(toks_ids[1:], clean_up_tokenization_spaces=False)
1747)
1748if with_prefix_space:
1749output_txt = " " + output_txt
1750words = output_txt.split(" ")
1751boxes = [[i, i, i, i] for i in range(len(words))]
1752output_ids = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
1753
1754return words, boxes, output_ids
1755
1756def test_added_token_with_space_before(self):
1757tokenizer_s = self.get_tokenizer()
1758tokenizer_f = self.get_rust_tokenizer()
1759
1760tokens_to_add = ["AAA", "bbb"]
1761
1762words_with_space = [f" {token}" for token in tokens_to_add + list(tokenizer_s.added_tokens_encoder.keys())]
1763words_without_space = tokens_to_add + list(tokenizer_s.added_tokens_encoder.keys())
1764boxes = [[i, i, i, i] for i in range(len(words_with_space))]
1765
1766tokens_to_add_formated = [
1767AddedToken(token, rstrip=True, lstrip=True, single_word=False) for token in tokens_to_add
1768]
1769tokenizer_s.add_tokens(tokens_to_add_formated)
1770tokenizer_f.add_tokens(tokens_to_add_formated)
1771
1772ids_s = tokenizer_s(words_with_space, boxes=boxes).input_ids
1773ids_f = tokenizer_f(words_with_space, boxes=boxes).input_ids
1774
1775tokens_s = tokenizer_s.convert_ids_to_tokens(ids_s)
1776tokens_f = tokenizer_f.convert_ids_to_tokens(ids_f)
1777
1778ids_s = tokenizer_s(words_without_space, boxes=boxes).input_ids
1779ids_f = tokenizer_f(words_without_space, boxes=boxes).input_ids
1780
1781tokens_s = tokenizer_s.convert_ids_to_tokens(ids_s)
1782tokens_f = tokenizer_f.convert_ids_to_tokens(ids_f)
1783
1784self.assertEqual(tokens_s, tokens_f)
1785
1786def test_maximum_encoding_length_pair_input(self):
1787tokenizers = self.get_tokenizers(do_lower_case=False, model_max_length=100)
1788for tokenizer in tokenizers:
1789with self.subTest(f"{tokenizer.__class__.__name__}"):
1790# Build a sequence from our model's vocabulary
1791stride = 2
1792seq_0, boxes_0, ids = self.get_clean_sequence(tokenizer, max_length=20)
1793question_0 = " ".join(map(str, seq_0))
1794if len(ids) <= 2 + stride:
1795seq_0 = (seq_0 + " ") * (2 + stride)
1796ids = None
1797
1798seq0_tokens = tokenizer(seq_0, boxes=boxes_0, add_special_tokens=False)
1799seq0_input_ids = seq0_tokens["input_ids"]
1800
1801self.assertGreater(len(seq0_input_ids), 2 + stride)
1802question_1 = "This is another sentence to be encoded."
1803seq_1 = ["what", "a", "weird", "test", "weirdly", "weird"]
1804boxes_1 = [[i, i, i, i] for i in range(1, len(seq_1) + 1)]
1805seq1_tokens = tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)
1806if abs(len(seq0_input_ids) - len(seq1_tokens["input_ids"])) <= 2:
1807seq1_tokens_input_ids = seq1_tokens["input_ids"] + seq1_tokens["input_ids"]
1808seq_1 = tokenizer.decode(seq1_tokens_input_ids, clean_up_tokenization_spaces=False)
1809seq_1 = seq_1.split(" ")
1810boxes_1 = [[i, i, i, i] for i in range(1, len(seq_1) + 1)]
1811seq1_tokens = tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)
1812seq1_input_ids = seq1_tokens["input_ids"]
1813
1814self.assertGreater(len(seq1_input_ids), 2 + stride)
1815
1816smallest = seq1_input_ids if len(seq0_input_ids) > len(seq1_input_ids) else seq0_input_ids
1817
1818# We are not using the special tokens - a bit too hard to test all the tokenizers with this
1819# TODO try this again later
1820sequence = tokenizer(
1821question_0, seq_1, boxes=boxes_1, add_special_tokens=False
1822) # , add_prefix_space=False)
1823
1824# Test with max model input length
1825model_max_length = tokenizer.model_max_length
1826self.assertEqual(model_max_length, 100)
1827seq_2 = seq_0 * model_max_length
1828question_2 = " ".join(map(str, seq_2))
1829boxes_2 = boxes_0 * model_max_length
1830self.assertGreater(len(seq_2), model_max_length)
1831
1832sequence1 = tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)
1833total_length1 = len(sequence1["input_ids"])
1834sequence2 = tokenizer(question_2, seq_1, boxes=boxes_1, add_special_tokens=False)
1835total_length2 = len(sequence2["input_ids"])
1836self.assertLess(total_length1, model_max_length, "Issue with the testing sequence, please update it.")
1837self.assertGreater(
1838total_length2, model_max_length, "Issue with the testing sequence, please update it."
1839)
1840
1841# Simple
1842padding_strategies = (
1843[False, True, "longest"] if tokenizer.pad_token and tokenizer.pad_token_id >= 0 else [False]
1844)
1845for padding_state in padding_strategies:
1846with self.subTest(f"{tokenizer.__class__.__name__} Padding: {padding_state}"):
1847for truncation_state in [True, "longest_first", "only_first"]:
1848with self.subTest(f"{tokenizer.__class__.__name__} Truncation: {truncation_state}"):
1849output = tokenizer(
1850question_2,
1851seq_1,
1852boxes=boxes_1,
1853padding=padding_state,
1854truncation=truncation_state,
1855)
1856self.assertEqual(len(output["input_ids"]), model_max_length)
1857self.assertEqual(len(output["bbox"]), model_max_length)
1858
1859output = tokenizer(
1860[question_2],
1861[seq_1],
1862boxes=[boxes_1],
1863padding=padding_state,
1864truncation=truncation_state,
1865)
1866self.assertEqual(len(output["input_ids"][0]), model_max_length)
1867self.assertEqual(len(output["bbox"][0]), model_max_length)
1868
1869# Simple
1870output = tokenizer(
1871question_1, seq_2, boxes=boxes_2, padding=padding_state, truncation="only_second"
1872)
1873self.assertEqual(len(output["input_ids"]), model_max_length)
1874self.assertEqual(len(output["bbox"]), model_max_length)
1875
1876output = tokenizer(
1877[question_1], [seq_2], boxes=[boxes_2], padding=padding_state, truncation="only_second"
1878)
1879self.assertEqual(len(output["input_ids"][0]), model_max_length)
1880self.assertEqual(len(output["bbox"][0]), model_max_length)
1881
1882# Simple with no truncation
1883# Reset warnings
1884tokenizer.deprecation_warnings = {}
1885with self.assertLogs("transformers", level="WARNING") as cm:
1886output = tokenizer(
1887question_1, seq_2, boxes=boxes_2, padding=padding_state, truncation=False
1888)
1889self.assertNotEqual(len(output["input_ids"]), model_max_length)
1890self.assertNotEqual(len(output["bbox"]), model_max_length)
1891self.assertEqual(len(cm.records), 1)
1892self.assertTrue(
1893cm.records[0].message.startswith(
1894"Token indices sequence length is longer than the specified maximum sequence length"
1895" for this model"
1896)
1897)
1898
1899tokenizer.deprecation_warnings = {}
1900with self.assertLogs("transformers", level="WARNING") as cm:
1901output = tokenizer(
1902[question_1], [seq_2], boxes=[boxes_2], padding=padding_state, truncation=False
1903)
1904self.assertNotEqual(len(output["input_ids"][0]), model_max_length)
1905self.assertNotEqual(len(output["bbox"][0]), model_max_length)
1906self.assertEqual(len(cm.records), 1)
1907self.assertTrue(
1908cm.records[0].message.startswith(
1909"Token indices sequence length is longer than the specified maximum sequence length"
1910" for this model"
1911)
1912)
1913# Check the order of Sequence of input ids, overflowing tokens and bbox sequence with truncation
1914truncated_first_sequence = (
1915tokenizer(seq_0, boxes=boxes_0, add_special_tokens=False)["input_ids"][:-2]
1916+ tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)["input_ids"]
1917)
1918truncated_second_sequence = (
1919tokenizer(seq_0, boxes=boxes_0, add_special_tokens=False)["input_ids"]
1920+ tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)["input_ids"][:-2]
1921)
1922truncated_longest_sequence = (
1923truncated_first_sequence
1924if len(seq0_input_ids) > len(seq1_input_ids)
1925else truncated_second_sequence
1926)
1927
1928overflow_first_sequence = (
1929tokenizer(seq_0, boxes=boxes_0, add_special_tokens=False)["input_ids"][-(2 + stride) :]
1930+ tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)["input_ids"]
1931)
1932overflow_second_sequence = (
1933tokenizer(seq_0, boxes=boxes_0, add_special_tokens=False)["input_ids"]
1934+ tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)["input_ids"][-(2 + stride) :]
1935)
1936overflow_longest_sequence = (
1937overflow_first_sequence if len(seq0_input_ids) > len(seq1_input_ids) else overflow_second_sequence
1938)
1939
1940bbox_first = [[0, 0, 0, 0]] * (len(seq0_input_ids) - 2)
1941bbox_first_sequence = bbox_first + tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)["bbox"]
1942overflowing_token_bbox_first_sequence_slow = [[0, 0, 0, 0]] * (2 + stride)
1943overflowing_token_bbox_first_sequence_fast = [[0, 0, 0, 0]] * (2 + stride) + tokenizer(
1944seq_1, boxes=boxes_1, add_special_tokens=False
1945)["bbox"]
1946
1947bbox_second = [[0, 0, 0, 0]] * len(seq0_input_ids)
1948bbox_second_sequence = (
1949bbox_second + tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)["bbox"][:-2]
1950)
1951overflowing_token_bbox_second_sequence_slow = tokenizer(
1952seq_1, boxes=boxes_1, add_special_tokens=False
1953)["bbox"][-(2 + stride) :]
1954overflowing_token_bbox_second_sequence_fast = [[0, 0, 0, 0]] * len(seq0_input_ids) + tokenizer(
1955seq_1, boxes=boxes_1, add_special_tokens=False
1956)["bbox"][-(2 + stride) :]
1957
1958bbox_longest_sequence = (
1959bbox_first_sequence if len(seq0_tokens) > len(seq1_tokens) else bbox_second_sequence
1960)
1961overflowing_token_bbox_longest_sequence_fast = (
1962overflowing_token_bbox_first_sequence_fast
1963if len(seq0_tokens) > len(seq1_tokens)
1964else overflowing_token_bbox_second_sequence_fast
1965)
1966
1967# Overflowing tokens are handled quite differently in slow and fast tokenizers
1968if isinstance(tokenizer, LayoutLMv3TokenizerFast):
1969information = tokenizer(
1970question_0,
1971seq_1,
1972boxes=boxes_1,
1973max_length=len(sequence["input_ids"]) - 2,
1974add_special_tokens=False,
1975stride=stride,
1976truncation="longest_first",
1977return_overflowing_tokens=True,
1978# add_prefix_space=False,
1979)
1980truncated_sequence = information["input_ids"][0]
1981overflowing_tokens = information["input_ids"][1]
1982bbox = information["bbox"][0]
1983overflowing_bbox = information["bbox"][1]
1984self.assertEqual(len(information["input_ids"]), 2)
1985
1986self.assertEqual(len(truncated_sequence), len(sequence["input_ids"]) - 2)
1987self.assertEqual(truncated_sequence, truncated_longest_sequence)
1988
1989self.assertEqual(len(overflowing_tokens), 2 + stride + len(smallest))
1990self.assertEqual(overflowing_tokens, overflow_longest_sequence)
1991self.assertEqual(bbox, bbox_longest_sequence)
1992
1993self.assertEqual(len(overflowing_bbox), 2 + stride + len(smallest))
1994self.assertEqual(overflowing_bbox, overflowing_token_bbox_longest_sequence_fast)
1995else:
1996# No overflowing tokens when using 'longest' in python tokenizers
1997with self.assertRaises(ValueError) as context:
1998information = tokenizer(
1999question_0,
2000seq_1,
2001boxes=boxes_1,
2002max_length=len(sequence["input_ids"]) - 2,
2003add_special_tokens=False,
2004stride=stride,
2005truncation="longest_first",
2006return_overflowing_tokens=True,
2007# add_prefix_space=False,
2008)
2009
2010self.assertTrue(
2011context.exception.args[0].startswith(
2012"Not possible to return overflowing tokens for pair of sequences with the "
2013"`longest_first`. Please select another truncation strategy than `longest_first`, "
2014"for instance `only_second` or `only_first`."
2015)
2016)
2017
2018# Overflowing tokens are handled quite differently in slow and fast tokenizers
2019if isinstance(tokenizer, LayoutLMv3TokenizerFast):
2020information = tokenizer(
2021question_0,
2022seq_1,
2023boxes=boxes_1,
2024max_length=len(sequence["input_ids"]) - 2,
2025add_special_tokens=False,
2026stride=stride,
2027truncation=True,
2028return_overflowing_tokens=True,
2029# add_prefix_space=False,
2030)
2031truncated_sequence = information["input_ids"][0]
2032overflowing_tokens = information["input_ids"][1]
2033bbox = information["bbox"][0]
2034overflowing_bbox = information["bbox"][1]
2035self.assertEqual(len(information["input_ids"]), 2)
2036
2037self.assertEqual(len(truncated_sequence), len(sequence["input_ids"]) - 2)
2038self.assertEqual(truncated_sequence, truncated_longest_sequence)
2039
2040self.assertEqual(len(overflowing_tokens), 2 + stride + len(smallest))
2041self.assertEqual(overflowing_tokens, overflow_longest_sequence)
2042self.assertEqual(bbox, bbox_longest_sequence)
2043self.assertEqual(overflowing_bbox, overflowing_token_bbox_longest_sequence_fast)
2044else:
2045# No overflowing tokens when using 'longest' in python tokenizers
2046with self.assertRaises(ValueError) as context:
2047information = tokenizer(
2048question_0,
2049seq_1,
2050boxes=boxes_1,
2051max_length=len(sequence["input_ids"]) - 2,
2052add_special_tokens=False,
2053stride=stride,
2054truncation=True,
2055return_overflowing_tokens=True,
2056# add_prefix_space=False,
2057)
2058
2059self.assertTrue(
2060context.exception.args[0].startswith(
2061"Not possible to return overflowing tokens for pair of sequences with the "
2062"`longest_first`. Please select another truncation strategy than `longest_first`, "
2063"for instance `only_second` or `only_first`."
2064)
2065)
2066
2067information_first_truncated = tokenizer(
2068question_0,
2069seq_1,
2070boxes=boxes_1,
2071max_length=len(sequence["input_ids"]) - 2,
2072add_special_tokens=False,
2073stride=stride,
2074truncation="only_first",
2075return_overflowing_tokens=True,
2076# add_prefix_space=False,
2077)
2078# Overflowing tokens are handled quite differently in slow and fast tokenizers
2079if isinstance(tokenizer, LayoutLMv3TokenizerFast):
2080truncated_sequence = information_first_truncated["input_ids"][0]
2081overflowing_tokens = information_first_truncated["input_ids"][1]
2082bbox = information_first_truncated["bbox"][0]
2083overflowing_bbox = information_first_truncated["bbox"][0]
2084self.assertEqual(len(information_first_truncated["input_ids"]), 2)
2085
2086self.assertEqual(len(truncated_sequence), len(sequence["input_ids"]) - 2)
2087self.assertEqual(truncated_sequence, truncated_first_sequence)
2088
2089self.assertEqual(len(overflowing_tokens), 2 + stride + len(seq1_input_ids))
2090self.assertEqual(overflowing_tokens, overflow_first_sequence)
2091self.assertEqual(bbox, bbox_first_sequence)
2092self.assertEqual(overflowing_bbox, overflowing_token_bbox_first_sequence_fast)
2093else:
2094truncated_sequence = information_first_truncated["input_ids"]
2095overflowing_tokens = information_first_truncated["overflowing_tokens"]
2096overflowing_bbox = information_first_truncated["overflowing_token_boxes"]
2097bbox = information_first_truncated["bbox"]
2098
2099self.assertEqual(len(truncated_sequence), len(sequence["input_ids"]) - 2)
2100self.assertEqual(truncated_sequence, truncated_first_sequence)
2101
2102self.assertEqual(len(overflowing_tokens), 2 + stride)
2103self.assertEqual(overflowing_tokens, seq0_input_ids[-(2 + stride) :])
2104self.assertEqual(bbox, bbox_first_sequence)
2105self.assertEqual(overflowing_bbox, overflowing_token_bbox_first_sequence_slow)
2106
2107information_second_truncated = tokenizer(
2108question_0,
2109seq_1,
2110boxes=boxes_1,
2111max_length=len(sequence["input_ids"]) - 2,
2112add_special_tokens=False,
2113stride=stride,
2114truncation="only_second",
2115return_overflowing_tokens=True,
2116# add_prefix_space=False,
2117)
2118# Overflowing tokens are handled quite differently in slow and fast tokenizers
2119if isinstance(tokenizer, LayoutLMv3TokenizerFast):
2120truncated_sequence = information_second_truncated["input_ids"][0]
2121overflowing_tokens = information_second_truncated["input_ids"][1]
2122bbox = information_second_truncated["bbox"][0]
2123overflowing_bbox = information_second_truncated["bbox"][1]
2124
2125self.assertEqual(len(information_second_truncated["input_ids"]), 2)
2126
2127self.assertEqual(len(truncated_sequence), len(sequence["input_ids"]) - 2)
2128self.assertEqual(truncated_sequence, truncated_second_sequence)
2129
2130self.assertEqual(len(overflowing_tokens), 2 + stride + len(seq0_input_ids))
2131self.assertEqual(overflowing_tokens, overflow_second_sequence)
2132self.assertEqual(bbox, bbox_second_sequence)
2133self.assertEqual(overflowing_bbox, overflowing_token_bbox_second_sequence_fast)
2134else:
2135truncated_sequence = information_second_truncated["input_ids"]
2136overflowing_tokens = information_second_truncated["overflowing_tokens"]
2137bbox = information_second_truncated["bbox"]
2138overflowing_bbox = information_second_truncated["overflowing_token_boxes"]
2139
2140self.assertEqual(len(truncated_sequence), len(sequence["input_ids"]) - 2)
2141self.assertEqual(truncated_sequence, truncated_second_sequence)
2142
2143self.assertEqual(len(overflowing_tokens), 2 + stride)
2144self.assertEqual(overflowing_tokens, seq1_input_ids[-(2 + stride) :])
2145self.assertEqual(bbox, bbox_second_sequence)
2146self.assertEqual(overflowing_bbox, overflowing_token_bbox_second_sequence_slow)
2147
2148def test_maximum_encoding_length_single_input(self):
2149tokenizers = self.get_tokenizers(do_lower_case=False, model_max_length=100)
2150for tokenizer in tokenizers:
2151with self.subTest(f"{tokenizer.__class__.__name__}"):
2152seq_0, boxes_0, ids = self.get_clean_sequence(tokenizer, max_length=20)
2153
2154sequence = tokenizer(seq_0, boxes=boxes_0, add_special_tokens=False)
2155total_length = len(sequence["input_ids"])
2156
2157self.assertGreater(
2158total_length, 4, "Issue with the testing sequence, please update it, it's too short"
2159)
2160
2161# Test with max model input length
2162model_max_length = tokenizer.model_max_length
2163self.assertEqual(model_max_length, 100)
2164seq_1 = seq_0 * model_max_length
2165boxes_1 = boxes_0 * model_max_length
2166sequence1 = tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)
2167total_length1 = len(sequence1["input_ids"])
2168self.assertGreater(
2169total_length1,
2170model_max_length,
2171"Issue with the testing sequence, please update it, it's too short",
2172)
2173
2174# Simple
2175padding_strategies = (
2176[False, True, "longest"] if tokenizer.pad_token and tokenizer.pad_token_id >= 0 else [False]
2177)
2178for padding_state in padding_strategies:
2179with self.subTest(f"Padding: {padding_state}"):
2180for truncation_state in [True, "longest_first", "only_first"]:
2181with self.subTest(f"Truncation: {truncation_state}"):
2182output = tokenizer(
2183seq_1,
2184boxes=boxes_1,
2185padding=padding_state,
2186truncation=truncation_state,
2187)
2188
2189self.assertEqual(len(output["input_ids"]), model_max_length)
2190self.assertEqual(len(output["bbox"]), model_max_length)
2191
2192output = tokenizer(
2193[seq_1],
2194boxes=[boxes_1],
2195padding=padding_state,
2196truncation=truncation_state,
2197)
2198self.assertEqual(len(output["input_ids"][0]), model_max_length)
2199self.assertEqual(len(output["bbox"][0]), model_max_length)
2200
2201# Simple with no truncation
2202# Reset warnings
2203tokenizer.deprecation_warnings = {}
2204with self.assertLogs("transformers", level="WARNING") as cm:
2205output = tokenizer(seq_1, boxes=boxes_1, padding=padding_state, truncation=False)
2206self.assertNotEqual(len(output["input_ids"]), model_max_length)
2207self.assertNotEqual(len(output["bbox"]), model_max_length)
2208self.assertEqual(len(cm.records), 1)
2209self.assertTrue(
2210cm.records[0].message.startswith(
2211"Token indices sequence length is longer than the specified maximum sequence length"
2212" for this model"
2213)
2214)
2215
2216tokenizer.deprecation_warnings = {}
2217with self.assertLogs("transformers", level="WARNING") as cm:
2218output = tokenizer([seq_1], boxes=[boxes_1], padding=padding_state, truncation=False)
2219self.assertNotEqual(len(output["input_ids"][0]), model_max_length)
2220self.assertNotEqual(len(output["bbox"][0]), model_max_length)
2221self.assertEqual(len(cm.records), 1)
2222self.assertTrue(
2223cm.records[0].message.startswith(
2224"Token indices sequence length is longer than the specified maximum sequence length"
2225" for this model"
2226)
2227)
2228# Check the order of Sequence of input ids, overflowing tokens and bbox sequence with truncation
2229stride = 2
2230information = tokenizer(
2231seq_0,
2232boxes=boxes_0,
2233max_length=total_length - 2,
2234add_special_tokens=False,
2235stride=stride,
2236truncation=True,
2237return_overflowing_tokens=True,
2238# add_prefix_space=False,
2239)
2240
2241# Overflowing tokens are handled quite differently in slow and fast tokenizers
2242if isinstance(tokenizer, LayoutLMv3TokenizerFast):
2243truncated_sequence = information["input_ids"][0]
2244overflowing_tokens = information["input_ids"][1]
2245# bbox = information["bbox"][0]
2246# overflowing_bbox = information["bbox"][1]
2247self.assertEqual(len(information["input_ids"]), 2)
2248
2249self.assertEqual(len(truncated_sequence), total_length - 2)
2250self.assertEqual(truncated_sequence, sequence["input_ids"][:-2])
2251
2252self.assertEqual(len(overflowing_tokens), 2 + stride)
2253self.assertEqual(overflowing_tokens, sequence["input_ids"][-(2 + stride) :])
2254
2255# self.assertEqual(bbox, sequence["bbox"][:-2])
2256# self.assertEqual(overflowing_bbox, sequence["bbox"][-(2 + stride) :])
2257else:
2258truncated_sequence = information["input_ids"]
2259overflowing_tokens = information["overflowing_tokens"]
2260# bbox = information["bbox"]
2261# overflowing_bbox = information["overflowing_token_boxes"]
2262self.assertEqual(len(truncated_sequence), total_length - 2)
2263self.assertEqual(truncated_sequence, sequence["input_ids"][:-2])
2264
2265self.assertEqual(len(overflowing_tokens), 2 + stride)
2266self.assertEqual(overflowing_tokens, sequence["input_ids"][-(2 + stride) :])
2267# self.assertEqual(bbox, sequence["bbox"][:-2])
2268# self.assertEqual(overflowing_bbox, sequence["bbox"][-(2 + stride) :])
2269
2270@unittest.skip("LayoutLMv3 tokenizer requires boxes besides sequences.")
2271def test_pretokenized_inputs(self):
2272pass
2273
2274@unittest.skip("LayoutLMv3 tokenizer always expects pretokenized inputs.")
2275def test_compare_pretokenized_inputs(self):
2276pass
2277
2278@unittest.skip("LayoutLMv3 fast tokenizer does not support prepare_for_model")
2279def test_compare_prepare_for_model(self):
2280pass
2281
2282@slow
2283def test_only_label_first_subword(self):
2284words = ["hello", "niels", "0000000000000000"]
2285boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
2286word_labels = [0, 1, 2]
2287
2288# test slow tokenizer
2289tokenizer_p = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base", add_visual_labels=False)
2290encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels)
2291self.assertListEqual(encoding.labels, [-100, 0, 1, -100, 2, -100, -100])
2292
2293tokenizer_p = LayoutLMv3Tokenizer.from_pretrained(
2294"microsoft/layoutlmv3-base",
2295only_label_first_subword=False,
2296add_visual_labels=False,
2297)
2298encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels)
2299self.assertListEqual(encoding.labels, [-100, 0, 1, 1, 2, 2, -100])
2300
2301# test fast tokenizer
2302tokenizer_r = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base", add_visual_labels=False)
2303encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels)
2304self.assertListEqual(encoding.labels, [-100, 0, 1, -100, 2, -100, -100])
2305
2306tokenizer_r = LayoutLMv3Tokenizer.from_pretrained(
2307"microsoft/layoutlmv3-base",
2308only_label_first_subword=False,
2309add_visual_labels=False,
2310)
2311encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels)
2312self.assertListEqual(encoding.labels, [-100, 0, 1, 1, 2, 2, -100])
2313
2314@slow
2315def test_layoutlmv3_integration_test(self):
2316tokenizer_p = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
2317tokenizer_r = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base")
2318
2319# There are 3 cases:
2320# CASE 1: document image classification (training + inference), document image token classification (inference),
2321# in which case only words and normalized bounding boxes are provided to the tokenizer
2322# CASE 2: document image token classification (training),
2323# in which case one also provides word labels to the tokenizer
2324# CASE 3: document image visual question answering (inference),
2325# in which case one also provides a question to the tokenizer
2326
2327# We need to test all 3 cases both on batched and non-batched inputs.
2328
2329# CASE 1: not batched
2330words, boxes = self.get_words_and_boxes()
2331
2332expected_results = {'input_ids': [0, 795, 13964, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'bbox': [[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], 'attention_mask': [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]} # fmt: skip
2333
2334encoding_p = tokenizer_p(words, boxes=boxes, padding="max_length", max_length=20)
2335encoding_r = tokenizer_r(words, boxes=boxes, padding="max_length", max_length=20)
2336self.assertDictEqual(dict(encoding_p), expected_results)
2337self.assertDictEqual(dict(encoding_r), expected_results)
2338
2339# CASE 1: batched
2340words, boxes = self.get_words_and_boxes_batch()
2341
2342expected_results = {'input_ids': [[0, 795, 13964, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 92, 614, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'bbox': [[[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [961, 885, 992, 912], [256, 38, 330, 58], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], 'attention_mask': [[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]} # fmt: skip
2343
2344encoding_p = tokenizer_p(words, boxes=boxes, padding="max_length", max_length=20)
2345encoding_r = tokenizer_r(words, boxes=boxes, padding="max_length", max_length=20)
2346self.assertDictEqual(dict(encoding_p), expected_results)
2347self.assertDictEqual(dict(encoding_r), expected_results)
2348
2349# CASE 2: not batched
2350words, boxes = self.get_words_and_boxes()
2351word_labels = [1, 2]
2352
2353expected_results = {'input_ids': [0, 795, 13964, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'bbox': [[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], 'labels': [-100, 1, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], 'attention_mask': [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]} # fmt: skip
2354
2355encoding_p = tokenizer_p(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20)
2356encoding_r = tokenizer_r(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20)
2357self.assertDictEqual(dict(encoding_p), expected_results)
2358self.assertDictEqual(dict(encoding_r), expected_results)
2359
2360# # CASE 2: batched
2361words, boxes = self.get_words_and_boxes_batch()
2362word_labels = [[1, 2], [2, 46]]
2363
2364expected_results = {'input_ids': [[0, 795, 13964, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 92, 614, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'bbox': [[[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [961, 885, 992, 912], [256, 38, 330, 58], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], 'labels': [[-100, 1, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], [-100, 2, 46, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]], 'attention_mask': [[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]} # fmt: skip
2365
2366encoding_p = tokenizer_p(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20)
2367encoding_r = tokenizer_r(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20)
2368self.assertDictEqual(dict(encoding_p), expected_results)
2369self.assertDictEqual(dict(encoding_r), expected_results)
2370
2371# # CASE 3: not batched
2372question, words, boxes = self.get_question_words_and_boxes()
2373
2374expected_results = {'input_ids': [0, 99, 18, 39, 766, 116, 2, 2, 795, 13964, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'bbox': [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]} # fmt: skip
2375
2376encoding_p = tokenizer_p(question, words, boxes, padding="max_length", max_length=20)
2377encoding_r = tokenizer_r(question, words, boxes, padding="max_length", max_length=20)
2378self.assertDictEqual(dict(encoding_p), expected_results)
2379self.assertDictEqual(dict(encoding_r), expected_results)
2380
2381# # CASE 3: batched
2382questions, words, boxes = self.get_question_words_and_boxes_batch()
2383
2384expected_results = {'input_ids': [[0, 99, 18, 39, 766, 116, 2, 2, 795, 13964, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 141, 16, 37, 373, 116, 2, 2, 13964, 795, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'bbox': [[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [256, 38, 330, 58], [256, 38, 330, 58], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]} # fmt: skip
2385
2386encoding_p = tokenizer_p(questions, words, boxes, padding="max_length", max_length=20)
2387encoding_r = tokenizer_r(questions, words, boxes, padding="max_length", max_length=20)
2388self.assertDictEqual(dict(encoding_p), expected_results)
2389self.assertDictEqual(dict(encoding_r), expected_results)
2390
2391@unittest.skip("Doesn't support another framework than PyTorch")
2392def test_np_encode_plus_sent_to_model(self):
2393pass
2394
2395@require_tf
2396@slow
2397def test_tf_encode_plus_sent_to_model(self):
2398from transformers import TF_MODEL_MAPPING, TOKENIZER_MAPPING
2399
2400MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(TF_MODEL_MAPPING, TOKENIZER_MAPPING)
2401
2402tokenizers = self.get_tokenizers(do_lower_case=False)
2403for tokenizer in tokenizers:
2404with self.subTest(f"{tokenizer.__class__.__name__}"):
2405if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
2406return
2407
2408config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
2409config = config_class()
2410
2411if config.is_encoder_decoder or config.pad_token_id is None:
2412return
2413
2414model = model_class(config)
2415
2416# Make sure the model contains at least the full vocabulary size in its embedding matrix
2417self.assertGreaterEqual(model.config.vocab_size, len(tokenizer))
2418
2419# Build sequence
2420first_ten_tokens = list(tokenizer.get_vocab().keys())[:10]
2421boxes = [[1000, 1000, 1000, 1000] for _ in range(len(first_ten_tokens))]
2422encoded_sequence = tokenizer.encode_plus(first_ten_tokens, boxes=boxes, return_tensors="tf")
2423batch_encoded_sequence = tokenizer.batch_encode_plus(
2424[first_ten_tokens, first_ten_tokens], boxes=[boxes, boxes], return_tensors="tf"
2425)
2426
2427# This should not fail
2428model(encoded_sequence)
2429model(batch_encoded_sequence)
2430
2431@unittest.skip("Chat is not supported")
2432def test_chat_template(self):
2433pass
2434