transformers

test_processor_layoutxlm.py
480 строк · 23.4 Кб
Перенос по словам
1
# Copyright 2021 The HuggingFace Team. All rights reserved.
2
#
3
# Licensed under the Apache License, Version 2.0 (the "License");
4
# you may not use this file except in compliance with the License.
5
# You may obtain a copy of the License at
6
#
7
#     http://www.apache.org/licenses/LICENSE-2.0
8
#
9
# Unless required by applicable law or agreed to in writing, software
10
# distributed under the License is distributed on an "AS IS" BASIS,
11
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
# See the License for the specific language governing permissions and
13
# limitations under the License.
14

15
import json
16
import os
17
import shutil
18
import tempfile
19
import unittest
20
from typing import List
21

22
import numpy as np
23

24
from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast
25
from transformers.models.layoutxlm import LayoutXLMTokenizer, LayoutXLMTokenizerFast
26
from transformers.testing_utils import (
27
    require_pytesseract,
28
    require_sentencepiece,
29
    require_tokenizers,
30
    require_torch,
31
    slow,
32
)
33
from transformers.utils import FEATURE_EXTRACTOR_NAME, cached_property, is_pytesseract_available
34

35

36
if is_pytesseract_available():
37
    from PIL import Image
38

39
    from transformers import LayoutLMv2ImageProcessor, LayoutXLMProcessor
40

41

42
@require_pytesseract
43
@require_sentencepiece
44
@require_tokenizers
45
class LayoutXLMProcessorTest(unittest.TestCase):
46
    tokenizer_class = LayoutXLMTokenizer
47
    rust_tokenizer_class = LayoutXLMTokenizerFast
48

49
    def setUp(self):
50
        image_processor_map = {
51
            "do_resize": True,
52
            "size": 224,
53
            "apply_ocr": True,
54
        }
55

56
        self.tmpdirname = tempfile.mkdtemp()
57
        self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
58
        with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
59
            fp.write(json.dumps(image_processor_map) + "\n")
60

61
        # taken from `test_tokenization_layoutxlm.LayoutXLMTokenizationTest.test_save_pretrained`
62
        self.tokenizer_pretrained_name = "hf-internal-testing/tiny-random-layoutxlm"
63

64
    def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer:
65
        return self.tokenizer_class.from_pretrained(self.tokenizer_pretrained_name, **kwargs)
66

67
    def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast:
68
        return self.rust_tokenizer_class.from_pretrained(self.tokenizer_pretrained_name, **kwargs)
69

70
    def get_tokenizers(self, **kwargs) -> List[PreTrainedTokenizerBase]:
71
        return [self.get_tokenizer(**kwargs), self.get_rust_tokenizer(**kwargs)]
72

73
    def get_image_processor(self, **kwargs):
74
        return LayoutLMv2ImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
75

76
    def tearDown(self):
77
        shutil.rmtree(self.tmpdirname)
78

79
    def prepare_image_inputs(self):
80
        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
81
        or a list of PyTorch tensors if one specifies torchify=True.
82
        """
83

84
        image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
85

86
        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
87

88
        return image_inputs
89

90
    def test_save_load_pretrained_default(self):
91
        image_processor = self.get_image_processor()
92
        tokenizers = self.get_tokenizers()
93
        for tokenizer in tokenizers:
94
            processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer)
95

96
            processor.save_pretrained(self.tmpdirname)
97
            processor = LayoutXLMProcessor.from_pretrained(self.tmpdirname)
98

99
            self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
100
            self.assertIsInstance(processor.tokenizer, (LayoutXLMTokenizer, LayoutXLMTokenizerFast))
101

102
            self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
103
            self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor)
104

105
    def test_save_load_pretrained_additional_features(self):
106
        processor = LayoutXLMProcessor(image_processor=self.get_image_processor(), tokenizer=self.get_tokenizer())
107
        processor.save_pretrained(self.tmpdirname)
108

109
        # slow tokenizer
110
        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
111
        image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30)
112

113
        processor = LayoutXLMProcessor.from_pretrained(
114
            self.tmpdirname,
115
            use_fast=False,
116
            bos_token="(BOS)",
117
            eos_token="(EOS)",
118
            do_resize=False,
119
            size=30,
120
        )
121

122
        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
123
        self.assertIsInstance(processor.tokenizer, LayoutXLMTokenizer)
124

125
        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
126
        self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor)
127

128
        # fast tokenizer
129
        tokenizer_add_kwargs = self.get_rust_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
130
        image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30)
131

132
        processor = LayoutXLMProcessor.from_pretrained(
133
            self.tmpdirname, use_xlm=True, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
134
        )
135

136
        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
137
        self.assertIsInstance(processor.tokenizer, LayoutXLMTokenizerFast)
138

139
        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
140
        self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor)
141

142
    def test_model_input_names(self):
143
        image_processor = self.get_image_processor()
144
        tokenizer = self.get_tokenizer()
145

146
        processor = LayoutXLMProcessor(tokenizer=tokenizer, image_processor=image_processor)
147

148
        input_str = "lower newer"
149
        image_input = self.prepare_image_inputs()
150

151
        # add extra args
152
        inputs = processor(text=input_str, images=image_input, return_codebook_pixels=False, return_image_mask=False)
153

154
        self.assertListEqual(list(inputs.keys()), processor.model_input_names)
155

156
    @slow
157
    def test_overflowing_tokens(self):
158
        # In the case of overflowing tokens, test that we still have 1-to-1 mapping between the images and input_ids (sequences that are too long are broken down into multiple sequences).
159

160
        from datasets import load_dataset
161

162
        # set up
163
        datasets = load_dataset("nielsr/funsd")
164
        processor = LayoutXLMProcessor.from_pretrained("microsoft/layoutxlm-base", apply_ocr=False)
165

166
        def preprocess_data(examples):
167
            images = [Image.open(path).convert("RGB") for path in examples["image_path"]]
168
            words = examples["words"]
169
            boxes = examples["bboxes"]
170
            word_labels = examples["ner_tags"]
171
            encoded_inputs = processor(
172
                images,
173
                words,
174
                boxes=boxes,
175
                word_labels=word_labels,
176
                max_length=512,
177
                padding="max_length",
178
                truncation=True,
179
                return_overflowing_tokens=True,
180
                stride=50,
181
                return_offsets_mapping=True,
182
                return_tensors="pt",
183
            )
184
            return encoded_inputs
185

186
        train_data = preprocess_data(datasets["train"])
187

188
        self.assertEqual(len(train_data["image"]), len(train_data["input_ids"]))
189

190

191
# different use cases tests
192
@require_sentencepiece
193
@require_torch
194
@require_pytesseract
195
class LayoutXLMProcessorIntegrationTests(unittest.TestCase):
196
    @cached_property
197
    def get_images(self):
198
        # we verify our implementation on 2 document images from the DocVQA dataset
199
        from datasets import load_dataset
200

201
        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
202

203
        image_1 = Image.open(ds[0]["file"]).convert("RGB")
204
        image_2 = Image.open(ds[1]["file"]).convert("RGB")
205

206
        return image_1, image_2
207

208
    @cached_property
209
    def get_tokenizers(self):
210
        slow_tokenizer = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base")
211
        fast_tokenizer = LayoutXLMTokenizerFast.from_pretrained("microsoft/layoutxlm-base")
212
        return [slow_tokenizer, fast_tokenizer]
213

214
    @slow
215
    def test_processor_case_1(self):
216
        # case 1: document image classification (training, inference) + token classification (inference), apply_ocr = True
217

218
        image_processor = LayoutLMv2ImageProcessor()
219
        tokenizers = self.get_tokenizers
220
        images = self.get_images
221

222
        for tokenizer in tokenizers:
223
            processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer)
224

225
            # not batched
226
            input_feat_extract = image_processor(images[0], return_tensors="pt")
227
            input_processor = processor(images[0], return_tensors="pt")
228

229
            # verify keys
230
            expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
231
            actual_keys = sorted(input_processor.keys())
232
            self.assertListEqual(actual_keys, expected_keys)
233

234
            # verify image
235
            self.assertAlmostEqual(
236
                input_feat_extract["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2
237
            )
238

239
            # verify input_ids
240
            # this was obtained with Tesseract 4.1.1
241
            expected_decoding = "<s> 11:14 to 11:39 a.m 11:39 to 11:44 a.m. 11:44 a.m. to 12:25 p.m. 12:25 to 12:58 p.m. 12:58 to 4:00 p.m. 2:00 to 5:00 p.m. Coffee Break Coffee will be served for men and women in the lobby adjacent to exhibit area. Please move into exhibit area. (Exhibits Open) TRRF GENERAL SESSION (PART |) Presiding: Lee A. Waller TRRF Vice President “Introductory Remarks” Lee A. Waller, TRRF Vice Presi- dent Individual Interviews with TRRF Public Board Members and Sci- entific Advisory Council Mem- bers Conducted by TRRF Treasurer Philip G. Kuehn to get answers which the public refrigerated warehousing industry is looking for. Plus questions from the floor. Dr. Emil M. Mrak, University of Cal- ifornia, Chairman, TRRF Board; Sam R. Cecil, University of Georgia College of Agriculture; Dr. Stanley Charm, Tufts University School of Medicine; Dr. Robert H. Cotton, ITT Continental Baking Company; Dr. Owen Fennema, University of Wis- consin; Dr. Robert E. Hardenburg, USDA. Questions and Answers Exhibits Open Capt. Jack Stoney Room TRRF Scientific Advisory Council Meeting Ballroom Foyer</s>"  # fmt: skip
242
            decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
243
            self.assertSequenceEqual(decoding, expected_decoding)
244

245
            # batched
246
            input_feat_extract = image_processor(images, return_tensors="pt")
247
            input_processor = processor(images, padding=True, return_tensors="pt")
248

249
            # verify keys
250
            expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
251
            actual_keys = sorted(input_processor.keys())
252
            self.assertListEqual(actual_keys, expected_keys)
253

254
            # verify images
255
            self.assertAlmostEqual(
256
                input_feat_extract["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2
257
            )
258

259
            # verify input_ids
260
            # this was obtained with Tesseract 4.1.1
261
            expected_decoding = "<s> 7 ITC Limited REPORT AND ACCOUNTS 2013 ITC’s Brands: An Asset for the Nation The consumer needs and aspirations they fulfil, the benefit they generate for millions across ITC’s value chains, the future-ready capabilities that support them, and the value that they create for the country, have made ITC’s brands national assets, adding to India’s competitiveness. It is ITC’s aspiration to be the No 1 FMCG player in the country, driven by its new FMCG businesses. A recent Nielsen report has highlighted that ITC's new FMCG businesses are the fastest growing among the top consumer goods companies operating in India. ITC takes justifiable pride that, along with generating economic value, these celebrated Indian brands also drive the creation of larger societal capital through the virtuous cycle of sustainable and inclusive growth. DI WILLS * ; LOVE DELIGHTFULLY SOFT SKIN? aia Ans Source: https://www.industrydocuments.ucsf.edu/docs/snbx0223</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>"  # fmt: skip
262
            decoding = processor.decode(input_processor.input_ids[1].tolist())
263
            self.assertSequenceEqual(decoding, expected_decoding)
264

265
    @slow
266
    def test_processor_case_2(self):
267
        # case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False
268

269
        image_processor = LayoutLMv2ImageProcessor(apply_ocr=False)
270
        tokenizers = self.get_tokenizers
271
        images = self.get_images
272

273
        for tokenizer in tokenizers:
274
            processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer)
275

276
            # not batched
277
            words = ["hello", "world"]
278
            boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
279
            input_processor = processor(images[0], words, boxes=boxes, return_tensors="pt")
280

281
            # verify keys
282
            expected_keys = ["input_ids", "bbox", "attention_mask", "image"]
283
            actual_keys = list(input_processor.keys())
284
            for key in expected_keys:
285
                self.assertIn(key, actual_keys)
286

287
            # verify input_ids
288
            expected_decoding = "<s> hello world</s>"
289
            decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
290
            self.assertSequenceEqual(decoding, expected_decoding)
291

292
            # batched
293
            words = [["hello", "world"], ["my", "name", "is", "niels"]]
294
            boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
295
            input_processor = processor(images, words, boxes=boxes, padding=True, return_tensors="pt")
296

297
            # verify keys
298
            expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
299
            actual_keys = sorted(input_processor.keys())
300
            self.assertListEqual(actual_keys, expected_keys)
301

302
            # verify input_ids
303
            expected_decoding = "<s> hello world</s><pad><pad>"
304
            decoding = processor.decode(input_processor.input_ids[0].tolist())
305
            self.assertSequenceEqual(decoding, expected_decoding)
306

307
            # verify bbox
308
            expected_bbox = [
309
                [0, 0, 0, 0],
310
                [3, 2, 5, 1],
311
                [6, 7, 4, 2],
312
                [3, 9, 2, 4],
313
                [1, 1, 2, 3],
314
                [1, 1, 2, 3],
315
                [1000, 1000, 1000, 1000],
316
            ]
317
            self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox)
318

319
    @slow
320
    def test_processor_case_3(self):
321
        # case 3: token classification (training), apply_ocr=False
322

323
        image_processor = LayoutLMv2ImageProcessor(apply_ocr=False)
324
        tokenizers = self.get_tokenizers
325
        images = self.get_images
326

327
        for tokenizer in tokenizers:
328
            processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer)
329

330
            # not batched
331
            words = ["weirdly", "world"]
332
            boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
333
            word_labels = [1, 2]
334
            input_processor = processor(images[0], words, boxes=boxes, word_labels=word_labels, return_tensors="pt")
335

336
            # verify keys
337
            expected_keys = ["attention_mask", "bbox", "image", "input_ids", "labels"]
338
            actual_keys = sorted(input_processor.keys())
339
            self.assertListEqual(actual_keys, expected_keys)
340

341
            # verify input_ids
342
            expected_decoding = "<s> weirdly world</s>"
343
            decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
344
            self.assertSequenceEqual(decoding, expected_decoding)
345

346
            # verify labels
347
            expected_labels = [-100, 1, -100, 2, -100]
348
            self.assertListEqual(input_processor.labels.squeeze().tolist(), expected_labels)
349

350
            # batched
351
            words = [["hello", "world"], ["my", "name", "is", "niels"]]
352
            boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
353
            word_labels = [[1, 2], [6, 3, 10, 2]]
354
            input_processor = processor(
355
                images, words, boxes=boxes, word_labels=word_labels, padding=True, return_tensors="pt"
356
            )
357

358
            # verify keys
359
            expected_keys = ["attention_mask", "bbox", "image", "input_ids", "labels"]
360
            actual_keys = sorted(input_processor.keys())
361
            self.assertListEqual(actual_keys, expected_keys)
362

363
            # verify input_ids
364
            expected_decoding = "<s> my name is niels</s>"
365
            decoding = processor.decode(input_processor.input_ids[1].tolist())
366
            self.assertSequenceEqual(decoding, expected_decoding)
367

368
            # verify bbox
369
            expected_bbox = [
370
                [0, 0, 0, 0],
371
                [3, 2, 5, 1],
372
                [6, 7, 4, 2],
373
                [3, 9, 2, 4],
374
                [1, 1, 2, 3],
375
                [1, 1, 2, 3],
376
                [1000, 1000, 1000, 1000],
377
            ]
378
            self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox)
379

380
            # verify labels
381
            expected_labels = [-100, 6, 3, 10, 2, -100, -100]
382
            self.assertListEqual(input_processor.labels[1].tolist(), expected_labels)
383

384
    @slow
385
    def test_processor_case_4(self):
386
        # case 4: visual question answering (inference), apply_ocr=True
387

388
        image_processor = LayoutLMv2ImageProcessor()
389
        tokenizers = self.get_tokenizers
390
        images = self.get_images
391

392
        for tokenizer in tokenizers:
393
            processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer)
394

395
            # not batched
396
            question = "What's his name?"
397
            input_processor = processor(images[0], question, return_tensors="pt")
398

399
            # verify keys
400
            expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
401
            actual_keys = sorted(input_processor.keys())
402
            self.assertListEqual(actual_keys, expected_keys)
403

404
            # verify input_ids
405
            # this was obtained with Tesseract 4.1.1
406
            expected_decoding = "<s> What's his name?</s></s> 11:14 to 11:39 a.m 11:39 to 11:44 a.m. 11:44 a.m. to 12:25 p.m. 12:25 to 12:58 p.m. 12:58 to 4:00 p.m. 2:00 to 5:00 p.m. Coffee Break Coffee will be served for men and women in the lobby adjacent to exhibit area. Please move into exhibit area. (Exhibits Open) TRRF GENERAL SESSION (PART |) Presiding: Lee A. Waller TRRF Vice President “Introductory Remarks” Lee A. Waller, TRRF Vice Presi- dent Individual Interviews with TRRF Public Board Members and Sci- entific Advisory Council Mem- bers Conducted by TRRF Treasurer Philip G. Kuehn to get answers which the public refrigerated warehousing industry is looking for. Plus questions from the floor. Dr. Emil M. Mrak, University of Cal- ifornia, Chairman, TRRF Board; Sam R. Cecil, University of Georgia College of Agriculture; Dr. Stanley Charm, Tufts University School of Medicine; Dr. Robert H. Cotton, ITT Continental Baking Company; Dr. Owen Fennema, University of Wis- consin; Dr. Robert E. Hardenburg, USDA. Questions and Answers Exhibits Open Capt. Jack Stoney Room TRRF Scientific Advisory Council Meeting Ballroom Foyer</s>"  # fmt: skip
407
            decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
408
            self.assertSequenceEqual(decoding, expected_decoding)
409

410
            # batched
411
            questions = ["How old is he?", "what's the time"]
412
            input_processor = processor(
413
                images, questions, padding="max_length", max_length=20, truncation=True, return_tensors="pt"
414
            )
415

416
            # verify keys
417
            expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
418
            actual_keys = sorted(input_processor.keys())
419
            self.assertListEqual(actual_keys, expected_keys)
420

421
            # verify input_ids
422
            # this was obtained with Tesseract 4.1.1
423
            expected_decoding = "<s> what's the time</s></s> 7 ITC Limited REPORT AND ACCOUNTS 2013</s>"
424
            decoding = processor.decode(input_processor.input_ids[1].tolist())
425
            self.assertSequenceEqual(decoding, expected_decoding)
426

427
            # verify bbox
428
            expected_bbox = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1000, 1000, 1000, 1000], [1000, 1000, 1000, 1000], [0, 45, 67, 80], [72, 56, 109, 67], [72, 56, 109, 67], [116, 56, 189, 67], [198, 59, 253, 66], [257, 59, 285, 66], [289, 59, 365, 66], [289, 59, 365, 66], [289, 59, 365, 66], [289, 59, 365, 66], [372, 59, 407, 66], [1000, 1000, 1000, 1000]]  # fmt: skip
429
            self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox)
430

431
    @slow
432
    def test_processor_case_5(self):
433
        # case 5: visual question answering (inference), apply_ocr=False
434

435
        image_processor = LayoutLMv2ImageProcessor(apply_ocr=False)
436
        tokenizers = self.get_tokenizers
437
        images = self.get_images
438

439
        for tokenizer in tokenizers:
440
            processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer)
441

442
            # not batched
443
            question = "What's his name?"
444
            words = ["hello", "world"]
445
            boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
446
            input_processor = processor(images[0], question, words, boxes, return_tensors="pt")
447

448
            # verify keys
449
            expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
450
            actual_keys = sorted(input_processor.keys())
451
            self.assertListEqual(actual_keys, expected_keys)
452

453
            # verify input_ids
454
            expected_decoding = "<s> What's his name?</s></s> hello world</s>"
455
            decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
456
            self.assertSequenceEqual(decoding, expected_decoding)
457

458
            # batched
459
            questions = ["How old is he?", "what's the time"]
460
            words = [["hello", "world"], ["my", "name", "is", "niels"]]
461
            boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
462
            input_processor = processor(images, questions, words, boxes, padding=True, return_tensors="pt")
463

464
            # verify keys
465
            expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
466
            actual_keys = sorted(input_processor.keys())
467
            self.assertListEqual(actual_keys, expected_keys)
468

469
            # verify input_ids
470
            expected_decoding = "<s> How old is he?</s></s> hello world</s><pad><pad>"
471
            decoding = processor.decode(input_processor.input_ids[0].tolist())
472
            self.assertSequenceEqual(decoding, expected_decoding)
473

474
            expected_decoding = "<s> what's the time</s></s> my name is niels</s>"
475
            decoding = processor.decode(input_processor.input_ids[1].tolist())
476
            self.assertSequenceEqual(decoding, expected_decoding)
477

478
            # verify bbox
479
            expected_bbox = [[6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3], [1, 1, 2, 3], [1000, 1000, 1000, 1000]]
480
            self.assertListEqual(input_processor.bbox[1].tolist()[-5:], expected_bbox)
481
transformers

Использование cookies