transformers
480 строк · 23.4 Кб
1# Copyright 2021 The HuggingFace Team. All rights reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15import json
16import os
17import shutil
18import tempfile
19import unittest
20from typing import List
21
22import numpy as np
23
24from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast
25from transformers.models.layoutxlm import LayoutXLMTokenizer, LayoutXLMTokenizerFast
26from transformers.testing_utils import (
27require_pytesseract,
28require_sentencepiece,
29require_tokenizers,
30require_torch,
31slow,
32)
33from transformers.utils import FEATURE_EXTRACTOR_NAME, cached_property, is_pytesseract_available
34
35
36if is_pytesseract_available():
37from PIL import Image
38
39from transformers import LayoutLMv2ImageProcessor, LayoutXLMProcessor
40
41
42@require_pytesseract
43@require_sentencepiece
44@require_tokenizers
45class LayoutXLMProcessorTest(unittest.TestCase):
46tokenizer_class = LayoutXLMTokenizer
47rust_tokenizer_class = LayoutXLMTokenizerFast
48
49def setUp(self):
50image_processor_map = {
51"do_resize": True,
52"size": 224,
53"apply_ocr": True,
54}
55
56self.tmpdirname = tempfile.mkdtemp()
57self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
58with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
59fp.write(json.dumps(image_processor_map) + "\n")
60
61# taken from `test_tokenization_layoutxlm.LayoutXLMTokenizationTest.test_save_pretrained`
62self.tokenizer_pretrained_name = "hf-internal-testing/tiny-random-layoutxlm"
63
64def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer:
65return self.tokenizer_class.from_pretrained(self.tokenizer_pretrained_name, **kwargs)
66
67def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast:
68return self.rust_tokenizer_class.from_pretrained(self.tokenizer_pretrained_name, **kwargs)
69
70def get_tokenizers(self, **kwargs) -> List[PreTrainedTokenizerBase]:
71return [self.get_tokenizer(**kwargs), self.get_rust_tokenizer(**kwargs)]
72
73def get_image_processor(self, **kwargs):
74return LayoutLMv2ImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
75
76def tearDown(self):
77shutil.rmtree(self.tmpdirname)
78
79def prepare_image_inputs(self):
80"""This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
81or a list of PyTorch tensors if one specifies torchify=True.
82"""
83
84image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
85
86image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
87
88return image_inputs
89
90def test_save_load_pretrained_default(self):
91image_processor = self.get_image_processor()
92tokenizers = self.get_tokenizers()
93for tokenizer in tokenizers:
94processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer)
95
96processor.save_pretrained(self.tmpdirname)
97processor = LayoutXLMProcessor.from_pretrained(self.tmpdirname)
98
99self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
100self.assertIsInstance(processor.tokenizer, (LayoutXLMTokenizer, LayoutXLMTokenizerFast))
101
102self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
103self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor)
104
105def test_save_load_pretrained_additional_features(self):
106processor = LayoutXLMProcessor(image_processor=self.get_image_processor(), tokenizer=self.get_tokenizer())
107processor.save_pretrained(self.tmpdirname)
108
109# slow tokenizer
110tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
111image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30)
112
113processor = LayoutXLMProcessor.from_pretrained(
114self.tmpdirname,
115use_fast=False,
116bos_token="(BOS)",
117eos_token="(EOS)",
118do_resize=False,
119size=30,
120)
121
122self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
123self.assertIsInstance(processor.tokenizer, LayoutXLMTokenizer)
124
125self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
126self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor)
127
128# fast tokenizer
129tokenizer_add_kwargs = self.get_rust_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
130image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30)
131
132processor = LayoutXLMProcessor.from_pretrained(
133self.tmpdirname, use_xlm=True, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
134)
135
136self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
137self.assertIsInstance(processor.tokenizer, LayoutXLMTokenizerFast)
138
139self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
140self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor)
141
142def test_model_input_names(self):
143image_processor = self.get_image_processor()
144tokenizer = self.get_tokenizer()
145
146processor = LayoutXLMProcessor(tokenizer=tokenizer, image_processor=image_processor)
147
148input_str = "lower newer"
149image_input = self.prepare_image_inputs()
150
151# add extra args
152inputs = processor(text=input_str, images=image_input, return_codebook_pixels=False, return_image_mask=False)
153
154self.assertListEqual(list(inputs.keys()), processor.model_input_names)
155
156@slow
157def test_overflowing_tokens(self):
158# In the case of overflowing tokens, test that we still have 1-to-1 mapping between the images and input_ids (sequences that are too long are broken down into multiple sequences).
159
160from datasets import load_dataset
161
162# set up
163datasets = load_dataset("nielsr/funsd")
164processor = LayoutXLMProcessor.from_pretrained("microsoft/layoutxlm-base", apply_ocr=False)
165
166def preprocess_data(examples):
167images = [Image.open(path).convert("RGB") for path in examples["image_path"]]
168words = examples["words"]
169boxes = examples["bboxes"]
170word_labels = examples["ner_tags"]
171encoded_inputs = processor(
172images,
173words,
174boxes=boxes,
175word_labels=word_labels,
176max_length=512,
177padding="max_length",
178truncation=True,
179return_overflowing_tokens=True,
180stride=50,
181return_offsets_mapping=True,
182return_tensors="pt",
183)
184return encoded_inputs
185
186train_data = preprocess_data(datasets["train"])
187
188self.assertEqual(len(train_data["image"]), len(train_data["input_ids"]))
189
190
191# different use cases tests
192@require_sentencepiece
193@require_torch
194@require_pytesseract
195class LayoutXLMProcessorIntegrationTests(unittest.TestCase):
196@cached_property
197def get_images(self):
198# we verify our implementation on 2 document images from the DocVQA dataset
199from datasets import load_dataset
200
201ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
202
203image_1 = Image.open(ds[0]["file"]).convert("RGB")
204image_2 = Image.open(ds[1]["file"]).convert("RGB")
205
206return image_1, image_2
207
208@cached_property
209def get_tokenizers(self):
210slow_tokenizer = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base")
211fast_tokenizer = LayoutXLMTokenizerFast.from_pretrained("microsoft/layoutxlm-base")
212return [slow_tokenizer, fast_tokenizer]
213
214@slow
215def test_processor_case_1(self):
216# case 1: document image classification (training, inference) + token classification (inference), apply_ocr = True
217
218image_processor = LayoutLMv2ImageProcessor()
219tokenizers = self.get_tokenizers
220images = self.get_images
221
222for tokenizer in tokenizers:
223processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer)
224
225# not batched
226input_feat_extract = image_processor(images[0], return_tensors="pt")
227input_processor = processor(images[0], return_tensors="pt")
228
229# verify keys
230expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
231actual_keys = sorted(input_processor.keys())
232self.assertListEqual(actual_keys, expected_keys)
233
234# verify image
235self.assertAlmostEqual(
236input_feat_extract["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2
237)
238
239# verify input_ids
240# this was obtained with Tesseract 4.1.1
241expected_decoding = "<s> 11:14 to 11:39 a.m 11:39 to 11:44 a.m. 11:44 a.m. to 12:25 p.m. 12:25 to 12:58 p.m. 12:58 to 4:00 p.m. 2:00 to 5:00 p.m. Coffee Break Coffee will be served for men and women in the lobby adjacent to exhibit area. Please move into exhibit area. (Exhibits Open) TRRF GENERAL SESSION (PART |) Presiding: Lee A. Waller TRRF Vice President “Introductory Remarks” Lee A. Waller, TRRF Vice Presi- dent Individual Interviews with TRRF Public Board Members and Sci- entific Advisory Council Mem- bers Conducted by TRRF Treasurer Philip G. Kuehn to get answers which the public refrigerated warehousing industry is looking for. Plus questions from the floor. Dr. Emil M. Mrak, University of Cal- ifornia, Chairman, TRRF Board; Sam R. Cecil, University of Georgia College of Agriculture; Dr. Stanley Charm, Tufts University School of Medicine; Dr. Robert H. Cotton, ITT Continental Baking Company; Dr. Owen Fennema, University of Wis- consin; Dr. Robert E. Hardenburg, USDA. Questions and Answers Exhibits Open Capt. Jack Stoney Room TRRF Scientific Advisory Council Meeting Ballroom Foyer</s>" # fmt: skip
242decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
243self.assertSequenceEqual(decoding, expected_decoding)
244
245# batched
246input_feat_extract = image_processor(images, return_tensors="pt")
247input_processor = processor(images, padding=True, return_tensors="pt")
248
249# verify keys
250expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
251actual_keys = sorted(input_processor.keys())
252self.assertListEqual(actual_keys, expected_keys)
253
254# verify images
255self.assertAlmostEqual(
256input_feat_extract["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2
257)
258
259# verify input_ids
260# this was obtained with Tesseract 4.1.1
261expected_decoding = "<s> 7 ITC Limited REPORT AND ACCOUNTS 2013 ITC’s Brands: An Asset for the Nation The consumer needs and aspirations they fulfil, the benefit they generate for millions across ITC’s value chains, the future-ready capabilities that support them, and the value that they create for the country, have made ITC’s brands national assets, adding to India’s competitiveness. It is ITC’s aspiration to be the No 1 FMCG player in the country, driven by its new FMCG businesses. A recent Nielsen report has highlighted that ITC's new FMCG businesses are the fastest growing among the top consumer goods companies operating in India. ITC takes justifiable pride that, along with generating economic value, these celebrated Indian brands also drive the creation of larger societal capital through the virtuous cycle of sustainable and inclusive growth. DI WILLS * ; LOVE DELIGHTFULLY SOFT SKIN? aia Ans Source: https://www.industrydocuments.ucsf.edu/docs/snbx0223</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>" # fmt: skip
262decoding = processor.decode(input_processor.input_ids[1].tolist())
263self.assertSequenceEqual(decoding, expected_decoding)
264
265@slow
266def test_processor_case_2(self):
267# case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False
268
269image_processor = LayoutLMv2ImageProcessor(apply_ocr=False)
270tokenizers = self.get_tokenizers
271images = self.get_images
272
273for tokenizer in tokenizers:
274processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer)
275
276# not batched
277words = ["hello", "world"]
278boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
279input_processor = processor(images[0], words, boxes=boxes, return_tensors="pt")
280
281# verify keys
282expected_keys = ["input_ids", "bbox", "attention_mask", "image"]
283actual_keys = list(input_processor.keys())
284for key in expected_keys:
285self.assertIn(key, actual_keys)
286
287# verify input_ids
288expected_decoding = "<s> hello world</s>"
289decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
290self.assertSequenceEqual(decoding, expected_decoding)
291
292# batched
293words = [["hello", "world"], ["my", "name", "is", "niels"]]
294boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
295input_processor = processor(images, words, boxes=boxes, padding=True, return_tensors="pt")
296
297# verify keys
298expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
299actual_keys = sorted(input_processor.keys())
300self.assertListEqual(actual_keys, expected_keys)
301
302# verify input_ids
303expected_decoding = "<s> hello world</s><pad><pad>"
304decoding = processor.decode(input_processor.input_ids[0].tolist())
305self.assertSequenceEqual(decoding, expected_decoding)
306
307# verify bbox
308expected_bbox = [
309[0, 0, 0, 0],
310[3, 2, 5, 1],
311[6, 7, 4, 2],
312[3, 9, 2, 4],
313[1, 1, 2, 3],
314[1, 1, 2, 3],
315[1000, 1000, 1000, 1000],
316]
317self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox)
318
319@slow
320def test_processor_case_3(self):
321# case 3: token classification (training), apply_ocr=False
322
323image_processor = LayoutLMv2ImageProcessor(apply_ocr=False)
324tokenizers = self.get_tokenizers
325images = self.get_images
326
327for tokenizer in tokenizers:
328processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer)
329
330# not batched
331words = ["weirdly", "world"]
332boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
333word_labels = [1, 2]
334input_processor = processor(images[0], words, boxes=boxes, word_labels=word_labels, return_tensors="pt")
335
336# verify keys
337expected_keys = ["attention_mask", "bbox", "image", "input_ids", "labels"]
338actual_keys = sorted(input_processor.keys())
339self.assertListEqual(actual_keys, expected_keys)
340
341# verify input_ids
342expected_decoding = "<s> weirdly world</s>"
343decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
344self.assertSequenceEqual(decoding, expected_decoding)
345
346# verify labels
347expected_labels = [-100, 1, -100, 2, -100]
348self.assertListEqual(input_processor.labels.squeeze().tolist(), expected_labels)
349
350# batched
351words = [["hello", "world"], ["my", "name", "is", "niels"]]
352boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
353word_labels = [[1, 2], [6, 3, 10, 2]]
354input_processor = processor(
355images, words, boxes=boxes, word_labels=word_labels, padding=True, return_tensors="pt"
356)
357
358# verify keys
359expected_keys = ["attention_mask", "bbox", "image", "input_ids", "labels"]
360actual_keys = sorted(input_processor.keys())
361self.assertListEqual(actual_keys, expected_keys)
362
363# verify input_ids
364expected_decoding = "<s> my name is niels</s>"
365decoding = processor.decode(input_processor.input_ids[1].tolist())
366self.assertSequenceEqual(decoding, expected_decoding)
367
368# verify bbox
369expected_bbox = [
370[0, 0, 0, 0],
371[3, 2, 5, 1],
372[6, 7, 4, 2],
373[3, 9, 2, 4],
374[1, 1, 2, 3],
375[1, 1, 2, 3],
376[1000, 1000, 1000, 1000],
377]
378self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox)
379
380# verify labels
381expected_labels = [-100, 6, 3, 10, 2, -100, -100]
382self.assertListEqual(input_processor.labels[1].tolist(), expected_labels)
383
384@slow
385def test_processor_case_4(self):
386# case 4: visual question answering (inference), apply_ocr=True
387
388image_processor = LayoutLMv2ImageProcessor()
389tokenizers = self.get_tokenizers
390images = self.get_images
391
392for tokenizer in tokenizers:
393processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer)
394
395# not batched
396question = "What's his name?"
397input_processor = processor(images[0], question, return_tensors="pt")
398
399# verify keys
400expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
401actual_keys = sorted(input_processor.keys())
402self.assertListEqual(actual_keys, expected_keys)
403
404# verify input_ids
405# this was obtained with Tesseract 4.1.1
406expected_decoding = "<s> What's his name?</s></s> 11:14 to 11:39 a.m 11:39 to 11:44 a.m. 11:44 a.m. to 12:25 p.m. 12:25 to 12:58 p.m. 12:58 to 4:00 p.m. 2:00 to 5:00 p.m. Coffee Break Coffee will be served for men and women in the lobby adjacent to exhibit area. Please move into exhibit area. (Exhibits Open) TRRF GENERAL SESSION (PART |) Presiding: Lee A. Waller TRRF Vice President “Introductory Remarks” Lee A. Waller, TRRF Vice Presi- dent Individual Interviews with TRRF Public Board Members and Sci- entific Advisory Council Mem- bers Conducted by TRRF Treasurer Philip G. Kuehn to get answers which the public refrigerated warehousing industry is looking for. Plus questions from the floor. Dr. Emil M. Mrak, University of Cal- ifornia, Chairman, TRRF Board; Sam R. Cecil, University of Georgia College of Agriculture; Dr. Stanley Charm, Tufts University School of Medicine; Dr. Robert H. Cotton, ITT Continental Baking Company; Dr. Owen Fennema, University of Wis- consin; Dr. Robert E. Hardenburg, USDA. Questions and Answers Exhibits Open Capt. Jack Stoney Room TRRF Scientific Advisory Council Meeting Ballroom Foyer</s>" # fmt: skip
407decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
408self.assertSequenceEqual(decoding, expected_decoding)
409
410# batched
411questions = ["How old is he?", "what's the time"]
412input_processor = processor(
413images, questions, padding="max_length", max_length=20, truncation=True, return_tensors="pt"
414)
415
416# verify keys
417expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
418actual_keys = sorted(input_processor.keys())
419self.assertListEqual(actual_keys, expected_keys)
420
421# verify input_ids
422# this was obtained with Tesseract 4.1.1
423expected_decoding = "<s> what's the time</s></s> 7 ITC Limited REPORT AND ACCOUNTS 2013</s>"
424decoding = processor.decode(input_processor.input_ids[1].tolist())
425self.assertSequenceEqual(decoding, expected_decoding)
426
427# verify bbox
428expected_bbox = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1000, 1000, 1000, 1000], [1000, 1000, 1000, 1000], [0, 45, 67, 80], [72, 56, 109, 67], [72, 56, 109, 67], [116, 56, 189, 67], [198, 59, 253, 66], [257, 59, 285, 66], [289, 59, 365, 66], [289, 59, 365, 66], [289, 59, 365, 66], [289, 59, 365, 66], [372, 59, 407, 66], [1000, 1000, 1000, 1000]] # fmt: skip
429self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox)
430
431@slow
432def test_processor_case_5(self):
433# case 5: visual question answering (inference), apply_ocr=False
434
435image_processor = LayoutLMv2ImageProcessor(apply_ocr=False)
436tokenizers = self.get_tokenizers
437images = self.get_images
438
439for tokenizer in tokenizers:
440processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer)
441
442# not batched
443question = "What's his name?"
444words = ["hello", "world"]
445boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
446input_processor = processor(images[0], question, words, boxes, return_tensors="pt")
447
448# verify keys
449expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
450actual_keys = sorted(input_processor.keys())
451self.assertListEqual(actual_keys, expected_keys)
452
453# verify input_ids
454expected_decoding = "<s> What's his name?</s></s> hello world</s>"
455decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
456self.assertSequenceEqual(decoding, expected_decoding)
457
458# batched
459questions = ["How old is he?", "what's the time"]
460words = [["hello", "world"], ["my", "name", "is", "niels"]]
461boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
462input_processor = processor(images, questions, words, boxes, padding=True, return_tensors="pt")
463
464# verify keys
465expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
466actual_keys = sorted(input_processor.keys())
467self.assertListEqual(actual_keys, expected_keys)
468
469# verify input_ids
470expected_decoding = "<s> How old is he?</s></s> hello world</s><pad><pad>"
471decoding = processor.decode(input_processor.input_ids[0].tolist())
472self.assertSequenceEqual(decoding, expected_decoding)
473
474expected_decoding = "<s> what's the time</s></s> my name is niels</s>"
475decoding = processor.decode(input_processor.input_ids[1].tolist())
476self.assertSequenceEqual(decoding, expected_decoding)
477
478# verify bbox
479expected_bbox = [[6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3], [1, 1, 2, 3], [1000, 1000, 1000, 1000]]
480self.assertListEqual(input_processor.bbox[1].tolist()[-5:], expected_bbox)
481