transformers
322 строки · 12.1 Кб
1# coding=utf-8
2# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15""" Testing suite for the PyTorch ViTMAE model. """
16
17
18import math
19import tempfile
20import unittest
21
22import numpy as np
23
24from transformers import ViTMAEConfig
25from transformers.testing_utils import require_torch, require_vision, slow, torch_device
26from transformers.utils import cached_property, is_torch_available, is_vision_available
27
28from ...test_configuration_common import ConfigTester
29from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
30from ...test_pipeline_mixin import PipelineTesterMixin
31
32
33if is_torch_available():
34import torch
35from torch import nn
36
37from transformers import ViTMAEForPreTraining, ViTMAEModel
38from transformers.models.vit.modeling_vit import VIT_PRETRAINED_MODEL_ARCHIVE_LIST
39
40
41if is_vision_available():
42from PIL import Image
43
44from transformers import ViTImageProcessor
45
46
47class ViTMAEModelTester:
48def __init__(
49self,
50parent,
51batch_size=13,
52image_size=30,
53patch_size=2,
54num_channels=3,
55is_training=True,
56use_labels=True,
57hidden_size=32,
58num_hidden_layers=2,
59num_attention_heads=4,
60intermediate_size=37,
61hidden_act="gelu",
62hidden_dropout_prob=0.1,
63attention_probs_dropout_prob=0.1,
64type_sequence_label_size=10,
65initializer_range=0.02,
66num_labels=3,
67mask_ratio=0.6,
68scope=None,
69):
70self.parent = parent
71self.batch_size = batch_size
72self.image_size = image_size
73self.patch_size = patch_size
74self.num_channels = num_channels
75self.is_training = is_training
76self.use_labels = use_labels
77self.hidden_size = hidden_size
78self.num_hidden_layers = num_hidden_layers
79self.num_attention_heads = num_attention_heads
80self.intermediate_size = intermediate_size
81self.hidden_act = hidden_act
82self.hidden_dropout_prob = hidden_dropout_prob
83self.attention_probs_dropout_prob = attention_probs_dropout_prob
84self.type_sequence_label_size = type_sequence_label_size
85self.initializer_range = initializer_range
86self.mask_ratio = mask_ratio
87self.scope = scope
88
89# in ViTMAE, the expected sequence length = (num_patches + 1) * (1 - config.mask_ratio), rounded above
90# (we add 1 for the [CLS] token)
91num_patches = (image_size // patch_size) ** 2
92self.seq_length = int(math.ceil((1 - mask_ratio) * (num_patches + 1)))
93
94def prepare_config_and_inputs(self):
95pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
96
97labels = None
98if self.use_labels:
99labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
100
101config = self.get_config()
102
103return config, pixel_values, labels
104
105def get_config(self):
106return ViTMAEConfig(
107image_size=self.image_size,
108patch_size=self.patch_size,
109num_channels=self.num_channels,
110hidden_size=self.hidden_size,
111num_hidden_layers=self.num_hidden_layers,
112num_attention_heads=self.num_attention_heads,
113intermediate_size=self.intermediate_size,
114hidden_act=self.hidden_act,
115hidden_dropout_prob=self.hidden_dropout_prob,
116attention_probs_dropout_prob=self.attention_probs_dropout_prob,
117is_decoder=False,
118initializer_range=self.initializer_range,
119mask_ratio=self.mask_ratio,
120decoder_hidden_size=self.hidden_size,
121decoder_intermediate_size=self.intermediate_size,
122decoder_num_attention_heads=self.num_attention_heads,
123decoder_num_hidden_layers=self.num_hidden_layers,
124)
125
126def create_and_check_model(self, config, pixel_values, labels):
127model = ViTMAEModel(config=config)
128model.to(torch_device)
129model.eval()
130result = model(pixel_values)
131self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
132
133def create_and_check_for_pretraining(self, config, pixel_values, labels):
134model = ViTMAEForPreTraining(config)
135model.to(torch_device)
136model.eval()
137result = model(pixel_values)
138num_patches = (self.image_size // self.patch_size) ** 2
139expected_num_channels = self.patch_size**2 * self.num_channels
140self.parent.assertEqual(result.logits.shape, (self.batch_size, num_patches, expected_num_channels))
141
142# test greyscale images
143config.num_channels = 1
144model = ViTMAEForPreTraining(config)
145model.to(torch_device)
146model.eval()
147pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
148result = model(pixel_values)
149expected_num_channels = self.patch_size**2
150self.parent.assertEqual(result.logits.shape, (self.batch_size, num_patches, expected_num_channels))
151
152def prepare_config_and_inputs_for_common(self):
153config_and_inputs = self.prepare_config_and_inputs()
154config, pixel_values, labels = config_and_inputs
155inputs_dict = {"pixel_values": pixel_values}
156return config, inputs_dict
157
158
159@require_torch
160class ViTMAEModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
161"""
162Here we also overwrite some of the tests of test_modeling_common.py, as ViTMAE does not use input_ids, inputs_embeds,
163attention_mask and seq_length.
164"""
165
166all_model_classes = (ViTMAEModel, ViTMAEForPreTraining) if is_torch_available() else ()
167pipeline_model_mapping = {"image-feature-extraction": ViTMAEModel} if is_torch_available() else {}
168
169test_pruning = False
170test_torchscript = False
171test_resize_embeddings = False
172test_head_masking = False
173
174def setUp(self):
175self.model_tester = ViTMAEModelTester(self)
176self.config_tester = ConfigTester(self, config_class=ViTMAEConfig, has_text_modality=False, hidden_size=37)
177
178def test_config(self):
179self.config_tester.run_common_tests()
180
181@unittest.skip(reason="ViTMAE does not use inputs_embeds")
182def test_inputs_embeds(self):
183pass
184
185def test_model_common_attributes(self):
186config, _ = self.model_tester.prepare_config_and_inputs_for_common()
187
188for model_class in self.all_model_classes:
189model = model_class(config)
190self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
191x = model.get_output_embeddings()
192self.assertTrue(x is None or isinstance(x, nn.Linear))
193
194def test_model(self):
195config_and_inputs = self.model_tester.prepare_config_and_inputs()
196self.model_tester.create_and_check_model(*config_and_inputs)
197
198def test_for_pretraining(self):
199config_and_inputs = self.model_tester.prepare_config_and_inputs()
200self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
201
202# overwrite from common since ViTMAEForPretraining has random masking, we need to fix the noise
203# to generate masks during test
204def check_pt_tf_models(self, tf_model, pt_model, pt_inputs_dict):
205# make masks reproducible
206np.random.seed(2)
207
208num_patches = int((pt_model.config.image_size // pt_model.config.patch_size) ** 2)
209noise = np.random.uniform(size=(self.model_tester.batch_size, num_patches))
210pt_noise = torch.from_numpy(noise)
211
212# Add `noise` argument.
213# PT inputs will be prepared in `super().check_pt_tf_models()` with this added `noise` argument
214pt_inputs_dict["noise"] = pt_noise
215
216super().check_pt_tf_models(tf_model, pt_model, pt_inputs_dict)
217
218def test_save_load(self):
219config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
220
221for model_class in self.all_model_classes:
222model = model_class(config)
223model.to(torch_device)
224model.eval()
225# make random mask reproducible
226torch.manual_seed(2)
227with torch.no_grad():
228outputs = model(**self._prepare_for_class(inputs_dict, model_class))
229
230out_2 = outputs[0].cpu().numpy()
231out_2[np.isnan(out_2)] = 0
232
233with tempfile.TemporaryDirectory() as tmpdirname:
234model.save_pretrained(tmpdirname)
235model = model_class.from_pretrained(tmpdirname)
236model.to(torch_device)
237# make random mask reproducible
238torch.manual_seed(2)
239with torch.no_grad():
240after_outputs = model(**self._prepare_for_class(inputs_dict, model_class))
241
242# Make sure we don't have nans
243out_1 = after_outputs[0].cpu().numpy()
244out_1[np.isnan(out_1)] = 0
245max_diff = np.amax(np.abs(out_1 - out_2))
246self.assertLessEqual(max_diff, 1e-5)
247
248@unittest.skip(
249reason="""ViTMAE returns a random mask + ids_restore in each forward pass. See test_save_load
250to get deterministic results."""
251)
252def test_determinism(self):
253pass
254
255@unittest.skip(
256reason="""ViTMAE returns a random mask + ids_restore in each forward pass. See test_save_load
257to get deterministic results."""
258)
259def test_save_load_fast_init_from_base(self):
260pass
261
262@unittest.skip(
263reason="""ViTMAE returns a random mask + ids_restore in each forward pass. See test_save_load
264to get deterministic results."""
265)
266def test_save_load_fast_init_to_base(self):
267pass
268
269@unittest.skip(reason="""ViTMAE returns a random mask + ids_restore in each forward pass. See test_save_load""")
270def test_model_outputs_equivalence(self):
271pass
272
273@slow
274def test_model_from_pretrained(self):
275for model_name in VIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
276model = ViTMAEModel.from_pretrained(model_name)
277self.assertIsNotNone(model)
278
279
280# We will verify our results on an image of cute cats
281def prepare_img():
282image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
283return image
284
285
286@require_torch
287@require_vision
288class ViTMAEModelIntegrationTest(unittest.TestCase):
289@cached_property
290def default_image_processor(self):
291return ViTImageProcessor.from_pretrained("facebook/vit-mae-base") if is_vision_available() else None
292
293@slow
294def test_inference_for_pretraining(self):
295# make random mask reproducible across the PT and TF model
296np.random.seed(2)
297
298model = ViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base").to(torch_device)
299
300image_processor = self.default_image_processor
301image = prepare_img()
302inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
303
304# prepare a noise vector that will be also used for testing the TF model
305# (this way we can ensure that the PT and TF models operate on the same inputs)
306vit_mae_config = ViTMAEConfig()
307num_patches = int((vit_mae_config.image_size // vit_mae_config.patch_size) ** 2)
308noise = np.random.uniform(size=(1, num_patches))
309
310# forward pass
311with torch.no_grad():
312outputs = model(**inputs, noise=torch.from_numpy(noise).to(device=torch_device))
313
314# verify the logits
315expected_shape = torch.Size((1, 196, 768))
316self.assertEqual(outputs.logits.shape, expected_shape)
317
318expected_slice = torch.tensor(
319[[-0.0548, -1.7023, -0.9325], [0.3721, -0.5670, -0.2233], [0.8235, -1.3878, -0.3524]]
320)
321
322self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice.to(torch_device), atol=1e-4))
323