transformers

test_modeling_vit_mae.py
322 строки · 12.1 Кб
Перенос по словам
1
# coding=utf-8
2
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
""" Testing suite for the PyTorch ViTMAE model. """
16

17

18
import math
19
import tempfile
20
import unittest
21

22
import numpy as np
23

24
from transformers import ViTMAEConfig
25
from transformers.testing_utils import require_torch, require_vision, slow, torch_device
26
from transformers.utils import cached_property, is_torch_available, is_vision_available
27

28
from ...test_configuration_common import ConfigTester
29
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
30
from ...test_pipeline_mixin import PipelineTesterMixin
31

32

33
if is_torch_available():
34
    import torch
35
    from torch import nn
36

37
    from transformers import ViTMAEForPreTraining, ViTMAEModel
38
    from transformers.models.vit.modeling_vit import VIT_PRETRAINED_MODEL_ARCHIVE_LIST
39

40

41
if is_vision_available():
42
    from PIL import Image
43

44
    from transformers import ViTImageProcessor
45

46

47
class ViTMAEModelTester:
48
    def __init__(
49
        self,
50
        parent,
51
        batch_size=13,
52
        image_size=30,
53
        patch_size=2,
54
        num_channels=3,
55
        is_training=True,
56
        use_labels=True,
57
        hidden_size=32,
58
        num_hidden_layers=2,
59
        num_attention_heads=4,
60
        intermediate_size=37,
61
        hidden_act="gelu",
62
        hidden_dropout_prob=0.1,
63
        attention_probs_dropout_prob=0.1,
64
        type_sequence_label_size=10,
65
        initializer_range=0.02,
66
        num_labels=3,
67
        mask_ratio=0.6,
68
        scope=None,
69
    ):
70
        self.parent = parent
71
        self.batch_size = batch_size
72
        self.image_size = image_size
73
        self.patch_size = patch_size
74
        self.num_channels = num_channels
75
        self.is_training = is_training
76
        self.use_labels = use_labels
77
        self.hidden_size = hidden_size
78
        self.num_hidden_layers = num_hidden_layers
79
        self.num_attention_heads = num_attention_heads
80
        self.intermediate_size = intermediate_size
81
        self.hidden_act = hidden_act
82
        self.hidden_dropout_prob = hidden_dropout_prob
83
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
84
        self.type_sequence_label_size = type_sequence_label_size
85
        self.initializer_range = initializer_range
86
        self.mask_ratio = mask_ratio
87
        self.scope = scope
88

89
        # in ViTMAE, the expected sequence length = (num_patches + 1) * (1 - config.mask_ratio), rounded above
90
        # (we add 1 for the [CLS] token)
91
        num_patches = (image_size // patch_size) ** 2
92
        self.seq_length = int(math.ceil((1 - mask_ratio) * (num_patches + 1)))
93

94
    def prepare_config_and_inputs(self):
95
        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
96

97
        labels = None
98
        if self.use_labels:
99
            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
100

101
        config = self.get_config()
102

103
        return config, pixel_values, labels
104

105
    def get_config(self):
106
        return ViTMAEConfig(
107
            image_size=self.image_size,
108
            patch_size=self.patch_size,
109
            num_channels=self.num_channels,
110
            hidden_size=self.hidden_size,
111
            num_hidden_layers=self.num_hidden_layers,
112
            num_attention_heads=self.num_attention_heads,
113
            intermediate_size=self.intermediate_size,
114
            hidden_act=self.hidden_act,
115
            hidden_dropout_prob=self.hidden_dropout_prob,
116
            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
117
            is_decoder=False,
118
            initializer_range=self.initializer_range,
119
            mask_ratio=self.mask_ratio,
120
            decoder_hidden_size=self.hidden_size,
121
            decoder_intermediate_size=self.intermediate_size,
122
            decoder_num_attention_heads=self.num_attention_heads,
123
            decoder_num_hidden_layers=self.num_hidden_layers,
124
        )
125

126
    def create_and_check_model(self, config, pixel_values, labels):
127
        model = ViTMAEModel(config=config)
128
        model.to(torch_device)
129
        model.eval()
130
        result = model(pixel_values)
131
        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
132

133
    def create_and_check_for_pretraining(self, config, pixel_values, labels):
134
        model = ViTMAEForPreTraining(config)
135
        model.to(torch_device)
136
        model.eval()
137
        result = model(pixel_values)
138
        num_patches = (self.image_size // self.patch_size) ** 2
139
        expected_num_channels = self.patch_size**2 * self.num_channels
140
        self.parent.assertEqual(result.logits.shape, (self.batch_size, num_patches, expected_num_channels))
141

142
        # test greyscale images
143
        config.num_channels = 1
144
        model = ViTMAEForPreTraining(config)
145
        model.to(torch_device)
146
        model.eval()
147
        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
148
        result = model(pixel_values)
149
        expected_num_channels = self.patch_size**2
150
        self.parent.assertEqual(result.logits.shape, (self.batch_size, num_patches, expected_num_channels))
151

152
    def prepare_config_and_inputs_for_common(self):
153
        config_and_inputs = self.prepare_config_and_inputs()
154
        config, pixel_values, labels = config_and_inputs
155
        inputs_dict = {"pixel_values": pixel_values}
156
        return config, inputs_dict
157

158

159
@require_torch
160
class ViTMAEModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
161
    """
162
    Here we also overwrite some of the tests of test_modeling_common.py, as ViTMAE does not use input_ids, inputs_embeds,
163
    attention_mask and seq_length.
164
    """
165

166
    all_model_classes = (ViTMAEModel, ViTMAEForPreTraining) if is_torch_available() else ()
167
    pipeline_model_mapping = {"image-feature-extraction": ViTMAEModel} if is_torch_available() else {}
168

169
    test_pruning = False
170
    test_torchscript = False
171
    test_resize_embeddings = False
172
    test_head_masking = False
173

174
    def setUp(self):
175
        self.model_tester = ViTMAEModelTester(self)
176
        self.config_tester = ConfigTester(self, config_class=ViTMAEConfig, has_text_modality=False, hidden_size=37)
177

178
    def test_config(self):
179
        self.config_tester.run_common_tests()
180

181
    @unittest.skip(reason="ViTMAE does not use inputs_embeds")
182
    def test_inputs_embeds(self):
183
        pass
184

185
    def test_model_common_attributes(self):
186
        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
187

188
        for model_class in self.all_model_classes:
189
            model = model_class(config)
190
            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
191
            x = model.get_output_embeddings()
192
            self.assertTrue(x is None or isinstance(x, nn.Linear))
193

194
    def test_model(self):
195
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
196
        self.model_tester.create_and_check_model(*config_and_inputs)
197

198
    def test_for_pretraining(self):
199
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
200
        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
201

202
    # overwrite from common since ViTMAEForPretraining has random masking, we need to fix the noise
203
    # to generate masks during test
204
    def check_pt_tf_models(self, tf_model, pt_model, pt_inputs_dict):
205
        # make masks reproducible
206
        np.random.seed(2)
207

208
        num_patches = int((pt_model.config.image_size // pt_model.config.patch_size) ** 2)
209
        noise = np.random.uniform(size=(self.model_tester.batch_size, num_patches))
210
        pt_noise = torch.from_numpy(noise)
211

212
        # Add `noise` argument.
213
        # PT inputs will be prepared in `super().check_pt_tf_models()` with this added `noise` argument
214
        pt_inputs_dict["noise"] = pt_noise
215

216
        super().check_pt_tf_models(tf_model, pt_model, pt_inputs_dict)
217

218
    def test_save_load(self):
219
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
220

221
        for model_class in self.all_model_classes:
222
            model = model_class(config)
223
            model.to(torch_device)
224
            model.eval()
225
            # make random mask reproducible
226
            torch.manual_seed(2)
227
            with torch.no_grad():
228
                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
229

230
            out_2 = outputs[0].cpu().numpy()
231
            out_2[np.isnan(out_2)] = 0
232

233
            with tempfile.TemporaryDirectory() as tmpdirname:
234
                model.save_pretrained(tmpdirname)
235
                model = model_class.from_pretrained(tmpdirname)
236
                model.to(torch_device)
237
                # make random mask reproducible
238
                torch.manual_seed(2)
239
                with torch.no_grad():
240
                    after_outputs = model(**self._prepare_for_class(inputs_dict, model_class))
241

242
                # Make sure we don't have nans
243
                out_1 = after_outputs[0].cpu().numpy()
244
                out_1[np.isnan(out_1)] = 0
245
                max_diff = np.amax(np.abs(out_1 - out_2))
246
                self.assertLessEqual(max_diff, 1e-5)
247

248
    @unittest.skip(
249
        reason="""ViTMAE returns a random mask + ids_restore in each forward pass. See test_save_load
250
    to get deterministic results."""
251
    )
252
    def test_determinism(self):
253
        pass
254

255
    @unittest.skip(
256
        reason="""ViTMAE returns a random mask + ids_restore in each forward pass. See test_save_load
257
    to get deterministic results."""
258
    )
259
    def test_save_load_fast_init_from_base(self):
260
        pass
261

262
    @unittest.skip(
263
        reason="""ViTMAE returns a random mask + ids_restore in each forward pass. See test_save_load
264
    to get deterministic results."""
265
    )
266
    def test_save_load_fast_init_to_base(self):
267
        pass
268

269
    @unittest.skip(reason="""ViTMAE returns a random mask + ids_restore in each forward pass. See test_save_load""")
270
    def test_model_outputs_equivalence(self):
271
        pass
272

273
    @slow
274
    def test_model_from_pretrained(self):
275
        for model_name in VIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
276
            model = ViTMAEModel.from_pretrained(model_name)
277
            self.assertIsNotNone(model)
278

279

280
# We will verify our results on an image of cute cats
281
def prepare_img():
282
    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
283
    return image
284

285

286
@require_torch
287
@require_vision
288
class ViTMAEModelIntegrationTest(unittest.TestCase):
289
    @cached_property
290
    def default_image_processor(self):
291
        return ViTImageProcessor.from_pretrained("facebook/vit-mae-base") if is_vision_available() else None
292

293
    @slow
294
    def test_inference_for_pretraining(self):
295
        # make random mask reproducible across the PT and TF model
296
        np.random.seed(2)
297

298
        model = ViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base").to(torch_device)
299

300
        image_processor = self.default_image_processor
301
        image = prepare_img()
302
        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
303

304
        # prepare a noise vector that will be also used for testing the TF model
305
        # (this way we can ensure that the PT and TF models operate on the same inputs)
306
        vit_mae_config = ViTMAEConfig()
307
        num_patches = int((vit_mae_config.image_size // vit_mae_config.patch_size) ** 2)
308
        noise = np.random.uniform(size=(1, num_patches))
309

310
        # forward pass
311
        with torch.no_grad():
312
            outputs = model(**inputs, noise=torch.from_numpy(noise).to(device=torch_device))
313

314
        # verify the logits
315
        expected_shape = torch.Size((1, 196, 768))
316
        self.assertEqual(outputs.logits.shape, expected_shape)
317

318
        expected_slice = torch.tensor(
319
            [[-0.0548, -1.7023, -0.9325], [0.3721, -0.5670, -0.2233], [0.8235, -1.3878, -0.3524]]
320
        )
321

322
        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice.to(torch_device), atol=1e-4))
323
transformers

Использование cookies