transformers
671 строка · 25.0 Кб
1# coding=utf-8
2# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15""" Testing suite for the TensorFlow SAM model. """
16
17
18from __future__ import annotations
19
20import inspect
21import unittest
22
23import numpy as np
24import requests
25
26from transformers import SamConfig, SamMaskDecoderConfig, SamPromptEncoderConfig, SamVisionConfig
27from transformers.testing_utils import require_tf, slow
28from transformers.utils import is_tf_available, is_vision_available
29
30from ...test_configuration_common import ConfigTester
31from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor
32from ...test_pipeline_mixin import PipelineTesterMixin
33
34
35if is_tf_available():
36import tensorflow as tf
37
38from transformers import SamProcessor, TFSamModel
39from transformers.modeling_tf_utils import keras
40
41if is_vision_available():
42from PIL import Image
43
44
45class TFSamPromptEncoderTester:
46def __init__(
47self,
48hidden_size=32,
49input_image_size=24,
50patch_size=2,
51mask_input_channels=4,
52num_point_embeddings=4,
53hidden_act="gelu",
54):
55self.hidden_size = hidden_size
56self.input_image_size = input_image_size
57self.patch_size = patch_size
58self.mask_input_channels = mask_input_channels
59self.num_point_embeddings = num_point_embeddings
60self.hidden_act = hidden_act
61
62def get_config(self):
63return SamPromptEncoderConfig(
64image_size=self.input_image_size,
65patch_size=self.patch_size,
66mask_input_channels=self.mask_input_channels,
67hidden_size=self.hidden_size,
68num_point_embeddings=self.num_point_embeddings,
69hidden_act=self.hidden_act,
70)
71
72def prepare_config_and_inputs(self):
73dummy_points = floats_tensor([self.batch_size, 3, 2])
74config = self.get_config()
75
76return config, dummy_points
77
78
79class TFSamMaskDecoderTester:
80def __init__(
81self,
82hidden_size=32,
83hidden_act="relu",
84mlp_dim=64,
85num_hidden_layers=2,
86num_attention_heads=4,
87attention_downsample_rate=2,
88num_multimask_outputs=3,
89iou_head_depth=3,
90iou_head_hidden_dim=32,
91layer_norm_eps=1e-6,
92):
93self.hidden_size = hidden_size
94self.hidden_act = hidden_act
95self.mlp_dim = mlp_dim
96self.num_hidden_layers = num_hidden_layers
97self.num_attention_heads = num_attention_heads
98self.attention_downsample_rate = attention_downsample_rate
99self.num_multimask_outputs = num_multimask_outputs
100self.iou_head_depth = iou_head_depth
101self.iou_head_hidden_dim = iou_head_hidden_dim
102self.layer_norm_eps = layer_norm_eps
103
104def get_config(self):
105return SamMaskDecoderConfig(
106hidden_size=self.hidden_size,
107hidden_act=self.hidden_act,
108mlp_dim=self.mlp_dim,
109num_hidden_layers=self.num_hidden_layers,
110num_attention_heads=self.num_attention_heads,
111attention_downsample_rate=self.attention_downsample_rate,
112num_multimask_outputs=self.num_multimask_outputs,
113iou_head_depth=self.iou_head_depth,
114iou_head_hidden_dim=self.iou_head_hidden_dim,
115layer_norm_eps=self.layer_norm_eps,
116)
117
118def prepare_config_and_inputs(self):
119config = self.get_config()
120
121dummy_inputs = {
122"image_embedding": floats_tensor([self.batch_size, self.hidden_size]),
123}
124
125return config, dummy_inputs
126
127
128class TFSamModelTester:
129def __init__(
130self,
131parent,
132hidden_size=36,
133intermediate_size=72,
134projection_dim=62,
135output_channels=32,
136num_hidden_layers=2,
137num_attention_heads=4,
138num_channels=3,
139image_size=24,
140patch_size=2,
141hidden_act="gelu",
142layer_norm_eps=1e-06,
143dropout=0.0,
144attention_dropout=0.0,
145initializer_range=0.02,
146initializer_factor=1.0,
147qkv_bias=True,
148mlp_ratio=4.0,
149use_abs_pos=True,
150use_rel_pos=True,
151rel_pos_zero_init=False,
152window_size=14,
153global_attn_indexes=[2, 5, 8, 11],
154num_pos_feats=16,
155mlp_dim=None,
156batch_size=2,
157):
158self.parent = parent
159self.image_size = image_size
160self.patch_size = patch_size
161self.output_channels = output_channels
162self.num_channels = num_channels
163self.hidden_size = hidden_size
164self.projection_dim = projection_dim
165self.num_hidden_layers = num_hidden_layers
166self.num_attention_heads = num_attention_heads
167self.intermediate_size = intermediate_size
168self.dropout = dropout
169self.attention_dropout = attention_dropout
170self.initializer_range = initializer_range
171self.initializer_factor = initializer_factor
172self.hidden_act = hidden_act
173self.layer_norm_eps = layer_norm_eps
174self.qkv_bias = qkv_bias
175self.mlp_ratio = mlp_ratio
176self.use_abs_pos = use_abs_pos
177self.use_rel_pos = use_rel_pos
178self.rel_pos_zero_init = rel_pos_zero_init
179self.window_size = window_size
180self.global_attn_indexes = global_attn_indexes
181self.num_pos_feats = num_pos_feats
182self.mlp_dim = mlp_dim
183self.batch_size = batch_size
184
185# in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
186num_patches = (image_size // patch_size) ** 2
187self.seq_length = num_patches + 1
188
189self.prompt_encoder_tester = TFSamPromptEncoderTester()
190self.mask_decoder_tester = TFSamMaskDecoderTester()
191
192def prepare_config_and_inputs(self):
193pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
194config = self.get_config()
195
196return config, pixel_values
197
198def get_config(self):
199vision_config = SamVisionConfig(
200image_size=self.image_size,
201patch_size=self.patch_size,
202num_channels=self.num_channels,
203hidden_size=self.hidden_size,
204projection_dim=self.projection_dim,
205num_hidden_layers=self.num_hidden_layers,
206num_attention_heads=self.num_attention_heads,
207intermediate_size=self.intermediate_size,
208dropout=self.dropout,
209attention_dropout=self.attention_dropout,
210initializer_range=self.initializer_range,
211initializer_factor=self.initializer_factor,
212output_channels=self.output_channels,
213qkv_bias=self.qkv_bias,
214mlp_ratio=self.mlp_ratio,
215use_abs_pos=self.use_abs_pos,
216use_rel_pos=self.use_rel_pos,
217rel_pos_zero_init=self.rel_pos_zero_init,
218window_size=self.window_size,
219global_attn_indexes=self.global_attn_indexes,
220num_pos_feats=self.num_pos_feats,
221mlp_dim=self.mlp_dim,
222)
223
224prompt_encoder_config = self.prompt_encoder_tester.get_config()
225
226mask_decoder_config = self.mask_decoder_tester.get_config()
227
228return SamConfig(
229vision_config=vision_config,
230prompt_encoder_config=prompt_encoder_config,
231mask_decoder_config=mask_decoder_config,
232)
233
234def create_and_check_model(self, config, pixel_values):
235model = TFSamModel(config=config)
236result = model(pixel_values)
237self.parent.assertEqual(result.iou_scores.shape, (self.batch_size, 1, 3))
238self.parent.assertEqual(result.pred_masks.shape[:3], (self.batch_size, 1, 3))
239
240def create_and_check_get_image_features(self, config, pixel_values):
241model = TFSamModel(config=config)
242result = model.get_image_embeddings(pixel_values)
243self.parent.assertEqual(result[0].shape, (self.output_channels, 12, 12))
244
245def create_and_check_get_image_hidden_states(self, config, pixel_values):
246model = TFSamModel(config=config)
247result = model.vision_encoder(
248pixel_values,
249output_hidden_states=True,
250return_dict=True,
251)
252
253# after computing the convolutional features
254expected_hidden_states_shape = (self.batch_size, 12, 12, 36)
255self.parent.assertEqual(len(result[1]), self.num_hidden_layers + 1)
256self.parent.assertEqual(result[1][0].shape, expected_hidden_states_shape)
257
258result = model.vision_encoder(
259pixel_values,
260output_hidden_states=True,
261return_dict=False,
262)
263
264# after computing the convolutional features
265expected_hidden_states_shape = (self.batch_size, 12, 12, 36)
266self.parent.assertEqual(len(result[1]), self.num_hidden_layers + 1)
267self.parent.assertEqual(result[1][0].shape, expected_hidden_states_shape)
268
269def prepare_config_and_inputs_for_common(self):
270config_and_inputs = self.prepare_config_and_inputs()
271config, pixel_values = config_and_inputs
272inputs_dict = {"pixel_values": pixel_values}
273return config, inputs_dict
274
275
276@require_tf
277class TFSamModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
278"""
279Here we also overwrite some of the tests of test_modeling_common.py, as SAM's vision encoder does not use input_ids, inputs_embeds,
280attention_mask and seq_length.
281"""
282
283all_model_classes = (TFSamModel,) if is_tf_available() else ()
284pipeline_model_mapping = (
285{"feature-extraction": TFSamModel, "mask-generation": TFSamModel} if is_tf_available() else {}
286)
287test_pruning = False
288test_resize_embeddings = False
289test_head_masking = False
290test_onnx = False
291
292# TODO: Fix me @Arthur: `run_batch_test` in `tests/test_pipeline_mixin.py` not working
293def is_pipeline_test_to_skip(
294self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
295):
296return True
297
298def setUp(self):
299self.model_tester = TFSamModelTester(self)
300self.vision_config_tester = ConfigTester(self, config_class=SamVisionConfig, has_text_modality=False)
301self.prompt_encoder_config_tester = ConfigTester(
302self,
303config_class=SamPromptEncoderConfig,
304has_text_modality=False,
305num_attention_heads=12,
306num_hidden_layers=2,
307)
308self.mask_decoder_config_tester = ConfigTester(
309self, config_class=SamMaskDecoderConfig, has_text_modality=False
310)
311
312def test_config(self):
313self.vision_config_tester.run_common_tests()
314self.prompt_encoder_config_tester.run_common_tests()
315self.mask_decoder_config_tester.run_common_tests()
316
317@unittest.skip(reason="SAM's vision encoder does not use inputs_embeds")
318def test_inputs_embeds(self):
319pass
320
321def test_model_common_attributes(self):
322config, _ = self.model_tester.prepare_config_and_inputs_for_common()
323
324for model_class in self.all_model_classes:
325model = model_class(config)
326self.assertIsInstance(model.get_input_embeddings(), (keras.layers.Layer))
327x = model.get_output_embeddings()
328self.assertTrue(x is None or isinstance(x, keras.layers.Dense))
329
330def test_forward_signature(self):
331config, _ = self.model_tester.prepare_config_and_inputs_for_common()
332
333for model_class in self.all_model_classes:
334model = model_class(config)
335signature = inspect.signature(model.call)
336# signature.parameters is an OrderedDict => so arg_names order is deterministic
337arg_names = [*signature.parameters.keys()]
338
339expected_arg_names = ["pixel_values"]
340self.assertListEqual(arg_names[:1], expected_arg_names)
341
342def test_model(self):
343config_and_inputs = self.model_tester.prepare_config_and_inputs()
344self.model_tester.create_and_check_model(*config_and_inputs)
345
346def test_get_image_features(self):
347config_and_inputs = self.model_tester.prepare_config_and_inputs()
348self.model_tester.create_and_check_get_image_features(*config_and_inputs)
349
350def test_image_hidden_states(self):
351config_and_inputs = self.model_tester.prepare_config_and_inputs()
352self.model_tester.create_and_check_get_image_hidden_states(*config_and_inputs)
353
354def test_attention_outputs(self):
355config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
356config.return_dict = True
357
358expected_vision_attention_shape = (
359self.model_tester.batch_size * self.model_tester.num_attention_heads,
360196,
361196,
362)
363expected_mask_decoder_attention_shape = (self.model_tester.batch_size, 1, 144, 32)
364
365for model_class in self.all_model_classes:
366inputs_dict["output_attentions"] = True
367inputs_dict["output_hidden_states"] = False
368config.return_dict = True
369model = model_class(config)
370outputs = model(**self._prepare_for_class(inputs_dict, model_class))
371
372vision_attentions = outputs.vision_attentions
373self.assertEqual(len(vision_attentions), self.model_tester.num_hidden_layers)
374
375mask_decoder_attentions = outputs.mask_decoder_attentions
376self.assertEqual(len(mask_decoder_attentions), self.model_tester.mask_decoder_tester.num_hidden_layers)
377
378# check that output_attentions also work using config
379del inputs_dict["output_attentions"]
380config.output_attentions = True
381model = model_class(config)
382outputs = model(**self._prepare_for_class(inputs_dict, model_class))
383vision_attentions = outputs.vision_attentions
384self.assertEqual(len(vision_attentions), self.model_tester.num_hidden_layers)
385
386mask_decoder_attentions = outputs.mask_decoder_attentions
387self.assertEqual(len(mask_decoder_attentions), self.model_tester.mask_decoder_tester.num_hidden_layers)
388
389self.assertListEqual(
390list(vision_attentions[0].shape[-4:]),
391list(expected_vision_attention_shape),
392)
393
394self.assertListEqual(
395list(mask_decoder_attentions[0].shape[-4:]),
396list(expected_mask_decoder_attention_shape),
397)
398
399@unittest.skip(reason="Hidden_states is tested in create_and_check_model tests")
400def test_hidden_states_output(self):
401pass
402
403@slow
404def test_model_from_pretrained(self):
405model = TFSamModel.from_pretrained("facebook/sam-vit-base") # sam-vit-huge blows out our memory
406self.assertIsNotNone(model)
407
408def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=5e-4, name="outputs", attributes=None):
409super().check_pt_tf_outputs(
410tf_outputs=tf_outputs,
411pt_outputs=pt_outputs,
412model_class=model_class,
413tol=tol,
414name=name,
415attributes=attributes,
416)
417
418
419def prepare_image():
420img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
421raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
422return raw_image
423
424
425def prepare_dog_img():
426img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/dog-sam.png"
427raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
428return raw_image
429
430
431@require_tf
432@slow
433class TFSamModelIntegrationTest(unittest.TestCase):
434def test_inference_mask_generation_no_point(self):
435model = TFSamModel.from_pretrained("facebook/sam-vit-base")
436processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
437
438raw_image = prepare_image()
439inputs = processor(images=raw_image, return_tensors="tf")
440
441outputs = model(**inputs)
442scores = tf.squeeze(outputs.iou_scores)
443masks = outputs.pred_masks[0, 0, 0, 0, :3]
444self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.4515), atol=2e-4))
445self.assertTrue(np.allclose(masks.numpy(), np.array([-4.1807, -3.4949, -3.4483]), atol=1e-2))
446
447def test_inference_mask_generation_one_point_one_bb(self):
448model = TFSamModel.from_pretrained("facebook/sam-vit-base")
449processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
450
451raw_image = prepare_image()
452input_boxes = [[[650, 900, 1000, 1250]]]
453input_points = [[[820, 1080]]]
454
455inputs = processor(images=raw_image, input_boxes=input_boxes, input_points=input_points, return_tensors="tf")
456
457outputs = model(**inputs)
458scores = tf.squeeze(outputs.iou_scores)
459masks = outputs.pred_masks[0, 0, 0, 0, :3]
460
461self.assertTrue(np.allclose(scores[-1], np.array(0.9566), atol=2e-4))
462self.assertTrue(np.allclose(masks.numpy(), np.array([-12.7657, -12.3683, -12.5985]), atol=2e-2))
463
464def test_inference_mask_generation_batched_points_batched_images(self):
465model = TFSamModel.from_pretrained("facebook/sam-vit-base")
466processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
467
468raw_image = prepare_image()
469input_points = [
470[[[820, 1080]], [[820, 1080]], [[820, 1080]], [[820, 1080]]],
471[[[510, 1080]], [[820, 1080]], [[820, 1080]], [[820, 1080]]],
472]
473
474inputs = processor(images=[raw_image, raw_image], input_points=input_points, return_tensors="tf")
475
476outputs = model(**inputs)
477scores = tf.squeeze(outputs.iou_scores)
478masks = outputs.pred_masks[0, 0, 0, 0, :3]
479
480EXPECTED_SCORES = np.array(
481[
482[
483[0.6765, 0.9379, 0.8803],
484[0.6765, 0.9379, 0.8803],
485[0.6765, 0.9379, 0.8803],
486[0.6765, 0.9379, 0.8803],
487],
488[
489[0.3317, 0.7264, 0.7646],
490[0.6765, 0.9379, 0.8803],
491[0.6765, 0.9379, 0.8803],
492[0.6765, 0.9379, 0.8803],
493],
494]
495)
496EXPECTED_MASKS = np.array([-2.8552, -2.7990, -2.9612])
497self.assertTrue(np.allclose(scores.numpy(), EXPECTED_SCORES, atol=1e-3))
498self.assertTrue(np.allclose(masks.numpy(), EXPECTED_MASKS, atol=3e-2))
499
500def test_inference_mask_generation_one_point_one_bb_zero(self):
501model = TFSamModel.from_pretrained("facebook/sam-vit-base")
502processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
503
504raw_image = prepare_image()
505input_boxes = [[[620, 900, 1000, 1255]]]
506input_points = [[[820, 1080]]]
507labels = [[0]]
508
509inputs = processor(
510images=raw_image,
511input_boxes=input_boxes,
512input_points=input_points,
513input_labels=labels,
514return_tensors="tf",
515)
516
517outputs = model(**inputs)
518scores = tf.squeeze(outputs.iou_scores)
519self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.7894), atol=1e-4))
520
521def test_inference_mask_generation_one_point(self):
522model = TFSamModel.from_pretrained("facebook/sam-vit-base")
523processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
524
525raw_image = prepare_image()
526
527input_points = [[[400, 650]]]
528input_labels = [[1]]
529
530inputs = processor(images=raw_image, input_points=input_points, input_labels=input_labels, return_tensors="tf")
531
532outputs = model(**inputs)
533scores = tf.squeeze(outputs.iou_scores)
534
535self.assertTrue(np.allclose(scores[-1], np.array(0.9675), atol=1e-4))
536
537# With no label
538input_points = [[[400, 650]]]
539
540inputs = processor(images=raw_image, input_points=input_points, return_tensors="tf")
541
542outputs = model(**inputs)
543scores = tf.squeeze(outputs.iou_scores)
544
545self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.9675), atol=1e-4))
546
547def test_inference_mask_generation_two_points(self):
548model = TFSamModel.from_pretrained("facebook/sam-vit-base")
549processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
550raw_image = prepare_image()
551
552input_points = [[[400, 650], [800, 650]]]
553input_labels = [[1, 1]]
554
555inputs = processor(images=raw_image, input_points=input_points, input_labels=input_labels, return_tensors="tf")
556
557outputs = model(**inputs)
558scores = tf.squeeze(outputs.iou_scores)
559
560self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.9762), atol=1e-4))
561
562# no labels
563inputs = processor(images=raw_image, input_points=input_points, return_tensors="tf")
564
565outputs = model(**inputs)
566scores = tf.squeeze(outputs.iou_scores)
567
568self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.9762), atol=1e-4))
569
570def test_inference_mask_generation_two_points_batched(self):
571model = TFSamModel.from_pretrained("facebook/sam-vit-base")
572processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
573
574raw_image = prepare_image()
575
576input_points = [[[400, 650], [800, 650]], [[400, 650]]]
577input_labels = [[1, 1], [1]]
578
579inputs = processor(
580images=[raw_image, raw_image], input_points=input_points, input_labels=input_labels, return_tensors="tf"
581)
582
583outputs = model(**inputs)
584scores = tf.squeeze(outputs.iou_scores)
585
586self.assertTrue(np.allclose(scores[0][-1].numpy(), np.array(0.9762), atol=1e-4))
587self.assertTrue(np.allclose(scores[1][-1], np.array(0.9637), atol=1e-4))
588
589def test_inference_mask_generation_one_box(self):
590model = TFSamModel.from_pretrained("facebook/sam-vit-base")
591processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
592
593raw_image = prepare_image()
594
595input_boxes = [[[75, 275, 1725, 850]]]
596
597inputs = processor(images=raw_image, input_boxes=input_boxes, return_tensors="tf")
598
599outputs = model(**inputs)
600scores = tf.squeeze(outputs.iou_scores)
601
602self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.7937), atol=1e-4))
603
604def test_inference_mask_generation_batched_image_one_point(self):
605model = TFSamModel.from_pretrained("facebook/sam-vit-base")
606processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
607
608raw_image = prepare_image()
609raw_dog_image = prepare_dog_img()
610
611input_points = [[[820, 1080]], [[220, 470]]]
612
613inputs = processor(images=[raw_image, raw_dog_image], input_points=input_points, return_tensors="tf")
614
615outputs = model(**inputs)
616scores_batched = tf.squeeze(outputs.iou_scores)
617
618input_points = [[[220, 470]]]
619
620inputs = processor(images=raw_dog_image, input_points=input_points, return_tensors="tf")
621
622outputs = model(**inputs)
623scores_single = tf.squeeze(outputs.iou_scores)
624self.assertTrue(np.allclose(scores_batched[1, :].numpy(), scores_single.numpy(), atol=1e-4))
625
626def test_inference_mask_generation_two_points_point_batch(self):
627model = TFSamModel.from_pretrained("facebook/sam-vit-base")
628processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
629
630raw_image = prepare_image()
631
632input_points = tf.convert_to_tensor([[[400, 650]], [[220, 470]]]) # fmt: skip
633
634input_points = tf.expand_dims(input_points, 0)
635
636inputs = processor(raw_image, input_points=input_points, return_tensors="tf")
637
638outputs = model(**inputs)
639
640iou_scores = outputs.iou_scores
641self.assertTrue(iou_scores.shape == (1, 2, 3))
642self.assertTrue(
643np.allclose(
644iou_scores.numpy(),
645np.array([[[0.9105, 0.9825, 0.9675], [0.7646, 0.7943, 0.7774]]]),
646atol=1e-4,
647rtol=1e-4,
648)
649)
650
651def test_inference_mask_generation_three_boxes_point_batch(self):
652model = TFSamModel.from_pretrained("facebook/sam-vit-base")
653processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
654
655raw_image = prepare_image()
656
657# fmt: off
658input_boxes = tf.convert_to_tensor([[[620, 900, 1000, 1255]], [[75, 275, 1725, 850]], [[75, 275, 1725, 850]]])
659EXPECTED_IOU = np.array([[[0.9773, 0.9881, 0.9522],
660[0.5996, 0.7661, 0.7937],
661[0.5996, 0.7661, 0.7937]]])
662# fmt: on
663input_boxes = tf.expand_dims(input_boxes, 0)
664
665inputs = processor(raw_image, input_boxes=input_boxes, return_tensors="tf")
666
667outputs = model(**inputs)
668
669iou_scores = outputs.iou_scores
670self.assertTrue(iou_scores.shape == (1, 3, 3))
671self.assertTrue(np.allclose(iou_scores.numpy(), EXPECTED_IOU, atol=1e-4, rtol=1e-4))
672