transformers

test_modeling_marian.py
901 строка · 33.7 Кб
Перенос по словам
1
# coding=utf-8
2
# Copyright 2021, The HuggingFace Inc. team. All rights reserved.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
""" Testing suite for the PyTorch Marian model. """
16

17
import tempfile
18
import unittest
19

20
from huggingface_hub.hf_api import list_models
21

22
from transformers import MarianConfig, is_torch_available
23
from transformers.testing_utils import (
24
    require_sentencepiece,
25
    require_tokenizers,
26
    require_torch,
27
    require_torch_fp16,
28
    slow,
29
    torch_device,
30
)
31
from transformers.utils import cached_property
32

33
from ...generation.test_utils import GenerationTesterMixin
34
from ...test_configuration_common import ConfigTester
35
from ...test_modeling_common import ModelTesterMixin, ids_tensor
36
from ...test_pipeline_mixin import PipelineTesterMixin
37

38

39
if is_torch_available():
40
    import torch
41

42
    from transformers import (
43
        AutoConfig,
44
        AutoModelWithLMHead,
45
        AutoTokenizer,
46
        MarianModel,
47
        MarianMTModel,
48
        TranslationPipeline,
49
    )
50
    from transformers.models.marian.convert_marian_to_pytorch import (
51
        ORG_NAME,
52
        convert_hf_name_to_opus_name,
53
        convert_opus_name_to_hf_name,
54
    )
55
    from transformers.models.marian.modeling_marian import (
56
        MarianDecoder,
57
        MarianEncoder,
58
        MarianForCausalLM,
59
        shift_tokens_right,
60
    )
61

62

63
def prepare_marian_inputs_dict(
64
    config,
65
    input_ids,
66
    decoder_input_ids,
67
    attention_mask=None,
68
    decoder_attention_mask=None,
69
    head_mask=None,
70
    decoder_head_mask=None,
71
    cross_attn_head_mask=None,
72
):
73
    if attention_mask is None:
74
        attention_mask = input_ids.ne(config.pad_token_id)
75
    if decoder_attention_mask is None:
76
        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
77
    if head_mask is None:
78
        head_mask = torch.ones(config.encoder_layers, config.encoder_attention_heads, device=torch_device)
79
    if decoder_head_mask is None:
80
        decoder_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
81
    if cross_attn_head_mask is None:
82
        cross_attn_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
83
    return {
84
        "input_ids": input_ids,
85
        "decoder_input_ids": decoder_input_ids,
86
        "attention_mask": attention_mask,
87
        "decoder_attention_mask": attention_mask,
88
        "head_mask": head_mask,
89
        "decoder_head_mask": decoder_head_mask,
90
        "cross_attn_head_mask": cross_attn_head_mask,
91
    }
92

93

94
class MarianModelTester:
95
    def __init__(
96
        self,
97
        parent,
98
        batch_size=13,
99
        seq_length=7,
100
        is_training=True,
101
        use_labels=False,
102
        vocab_size=99,
103
        hidden_size=16,
104
        num_hidden_layers=2,
105
        num_attention_heads=4,
106
        intermediate_size=4,
107
        hidden_act="gelu",
108
        hidden_dropout_prob=0.1,
109
        attention_probs_dropout_prob=0.1,
110
        max_position_embeddings=20,
111
        eos_token_id=2,
112
        pad_token_id=1,
113
        bos_token_id=0,
114
        decoder_start_token_id=3,
115
    ):
116
        self.parent = parent
117
        self.batch_size = batch_size
118
        self.seq_length = seq_length
119
        self.is_training = is_training
120
        self.use_labels = use_labels
121
        self.vocab_size = vocab_size
122
        self.hidden_size = hidden_size
123
        self.num_hidden_layers = num_hidden_layers
124
        self.num_attention_heads = num_attention_heads
125
        self.intermediate_size = intermediate_size
126
        self.hidden_act = hidden_act
127
        self.hidden_dropout_prob = hidden_dropout_prob
128
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
129
        self.max_position_embeddings = max_position_embeddings
130
        self.eos_token_id = eos_token_id
131
        self.pad_token_id = pad_token_id
132
        self.bos_token_id = bos_token_id
133
        self.decoder_start_token_id = decoder_start_token_id
134

135
        # forcing a certain token to be generated, sets all other tokens to -inf
136
        # if however the token to be generated is already at -inf then it can lead token
137
        # `nan` values and thus break generation
138
        self.forced_bos_token_id = None
139
        self.forced_eos_token_id = None
140

141
    def prepare_config_and_inputs(self):
142
        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
143
            3,
144
        )
145
        input_ids[:, -1] = self.eos_token_id  # Eos Token
146

147
        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
148

149
        config = self.get_config()
150
        inputs_dict = prepare_marian_inputs_dict(config, input_ids, decoder_input_ids)
151
        return config, inputs_dict
152

153
    def get_config(self):
154
        return MarianConfig(
155
            vocab_size=self.vocab_size,
156
            d_model=self.hidden_size,
157
            encoder_layers=self.num_hidden_layers,
158
            decoder_layers=self.num_hidden_layers,
159
            encoder_attention_heads=self.num_attention_heads,
160
            decoder_attention_heads=self.num_attention_heads,
161
            encoder_ffn_dim=self.intermediate_size,
162
            decoder_ffn_dim=self.intermediate_size,
163
            dropout=self.hidden_dropout_prob,
164
            attention_dropout=self.attention_probs_dropout_prob,
165
            max_position_embeddings=self.max_position_embeddings,
166
            eos_token_id=self.eos_token_id,
167
            bos_token_id=self.bos_token_id,
168
            pad_token_id=self.pad_token_id,
169
            decoder_start_token_id=self.decoder_start_token_id,
170
            forced_bos_token_id=self.forced_bos_token_id,
171
            forced_eos_token_id=self.forced_eos_token_id,
172
        )
173

174
    def prepare_config_and_inputs_for_common(self):
175
        config, inputs_dict = self.prepare_config_and_inputs()
176
        return config, inputs_dict
177

178
    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
179
        model = MarianModel(config=config).get_decoder().to(torch_device).eval()
180
        input_ids = inputs_dict["input_ids"]
181
        attention_mask = inputs_dict["attention_mask"]
182
        head_mask = inputs_dict["head_mask"]
183

184
        # first forward pass
185
        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
186

187
        output, past_key_values = outputs.to_tuple()
188

189
        # create hypothetical multiple next token and extent to next_input_ids
190
        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
191
        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
192

193
        # append to next input_ids and
194
        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
195
        next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
196

197
        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
198
        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
199
            "last_hidden_state"
200
        ]
201

202
        # select random slice
203
        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
204
        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
205
        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
206

207
        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
208

209
        # test that outputs are equal for slice
210
        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
211

212
    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
213
        model = MarianModel(config=config).to(torch_device).eval()
214
        outputs = model(**inputs_dict)
215

216
        encoder_last_hidden_state = outputs.encoder_last_hidden_state
217
        last_hidden_state = outputs.last_hidden_state
218

219
        with tempfile.TemporaryDirectory() as tmpdirname:
220
            encoder = model.get_encoder()
221
            encoder.save_pretrained(tmpdirname)
222
            encoder = MarianEncoder.from_pretrained(tmpdirname).to(torch_device)
223

224
        encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
225
            0
226
        ]
227

228
        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
229

230
        with tempfile.TemporaryDirectory() as tmpdirname:
231
            decoder = model.get_decoder()
232
            decoder.save_pretrained(tmpdirname)
233
            decoder = MarianDecoder.from_pretrained(tmpdirname).to(torch_device)
234

235
        last_hidden_state_2 = decoder(
236
            input_ids=inputs_dict["decoder_input_ids"],
237
            attention_mask=inputs_dict["decoder_attention_mask"],
238
            encoder_hidden_states=encoder_last_hidden_state,
239
            encoder_attention_mask=inputs_dict["attention_mask"],
240
        )[0]
241

242
        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
243

244

245
@require_torch
246
class MarianModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
247
    all_model_classes = (MarianModel, MarianMTModel) if is_torch_available() else ()
248
    all_generative_model_classes = (MarianMTModel,) if is_torch_available() else ()
249
    pipeline_model_mapping = (
250
        {
251
            "conversational": MarianMTModel,
252
            "feature-extraction": MarianModel,
253
            "summarization": MarianMTModel,
254
            "text-generation": MarianForCausalLM,
255
            "text2text-generation": MarianMTModel,
256
            "translation": MarianMTModel,
257
        }
258
        if is_torch_available()
259
        else {}
260
    )
261
    is_encoder_decoder = True
262
    fx_compatible = True
263
    test_pruning = False
264
    test_missing_keys = False
265

266
    def setUp(self):
267
        self.model_tester = MarianModelTester(self)
268
        self.config_tester = ConfigTester(self, config_class=MarianConfig)
269

270
    def test_config(self):
271
        self.config_tester.run_common_tests()
272

273
    def test_save_load_strict(self):
274
        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
275
        for model_class in self.all_model_classes:
276
            model = model_class(config)
277

278
            with tempfile.TemporaryDirectory() as tmpdirname:
279
                model.save_pretrained(tmpdirname)
280
                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
281
            self.assertEqual(info["missing_keys"], [])
282

283
    def test_decoder_model_past_with_large_inputs(self):
284
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
285
        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
286

287
    def test_encoder_decoder_model_standalone(self):
288
        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
289
        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
290

291
    @require_torch_fp16
292
    def test_generate_fp16(self):
293
        config, input_dict = self.model_tester.prepare_config_and_inputs()
294
        input_ids = input_dict["input_ids"]
295
        attention_mask = input_ids.ne(1).to(torch_device)
296
        model = MarianMTModel(config).eval().to(torch_device)
297
        model.half()
298
        model.generate(input_ids, attention_mask=attention_mask)
299
        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
300

301
    def test_share_encoder_decoder_embeddings(self):
302
        config, input_dict = self.model_tester.prepare_config_and_inputs()
303

304
        # check if embeddings are shared by default
305
        for model_class in self.all_model_classes:
306
            model = model_class(config)
307
            self.assertIs(model.get_encoder().embed_tokens, model.get_decoder().embed_tokens)
308
            self.assertIs(model.get_encoder().embed_tokens.weight, model.get_decoder().embed_tokens.weight)
309

310
        # check if embeddings are not shared when config.share_encoder_decoder_embeddings = False
311
        config.share_encoder_decoder_embeddings = False
312
        for model_class in self.all_model_classes:
313
            model = model_class(config)
314
            self.assertIsNot(model.get_encoder().embed_tokens, model.get_decoder().embed_tokens)
315
            self.assertIsNot(model.get_encoder().embed_tokens.weight, model.get_decoder().embed_tokens.weight)
316

317
        # check if a model with shared embeddings can be saved and loaded with share_encoder_decoder_embeddings = False
318
        config, _ = self.model_tester.prepare_config_and_inputs()
319
        for model_class in self.all_model_classes:
320
            model = model_class(config)
321
            with tempfile.TemporaryDirectory() as tmpdirname:
322
                model.save_pretrained(tmpdirname)
323
                model = model_class.from_pretrained(tmpdirname, share_encoder_decoder_embeddings=False)
324
                self.assertIsNot(model.get_encoder().embed_tokens, model.get_decoder().embed_tokens)
325
                self.assertIsNot(model.get_encoder().embed_tokens.weight, model.get_decoder().embed_tokens.weight)
326

327
    def test_resize_decoder_token_embeddings(self):
328
        config, _ = self.model_tester.prepare_config_and_inputs()
329

330
        # check if resize_decoder_token_embeddings raises an error when embeddings are shared
331
        for model_class in self.all_model_classes:
332
            model = model_class(config)
333
            with self.assertRaises(ValueError):
334
                model.resize_decoder_token_embeddings(config.vocab_size + 1)
335

336
        # check if decoder embeddings are resized when config.share_encoder_decoder_embeddings = False
337
        config.share_encoder_decoder_embeddings = False
338
        for model_class in self.all_model_classes:
339
            model = model_class(config)
340
            model.resize_decoder_token_embeddings(config.vocab_size + 1)
341
            self.assertEqual(model.get_decoder().embed_tokens.weight.shape, (config.vocab_size + 1, config.d_model))
342

343
        # check if lm_head is also resized
344
        config, _ = self.model_tester.prepare_config_and_inputs()
345
        config.share_encoder_decoder_embeddings = False
346
        model = MarianMTModel(config)
347
        model.resize_decoder_token_embeddings(config.vocab_size + 1)
348
        self.assertEqual(model.lm_head.weight.shape, (config.vocab_size + 1, config.d_model))
349

350
    def test_tie_word_embeddings_decoder(self):
351
        pass
352

353
    @unittest.skip("Skipping for now, to fix @ArthurZ or @ydshieh")
354
    def test_pipeline_conversational(self):
355
        pass
356

357
    @unittest.skip(
358
        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
359
    )
360
    def test_training_gradient_checkpointing(self):
361
        pass
362

363
    @unittest.skip(
364
        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
365
    )
366
    def test_training_gradient_checkpointing_use_reentrant(self):
367
        pass
368

369
    @unittest.skip(
370
        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
371
    )
372
    def test_training_gradient_checkpointing_use_reentrant_false(self):
373
        pass
374

375

376
def assert_tensors_close(a, b, atol=1e-12, prefix=""):
377
    """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
378
    if a is None and b is None:
379
        return True
380
    try:
381
        if torch.allclose(a, b, atol=atol):
382
            return True
383
        raise
384
    except Exception:
385
        pct_different = (torch.gt((a - b).abs(), atol)).float().mean().item()
386
        if a.numel() > 100:
387
            msg = f"tensor values are {pct_different:.1%} percent different."
388
        else:
389
            msg = f"{a} != {b}"
390
        if prefix:
391
            msg = prefix + ": " + msg
392
        raise AssertionError(msg)
393

394

395
def _long_tensor(tok_lst):
396
    return torch.tensor(tok_lst, dtype=torch.long, device=torch_device)
397

398

399
class ModelManagementTests(unittest.TestCase):
400
    @slow
401
    @require_torch
402
    def test_model_names(self):
403
        model_list = list_models()
404
        model_ids = [x.modelId for x in model_list if x.modelId.startswith(ORG_NAME)]
405
        bad_model_ids = [mid for mid in model_ids if "+" in model_ids]
406
        self.assertListEqual([], bad_model_ids)
407
        self.assertGreater(len(model_ids), 500)
408

409

410
@require_torch
411
@require_sentencepiece
412
@require_tokenizers
413
class MarianIntegrationTest(unittest.TestCase):
414
    src = "en"
415
    tgt = "de"
416
    src_text = [
417
        "I am a small frog.",
418
        "Now I can forget the 100 words of german that I know.",
419
        "Tom asked his teacher for advice.",
420
        "That's how I would do it.",
421
        "Tom really admired Mary's courage.",
422
        "Turn around and close your eyes.",
423
    ]
424
    expected_text = [
425
        "Ich bin ein kleiner Frosch.",
426
        "Jetzt kann ich die 100 Wörter des Deutschen vergessen, die ich kenne.",
427
        "Tom bat seinen Lehrer um Rat.",
428
        "So würde ich das machen.",
429
        "Tom bewunderte Marias Mut wirklich.",
430
        "Drehen Sie sich um und schließen Sie die Augen.",
431
    ]
432
    # ^^ actual C++ output differs slightly: (1) des Deutschen removed, (2) ""-> "O", (3) tun -> machen
433

434
    @classmethod
435
    def setUpClass(cls) -> None:
436
        cls.model_name = f"Helsinki-NLP/opus-mt-{cls.src}-{cls.tgt}"
437
        return cls
438

439
    @cached_property
440
    def tokenizer(self):
441
        return AutoTokenizer.from_pretrained(self.model_name)
442

443
    @property
444
    def eos_token_id(self) -> int:
445
        return self.tokenizer.eos_token_id
446

447
    @cached_property
448
    def model(self):
449
        model: MarianMTModel = AutoModelWithLMHead.from_pretrained(self.model_name).to(torch_device)
450
        c = model.config
451
        self.assertListEqual(c.bad_words_ids, [[c.pad_token_id]])
452
        self.assertEqual(c.max_length, 512)
453
        self.assertEqual(c.decoder_start_token_id, c.pad_token_id)
454

455
        if torch_device == "cuda":
456
            return model.half()
457
        else:
458
            return model
459

460
    def _assert_generated_batch_equal_expected(self, **tokenizer_kwargs):
461
        generated_words = self.translate_src_text(**tokenizer_kwargs)
462
        self.assertListEqual(self.expected_text, generated_words)
463

464
    def translate_src_text(self, **tokenizer_kwargs):
465
        model_inputs = self.tokenizer(self.src_text, padding=True, return_tensors="pt", **tokenizer_kwargs).to(
466
            torch_device
467
        )
468
        self.assertEqual(self.model.device, model_inputs.input_ids.device)
469
        generated_ids = self.model.generate(
470
            model_inputs.input_ids,
471
            attention_mask=model_inputs.attention_mask,
472
            num_beams=2,
473
            max_length=128,
474
            renormalize_logits=True,  # Marian should always renormalize its logits. See #25459
475
        )
476
        generated_words = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
477
        return generated_words
478

479

480
@require_sentencepiece
481
@require_tokenizers
482
class TestMarian_EN_DE_More(MarianIntegrationTest):
483
    @slow
484
    def test_forward(self):
485
        src, tgt = ["I am a small frog"], ["Ich bin ein kleiner Frosch."]
486
        expected_ids = [38, 121, 14, 697, 38848, 0]
487

488
        model_inputs = self.tokenizer(src, text_target=tgt, return_tensors="pt").to(torch_device)
489

490
        self.assertListEqual(expected_ids, model_inputs.input_ids[0].tolist())
491

492
        desired_keys = {
493
            "input_ids",
494
            "attention_mask",
495
            "labels",
496
        }
497
        self.assertSetEqual(desired_keys, set(model_inputs.keys()))
498
        model_inputs["decoder_input_ids"] = shift_tokens_right(
499
            model_inputs.labels, self.tokenizer.pad_token_id, self.model.config.decoder_start_token_id
500
        )
501
        model_inputs["return_dict"] = True
502
        model_inputs["use_cache"] = False
503
        with torch.no_grad():
504
            outputs = self.model(**model_inputs)
505
        max_indices = outputs.logits.argmax(-1)
506
        self.tokenizer.batch_decode(max_indices)
507

508
    def test_unk_support(self):
509
        t = self.tokenizer
510
        ids = t(["||"], return_tensors="pt").to(torch_device).input_ids[0].tolist()
511
        expected = [t.unk_token_id, t.unk_token_id, t.eos_token_id]
512
        self.assertEqual(expected, ids)
513

514
    def test_pad_not_split(self):
515
        input_ids_w_pad = self.tokenizer(["I am a small frog <pad>"], return_tensors="pt").input_ids[0].tolist()
516
        expected_w_pad = [38, 121, 14, 697, 38848, self.tokenizer.pad_token_id, 0]  # pad
517
        self.assertListEqual(expected_w_pad, input_ids_w_pad)
518

519
    @slow
520
    def test_batch_generation_en_de(self):
521
        self._assert_generated_batch_equal_expected()
522

523
    def test_auto_config(self):
524
        config = AutoConfig.from_pretrained(self.model_name)
525
        self.assertIsInstance(config, MarianConfig)
526

527

528
@require_sentencepiece
529
@require_tokenizers
530
class TestMarian_EN_FR(MarianIntegrationTest):
531
    src = "en"
532
    tgt = "fr"
533
    src_text = [
534
        "I am a small frog.",
535
        "Now I can forget the 100 words of german that I know.",
536
    ]
537
    expected_text = [
538
        "Je suis une petite grenouille.",
539
        "Maintenant, je peux oublier les 100 mots d'allemand que je connais.",
540
    ]
541

542
    @slow
543
    def test_batch_generation_en_fr(self):
544
        self._assert_generated_batch_equal_expected()
545

546

547
@require_sentencepiece
548
@require_tokenizers
549
class TestMarian_FR_EN(MarianIntegrationTest):
550
    src = "fr"
551
    tgt = "en"
552
    src_text = [
553
        "Donnez moi le micro.",
554
        "Tom et Mary étaient assis à une table.",  # Accents
555
    ]
556
    expected_text = [
557
        "Give me the microphone.",
558
        "Tom and Mary were sitting at a table.",
559
    ]
560

561
    @slow
562
    def test_batch_generation_fr_en(self):
563
        self._assert_generated_batch_equal_expected()
564

565

566
@require_sentencepiece
567
@require_tokenizers
568
class TestMarian_RU_FR(MarianIntegrationTest):
569
    src = "ru"
570
    tgt = "fr"
571
    src_text = ["Он показал мне рукопись своей новой пьесы."]
572
    expected_text = ["Il m'a montré le manuscrit de sa nouvelle pièce."]
573

574
    @slow
575
    def test_batch_generation_ru_fr(self):
576
        self._assert_generated_batch_equal_expected()
577

578

579
@require_sentencepiece
580
@require_tokenizers
581
class TestMarian_MT_EN(MarianIntegrationTest):
582
    """Cover low resource/high perplexity setting. This breaks without adjust_logits_generation overwritten"""
583

584
    src = "mt"
585
    tgt = "en"
586
    src_text = ["Billi messu b'mod ġentili, Ġesù fejjaq raġel li kien milqut bil - marda kerha tal - ġdiem."]
587
    expected_text = ["Touching gently, Jesus healed a man who was affected by the sad disease of leprosy."]
588

589
    @slow
590
    def test_batch_generation_mt_en(self):
591
        self._assert_generated_batch_equal_expected()
592

593

594
@require_sentencepiece
595
@require_tokenizers
596
class TestMarian_en_zh(MarianIntegrationTest):
597
    src = "en"
598
    tgt = "zh"
599
    src_text = ["My name is Wolfgang and I live in Berlin"]
600
    expected_text = ["我叫沃尔夫冈 我住在柏林"]
601

602
    @slow
603
    def test_batch_generation_eng_zho(self):
604
        self._assert_generated_batch_equal_expected()
605

606

607
@require_sentencepiece
608
@require_tokenizers
609
class TestMarian_en_ROMANCE(MarianIntegrationTest):
610
    """Multilingual on target side."""
611

612
    src = "en"
613
    tgt = "ROMANCE"
614
    src_text = [
615
        ">>fr<< Don't spend so much time watching TV.",
616
        ">>pt<< Your message has been sent.",
617
        ">>es<< He's two years older than me.",
618
    ]
619
    expected_text = [
620
        "Ne passez pas autant de temps à regarder la télé.",
621
        "A sua mensagem foi enviada.",
622
        "Es dos años más viejo que yo.",
623
    ]
624

625
    @slow
626
    def test_batch_generation_en_ROMANCE_multi(self):
627
        self._assert_generated_batch_equal_expected()
628

629
    @slow
630
    @require_torch
631
    def test_pipeline(self):
632
        pipeline = TranslationPipeline(self.model, self.tokenizer, framework="pt", device=torch_device)
633
        output = pipeline(self.src_text)
634
        self.assertEqual(self.expected_text, [x["translation_text"] for x in output])
635

636

637
@require_sentencepiece
638
@require_tokenizers
639
class TestMarian_FI_EN_V2(MarianIntegrationTest):
640
    src = "fi"
641
    tgt = "en"
642
    src_text = [
643
        "minä tykkään kirjojen lukemisesta",
644
        "Pidän jalkapallon katsomisesta",
645
    ]
646
    expected_text = ["I like to read books", "I like watching football"]
647

648
    @classmethod
649
    def setUpClass(cls) -> None:
650
        cls.model_name = "hf-internal-testing/test-opus-tatoeba-fi-en-v2"
651
        return cls
652

653
    @slow
654
    def test_batch_generation_fi_en(self):
655
        self._assert_generated_batch_equal_expected()
656

657

658
@require_torch
659
class TestConversionUtils(unittest.TestCase):
660
    def test_renaming_multilingual(self):
661
        old_names = [
662
            "opus-mt-cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-fi",
663
            "opus-mt-cmn+cn-fi",  # no group
664
            "opus-mt-en-de",  # standard name
665
            "opus-mt-en-de",  # standard name
666
        ]
667
        expected = ["opus-mt-ZH-fi", "opus-mt-cmn_cn-fi", "opus-mt-en-de", "opus-mt-en-de"]
668
        self.assertListEqual(expected, [convert_opus_name_to_hf_name(x) for x in old_names])
669

670
    def test_undoing_renaming(self):
671
        hf_names = ["opus-mt-ZH-fi", "opus-mt-cmn_cn-fi", "opus-mt-en-de", "opus-mt-en-de"]
672
        converted_opus_names = [convert_hf_name_to_opus_name(x) for x in hf_names]
673
        expected_opus_names = [
674
            "cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-fi",
675
            "cmn+cn-fi",
676
            "en-de",  # standard name
677
            "en-de",
678
        ]
679
        self.assertListEqual(expected_opus_names, converted_opus_names)
680

681

682
class MarianStandaloneDecoderModelTester:
683
    def __init__(
684
        self,
685
        parent,
686
        vocab_size=99,
687
        batch_size=13,
688
        d_model=16,
689
        decoder_seq_length=7,
690
        is_training=True,
691
        is_decoder=True,
692
        use_attention_mask=True,
693
        use_cache=False,
694
        use_labels=True,
695
        decoder_start_token_id=2,
696
        decoder_ffn_dim=32,
697
        decoder_layers=2,
698
        encoder_attention_heads=4,
699
        decoder_attention_heads=4,
700
        max_position_embeddings=30,
701
        is_encoder_decoder=False,
702
        pad_token_id=0,
703
        bos_token_id=1,
704
        eos_token_id=2,
705
        scope=None,
706
    ):
707
        self.parent = parent
708
        self.batch_size = batch_size
709
        self.decoder_seq_length = decoder_seq_length
710
        # For common tests
711
        self.seq_length = self.decoder_seq_length
712
        self.is_training = is_training
713
        self.use_attention_mask = use_attention_mask
714
        self.use_labels = use_labels
715

716
        self.vocab_size = vocab_size
717
        self.d_model = d_model
718
        self.hidden_size = d_model
719
        self.num_hidden_layers = decoder_layers
720
        self.decoder_layers = decoder_layers
721
        self.decoder_ffn_dim = decoder_ffn_dim
722
        self.encoder_attention_heads = encoder_attention_heads
723
        self.decoder_attention_heads = decoder_attention_heads
724
        self.num_attention_heads = decoder_attention_heads
725
        self.eos_token_id = eos_token_id
726
        self.bos_token_id = bos_token_id
727
        self.pad_token_id = pad_token_id
728
        self.decoder_start_token_id = decoder_start_token_id
729
        self.use_cache = use_cache
730
        self.max_position_embeddings = max_position_embeddings
731
        self.is_encoder_decoder = is_encoder_decoder
732

733
        self.scope = None
734
        self.decoder_key_length = decoder_seq_length
735
        self.base_model_out_len = 2
736
        self.decoder_attention_idx = 1
737

738
    def prepare_config_and_inputs(self):
739
        input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
740

741
        attention_mask = None
742
        if self.use_attention_mask:
743
            attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
744

745
        lm_labels = None
746
        if self.use_labels:
747
            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
748

749
        config = MarianConfig(
750
            vocab_size=self.vocab_size,
751
            d_model=self.d_model,
752
            decoder_layers=self.decoder_layers,
753
            decoder_ffn_dim=self.decoder_ffn_dim,
754
            encoder_attention_heads=self.encoder_attention_heads,
755
            decoder_attention_heads=self.decoder_attention_heads,
756
            eos_token_id=self.eos_token_id,
757
            bos_token_id=self.bos_token_id,
758
            use_cache=self.use_cache,
759
            pad_token_id=self.pad_token_id,
760
            decoder_start_token_id=self.decoder_start_token_id,
761
            max_position_embeddings=self.max_position_embeddings,
762
            is_encoder_decoder=self.is_encoder_decoder,
763
        )
764

765
        return (
766
            config,
767
            input_ids,
768
            attention_mask,
769
            lm_labels,
770
        )
771

772
    def create_and_check_decoder_model_past(
773
        self,
774
        config,
775
        input_ids,
776
        attention_mask,
777
        lm_labels,
778
    ):
779
        config.use_cache = True
780
        model = MarianDecoder(config=config).to(torch_device).eval()
781
        # first forward pass
782
        outputs = model(input_ids, use_cache=True)
783
        outputs_use_cache_conf = model(input_ids)
784
        outputs_no_past = model(input_ids, use_cache=False)
785

786
        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
787
        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
788

789
        past_key_values = outputs["past_key_values"]
790

791
        # create hypothetical next token and extent to next_input_ids
792
        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
793

794
        # append to next input_ids and
795
        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
796

797
        output_from_no_past = model(next_input_ids)["last_hidden_state"]
798
        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
799

800
        # select random slice
801
        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
802
        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
803
        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
804

805
        # test that outputs are equal for slice
806
        assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
807

808
    def create_and_check_decoder_model_attention_mask_past(
809
        self,
810
        config,
811
        input_ids,
812
        attention_mask,
813
        lm_labels,
814
    ):
815
        model = MarianDecoder(config=config).to(torch_device).eval()
816

817
        # create attention mask
818
        attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
819

820
        half_seq_length = input_ids.shape[-1] // 2
821
        attn_mask[:, half_seq_length:] = 0
822

823
        # first forward pass
824
        past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True)["past_key_values"]
825

826
        # create hypothetical next token and extent to next_input_ids
827
        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
828

829
        # change a random masked slice from input_ids
830
        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
831
        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
832
        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
833

834
        # append to next input_ids and attn_mask
835
        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
836
        attn_mask = torch.cat(
837
            [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
838
            dim=1,
839
        )
840

841
        # get two different outputs
842
        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
843
        output_from_past = model(next_tokens, attention_mask=attn_mask, past_key_values=past_key_values)[
844
            "last_hidden_state"
845
        ]
846

847
        # select random slice
848
        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
849
        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
850
        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
851

852
        # test that outputs are equal for slice
853
        assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
854

855
    def prepare_config_and_inputs_for_common(self):
856
        config_and_inputs = self.prepare_config_and_inputs()
857
        (
858
            config,
859
            input_ids,
860
            attention_mask,
861
            lm_labels,
862
        ) = config_and_inputs
863

864
        inputs_dict = {
865
            "input_ids": input_ids,
866
            "attention_mask": attention_mask,
867
        }
868
        return config, inputs_dict
869

870

871
@require_torch
872
class MarianStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
873
    all_model_classes = (MarianDecoder, MarianForCausalLM) if is_torch_available() else ()
874
    all_generative_model_classes = (MarianForCausalLM,) if is_torch_available() else ()
875
    test_pruning = False
876
    is_encoder_decoder = False
877

878
    def setUp(
879
        self,
880
    ):
881
        self.model_tester = MarianStandaloneDecoderModelTester(self, is_training=False)
882
        self.config_tester = ConfigTester(self, config_class=MarianConfig)
883

884
    def test_config(self):
885
        self.config_tester.run_common_tests()
886

887
    def test_decoder_model_past(self):
888
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
889
        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
890

891
    def test_decoder_model_attn_mask_past(self):
892
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
893
        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
894

895
    def test_retain_grad_hidden_states_attentions(self):
896
        # decoder cannot keep gradients
897
        return
898

899
    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
900
    def test_left_padding_compatibility(self):
901
        pass
902
transformers

Использование cookies