transformers

test_modeling_tf_marian.py
318 строк · 11.6 Кб
Перенос по словам
1
# coding=utf-8
2
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15

16

17
from __future__ import annotations
18

19
import unittest
20
import warnings
21

22
from transformers import AutoTokenizer, MarianConfig, MarianTokenizer, TranslationPipeline, is_tf_available
23
from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
24
from transformers.utils import cached_property
25

26
from ...test_configuration_common import ConfigTester
27
from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor
28
from ...test_pipeline_mixin import PipelineTesterMixin
29

30

31
if is_tf_available():
32
    import tensorflow as tf
33

34
    from transformers import TFAutoModelForSeq2SeqLM, TFMarianModel, TFMarianMTModel
35

36

37
@require_tf
38
class TFMarianModelTester:
39
    config_cls = MarianConfig
40
    config_updates = {}
41
    hidden_act = "gelu"
42

43
    def __init__(
44
        self,
45
        parent,
46
        batch_size=13,
47
        seq_length=7,
48
        is_training=True,
49
        use_labels=False,
50
        vocab_size=99,
51
        hidden_size=32,
52
        num_hidden_layers=2,
53
        num_attention_heads=4,
54
        intermediate_size=37,
55
        hidden_dropout_prob=0.1,
56
        attention_probs_dropout_prob=0.1,
57
        max_position_embeddings=20,
58
        eos_token_id=2,
59
        pad_token_id=1,
60
        bos_token_id=0,
61
    ):
62
        self.parent = parent
63
        self.batch_size = batch_size
64
        self.seq_length = seq_length
65
        self.is_training = is_training
66
        self.use_labels = use_labels
67
        self.vocab_size = vocab_size
68
        self.hidden_size = hidden_size
69
        self.num_hidden_layers = num_hidden_layers
70
        self.num_attention_heads = num_attention_heads
71
        self.intermediate_size = intermediate_size
72

73
        self.hidden_dropout_prob = hidden_dropout_prob
74
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
75
        self.max_position_embeddings = max_position_embeddings
76
        self.eos_token_id = eos_token_id
77
        self.pad_token_id = pad_token_id
78
        self.bos_token_id = bos_token_id
79

80
    def prepare_config_and_inputs_for_common(self):
81
        input_ids = ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size)
82
        eos_tensor = tf.expand_dims(tf.constant([self.eos_token_id] * self.batch_size), 1)
83
        input_ids = tf.concat([input_ids, eos_tensor], axis=1)
84

85
        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
86

87
        config = self.config_cls(
88
            vocab_size=self.vocab_size,
89
            d_model=self.hidden_size,
90
            encoder_layers=self.num_hidden_layers,
91
            decoder_layers=self.num_hidden_layers,
92
            encoder_attention_heads=self.num_attention_heads,
93
            decoder_attention_heads=self.num_attention_heads,
94
            encoder_ffn_dim=self.intermediate_size,
95
            decoder_ffn_dim=self.intermediate_size,
96
            dropout=self.hidden_dropout_prob,
97
            attention_dropout=self.attention_probs_dropout_prob,
98
            max_position_embeddings=self.max_position_embeddings,
99
            eos_token_ids=[2],
100
            bos_token_id=self.bos_token_id,
101
            pad_token_id=self.pad_token_id,
102
            decoder_start_token_id=self.pad_token_id,
103
            **self.config_updates,
104
        )
105
        inputs_dict = prepare_marian_inputs_dict(config, input_ids, decoder_input_ids)
106
        return config, inputs_dict
107

108
    def check_decoder_model_past_large_inputs(self, config, inputs_dict):
109
        model = TFMarianModel(config=config).get_decoder()
110
        input_ids = inputs_dict["input_ids"]
111

112
        input_ids = input_ids[:1, :]
113
        attention_mask = inputs_dict["attention_mask"][:1, :]
114
        head_mask = inputs_dict["head_mask"]
115
        self.batch_size = 1
116

117
        # first forward pass
118
        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
119

120
        output, past_key_values = outputs.to_tuple()
121

122
        # create hypothetical next token and extent to next_input_ids
123
        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
124
        next_attn_mask = tf.cast(ids_tensor((self.batch_size, 3), 2), tf.int8)
125

126
        # append to next input_ids and
127
        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
128
        next_attention_mask = tf.concat([attention_mask, next_attn_mask], axis=-1)
129

130
        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)[0]
131
        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[0]
132

133
        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
134

135
        # select random slice
136
        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
137
        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
138
        output_from_past_slice = output_from_past[:, :, random_slice_idx]
139

140
        # test that outputs are equal for slice
141
        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
142

143

144
def prepare_marian_inputs_dict(
145
    config,
146
    input_ids,
147
    decoder_input_ids,
148
    attention_mask=None,
149
    decoder_attention_mask=None,
150
    head_mask=None,
151
    decoder_head_mask=None,
152
    cross_attn_head_mask=None,
153
):
154
    if attention_mask is None:
155
        attention_mask = tf.cast(tf.math.not_equal(input_ids, config.pad_token_id), tf.int8)
156
    if decoder_attention_mask is None:
157
        decoder_attention_mask = tf.concat(
158
            [
159
                tf.ones(decoder_input_ids[:, :1].shape, dtype=tf.int8),
160
                tf.cast(tf.math.not_equal(decoder_input_ids[:, 1:], config.pad_token_id), tf.int8),
161
            ],
162
            axis=-1,
163
        )
164
    if head_mask is None:
165
        head_mask = tf.ones((config.encoder_layers, config.encoder_attention_heads))
166
    if decoder_head_mask is None:
167
        decoder_head_mask = tf.ones((config.decoder_layers, config.decoder_attention_heads))
168
    if cross_attn_head_mask is None:
169
        cross_attn_head_mask = tf.ones((config.decoder_layers, config.decoder_attention_heads))
170
    return {
171
        "input_ids": input_ids,
172
        "decoder_input_ids": decoder_input_ids,
173
        "attention_mask": attention_mask,
174
        "decoder_attention_mask": decoder_attention_mask,
175
        "head_mask": head_mask,
176
        "decoder_head_mask": decoder_head_mask,
177
        "cross_attn_head_mask": cross_attn_head_mask,
178
    }
179

180

181
@require_tf
182
class TFMarianModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
183
    all_model_classes = (TFMarianMTModel, TFMarianModel) if is_tf_available() else ()
184
    all_generative_model_classes = (TFMarianMTModel,) if is_tf_available() else ()
185
    pipeline_model_mapping = (
186
        {
187
            "conversational": TFMarianMTModel,
188
            "feature-extraction": TFMarianModel,
189
            "summarization": TFMarianMTModel,
190
            "text2text-generation": TFMarianMTModel,
191
            "translation": TFMarianMTModel,
192
        }
193
        if is_tf_available()
194
        else {}
195
    )
196
    is_encoder_decoder = True
197
    test_pruning = False
198
    test_onnx = False
199

200
    def setUp(self):
201
        self.model_tester = TFMarianModelTester(self)
202
        self.config_tester = ConfigTester(self, config_class=MarianConfig)
203

204
    def test_config(self):
205
        self.config_tester.run_common_tests()
206

207
    def test_decoder_model_past_large_inputs(self):
208
        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
209
        self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
210

211
    @unittest.skip("Skipping for now, to fix @ArthurZ or @ydshieh")
212
    def test_pipeline_conversational(self):
213
        pass
214

215

216
@require_tf
217
class AbstractMarianIntegrationTest(unittest.TestCase):
218
    maxDiff = 1000  # show more chars for failing integration tests
219

220
    @classmethod
221
    def setUpClass(cls) -> None:
222
        cls.model_name = f"Helsinki-NLP/opus-mt-{cls.src}-{cls.tgt}"
223
        return cls
224

225
    @cached_property
226
    def tokenizer(self) -> MarianTokenizer:
227
        return AutoTokenizer.from_pretrained(self.model_name)
228

229
    @property
230
    def eos_token_id(self) -> int:
231
        return self.tokenizer.eos_token_id
232

233
    @cached_property
234
    def model(self):
235
        warnings.simplefilter("error")
236
        model: TFMarianMTModel = TFAutoModelForSeq2SeqLM.from_pretrained(self.model_name)
237
        assert isinstance(model, TFMarianMTModel)
238
        c = model.config
239
        self.assertListEqual(c.bad_words_ids, [[c.pad_token_id]])
240
        self.assertEqual(c.max_length, 512)
241
        self.assertEqual(c.decoder_start_token_id, c.pad_token_id)
242
        return model
243

244
    def _assert_generated_batch_equal_expected(self, **tokenizer_kwargs):
245
        generated_words = self.translate_src_text(**tokenizer_kwargs)
246
        self.assertListEqual(self.expected_text, generated_words)
247

248
    def translate_src_text(self, **tokenizer_kwargs):
249
        model_inputs = self.tokenizer(self.src_text, **tokenizer_kwargs, padding=True, return_tensors="tf")
250
        generated_ids = self.model.generate(
251
            model_inputs.input_ids, attention_mask=model_inputs.attention_mask, num_beams=2, max_length=128
252
        )
253
        generated_words = self.tokenizer.batch_decode(generated_ids.numpy(), skip_special_tokens=True)
254
        return generated_words
255

256

257
@require_sentencepiece
258
@require_tokenizers
259
@require_tf
260
class TestMarian_MT_EN(AbstractMarianIntegrationTest):
261
    """Cover low resource/high perplexity setting. This breaks if pad_token_id logits not set to LARGE_NEGATIVE."""
262

263
    src = "mt"
264
    tgt = "en"
265
    src_text = ["Billi messu b'mod ġentili, Ġesù fejjaq raġel li kien milqut bil - marda kerha tal - ġdiem."]
266
    expected_text = ["Touching gently, Jesus healed a man who was affected by the sad disease of leprosy."]
267

268
    @unittest.skip("Skipping until #12647 is resolved.")
269
    @slow
270
    def test_batch_generation_mt_en(self):
271
        self._assert_generated_batch_equal_expected()
272

273

274
@require_sentencepiece
275
@require_tokenizers
276
@require_tf
277
class TestMarian_en_zh(AbstractMarianIntegrationTest):
278
    src = "en"
279
    tgt = "zh"
280
    src_text = ["My name is Wolfgang and I live in Berlin"]
281
    expected_text = ["我叫沃尔夫冈 我住在柏林"]
282

283
    @unittest.skip("Skipping until #12647 is resolved.")
284
    @slow
285
    def test_batch_generation_en_zh(self):
286
        self._assert_generated_batch_equal_expected()
287

288

289
@require_sentencepiece
290
@require_tokenizers
291
@require_tf
292
class TestMarian_en_ROMANCE(AbstractMarianIntegrationTest):
293
    """Multilingual on target side."""
294

295
    src = "en"
296
    tgt = "ROMANCE"
297
    src_text = [
298
        ">>fr<< Don't spend so much time watching TV.",
299
        ">>pt<< Your message has been sent.",
300
        ">>es<< He's two years older than me.",
301
    ]
302
    expected_text = [
303
        "Ne passez pas autant de temps à regarder la télé.",
304
        "A sua mensagem foi enviada.",
305
        "Es dos años más viejo que yo.",
306
    ]
307

308
    @unittest.skip("Skipping until #12647 is resolved.")
309
    @slow
310
    def test_batch_generation_en_ROMANCE_multi(self):
311
        self._assert_generated_batch_equal_expected()
312

313
    @unittest.skip("Skipping until #12647 is resolved.")
314
    @slow
315
    def test_pipeline(self):
316
        pipeline = TranslationPipeline(self.model, self.tokenizer, framework="tf")
317
        output = pipeline(self.src_text)
318
        self.assertEqual(self.expected_text, [x["translation_text"] for x in output])
319
transformers

Использование cookies