transformers

Форк
0
/
test_check_copies.py 
454 строки · 17.3 Кб
1
# Copyright 2020 The HuggingFace Team. All rights reserved.
2
#
3
# Licensed under the Apache License, Version 2.0 (the "License");
4
# you may not use this file except in compliance with the License.
5
# You may obtain a copy of the License at
6
#
7
#     http://www.apache.org/licenses/LICENSE-2.0
8
#
9
# Unless required by applicable law or agreed to in writing, software
10
# distributed under the License is distributed on an "AS IS" BASIS,
11
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
# See the License for the specific language governing permissions and
13
# limitations under the License.
14

15
import os
16
import shutil
17
import sys
18
import tempfile
19
import unittest
20
from contextlib import contextmanager
21
from pathlib import Path
22

23

24
git_repo_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
25
sys.path.append(os.path.join(git_repo_path, "utils"))
26

27
import check_copies  # noqa: E402
28
from check_copies import convert_to_localized_md, find_code_in_transformers, is_copy_consistent  # noqa: E402
29

30

31
# This is the reference code that will be used in the tests.
32
# If BertLMPredictionHead is changed in modeling_bert.py, this code needs to be manually updated.
33
REFERENCE_CODE = """    def __init__(self, config):
34
        super().__init__()
35
        self.transform = BertPredictionHeadTransform(config)
36

37
        # The output weights are the same as the input embeddings, but there is
38
        # an output-only bias for each token.
39
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
40

41
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
42

43
        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
44
        self.decoder.bias = self.bias
45

46
    def forward(self, hidden_states):
47
        hidden_states = self.transform(hidden_states)
48
        hidden_states = self.decoder(hidden_states)
49
        return hidden_states
50
"""
51

52
MOCK_BERT_CODE = """from ...modeling_utils import PreTrainedModel
53

54
def bert_function(x):
55
    return x
56

57

58
class BertAttention(nn.Module):
59
    def __init__(self, config):
60
        super().__init__()
61

62

63
class BertModel(BertPreTrainedModel):
64
    def __init__(self, config):
65
        super().__init__()
66
        self.bert = BertEncoder(config)
67

68
    @add_docstring(BERT_DOCSTRING)
69
    def forward(self, x):
70
        return self.bert(x)
71
"""
72

73
MOCK_BERT_COPY_CODE = """from ...modeling_utils import PreTrainedModel
74

75
# Copied from transformers.models.bert.modeling_bert.bert_function
76
def bert_copy_function(x):
77
    return x
78

79

80
# Copied from transformers.models.bert.modeling_bert.BertAttention
81
class BertCopyAttention(nn.Module):
82
    def __init__(self, config):
83
        super().__init__()
84

85

86
# Copied from transformers.models.bert.modeling_bert.BertModel with Bert->BertCopy all-casing
87
class BertCopyModel(BertCopyPreTrainedModel):
88
    def __init__(self, config):
89
        super().__init__()
90
        self.bertcopy = BertCopyEncoder(config)
91

92
    @add_docstring(BERTCOPY_DOCSTRING)
93
    def forward(self, x):
94
        return self.bertcopy(x)
95
"""
96

97

98
MOCK_DUMMY_BERT_CODE_MATCH = """
99
class BertDummyModel:
100
    attr_1 = 1
101
    attr_2 = 2
102

103
    def __init__(self, a=1, b=2):
104
        self.a = a
105
        self.b = b
106

107
    # Copied from transformers.models.dummy_gpt2.modeling_dummy_gpt2.GPT2DummyModel.forward
108
    def forward(self, c):
109
        return 1
110

111
    def existing_common(self, c):
112
        return 4
113

114
    def existing_diff_to_be_ignored(self, c):
115
        return 9
116
"""
117

118

119
MOCK_DUMMY_ROBERTA_CODE_MATCH = """
120
# Copied from transformers.models.dummy_bert_match.modeling_dummy_bert_match.BertDummyModel with BertDummy->RobertaBertDummy
121
class RobertaBertDummyModel:
122

123
    attr_1 = 1
124
    attr_2 = 2
125

126
    def __init__(self, a=1, b=2):
127
        self.a = a
128
        self.b = b
129

130
    # Ignore copy
131
    def only_in_roberta_to_be_ignored(self, c):
132
        return 3
133

134
    # Copied from transformers.models.dummy_gpt2.modeling_dummy_gpt2.GPT2DummyModel.forward
135
    def forward(self, c):
136
        return 1
137

138
    def existing_common(self, c):
139
        return 4
140

141
    # Ignore copy
142
    def existing_diff_to_be_ignored(self, c):
143
        return 6
144
"""
145

146

147
MOCK_DUMMY_BERT_CODE_NO_MATCH = """
148
class BertDummyModel:
149
    attr_1 = 1
150
    attr_2 = 2
151

152
    def __init__(self, a=1, b=2):
153
        self.a = a
154
        self.b = b
155

156
    # Copied from transformers.models.dummy_gpt2.modeling_dummy_gpt2.GPT2DummyModel.forward
157
    def forward(self, c):
158
        return 1
159

160
    def only_in_bert(self, c):
161
        return 7
162

163
    def existing_common(self, c):
164
        return 4
165

166
    def existing_diff_not_ignored(self, c):
167
        return 8
168

169
    def existing_diff_to_be_ignored(self, c):
170
        return 9
171
"""
172

173

174
MOCK_DUMMY_ROBERTA_CODE_NO_MATCH = """
175
# Copied from transformers.models.dummy_bert_no_match.modeling_dummy_bert_no_match.BertDummyModel with BertDummy->RobertaBertDummy
176
class RobertaBertDummyModel:
177

178
    attr_1 = 1
179
    attr_2 = 3
180

181
    def __init__(self, a=1, b=2):
182
        self.a = a
183
        self.b = b
184

185
    # Ignore copy
186
    def only_in_roberta_to_be_ignored(self, c):
187
        return 3
188

189
    # Copied from transformers.models.dummy_gpt2.modeling_dummy_gpt2.GPT2DummyModel.forward
190
    def forward(self, c):
191
        return 1
192

193
    def only_in_roberta_not_ignored(self, c):
194
        return 2
195

196
    def existing_common(self, c):
197
        return 4
198

199
    def existing_diff_not_ignored(self, c):
200
        return 5
201

202
    # Ignore copy
203
    def existing_diff_to_be_ignored(self, c):
204
        return 6
205
"""
206

207

208
EXPECTED_REPLACED_CODE = """
209
# Copied from transformers.models.dummy_bert_no_match.modeling_dummy_bert_no_match.BertDummyModel with BertDummy->RobertaBertDummy
210
class RobertaBertDummyModel:
211
    attr_1 = 1
212
    attr_2 = 2
213

214
    def __init__(self, a=1, b=2):
215
        self.a = a
216
        self.b = b
217

218
    # Copied from transformers.models.dummy_gpt2.modeling_dummy_gpt2.GPT2DummyModel.forward
219
    def forward(self, c):
220
        return 1
221

222
    def only_in_bert(self, c):
223
        return 7
224

225
    def existing_common(self, c):
226
        return 4
227

228
    def existing_diff_not_ignored(self, c):
229
        return 8
230

231
    # Ignore copy
232
    def existing_diff_to_be_ignored(self, c):
233
        return 6
234

235
    # Ignore copy
236
    def only_in_roberta_to_be_ignored(self, c):
237
        return 3
238
"""
239

240

241
def replace_in_file(filename, old, new):
242
    with open(filename, "r", encoding="utf-8") as f:
243
        content = f.read()
244

245
    content = content.replace(old, new)
246

247
    with open(filename, "w", encoding="utf-8", newline="\n") as f:
248
        f.write(content)
249

250

251
def create_tmp_repo(tmp_dir):
252
    """
253
    Creates a mock repository in a temporary folder for testing.
254
    """
255
    tmp_dir = Path(tmp_dir)
256
    if tmp_dir.exists():
257
        shutil.rmtree(tmp_dir)
258
    tmp_dir.mkdir(exist_ok=True)
259

260
    model_dir = tmp_dir / "src" / "transformers" / "models"
261
    model_dir.mkdir(parents=True, exist_ok=True)
262

263
    models = {
264
        "bert": MOCK_BERT_CODE,
265
        "bertcopy": MOCK_BERT_COPY_CODE,
266
        "dummy_bert_match": MOCK_DUMMY_BERT_CODE_MATCH,
267
        "dummy_roberta_match": MOCK_DUMMY_ROBERTA_CODE_MATCH,
268
        "dummy_bert_no_match": MOCK_DUMMY_BERT_CODE_NO_MATCH,
269
        "dummy_roberta_no_match": MOCK_DUMMY_ROBERTA_CODE_NO_MATCH,
270
    }
271
    for model, code in models.items():
272
        model_subdir = model_dir / model
273
        model_subdir.mkdir(exist_ok=True)
274
        with open(model_subdir / f"modeling_{model}.py", "w", encoding="utf-8", newline="\n") as f:
275
            f.write(code)
276

277

278
@contextmanager
279
def patch_transformer_repo_path(new_folder):
280
    """
281
    Temporarily patches the variables defines in `check_copies` to use a different location for the repo.
282
    """
283
    old_repo_path = check_copies.REPO_PATH
284
    old_doc_path = check_copies.PATH_TO_DOCS
285
    old_transformer_path = check_copies.TRANSFORMERS_PATH
286
    repo_path = Path(new_folder).resolve()
287
    check_copies.REPO_PATH = str(repo_path)
288
    check_copies.PATH_TO_DOCS = str(repo_path / "docs" / "source" / "en")
289
    check_copies.TRANSFORMERS_PATH = str(repo_path / "src" / "transformers")
290
    try:
291
        yield
292
    finally:
293
        check_copies.REPO_PATH = old_repo_path
294
        check_copies.PATH_TO_DOCS = old_doc_path
295
        check_copies.TRANSFORMERS_PATH = old_transformer_path
296

297

298
class CopyCheckTester(unittest.TestCase):
299
    def test_find_code_in_transformers(self):
300
        with tempfile.TemporaryDirectory() as tmp_folder:
301
            create_tmp_repo(tmp_folder)
302
            with patch_transformer_repo_path(tmp_folder):
303
                code = find_code_in_transformers("models.bert.modeling_bert.BertAttention")
304

305
        reference_code = (
306
            "class BertAttention(nn.Module):\n    def __init__(self, config):\n        super().__init__()\n"
307
        )
308
        self.assertEqual(code, reference_code)
309

310
    def test_is_copy_consistent(self):
311
        path_to_check = ["src", "transformers", "models", "bertcopy", "modeling_bertcopy.py"]
312
        with tempfile.TemporaryDirectory() as tmp_folder:
313
            # Base check
314
            create_tmp_repo(tmp_folder)
315
            with patch_transformer_repo_path(tmp_folder):
316
                file_to_check = os.path.join(tmp_folder, *path_to_check)
317
                diffs = is_copy_consistent(file_to_check)
318
                self.assertEqual(diffs, [])
319

320
            # Base check with an inconsistency
321
            create_tmp_repo(tmp_folder)
322
            with patch_transformer_repo_path(tmp_folder):
323
                file_to_check = os.path.join(tmp_folder, *path_to_check)
324

325
                replace_in_file(file_to_check, "self.bertcopy(x)", "self.bert(x)")
326
                diffs = is_copy_consistent(file_to_check)
327
                self.assertEqual(diffs, [["models.bert.modeling_bert.BertModel", 22]])
328

329
                _ = is_copy_consistent(file_to_check, overwrite=True)
330

331
                with open(file_to_check, "r", encoding="utf-8") as f:
332
                    self.assertEqual(f.read(), MOCK_BERT_COPY_CODE)
333

334
    def test_is_copy_consistent_with_ignored_match(self):
335
        path_to_check = ["src", "transformers", "models", "dummy_roberta_match", "modeling_dummy_roberta_match.py"]
336
        with tempfile.TemporaryDirectory() as tmp_folder:
337
            # Base check
338
            create_tmp_repo(tmp_folder)
339
            with patch_transformer_repo_path(tmp_folder):
340
                file_to_check = os.path.join(tmp_folder, *path_to_check)
341
                diffs = is_copy_consistent(file_to_check)
342
                self.assertEqual(diffs, [])
343

344
    def test_is_copy_consistent_with_ignored_no_match(self):
345
        path_to_check = [
346
            "src",
347
            "transformers",
348
            "models",
349
            "dummy_roberta_no_match",
350
            "modeling_dummy_roberta_no_match.py",
351
        ]
352
        with tempfile.TemporaryDirectory() as tmp_folder:
353
            # Base check with an inconsistency
354
            create_tmp_repo(tmp_folder)
355
            with patch_transformer_repo_path(tmp_folder):
356
                file_to_check = os.path.join(tmp_folder, *path_to_check)
357

358
                diffs = is_copy_consistent(file_to_check)
359
                # line 6: `attr_2 = 3` in `MOCK_DUMMY_ROBERTA_CODE_NO_MATCH`.
360
                # (which has a leading `\n`.)
361
                self.assertEqual(
362
                    diffs, [["models.dummy_bert_no_match.modeling_dummy_bert_no_match.BertDummyModel", 6]]
363
                )
364

365
                _ = is_copy_consistent(file_to_check, overwrite=True)
366

367
                with open(file_to_check, "r", encoding="utf-8") as f:
368
                    self.assertEqual(f.read(), EXPECTED_REPLACED_CODE)
369

370
    def test_convert_to_localized_md(self):
371
        localized_readme = check_copies.LOCALIZED_READMES["README_zh-hans.md"]
372

373
        md_list = (
374
            "1. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (from Google Research and the"
375
            " Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for"
376
            " Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong"
377
            " Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.\n1."
378
            " **[DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)** (from HuggingFace),"
379
            " released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and"
380
            " lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same"
381
            " method has been applied to compress GPT2 into"
382
            " [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/distillation), RoBERTa into"
383
            " [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/distillation),"
384
            " Multilingual BERT into"
385
            " [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) and a German"
386
            " version of DistilBERT.\n1. **[ELECTRA](https://huggingface.co/transformers/model_doc/electra.html)**"
387
            " (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders"
388
            " as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang"
389
            " Luong, Quoc V. Le, Christopher D. Manning."
390
        )
391
        localized_md_list = (
392
            "1. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (来自 Google Research and the"
393
            " Toyota Technological Institute at Chicago) 伴随论文 [ALBERT: A Lite BERT for Self-supervised Learning of"
394
            " Language Representations](https://arxiv.org/abs/1909.11942), 由 Zhenzhong Lan, Mingda Chen, Sebastian"
395
            " Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut 发布。\n"
396
        )
397
        converted_md_list_sample = (
398
            "1. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (来自 Google Research and the"
399
            " Toyota Technological Institute at Chicago) 伴随论文 [ALBERT: A Lite BERT for Self-supervised Learning of"
400
            " Language Representations](https://arxiv.org/abs/1909.11942), 由 Zhenzhong Lan, Mingda Chen, Sebastian"
401
            " Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut 发布。\n1."
402
            " **[DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)** (来自 HuggingFace) 伴随论文"
403
            " [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and"
404
            " lighter](https://arxiv.org/abs/1910.01108) 由 Victor Sanh, Lysandre Debut and Thomas Wolf 发布。 The same"
405
            " method has been applied to compress GPT2 into"
406
            " [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/distillation), RoBERTa into"
407
            " [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/distillation),"
408
            " Multilingual BERT into"
409
            " [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) and a German"
410
            " version of DistilBERT.\n1. **[ELECTRA](https://huggingface.co/transformers/model_doc/electra.html)** (来自"
411
            " Google Research/Stanford University) 伴随论文 [ELECTRA: Pre-training text encoders as discriminators rather"
412
            " than generators](https://arxiv.org/abs/2003.10555) 由 Kevin Clark, Minh-Thang Luong, Quoc V. Le,"
413
            " Christopher D. Manning 发布。\n"
414
        )
415

416
        num_models_equal, converted_md_list = convert_to_localized_md(
417
            md_list, localized_md_list, localized_readme["format_model_list"]
418
        )
419

420
        self.assertFalse(num_models_equal)
421
        self.assertEqual(converted_md_list, converted_md_list_sample)
422

423
        num_models_equal, converted_md_list = convert_to_localized_md(
424
            md_list, converted_md_list, localized_readme["format_model_list"]
425
        )
426

427
        # Check whether the number of models is equal to README.md after conversion.
428
        self.assertTrue(num_models_equal)
429

430
        link_changed_md_list = (
431
            "1. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (from Google Research and the"
432
            " Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for"
433
            " Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong"
434
            " Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut."
435
        )
436
        link_unchanged_md_list = (
437
            "1. **[ALBERT](https://huggingface.co/transformers/main/model_doc/albert.html)** (来自 Google Research and"
438
            " the Toyota Technological Institute at Chicago) 伴随论文 [ALBERT: A Lite BERT for Self-supervised Learning of"
439
            " Language Representations](https://arxiv.org/abs/1909.11942), 由 Zhenzhong Lan, Mingda Chen, Sebastian"
440
            " Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut 发布。\n"
441
        )
442
        converted_md_list_sample = (
443
            "1. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (来自 Google Research and the"
444
            " Toyota Technological Institute at Chicago) 伴随论文 [ALBERT: A Lite BERT for Self-supervised Learning of"
445
            " Language Representations](https://arxiv.org/abs/1909.11942), 由 Zhenzhong Lan, Mingda Chen, Sebastian"
446
            " Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut 发布。\n"
447
        )
448

449
        num_models_equal, converted_md_list = convert_to_localized_md(
450
            link_changed_md_list, link_unchanged_md_list, localized_readme["format_model_list"]
451
        )
452

453
        # Check if the model link is synchronized.
454
        self.assertEqual(converted_md_list, converted_md_list_sample)
455

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.