optimum-intel

test_training_examples.py
202 строки · 6.7 Кб
Перенос по словам
1
#  Copyright 2023 The HuggingFace Team. All rights reserved.
2
#
3
#  Licensed under the Apache License, Version 2.0 (the "License");
4
#  you may not use this file except in compliance with the License.
5
#  You may obtain a copy of the License at
6
#
7
#      http://www.apache.org/licenses/LICENSE-2.0
8
#
9
#  Unless required by applicable law or agreed to in writing, software
10
#  distributed under the License is distributed on an "AS IS" BASIS,
11
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
#  See the License for the specific language governing permissions and
13
#  limitations under the License.
14

15
import os
16
import subprocess
17
import sys
18
import tempfile
19
import unittest
20
from dataclasses import dataclass
21
from pathlib import Path
22
from typing import List, Union
23

24
import torch
25
import torch.cuda
26
from parameterized import parameterized
27

28
from optimum.intel.openvino.utils import OV_XML_FILE_NAME
29

30

31
PROJECT_ROOT = Path(__file__).parents[2]
32
OPENVINO_EXAMPLES_PATH = PROJECT_ROOT / "examples" / "openvino"
33
CUDA_VISIBLE_DEVICES = "CUDA_VISIBLE_DEVICES"
34

35

36
@dataclass
37
class TrainingExampleDescriptor:
38
    cwd: Union[Path, str]
39
    filename: str
40
    args: List[str]
41
    timeout: int
42

43
    def get_args_with_output_dir(self, output_dir: Union[Path, str]):
44
        flag = "--output_dir"
45
        args = self.args.copy()
46
        if flag in args:
47
            idx = args.index(flag)
48
            del args[idx : idx + 2]
49
        return [*args, flag, str(output_dir)]
50

51

52
TRAINING_EXAMPLE_DESCRIPTORS = {
53
    "text-classification-QAT": TrainingExampleDescriptor(
54
        cwd=OPENVINO_EXAMPLES_PATH / "text-classification",
55
        filename="run_glue.py",
56
        args=[
57
            "--model_name_or_path",
58
            "hf-internal-testing/tiny-bert",
59
            "--task_name",
60
            "sst2",
61
            "--do_train",
62
            "--do_eval",
63
            "--per_device_train_batch_size",
64
            "2",
65
            "--per_device_eval_batch_size",
66
            "8",
67
            "--logging_steps",
68
            "1",
69
            "--evaluation_strategy",
70
            "steps",
71
            "--eval_steps",
72
            "2",
73
            "--save_strategy",
74
            "steps",
75
            "--save_steps",
76
            "2",
77
            "--save_total_limit",
78
            "1",
79
            "--max_steps",
80
            "5",
81
            "--fp16",
82
            "--report_to",
83
            "none",
84
        ],
85
        timeout=300,
86
    ),
87
    "text-classification-JPQD": TrainingExampleDescriptor(
88
        cwd=OPENVINO_EXAMPLES_PATH / "text-classification",
89
        filename="run_glue.py",
90
        args=[
91
            "--model_name_or_path",
92
            "hf-internal-testing/tiny-bert",
93
            "--teacher_model_name_or_path",
94
            "hf-internal-testing/tiny-bert",
95
            "--nncf_compression_config",
96
            "./configs/bert-base-jpqd.json",
97
            "--task_name",
98
            "sst2",
99
            "--do_train",
100
            "--do_eval",
101
            "--per_device_train_batch_size",
102
            "2",
103
            "--per_device_eval_batch_size",
104
            "8",
105
            "--logging_steps",
106
            "1",
107
            "--evaluation_strategy",
108
            "steps",
109
            "--eval_steps",
110
            "2",
111
            "--save_strategy",
112
            "steps",
113
            "--save_steps",
114
            "2",
115
            "--save_total_limit",
116
            "1",
117
            "--max_steps",
118
            "5",
119
            "--fp16",
120
            "--report_to",
121
            "none",
122
        ],
123
        timeout=300,
124
    ),
125
}
126

127

128
def get_available_cuda_device_ids() -> List[int]:
129
    torch_device_count = torch.cuda.device_count()
130
    visible_devices_str = str(os.environ.get("CUDA_VISIBLE_DEVICES", ""))
131
    if not visible_devices_str:
132
        return list(range(torch_device_count))
133
    device_ids = list(map(int, visible_devices_str.strip().split(",")))
134
    if len(device_ids) != torch_device_count:
135
        # Cannot decide device ids since some devices in env are unavailable.
136
        return []
137
    return device_ids
138

139

140
class OVTrainingExampleTest(unittest.TestCase):
141
    def setUp(self) -> None:
142
        self.available_cuda_device_ids = get_available_cuda_device_ids()
143
        self.env = os.environ.copy()
144

145
    @parameterized.expand(TRAINING_EXAMPLE_DESCRIPTORS.items())
146
    def test_single_card_training(self, _, desc: TrainingExampleDescriptor):
147
        if len(self.available_cuda_device_ids) < 1:
148
            self.skipTest("No enough cuda devices.")
149

150
        self.env[CUDA_VISIBLE_DEVICES] = str(self.available_cuda_device_ids[0])
151
        with tempfile.TemporaryDirectory() as output_dir:
152
            args = ["torchrun", "--nproc_per_node=1", desc.filename, *desc.get_args_with_output_dir(output_dir)]
153
            proc = subprocess.Popen(
154
                args=args,
155
                cwd=desc.cwd,
156
                env=self.env.copy(),
157
            )
158
            return_code = proc.wait(desc.timeout)
159
            self.assertEqual(return_code, 0)
160
            self.assertTrue(Path(output_dir, OV_XML_FILE_NAME).is_file())
161

162
    @parameterized.expand(TRAINING_EXAMPLE_DESCRIPTORS.items())
163
    def test_data_parallel_training(self, _, desc: TrainingExampleDescriptor):
164
        if len(self.available_cuda_device_ids) < 2:
165
            self.skipTest("No enough cuda devices.")
166

167
        self.env[CUDA_VISIBLE_DEVICES] = ",".join(map(str, self.available_cuda_device_ids[:2]))
168
        with tempfile.TemporaryDirectory() as output_dir:
169
            args = [sys.executable, desc.filename, *desc.get_args_with_output_dir(output_dir)]
170
            proc = subprocess.Popen(
171
                args=args,
172
                cwd=desc.cwd,
173
                env=self.env.copy(),
174
            )
175
            return_code = proc.wait(desc.timeout)
176
            self.assertEqual(return_code, 0)
177
            self.assertTrue(Path(output_dir, OV_XML_FILE_NAME).is_file())
178

179
    @parameterized.expand(TRAINING_EXAMPLE_DESCRIPTORS.items())
180
    def test_distributed_data_parallel_training(self, _, desc: TrainingExampleDescriptor):
181
        if len(self.available_cuda_device_ids) < 2:
182
            self.skipTest("No enough cuda devices.")
183

184
        self.env[CUDA_VISIBLE_DEVICES] = ",".join(map(str, self.available_cuda_device_ids[:2]))
185
        with tempfile.TemporaryDirectory() as output_dir:
186
            args = [
187
                "torchrun",
188
                "--rdzv_backend=c10d",
189
                "--rdzv_endpoint=localhost:0",
190
                "--nnodes=1",
191
                "--nproc_per_node=2",
192
                desc.filename,
193
                *desc.get_args_with_output_dir(output_dir),
194
            ]
195
            proc = subprocess.Popen(
196
                args=args,
197
                cwd=desc.cwd,
198
                env=self.env.copy(),
199
            )
200
            return_code = proc.wait(desc.timeout)
201
            self.assertEqual(return_code, 0)
202
            self.assertTrue(Path(output_dir, OV_XML_FILE_NAME).is_file())
203
optimum-intel

Использование cookies