optimum-intel
202 строки · 6.7 Кб
1# Copyright 2023 The HuggingFace Team. All rights reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15import os
16import subprocess
17import sys
18import tempfile
19import unittest
20from dataclasses import dataclass
21from pathlib import Path
22from typing import List, Union
23
24import torch
25import torch.cuda
26from parameterized import parameterized
27
28from optimum.intel.openvino.utils import OV_XML_FILE_NAME
29
30
31PROJECT_ROOT = Path(__file__).parents[2]
32OPENVINO_EXAMPLES_PATH = PROJECT_ROOT / "examples" / "openvino"
33CUDA_VISIBLE_DEVICES = "CUDA_VISIBLE_DEVICES"
34
35
36@dataclass
37class TrainingExampleDescriptor:
38cwd: Union[Path, str]
39filename: str
40args: List[str]
41timeout: int
42
43def get_args_with_output_dir(self, output_dir: Union[Path, str]):
44flag = "--output_dir"
45args = self.args.copy()
46if flag in args:
47idx = args.index(flag)
48del args[idx : idx + 2]
49return [*args, flag, str(output_dir)]
50
51
52TRAINING_EXAMPLE_DESCRIPTORS = {
53"text-classification-QAT": TrainingExampleDescriptor(
54cwd=OPENVINO_EXAMPLES_PATH / "text-classification",
55filename="run_glue.py",
56args=[
57"--model_name_or_path",
58"hf-internal-testing/tiny-bert",
59"--task_name",
60"sst2",
61"--do_train",
62"--do_eval",
63"--per_device_train_batch_size",
64"2",
65"--per_device_eval_batch_size",
66"8",
67"--logging_steps",
68"1",
69"--evaluation_strategy",
70"steps",
71"--eval_steps",
72"2",
73"--save_strategy",
74"steps",
75"--save_steps",
76"2",
77"--save_total_limit",
78"1",
79"--max_steps",
80"5",
81"--fp16",
82"--report_to",
83"none",
84],
85timeout=300,
86),
87"text-classification-JPQD": TrainingExampleDescriptor(
88cwd=OPENVINO_EXAMPLES_PATH / "text-classification",
89filename="run_glue.py",
90args=[
91"--model_name_or_path",
92"hf-internal-testing/tiny-bert",
93"--teacher_model_name_or_path",
94"hf-internal-testing/tiny-bert",
95"--nncf_compression_config",
96"./configs/bert-base-jpqd.json",
97"--task_name",
98"sst2",
99"--do_train",
100"--do_eval",
101"--per_device_train_batch_size",
102"2",
103"--per_device_eval_batch_size",
104"8",
105"--logging_steps",
106"1",
107"--evaluation_strategy",
108"steps",
109"--eval_steps",
110"2",
111"--save_strategy",
112"steps",
113"--save_steps",
114"2",
115"--save_total_limit",
116"1",
117"--max_steps",
118"5",
119"--fp16",
120"--report_to",
121"none",
122],
123timeout=300,
124),
125}
126
127
128def get_available_cuda_device_ids() -> List[int]:
129torch_device_count = torch.cuda.device_count()
130visible_devices_str = str(os.environ.get("CUDA_VISIBLE_DEVICES", ""))
131if not visible_devices_str:
132return list(range(torch_device_count))
133device_ids = list(map(int, visible_devices_str.strip().split(",")))
134if len(device_ids) != torch_device_count:
135# Cannot decide device ids since some devices in env are unavailable.
136return []
137return device_ids
138
139
140class OVTrainingExampleTest(unittest.TestCase):
141def setUp(self) -> None:
142self.available_cuda_device_ids = get_available_cuda_device_ids()
143self.env = os.environ.copy()
144
145@parameterized.expand(TRAINING_EXAMPLE_DESCRIPTORS.items())
146def test_single_card_training(self, _, desc: TrainingExampleDescriptor):
147if len(self.available_cuda_device_ids) < 1:
148self.skipTest("No enough cuda devices.")
149
150self.env[CUDA_VISIBLE_DEVICES] = str(self.available_cuda_device_ids[0])
151with tempfile.TemporaryDirectory() as output_dir:
152args = ["torchrun", "--nproc_per_node=1", desc.filename, *desc.get_args_with_output_dir(output_dir)]
153proc = subprocess.Popen(
154args=args,
155cwd=desc.cwd,
156env=self.env.copy(),
157)
158return_code = proc.wait(desc.timeout)
159self.assertEqual(return_code, 0)
160self.assertTrue(Path(output_dir, OV_XML_FILE_NAME).is_file())
161
162@parameterized.expand(TRAINING_EXAMPLE_DESCRIPTORS.items())
163def test_data_parallel_training(self, _, desc: TrainingExampleDescriptor):
164if len(self.available_cuda_device_ids) < 2:
165self.skipTest("No enough cuda devices.")
166
167self.env[CUDA_VISIBLE_DEVICES] = ",".join(map(str, self.available_cuda_device_ids[:2]))
168with tempfile.TemporaryDirectory() as output_dir:
169args = [sys.executable, desc.filename, *desc.get_args_with_output_dir(output_dir)]
170proc = subprocess.Popen(
171args=args,
172cwd=desc.cwd,
173env=self.env.copy(),
174)
175return_code = proc.wait(desc.timeout)
176self.assertEqual(return_code, 0)
177self.assertTrue(Path(output_dir, OV_XML_FILE_NAME).is_file())
178
179@parameterized.expand(TRAINING_EXAMPLE_DESCRIPTORS.items())
180def test_distributed_data_parallel_training(self, _, desc: TrainingExampleDescriptor):
181if len(self.available_cuda_device_ids) < 2:
182self.skipTest("No enough cuda devices.")
183
184self.env[CUDA_VISIBLE_DEVICES] = ",".join(map(str, self.available_cuda_device_ids[:2]))
185with tempfile.TemporaryDirectory() as output_dir:
186args = [
187"torchrun",
188"--rdzv_backend=c10d",
189"--rdzv_endpoint=localhost:0",
190"--nnodes=1",
191"--nproc_per_node=2",
192desc.filename,
193*desc.get_args_with_output_dir(output_dir),
194]
195proc = subprocess.Popen(
196args=args,
197cwd=desc.cwd,
198env=self.env.copy(),
199)
200return_code = proc.wait(desc.timeout)
201self.assertEqual(return_code, 0)
202self.assertTrue(Path(output_dir, OV_XML_FILE_NAME).is_file())
203