transformers
122 строки · 4.4 Кб
1import json2import os3import subprocess4import unittest5from ast import literal_eval6
7import pytest8from parameterized import parameterized, parameterized_class9
10from . import is_sagemaker_available11
12
13if is_sagemaker_available():14from sagemaker import Session, TrainingJobAnalytics15from sagemaker.huggingface import HuggingFace16
17
18@pytest.mark.skipif(19literal_eval(os.getenv("TEST_SAGEMAKER", "False")) is not True,20reason="Skipping test because should only be run when releasing minor transformers version",21)
22@pytest.mark.usefixtures("sm_env")23@parameterized_class(24[25{26"framework": "pytorch",27"script": "run_glue_model_parallelism.py",28"model_name_or_path": "FacebookAI/roberta-large",29"instance_type": "ml.p3dn.24xlarge",30"results": {"train_runtime": 1600, "eval_accuracy": 0.3, "eval_loss": 1.2},31},32{33"framework": "pytorch",34"script": "run_glue.py",35"model_name_or_path": "FacebookAI/roberta-large",36"instance_type": "ml.p3dn.24xlarge",37"results": {"train_runtime": 1600, "eval_accuracy": 0.3, "eval_loss": 1.2},38},39]40)
41class MultiNodeTest(unittest.TestCase):42def setUp(self):43if self.framework == "pytorch":44subprocess.run(45f"cp ./examples/pytorch/text-classification/run_glue.py {self.env.test_path}/run_glue.py".split(),46encoding="utf-8",47check=True,48)49assert hasattr(self, "env")50
51def create_estimator(self, instance_count):52# configuration for running training on smdistributed Model Parallel53mpi_options = {54"enabled": True,55"processes_per_host": 8,56}57smp_options = {58"enabled": True,59"parameters": {60"microbatches": 4,61"placement_strategy": "spread",62"pipeline": "interleaved",63"optimize": "speed",64"partitions": 4,65"ddp": True,66},67}68
69distribution = {"smdistributed": {"modelparallel": smp_options}, "mpi": mpi_options}70
71name_extension = "trainer" if self.script == "run_glue.py" else "smtrainer"72# creates estimator73return HuggingFace(74entry_point=self.script,75source_dir=self.env.test_path,76role=self.env.role,77image_uri=self.env.image_uri,78base_job_name=f"{self.env.base_job_name}-{instance_count}-smp-{name_extension}",79instance_count=instance_count,80instance_type=self.instance_type,81debugger_hook_config=False,82hyperparameters={83**self.env.hyperparameters,84"model_name_or_path": self.model_name_or_path,85"max_steps": 500,86},87metric_definitions=self.env.metric_definitions,88distribution=distribution,89py_version="py36",90)91
92def save_results_as_csv(self, job_name):93TrainingJobAnalytics(job_name).export_csv(f"{self.env.test_path}/{job_name}_metrics.csv")94
95# @parameterized.expand([(2,), (4,),])96@parameterized.expand([(1,)])97def test_scripz(self, instance_count):98# create estimator99estimator = self.create_estimator(instance_count)100
101# run training102estimator.fit()103
104# result dataframe105result_metrics_df = TrainingJobAnalytics(estimator.latest_training_job.name).dataframe()106
107# extract kpis108eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name == "eval_accuracy"]["value"])109eval_loss = list(result_metrics_df[result_metrics_df.metric_name == "eval_loss"]["value"])110# get train time from SageMaker job, this includes starting, preprocessing, stopping111train_runtime = (112Session().describe_training_job(estimator.latest_training_job.name).get("TrainingTimeInSeconds", 999999)113)114
115# assert kpis116assert train_runtime <= self.results["train_runtime"]117assert all(t >= self.results["eval_accuracy"] for t in eval_accuracy)118assert all(t <= self.results["eval_loss"] for t in eval_loss)119
120# dump tests result into json file to share in PR121with open(f"{estimator.latest_training_job.name}.json", "w") as outfile:122json.dump({"train_time": train_runtime, "eval_accuracy": eval_accuracy, "eval_loss": eval_loss}, outfile)123