1
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
3
# Licensed under the Apache License, Version 2.0 (the "License");
4
# you may not use this file except in compliance with the License.
5
# You may obtain a copy of the License at
7
# http://www.apache.org/licenses/LICENSE-2.0
9
# Unless required by applicable law or agreed to in writing, software
10
# distributed under the License is distributed on an "AS IS" BASIS,
11
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
# See the License for the specific language governing permissions and
13
# limitations under the License.
14
from __future__ import annotations
21
from parameterized import parameterized_class
23
from paddlenlp.utils.downloader import get_path_from_url
24
from tests.testing_utils import argv_context_guard, load_test_config
26
from .testing_utils import LLMTest
37
class PretrainTest(LLMTest, unittest.TestCase):
38
config_path: str = "./tests/fixtures/llm/pretrain.yaml"
41
def setUp(self) -> None:
44
self.dataset_dir = tempfile.mkdtemp()
45
self.model_codes_dir = self.root_path
47
def tearDown(self) -> None:
48
LLMTest.tearDown(self)
49
shutil.rmtree(self.dataset_dir)
51
def test_pretrain(self):
54
for key, value in sys.modules.items():
55
if "run_pretrain" in key:
59
del sys.modules["run_pretrain"]
62
URL = "https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy"
63
URL2 = "https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz"
64
get_path_from_url(URL, root_dir=self.dataset_dir)
65
get_path_from_url(URL2, root_dir=self.dataset_dir)
67
pretrain_config = load_test_config(self.config_path, "pretrain", self.model_dir)
69
pretrain_config["input_dir"] = self.dataset_dir
70
pretrain_config["output_dir"] = self.output_dir
72
with argv_context_guard(pretrain_config):
73
from run_pretrain import main
77
# Now, only work for llama, not gpt or qwen
78
if self.model_dir == "llama":
79
self.run_predictor({"inference_model": True})
81
self.run_predictor({"inference_model": False})