14
from __future__ import annotations
21
from parameterized import parameterized_class
23
from paddlenlp.experimental.transformers import QWenForQWenVLInferenceModel
24
from paddlenlp.transformers import (
33
from paddlenlp.utils.downloader import (
34
COMMUNITY_MODEL_PREFIX,
35
get_path_from_url_with_filelock,
38
from tests.testing_utils import GPUsTesting, require_gpu
40
from .testing_utils import LLMTest, argv_context_guard, load_test_config
44
["model_name_or_path", "model_class"],
46
["__internal_testing__/tiny-random-llama", LlamaForCausalLM],
47
["__internal_testing__/tiny-fused-bloom", BloomForCausalLM],
48
["__internal_testing__/tiny-fused-chatglm", ChatGLMForCausalLM],
49
["__internal_testing__/tiny-fused-chatglm2", ChatGLMv2ForCausalLM],
50
["__internal_testing__/tiny-fused-qwen-inference5.2", QWenForCausalLM],
53
class PredictorTest(LLMTest, unittest.TestCase):
54
config_path: str = "./tests/fixtures/llm/predictor.yaml"
55
model_name_or_path: str = None
58
def setUp(self) -> None:
60
paddle.set_default_dtype("float32")
61
self.model_class.from_pretrained(self.model_name_or_path, dtype="float16").save_pretrained(self.output_dir)
62
AutoTokenizer.from_pretrained(self.model_name_or_path).save_pretrained(self.output_dir)
64
def test_predictor(self):
65
self.run_predictor({"inference_model": True})
66
result_0 = self._read_result(os.path.join(self.output_dir, "predict.json"))
67
self.run_predictor({"inference_model": False})
68
result_1 = self._read_result(os.path.join(self.output_dir, "predict.json"))
71
assert len(result_0) == len(result_1)
73
count, full_match = 0, 0
74
for inference_item, no_inference_item in zip(result_0, result_1):
75
min_length = min(len(inference_item), len(no_inference_item))
76
count += int(inference_item[: min_length // 2] == no_inference_item[: min_length // 2])
77
full_match += int(inference_item[:min_length] == no_inference_item[:min_length])
79
self.assertGreaterEqual(full_match / len(result_0), 0.25)
81
if self.model_name_or_path == "__internal_testing__/tiny-fused-chatglm":
82
self.assertGreaterEqual(count / len(result_0), 0.3)
84
self.assertGreaterEqual(count / len(result_0), 0.4)
86
def test_flash_attention(self):
87
self.run_predictor({"inference_model": False, "use_flash_attention": False})
88
result_0 = self._read_result(os.path.join(self.output_dir, "predict.json"))
90
self.run_predictor({"inference_model": False, "use_flash_attention": True})
91
result_1 = self._read_result(os.path.join(self.output_dir, "predict.json"))
94
assert len(result_0) == len(result_1)
96
count, full_match = 0, 0
97
for inference_item, no_inference_item in zip(result_0, result_1):
98
if self.model_name_or_path == "__internal_testing__/tiny-random-llama":
101
min_length = min(len(inference_item), len(no_inference_item))
102
count += int(inference_item[: min_length // 2] == no_inference_item[: min_length // 2])
103
full_match += int(inference_item[:min_length] == no_inference_item[:min_length])
105
if self.model_name_or_path == "__internal_testing__/tiny-random-llama":
106
self.assertGreaterEqual(count / len(result_0), 0.2)
108
self.assertEqual(full_match / len(result_0), 1.0)
110
def test_wint8(self):
111
self.run_predictor({"inference_model": True, "quant_type": "weight_only_int8"})
112
result_0 = self._read_result(os.path.join(self.output_dir, "predict.json"))
113
self.run_predictor({"inference_model": False})
114
result_1 = self._read_result(os.path.join(self.output_dir, "predict.json"))
116
assert len(result_0) == len(result_1)
117
count, full_match = 0, 0
119
for inference_item, no_inference_item in zip(result_0, result_1):
120
min_length = min(len(inference_item), len(no_inference_item))
121
count += int(inference_item[: min_length // 2] == no_inference_item[: min_length // 2])
122
full_match += int(inference_item[:min_length] == no_inference_item[:min_length])
124
self.assertGreaterEqual(full_match / len(result_0), 0.1)
126
if self.model_name_or_path == "__internal_testing__/tiny-fused-chatglm":
127
self.assertGreaterEqual(count / len(result_0), 0.3)
129
self.assertGreaterEqual(count / len(result_0), 0.4)
133
["model_name_or_path", "model_class"],
134
[["__internal_testing__/tiny-random-llama", LlamaForCausalLM]],
136
class PredictorPrecacheTest(LLMTest, unittest.TestCase):
137
config_path: str = "./tests/fixtures/llm/predictor.yaml"
138
model_name_or_path: str = None
141
def setUp(self) -> None:
144
AutoTokenizer.from_pretrained(self.model_name_or_path).save_pretrained(self.output_dir)
145
self.download_precache_files()
147
def download_precache_files(self):
149
"prefix_config.json",
151
"model_state.pdparams",
153
"prefix_model_state.pdparams",
156
file_url = os.path.join(COMMUNITY_MODEL_PREFIX, self.model_name_or_path, file)
157
if not url_file_exists(file_url):
159
get_path_from_url_with_filelock(file_url, root_dir=self.output_dir)
161
def test_predictor(self):
162
self.run_predictor({"inference_model": True, "export_precache": True, "prefix_path": self.output_dir})
163
result_0 = self._read_result(os.path.join(self.output_dir, "predict.json"))
164
self.run_predictor({"inference_model": False, "export_precache": True, "prefix_path": self.output_dir})
165
result_1 = self._read_result(os.path.join(self.output_dir, "predict.json"))
168
assert len(result_0) == len(result_1)
169
count, full_match = 0, 0
170
for inference_item, no_inference_item in zip(result_0, result_1):
172
min_length = min(len(inference_item), len(no_inference_item))
173
count += int(inference_item[: min_length // 2] == no_inference_item[: min_length // 2])
174
full_match += int(inference_item[:min_length] == no_inference_item[:min_length])
176
self.assertGreaterEqual(full_match / len(result_0), 0.6)
177
self.assertGreaterEqual(count / len(result_0), 0.8)
180
class PredictorBaseTest(LLMTest, unittest.TestCase):
181
def load_test_config(self):
182
config = load_test_config("./tests/fixtures/llm/predictor.yaml", "inference-predict")
183
config["model_name_or_path"] = "__internal_testing__/micro-random-llama"
187
def test_create_predictor_with_unexpected_length(self):
188
from predictor import predict
190
config = self.load_test_config()
191
config.pop("src_length", None)
192
config.pop("max_length", None)
194
with pytest.raises(ValueError, match="--src_length<2048> param should be smaller "):
195
config["src_length"] = 2048
197
with argv_context_guard(config):
200
with pytest.raises(ValueError, match="--max_length<2048> param should be smaller "):
201
config.pop("src_length", None)
202
config["max_length"] = 2048
204
with argv_context_guard(config):
207
with pytest.raises(ValueError, match="The sum of src_length<1025> and"):
208
config["max_length"] = 1024
209
config["src_length"] = 1025
211
with argv_context_guard(config):
216
["model_name_or_path", "model_class"],
218
["__internal_testing__/tiny-fused-llama-inference5.2", LlamaForCausalLM],
219
["__internal_testing__/tiny-fused-bloom", BloomForCausalLM],
222
class BlockAttnPredictorTest(LLMTest, unittest.TestCase):
223
config_path: str = "./tests/fixtures/llm/predictor.yaml"
224
model_name_or_path: str = None
227
def setUp(self) -> None:
229
paddle.set_default_dtype("float32")
230
self.model_class.from_pretrained(self.model_name_or_path, dtype="float16").save_pretrained(self.output_dir)
231
AutoTokenizer.from_pretrained(self.model_name_or_path).save_pretrained(self.output_dir)
234
self.run_predictor({"inference_model": True, "block_attn": True})
235
result_0 = self._read_result(os.path.join(self.output_dir, "predict.json"))
236
self.run_predictor({"inference_model": False})
237
result_1 = self._read_result(os.path.join(self.output_dir, "predict.json"))
240
assert len(result_0) == len(result_1)
242
count, full_match = 0, 0
243
for inference_item, no_inference_item in zip(result_0, result_1):
244
min_length = min(len(inference_item), len(no_inference_item))
245
count += int(inference_item[: min_length // 2] == no_inference_item[: min_length // 2])
246
full_match += int(inference_item[:min_length] == no_inference_item[:min_length])
248
self.assertGreaterEqual(full_match / len(result_0), 0.3)
250
if self.model_name_or_path == "__internal_testing__/tiny-fused-chatglm":
251
self.assertGreaterEqual(count / len(result_0), 0.3)
253
self.assertGreaterEqual(count / len(result_0), 0.4)
255
def test_wint8(self):
256
self.run_predictor({"inference_model": True, "quant_type": "weight_only_int8", "block_attn": True})
257
result_0 = self._read_result(os.path.join(self.output_dir, "predict.json"))
258
self.run_predictor({"inference_model": True, "quant_type": "weight_only_int8"})
259
result_1 = self._read_result(os.path.join(self.output_dir, "predict.json"))
261
assert len(result_0) == len(result_1)
262
count, full_match = 0, 0
264
for inference_item, no_inference_item in zip(result_0, result_1):
265
min_length = min(len(inference_item), len(no_inference_item))
266
count += int(inference_item[: min_length // 2] == no_inference_item[: min_length // 2])
267
full_match += int(inference_item[:min_length] == no_inference_item[:min_length])
269
self.assertGreaterEqual(full_match / len(result_0), 0.75)
271
if self.model_name_or_path == "__internal_testing__/tiny-fused-chatglm":
272
self.assertGreaterEqual(count / len(result_0), 0.3)
274
self.assertGreaterEqual(count / len(result_0), 0.4)
276
def test_cachekv_int8(self):
277
self.run_predictor({"inference_model": True, "block_attn": True, "cachekv_int8": True})
278
result_0 = self._read_result(os.path.join(self.output_dir, "predict.json"))
279
self.run_predictor({"inference_model": True, "block_attn": True})
280
result_1 = self._read_result(os.path.join(self.output_dir, "predict.json"))
281
print(f"result_0 {result_0}, result_1 {result_1}")
283
assert len(result_0) == len(result_1)
284
count, full_match = 0, 0
286
for inference_item, no_inference_item in zip(result_0, result_1):
287
min_length = min(len(inference_item), len(no_inference_item))
288
count += int(inference_item[: min_length // 2] == no_inference_item[: min_length // 2])
289
full_match += int(inference_item[:min_length] == no_inference_item[:min_length])
291
self.assertGreaterEqual(count / len(result_0), 0.2)
295
["model_name_or_path", "model_class"],
297
["__internal_testing__/tiny-random-llama", LlamaForCausalLM],
300
class GPUsPredictorTest(LLMTest, GPUsTesting, unittest.TestCase):
301
config_path: str = "./tests/fixtures/llm/predictor.yaml"
302
model_name_or_path: str = None
305
def setUp(self) -> None:
307
self.model_class.from_pretrained(self.model_name_or_path, dtype="float16").save_pretrained(self.output_dir)
308
AutoTokenizer.from_pretrained(self.model_name_or_path).save_pretrained(self.output_dir)
311
def test_predictor(self):
314
self.run_predictor({"inference_model": True})
315
result_0 = self._read_result(os.path.join(self.output_dir, "predict.json"))
316
self.run_predictor({"inference_model": False})
317
result_1 = self._read_result(os.path.join(self.output_dir, "predict.json"))
320
assert len(result_0) == len(result_1)
322
count, full_match = 0, 0
323
for inference_item, no_inference_item in zip(result_0, result_1):
324
min_length = min(len(inference_item), len(no_inference_item))
325
count += int(inference_item[: min_length // 2] == no_inference_item[: min_length // 2])
326
full_match += int(inference_item[:min_length] == no_inference_item[:min_length])
328
self.assertGreaterEqual(full_match / len(result_0), 0.25)
329
self.assertGreaterEqual(count / len(result_0), 0.4)
332
class QWenVLTest(LLMTest, unittest.TestCase):
333
config_path: str = "./tests/fixtures/llm/predictor.yaml"
334
model_name_or_path: str = "__internal_testing__/tiny-fused-qwen"
335
model_class = QWenForCausalLM
337
def setUp(self) -> None:
339
paddle.set_default_dtype("float32")
340
self.model_class.from_pretrained(self.model_name_or_path, dtype="float16").save_pretrained(self.output_dir)
341
AutoTokenizer.from_pretrained(self.model_name_or_path).save_pretrained(self.output_dir)
343
def test_forward(self):
344
self.disable_static()
345
config = AutoConfig.from_pretrained(self.output_dir)
346
config.quant_type = None
347
config.weight_only_quant_bits = None
349
paddle.set_default_dtype("float16")
350
model = QWenForQWenVLInferenceModel.from_pretrained(self.output_dir, config=config, dtype="float16")
356
input_ids = paddle.randint(0, 100, [batch, seq], dtype="int64")
357
image_features = paddle.randn([batch, 16, config.hidden_size], dtype="float16")
358
tgt_generation_mask = paddle.full([batch, 1, 1, max_len], 1, dtype=dtype)
359
img_pos = paddle.to_tensor([[0, 4, 21]], dtype="int64")
360
attention_mask = paddle.full([batch, 1, max_len, max_len], 0, dtype=dtype)
361
attention_mask[:, 0, :seq, :seq] = paddle.tril(paddle.ones(shape=(seq, seq), dtype=dtype))
362
position_ids = paddle.full([batch, seq], 0, dtype="int64")
363
for i in range(batch):
364
position_ids[i, :] = paddle.to_tensor([i for i in range(seq)], dtype="int64")
372
paddle.full([batch, 1], 1.0, dtype="float32"),
373
paddle.full([batch, 1], 0.0, dtype="float32"),
374
paddle.full([batch, 1], 0.0, dtype="float32"),
375
paddle.full([batch, 1], 1, dtype="int64"),
376
paddle.full([batch, 1], max_len - seq, dtype="int64"),
377
paddle.full([batch, 1], 1.0, dtype="float32"),
378
paddle.full([batch, 1], 0.0, dtype="float32"),
379
paddle.full([1], 151643, dtype="int64"),
380
paddle.full([batch, 1], seq, dtype="int32"),
381
paddle.full([batch, 1], seq, dtype="int32"),
382
paddle.full([batch, 1], 0, dtype="int64"),
383
paddle.full([batch, 1], False, dtype="bool"),
384
paddle.full([batch, 1], -123, dtype="int64"),
385
paddle.full([batch, 1], seq - 1, dtype="int64"),
387
paddle.full([batch, max_len], -100, dtype="int64"),
388
paddle.full([1], batch, dtype="int64"),
390
for i in range(config.num_hidden_layers):
391
tmp = paddle.rand(shape=[2, batch, 1, max_len, 64], dtype=dtype)
395
model.generate_text_with_image_features(
397
image_features=inputs[1],
399
attention_mask=inputs[3],
400
position_ids=inputs[4],
401
penalty_score=inputs[5],
402
frequency_score=inputs[6],
403
presence_score=inputs[7],
404
min_length=inputs[8],
405
max_length=inputs[9],
406
temperature=inputs[10],
408
eos_token_id=inputs[12],
409
seq_len_encoder=inputs[13],
410
seq_len_decoder=inputs[14],
412
stop_flags=inputs[16],
415
tgt_generation_mask=inputs[19],
417
stop_nums=inputs[21],
418
cache_kvs=inputs[22:],
421
def test_export(self):
422
self.disable_static()
423
config = load_test_config(self.config_path, "inference-to-static")
424
config["model_name_or_path"] = self.model_name_or_path
425
config["output_path"] = self.output_dir
426
config["dtype"] = "float16"
427
config["inference_model"] = True
428
config["model_prefix"] = "qwen"
429
config["model_type"] = "qwen-img2txt"
431
with argv_context_guard(config):
432
from export_model import main