paddlenlp

Форк
0
/
test_predictor.py 
434 строки · 18.6 Кб
1
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2
#
3
# Licensed under the Apache License, Version 2.0 (the "License");
4
# you may not use this file except in compliance with the License.
5
# You may obtain a copy of the License at
6
#
7
#     http://www.apache.org/licenses/LICENSE-2.0
8
#
9
# Unless required by applicable law or agreed to in writing, software
10
# distributed under the License is distributed on an "AS IS" BASIS,
11
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
# See the License for the specific language governing permissions and
13
# limitations under the License.
14
from __future__ import annotations
15

16
import os
17
import unittest
18

19
import paddle
20
import pytest
21
from parameterized import parameterized_class
22

23
from paddlenlp.experimental.transformers import QWenForQWenVLInferenceModel
24
from paddlenlp.transformers import (  # ChatGLMForCausalLM,
25
    AutoConfig,
26
    AutoTokenizer,
27
    BloomForCausalLM,
28
    ChatGLMForCausalLM,
29
    ChatGLMv2ForCausalLM,
30
    LlamaForCausalLM,
31
    QWenForCausalLM,
32
)
33
from paddlenlp.utils.downloader import (
34
    COMMUNITY_MODEL_PREFIX,
35
    get_path_from_url_with_filelock,
36
    url_file_exists,
37
)
38
from tests.testing_utils import GPUsTesting, require_gpu
39

40
from .testing_utils import LLMTest, argv_context_guard, load_test_config
41

42

43
@parameterized_class(
44
    ["model_name_or_path", "model_class"],
45
    [
46
        ["__internal_testing__/tiny-random-llama", LlamaForCausalLM],
47
        ["__internal_testing__/tiny-fused-bloom", BloomForCausalLM],
48
        ["__internal_testing__/tiny-fused-chatglm", ChatGLMForCausalLM],
49
        ["__internal_testing__/tiny-fused-chatglm2", ChatGLMv2ForCausalLM],
50
        ["__internal_testing__/tiny-fused-qwen-inference5.2", QWenForCausalLM],
51
    ],
52
)
53
class PredictorTest(LLMTest, unittest.TestCase):
54
    config_path: str = "./tests/fixtures/llm/predictor.yaml"
55
    model_name_or_path: str = None
56
    model_class = None
57

58
    def setUp(self) -> None:
59
        super().setUp()
60
        paddle.set_default_dtype("float32")
61
        self.model_class.from_pretrained(self.model_name_or_path, dtype="float16").save_pretrained(self.output_dir)
62
        AutoTokenizer.from_pretrained(self.model_name_or_path).save_pretrained(self.output_dir)
63

64
    def test_predictor(self):
65
        self.run_predictor({"inference_model": True})
66
        result_0 = self._read_result(os.path.join(self.output_dir, "predict.json"))
67
        self.run_predictor({"inference_model": False})
68
        result_1 = self._read_result(os.path.join(self.output_dir, "predict.json"))
69

70
        # compare the generation result of inference & dygraph model
71
        assert len(result_0) == len(result_1)
72

73
        count, full_match = 0, 0
74
        for inference_item, no_inference_item in zip(result_0, result_1):
75
            min_length = min(len(inference_item), len(no_inference_item))
76
            count += int(inference_item[: min_length // 2] == no_inference_item[: min_length // 2])
77
            full_match += int(inference_item[:min_length] == no_inference_item[:min_length])
78

79
        self.assertGreaterEqual(full_match / len(result_0), 0.25)
80

81
        if self.model_name_or_path == "__internal_testing__/tiny-fused-chatglm":
82
            self.assertGreaterEqual(count / len(result_0), 0.3)
83
        else:
84
            self.assertGreaterEqual(count / len(result_0), 0.4)
85

86
    def test_flash_attention(self):
87
        self.run_predictor({"inference_model": False, "use_flash_attention": False})
88
        result_0 = self._read_result(os.path.join(self.output_dir, "predict.json"))
89

90
        self.run_predictor({"inference_model": False, "use_flash_attention": True})
91
        result_1 = self._read_result(os.path.join(self.output_dir, "predict.json"))
92

93
        # compare the generation result of dygraph & flash attention model
94
        assert len(result_0) == len(result_1)
95

96
        count, full_match = 0, 0
97
        for inference_item, no_inference_item in zip(result_0, result_1):
98
            if self.model_name_or_path == "__internal_testing__/tiny-random-llama":
99
                min_length = 5
100
            else:
101
                min_length = min(len(inference_item), len(no_inference_item))
102
            count += int(inference_item[: min_length // 2] == no_inference_item[: min_length // 2])
103
            full_match += int(inference_item[:min_length] == no_inference_item[:min_length])
104

105
        if self.model_name_or_path == "__internal_testing__/tiny-random-llama":
106
            self.assertGreaterEqual(count / len(result_0), 0.2)
107
        else:
108
            self.assertEqual(full_match / len(result_0), 1.0)
109

110
    def test_wint8(self):
111
        self.run_predictor({"inference_model": True, "quant_type": "weight_only_int8"})
112
        result_0 = self._read_result(os.path.join(self.output_dir, "predict.json"))
113
        self.run_predictor({"inference_model": False})
114
        result_1 = self._read_result(os.path.join(self.output_dir, "predict.json"))
115

116
        assert len(result_0) == len(result_1)
117
        count, full_match = 0, 0
118

119
        for inference_item, no_inference_item in zip(result_0, result_1):
120
            min_length = min(len(inference_item), len(no_inference_item))
121
            count += int(inference_item[: min_length // 2] == no_inference_item[: min_length // 2])
122
            full_match += int(inference_item[:min_length] == no_inference_item[:min_length])
123

124
        self.assertGreaterEqual(full_match / len(result_0), 0.1)
125

126
        if self.model_name_or_path == "__internal_testing__/tiny-fused-chatglm":
127
            self.assertGreaterEqual(count / len(result_0), 0.3)
128
        else:
129
            self.assertGreaterEqual(count / len(result_0), 0.4)
130

131

132
@parameterized_class(
133
    ["model_name_or_path", "model_class"],
134
    [["__internal_testing__/tiny-random-llama", LlamaForCausalLM]],
135
)
136
class PredictorPrecacheTest(LLMTest, unittest.TestCase):
137
    config_path: str = "./tests/fixtures/llm/predictor.yaml"
138
    model_name_or_path: str = None
139
    model_class = None
140

141
    def setUp(self) -> None:
142
        super().setUp()
143

144
        AutoTokenizer.from_pretrained(self.model_name_or_path).save_pretrained(self.output_dir)
145
        self.download_precache_files()
146

147
    def download_precache_files(self):
148
        files = [
149
            "prefix_config.json",
150
            "config.json",
151
            "model_state.pdparams",
152
            "pre_caches.npy",
153
            "prefix_model_state.pdparams",
154
        ]
155
        for file in files:
156
            file_url = os.path.join(COMMUNITY_MODEL_PREFIX, self.model_name_or_path, file)
157
            if not url_file_exists(file_url):
158
                continue
159
            get_path_from_url_with_filelock(file_url, root_dir=self.output_dir)
160

161
    def test_predictor(self):
162
        self.run_predictor({"inference_model": True, "export_precache": True, "prefix_path": self.output_dir})
163
        result_0 = self._read_result(os.path.join(self.output_dir, "predict.json"))
164
        self.run_predictor({"inference_model": False, "export_precache": True, "prefix_path": self.output_dir})
165
        result_1 = self._read_result(os.path.join(self.output_dir, "predict.json"))
166

167
        # compare the generation result of inference & dygraph model
168
        assert len(result_0) == len(result_1)
169
        count, full_match = 0, 0
170
        for inference_item, no_inference_item in zip(result_0, result_1):
171

172
            min_length = min(len(inference_item), len(no_inference_item))
173
            count += int(inference_item[: min_length // 2] == no_inference_item[: min_length // 2])
174
            full_match += int(inference_item[:min_length] == no_inference_item[:min_length])
175

176
        self.assertGreaterEqual(full_match / len(result_0), 0.6)
177
        self.assertGreaterEqual(count / len(result_0), 0.8)
178

179

180
class PredictorBaseTest(LLMTest, unittest.TestCase):
181
    def load_test_config(self):
182
        config = load_test_config("./tests/fixtures/llm/predictor.yaml", "inference-predict")
183
        config["model_name_or_path"] = "__internal_testing__/micro-random-llama"
184

185
        return config
186

187
    def test_create_predictor_with_unexpected_length(self):
188
        from predictor import predict
189

190
        config = self.load_test_config()
191
        config.pop("src_length", None)
192
        config.pop("max_length", None)
193

194
        with pytest.raises(ValueError, match="--src_length<2048> param should be smaller "):
195
            config["src_length"] = 2048
196

197
            with argv_context_guard(config):
198
                predict()
199

200
        with pytest.raises(ValueError, match="--max_length<2048> param should be smaller "):
201
            config.pop("src_length", None)
202
            config["max_length"] = 2048
203

204
            with argv_context_guard(config):
205
                predict()
206

207
        with pytest.raises(ValueError, match="The sum of src_length<1025> and"):
208
            config["max_length"] = 1024
209
            config["src_length"] = 1025
210

211
            with argv_context_guard(config):
212
                predict()
213

214

215
@parameterized_class(
216
    ["model_name_or_path", "model_class"],
217
    [
218
        ["__internal_testing__/tiny-fused-llama-inference5.2", LlamaForCausalLM],
219
        ["__internal_testing__/tiny-fused-bloom", BloomForCausalLM],
220
    ],
221
)
222
class BlockAttnPredictorTest(LLMTest, unittest.TestCase):
223
    config_path: str = "./tests/fixtures/llm/predictor.yaml"
224
    model_name_or_path: str = None
225
    model_class = None
226

227
    def setUp(self) -> None:
228
        super().setUp()
229
        paddle.set_default_dtype("float32")
230
        self.model_class.from_pretrained(self.model_name_or_path, dtype="float16").save_pretrained(self.output_dir)
231
        AutoTokenizer.from_pretrained(self.model_name_or_path).save_pretrained(self.output_dir)
232

233
    def test_blha(self):
234
        self.run_predictor({"inference_model": True, "block_attn": True})
235
        result_0 = self._read_result(os.path.join(self.output_dir, "predict.json"))
236
        self.run_predictor({"inference_model": False})
237
        result_1 = self._read_result(os.path.join(self.output_dir, "predict.json"))
238

239
        # compare the generation result of inference & dygraph model
240
        assert len(result_0) == len(result_1)
241

242
        count, full_match = 0, 0
243
        for inference_item, no_inference_item in zip(result_0, result_1):
244
            min_length = min(len(inference_item), len(no_inference_item))
245
            count += int(inference_item[: min_length // 2] == no_inference_item[: min_length // 2])
246
            full_match += int(inference_item[:min_length] == no_inference_item[:min_length])
247

248
        self.assertGreaterEqual(full_match / len(result_0), 0.3)
249

250
        if self.model_name_or_path == "__internal_testing__/tiny-fused-chatglm":
251
            self.assertGreaterEqual(count / len(result_0), 0.3)
252
        else:
253
            self.assertGreaterEqual(count / len(result_0), 0.4)
254

255
    def test_wint8(self):
256
        self.run_predictor({"inference_model": True, "quant_type": "weight_only_int8", "block_attn": True})
257
        result_0 = self._read_result(os.path.join(self.output_dir, "predict.json"))
258
        self.run_predictor({"inference_model": True, "quant_type": "weight_only_int8"})
259
        result_1 = self._read_result(os.path.join(self.output_dir, "predict.json"))
260

261
        assert len(result_0) == len(result_1)
262
        count, full_match = 0, 0
263

264
        for inference_item, no_inference_item in zip(result_0, result_1):
265
            min_length = min(len(inference_item), len(no_inference_item))
266
            count += int(inference_item[: min_length // 2] == no_inference_item[: min_length // 2])
267
            full_match += int(inference_item[:min_length] == no_inference_item[:min_length])
268

269
        self.assertGreaterEqual(full_match / len(result_0), 0.75)
270

271
        if self.model_name_or_path == "__internal_testing__/tiny-fused-chatglm":
272
            self.assertGreaterEqual(count / len(result_0), 0.3)
273
        else:
274
            self.assertGreaterEqual(count / len(result_0), 0.4)
275

276
    def test_cachekv_int8(self):
277
        self.run_predictor({"inference_model": True, "block_attn": True, "cachekv_int8": True})
278
        result_0 = self._read_result(os.path.join(self.output_dir, "predict.json"))
279
        self.run_predictor({"inference_model": True, "block_attn": True})
280
        result_1 = self._read_result(os.path.join(self.output_dir, "predict.json"))
281
        print(f"result_0 {result_0}, result_1 {result_1}")
282

283
        assert len(result_0) == len(result_1)
284
        count, full_match = 0, 0
285

286
        for inference_item, no_inference_item in zip(result_0, result_1):
287
            min_length = min(len(inference_item), len(no_inference_item))
288
            count += int(inference_item[: min_length // 2] == no_inference_item[: min_length // 2])
289
            full_match += int(inference_item[:min_length] == no_inference_item[:min_length])
290

291
        self.assertGreaterEqual(count / len(result_0), 0.2)
292

293

294
@parameterized_class(
295
    ["model_name_or_path", "model_class"],
296
    [
297
        ["__internal_testing__/tiny-random-llama", LlamaForCausalLM],
298
    ],
299
)
300
class GPUsPredictorTest(LLMTest, GPUsTesting, unittest.TestCase):
301
    config_path: str = "./tests/fixtures/llm/predictor.yaml"
302
    model_name_or_path: str = None
303
    model_class = None
304

305
    def setUp(self) -> None:
306
        super().setUp()
307
        self.model_class.from_pretrained(self.model_name_or_path, dtype="float16").save_pretrained(self.output_dir)
308
        AutoTokenizer.from_pretrained(self.model_name_or_path).save_pretrained(self.output_dir)
309

310
    @require_gpu(2)
311
    def test_predictor(self):
312
        self.init_dist_env()
313

314
        self.run_predictor({"inference_model": True})
315
        result_0 = self._read_result(os.path.join(self.output_dir, "predict.json"))
316
        self.run_predictor({"inference_model": False})
317
        result_1 = self._read_result(os.path.join(self.output_dir, "predict.json"))
318

319
        # compare the generation result of inference & dygraph model
320
        assert len(result_0) == len(result_1)
321

322
        count, full_match = 0, 0
323
        for inference_item, no_inference_item in zip(result_0, result_1):
324
            min_length = min(len(inference_item), len(no_inference_item))
325
            count += int(inference_item[: min_length // 2] == no_inference_item[: min_length // 2])
326
            full_match += int(inference_item[:min_length] == no_inference_item[:min_length])
327

328
        self.assertGreaterEqual(full_match / len(result_0), 0.25)
329
        self.assertGreaterEqual(count / len(result_0), 0.4)
330

331

332
class QWenVLTest(LLMTest, unittest.TestCase):
333
    config_path: str = "./tests/fixtures/llm/predictor.yaml"
334
    model_name_or_path: str = "__internal_testing__/tiny-fused-qwen"
335
    model_class = QWenForCausalLM
336

337
    def setUp(self) -> None:
338
        super().setUp()
339
        paddle.set_default_dtype("float32")
340
        self.model_class.from_pretrained(self.model_name_or_path, dtype="float16").save_pretrained(self.output_dir)
341
        AutoTokenizer.from_pretrained(self.model_name_or_path).save_pretrained(self.output_dir)
342

343
    def test_forward(self):
344
        self.disable_static()
345
        config = AutoConfig.from_pretrained(self.output_dir)
346
        config.quant_type = None
347
        config.weight_only_quant_bits = None
348

349
        paddle.set_default_dtype("float16")
350
        model = QWenForQWenVLInferenceModel.from_pretrained(self.output_dir, config=config, dtype="float16")
351

352
        batch = 1
353
        seq = 31
354
        max_len = 50
355
        dtype = "float16"
356
        input_ids = paddle.randint(0, 100, [batch, seq], dtype="int64")
357
        image_features = paddle.randn([batch, 16, config.hidden_size], dtype="float16")
358
        tgt_generation_mask = paddle.full([batch, 1, 1, max_len], 1, dtype=dtype)
359
        img_pos = paddle.to_tensor([[0, 4, 21]], dtype="int64")
360
        attention_mask = paddle.full([batch, 1, max_len, max_len], 0, dtype=dtype)
361
        attention_mask[:, 0, :seq, :seq] = paddle.tril(paddle.ones(shape=(seq, seq), dtype=dtype))
362
        position_ids = paddle.full([batch, seq], 0, dtype="int64")
363
        for i in range(batch):
364
            position_ids[i, :] = paddle.to_tensor([i for i in range(seq)], dtype="int64")
365

366
        inputs = [
367
            input_ids,  # input_ids
368
            image_features,  # image_features
369
            img_pos,  # img_pos
370
            attention_mask,  # attention_mask
371
            position_ids,  # position_ids
372
            paddle.full([batch, 1], 1.0, dtype="float32"),  # penalty_score
373
            paddle.full([batch, 1], 0.0, dtype="float32"),  # frequency_score,
374
            paddle.full([batch, 1], 0.0, dtype="float32"),  # presence_score,
375
            paddle.full([batch, 1], 1, dtype="int64"),  # min_length,
376
            paddle.full([batch, 1], max_len - seq, dtype="int64"),  # max_length,
377
            paddle.full([batch, 1], 1.0, dtype="float32"),  # temperature,
378
            paddle.full([batch, 1], 0.0, dtype="float32"),  # top_p,
379
            paddle.full([1], 151643, dtype="int64"),  # eos_token_id,
380
            paddle.full([batch, 1], seq, dtype="int32"),  # seq_len_encoder,
381
            paddle.full([batch, 1], seq, dtype="int32"),  # seq_len_decoder,
382
            paddle.full([batch, 1], 0, dtype="int64"),  # step_idx,
383
            paddle.full([batch, 1], False, dtype="bool"),  # stop_flags,
384
            paddle.full([batch, 1], -123, dtype="int64"),  # tgt_ids can be be initialized arbitrarily
385
            paddle.full([batch, 1], seq - 1, dtype="int64"),  # tgt_pos,
386
            tgt_generation_mask,  # tgt_generation_mask,
387
            paddle.full([batch, max_len], -100, dtype="int64"),  # pre_ids, can be initialized arbitrarily
388
            paddle.full([1], batch, dtype="int64"),  # stop_nums, be batch
389
        ]
390
        for i in range(config.num_hidden_layers):
391
            tmp = paddle.rand(shape=[2, batch, 1, max_len, 64], dtype=dtype)
392
            inputs.append(tmp)
393

394
        model.eval()
395
        model.generate_text_with_image_features(
396
            input_ids=inputs[0],
397
            image_features=inputs[1],
398
            img_pos=inputs[2],
399
            attention_mask=inputs[3],
400
            position_ids=inputs[4],
401
            penalty_score=inputs[5],
402
            frequency_score=inputs[6],
403
            presence_score=inputs[7],
404
            min_length=inputs[8],
405
            max_length=inputs[9],
406
            temperature=inputs[10],
407
            top_p=inputs[11],
408
            eos_token_id=inputs[12],
409
            seq_len_encoder=inputs[13],
410
            seq_len_decoder=inputs[14],
411
            step_idx=inputs[15],
412
            stop_flags=inputs[16],
413
            tgt_ids=inputs[17],
414
            tgt_pos=inputs[18],
415
            tgt_generation_mask=inputs[19],
416
            pre_ids=inputs[20],
417
            stop_nums=inputs[21],
418
            cache_kvs=inputs[22:],
419
        )
420

421
    def test_export(self):
422
        self.disable_static()
423
        config = load_test_config(self.config_path, "inference-to-static")
424
        config["model_name_or_path"] = self.model_name_or_path
425
        config["output_path"] = self.output_dir
426
        config["dtype"] = "float16"
427
        config["inference_model"] = True
428
        config["model_prefix"] = "qwen"
429
        config["model_type"] = "qwen-img2txt"
430

431
        with argv_context_guard(config):
432
            from export_model import main
433

434
            main()
435

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.