paddlenlp
162 строки · 5.6 Кб
1# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16import argparse
17
18# append project root dir to project to make it run with latest code
19import sys
20import time
21from pprint import pprint
22
23import numpy as np
24import paddle
25import torch
26from transformers.models.opt.modeling_opt import OPTForCausalLM as hf_opt_model
27
28from paddlenlp.transformers import GPTTokenizer, OPTForCausalLM
29
30sys.path.insert(0, "../../")
31
32
33def parse_args():
34parser = argparse.ArgumentParser()
35parser.add_argument(
36"--model_name_or_path",
37default="facebook/opt-125m",
38type=str,
39choices=["facebook/opt-125m", "facebook/opt-350m", "facebook/opt-1.3b", "facebook/opt-2.7b"],
40help="The model name to specify the bart to use. Can be one of ['facebook/opt-125m', 'facebook/opt-350m', 'facebook/opt-1.3b', 'facebook/opt-2.7b']. ",
41)
42parser.add_argument(
43"--decode_strategy",
44default="greedy_search",
45type=str,
46choices=["greedy_search", "sampling"],
47help="The decoding strategy. Can be one of ['greedy_search', 'sampling']",
48)
49parser.add_argument("--top_k", default=4, type=int, help="The number of candidate to procedure beam search. ")
50parser.add_argument("--batch_size", default=4, type=int, help="The size of input batch. ")
51parser.add_argument(
52"--top_p", default=1.0, type=float, help="The probability threshold to procedure topp sampling. "
53)
54parser.add_argument("--max_length", default=32, type=int, help="Maximum output length. ")
55parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ")
56args = parser.parse_args()
57return args
58
59
60def do_predict(args):
61place = "gpu"
62place = paddle.set_device(place)
63
64tokenizer = GPTTokenizer.from_pretrained(args.model_name_or_path)
65model = OPTForCausalLM.from_pretrained(args.model_name_or_path)
66# Set evaluate mode
67model.eval()
68bos_id = tokenizer.convert_tokens_to_ids("<|endoftext|>")
69eos_id = tokenizer.convert_tokens_to_ids("<|endoftext|>")
70
71input_ids_np = np.array([[bos_id] for i in range(args.batch_size)]).astype("int64").reshape([args.batch_size, 1])
72input_ids = paddle.to_tensor(input_ids_np)
73# Define model
74num_loop = 100
75with paddle.no_grad():
76for i in range(num_loop):
77# For warmup.
78if 50 == i:
79# PaddlePaddle >= 2.2
80paddle.device.cuda.synchronize(place)
81start = time.perf_counter()
82model.generate(
83input_ids=input_ids,
84max_length=args.max_length,
85decode_strategy=args.decode_strategy,
86top_k=args.top_k,
87top_p=args.top_p,
88bos_token_id=bos_id,
89eos_token_id=eos_id,
90use_fast=True,
91use_fp16_decoding=args.use_fp16_decoding,
92)
93paddle.device.cuda.synchronize(place)
94fast_cost = (time.perf_counter() - start) / 50 * 1000
95
96if args.use_fp16_decoding:
97pprint(args)
98print("Fast FP16 cost:", fast_cost)
99return
100with paddle.no_grad():
101for i in range(num_loop):
102# For warmup.
103if 50 == i:
104# PaddlePaddle >= 2.2
105paddle.device.cuda.synchronize(place)
106start = time.perf_counter()
107model.generate(
108input_ids=input_ids,
109max_length=args.max_length,
110decode_strategy=args.decode_strategy,
111top_k=args.top_k,
112top_p=args.top_p,
113bos_token_id=bos_id,
114eos_token_id=eos_id,
115)
116paddle.device.cuda.synchronize(place)
117pd_cost = (time.perf_counter() - start) / 50 * 1000
118
119device = torch.device("cuda:0")
120hf_model = hf_opt_model.from_pretrained(args.model_name_or_path)
121
122hf_model.to(device)
123hf_model.eval()
124
125hf_input_ids = torch.tensor(input_ids_np)
126hf_input_ids = hf_input_ids.to(device)
127
128if args.decode_strategy == "sampling":
129do_sample = True
130else:
131do_sample = False
132with torch.no_grad():
133for i in range(num_loop):
134# For warmup.
135if 50 == i:
136torch.cuda.synchronize()
137start = time.perf_counter()
138hf_model.generate(
139hf_input_ids,
140do_sample=do_sample,
141max_length=args.max_length + 1,
142bos_token_id=bos_id,
143eos_token_id=eos_id,
144pad_token_id=0,
145top_k=args.top_k,
146top_p=args.top_p,
147)
148torch.cuda.synchronize()
149hf_cost = (time.perf_counter() - start) / 50 * 1000
150
151pprint(args)
152print("Fast FP32 cost:", fast_cost)
153print("PD cost:", pd_cost)
154print("HF cost:", hf_cost)
155print("Speed up Fast FP32/PD:", pd_cost / fast_cost)
156print("Speed up Fast FP32/HF:", hf_cost / fast_cost)
157
158
159if __name__ == "__main__":
160args = parse_args()
161print(args.model_name_or_path)
162do_predict(args)
163