Langchain-Chatchat
258 строк · 9.3 Кб
1"""
2调用示例: python llm_api_stale.py --model-path-address THUDM/chatglm2-6b@localhost@7650 THUDM/chatglm2-6b-32k@localhost@7651
3其他fastchat.server.controller/worker/openai_api_server参数可按照fastchat文档调用
4但少数非关键参数如--worker-address,--allowed-origins,--allowed-methods,--allowed-headers不支持
5
6"""
7import sys
8import os
9
10sys.path.append(os.path.dirname(os.path.dirname(__file__)))
11
12import subprocess
13import re
14import logging
15import argparse
16
17LOG_PATH = "./logs/"
18LOG_FORMAT = "%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s"
19logger = logging.getLogger()
20logger.setLevel(logging.INFO)
21logging.basicConfig(format=LOG_FORMAT)
22
23parser = argparse.ArgumentParser()
24# ------multi worker-----------------
25parser.add_argument('--model-path-address',
26default="THUDM/chatglm2-6b@localhost@20002",
27nargs="+",
28type=str,
29help="model path, host, and port, formatted as model-path@host@port")
30# ---------------controller-------------------------
31
32parser.add_argument("--controller-host", type=str, default="localhost")
33parser.add_argument("--controller-port", type=int, default=21001)
34parser.add_argument(
35"--dispatch-method",
36type=str,
37choices=["lottery", "shortest_queue"],
38default="shortest_queue",
39)
40controller_args = ["controller-host", "controller-port", "dispatch-method"]
41
42# ----------------------worker------------------------------------------
43
44parser.add_argument("--worker-host", type=str, default="localhost")
45parser.add_argument("--worker-port", type=int, default=21002)
46# parser.add_argument("--worker-address", type=str, default="http://localhost:21002")
47# parser.add_argument(
48# "--controller-address", type=str, default="http://localhost:21001"
49# )
50parser.add_argument(
51"--model-path",
52type=str,
53default="lmsys/vicuna-7b-v1.3",
54help="The path to the weights. This can be a local folder or a Hugging Face repo ID.",
55)
56parser.add_argument(
57"--revision",
58type=str,
59default="main",
60help="Hugging Face Hub model revision identifier",
61)
62parser.add_argument(
63"--device",
64type=str,
65choices=["cpu", "cuda", "mps", "xpu"],
66default="cuda",
67help="The device type",
68)
69parser.add_argument(
70"--gpus",
71type=str,
72default="0",
73help="A single GPU like 1 or multiple GPUs like 0,2",
74)
75parser.add_argument("--num-gpus", type=int, default=1)
76parser.add_argument(
77"--max-gpu-memory",
78type=str,
79default="20GiB",
80help="The maximum memory per gpu. Use a string like '13Gib'",
81)
82parser.add_argument(
83"--load-8bit", action="store_true", help="Use 8-bit quantization"
84)
85parser.add_argument(
86"--cpu-offloading",
87action="store_true",
88help="Only when using 8-bit quantization: Offload excess weights to the CPU that don't fit on the GPU",
89)
90parser.add_argument(
91"--gptq-ckpt",
92type=str,
93default=None,
94help="Load quantized model. The path to the local GPTQ checkpoint.",
95)
96parser.add_argument(
97"--gptq-wbits",
98type=int,
99default=16,
100choices=[2, 3, 4, 8, 16],
101help="#bits to use for quantization",
102)
103parser.add_argument(
104"--gptq-groupsize",
105type=int,
106default=-1,
107help="Groupsize to use for quantization; default uses full row.",
108)
109parser.add_argument(
110"--gptq-act-order",
111action="store_true",
112help="Whether to apply the activation order GPTQ heuristic",
113)
114parser.add_argument(
115"--model-names",
116type=lambda s: s.split(","),
117help="Optional display comma separated names",
118)
119parser.add_argument(
120"--limit-worker-concurrency",
121type=int,
122default=5,
123help="Limit the model concurrency to prevent OOM.",
124)
125parser.add_argument("--stream-interval", type=int, default=2)
126parser.add_argument("--no-register", action="store_true")
127
128worker_args = [
129"worker-host", "worker-port",
130"model-path", "revision", "device", "gpus", "num-gpus",
131"max-gpu-memory", "load-8bit", "cpu-offloading",
132"gptq-ckpt", "gptq-wbits", "gptq-groupsize",
133"gptq-act-order", "model-names", "limit-worker-concurrency",
134"stream-interval", "no-register",
135"controller-address", "worker-address"
136]
137# -----------------openai server---------------------------
138
139parser.add_argument("--server-host", type=str, default="localhost", help="host name")
140parser.add_argument("--server-port", type=int, default=8888, help="port number")
141parser.add_argument(
142"--allow-credentials", action="store_true", help="allow credentials"
143)
144# parser.add_argument(
145# "--allowed-origins", type=json.loads, default=["*"], help="allowed origins"
146# )
147# parser.add_argument(
148# "--allowed-methods", type=json.loads, default=["*"], help="allowed methods"
149# )
150# parser.add_argument(
151# "--allowed-headers", type=json.loads, default=["*"], help="allowed headers"
152# )
153parser.add_argument(
154"--api-keys",
155type=lambda s: s.split(","),
156help="Optional list of comma separated API keys",
157)
158server_args = ["server-host", "server-port", "allow-credentials", "api-keys",
159"controller-address"
160]
161
162# 0,controller, model_worker, openai_api_server
163# 1, 命令行选项
164# 2,LOG_PATH
165# 3, log的文件名
166base_launch_sh = "nohup python3 -m fastchat.serve.{0} {1} >{2}/{3}.log 2>&1 &"
167
168# 0 log_path
169# ! 1 log的文件名,必须与bash_launch_sh一致
170# 2 controller, worker, openai_api_server
171base_check_sh = """while [ `grep -c "Uvicorn running on" {0}/{1}.log` -eq '0' ];do
172sleep 5s;
173echo "wait {2} running"
174done
175echo '{2} running' """
176
177
178def string_args(args, args_list):
179"""将args中的key转化为字符串"""
180args_str = ""
181for key, value in args._get_kwargs():
182# args._get_kwargs中的key以_为分隔符,先转换,再判断是否在指定的args列表中
183key = key.replace("_", "-")
184if key not in args_list:
185continue
186# fastchat中port,host没有前缀,去除前缀
187key = key.split("-")[-1] if re.search("port|host", key) else key
188if not value:
189pass
190# 1==True -> True
191elif isinstance(value, bool) and value == True:
192args_str += f" --{key} "
193elif isinstance(value, list) or isinstance(value, tuple) or isinstance(value, set):
194value = " ".join(value)
195args_str += f" --{key} {value} "
196else:
197args_str += f" --{key} {value} "
198
199return args_str
200
201
202def launch_worker(item, args, worker_args=worker_args):
203log_name = item.split("/")[-1].split("\\")[-1].replace("-", "_").replace("@", "_").replace(".", "_")
204# 先分割model-path-address,在传到string_args中分析参数
205args.model_path, args.worker_host, args.worker_port = item.split("@")
206args.worker_address = f"http://{args.worker_host}:{args.worker_port}"
207print("*" * 80)
208print(f"如长时间未启动,请到{LOG_PATH}{log_name}.log下查看日志")
209worker_str_args = string_args(args, worker_args)
210print(worker_str_args)
211worker_sh = base_launch_sh.format("model_worker", worker_str_args, LOG_PATH, f"worker_{log_name}")
212worker_check_sh = base_check_sh.format(LOG_PATH, f"worker_{log_name}", "model_worker")
213subprocess.run(worker_sh, shell=True, check=True)
214subprocess.run(worker_check_sh, shell=True, check=True)
215
216
217def launch_all(args,
218controller_args=controller_args,
219worker_args=worker_args,
220server_args=server_args
221):
222print(f"Launching llm service,logs are located in {LOG_PATH}...")
223print(f"开始启动LLM服务,请到{LOG_PATH}下监控各模块日志...")
224controller_str_args = string_args(args, controller_args)
225controller_sh = base_launch_sh.format("controller", controller_str_args, LOG_PATH, "controller")
226controller_check_sh = base_check_sh.format(LOG_PATH, "controller", "controller")
227subprocess.run(controller_sh, shell=True, check=True)
228subprocess.run(controller_check_sh, shell=True, check=True)
229print(f"worker启动时间视设备不同而不同,约需3-10分钟,请耐心等待...")
230if isinstance(args.model_path_address, str):
231launch_worker(args.model_path_address, args=args, worker_args=worker_args)
232else:
233for idx, item in enumerate(args.model_path_address):
234print(f"开始加载第{idx}个模型:{item}")
235launch_worker(item, args=args, worker_args=worker_args)
236
237server_str_args = string_args(args, server_args)
238server_sh = base_launch_sh.format("openai_api_server", server_str_args, LOG_PATH, "openai_api_server")
239server_check_sh = base_check_sh.format(LOG_PATH, "openai_api_server", "openai_api_server")
240subprocess.run(server_sh, shell=True, check=True)
241subprocess.run(server_check_sh, shell=True, check=True)
242print("Launching LLM service done!")
243print("LLM服务启动完毕。")
244
245
246if __name__ == "__main__":
247args = parser.parse_args()
248# 必须要加http//:,否则InvalidSchema: No connection adapters were found
249args = argparse.Namespace(**vars(args),
250**{"controller-address": f"http://{args.controller_host}:{str(args.controller_port)}"})
251
252if args.gpus:
253if len(args.gpus.split(",")) < args.num_gpus:
254raise ValueError(
255f"Larger --num-gpus ({args.num_gpus}) than --gpus {args.gpus}!"
256)
257os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
258launch_all(args=args)
259