ray-llm
1import json2import os3
4from ray import serve5
6from rayllm.backend.logger import get_logger7from rayllm.backend.server.routers.router_app import Router, router_app8
9logger = get_logger(__name__)10
11
12RouterDeployment = serve.deployment(13route_prefix="/",14# TODO make this configurable in aviary run15autoscaling_config={16"min_replicas": int(os.environ.get("AVIARY_ROUTER_MIN_REPLICAS", 2)),17"initial_replicas": int(os.environ.get("AVIARY_ROUTER_INITIAL_REPLICAS", 2)),18"max_replicas": int(os.environ.get("AVIARY_ROUTER_MAX_REPLICAS", 16)),19"target_num_ongoing_requests_per_replica": int(20os.environ.get("AVIARY_ROUTER_TARGET_NUM_ONGOING_REQUESTS_PER_REPLICA", 200)21),22},23ray_actor_options=json.loads(24os.environ.get("AVIARY_ROUTER_RAY_ACTOR_OPTIONS", "{}")25),26max_concurrent_queries=1000, # Maximum backlog for a single replica27)(serve.ingress(router_app)(Router))28