ray-llm
1deployment_config:
2autoscaling_config:
3min_replicas: 4
4initial_replicas: 4
5max_replicas: 10
6target_num_ongoing_requests_per_replica: 20
7metrics_interval_s: 10.0
8look_back_period_s: 30.0
9smoothing_factor: 0.5
10downscale_delay_s: 300.0
11upscale_delay_s: 15.0
12max_concurrent_queries: 48
13ray_actor_options:
14resources:
15mock_resource: 0
16engine_config:
17model_id: VLLMFakeModel
18type: VLLMEngine
19max_total_tokens: 4096
20generation:
21prompt_format:
22system: "<<SYS>>\n{instruction}\n<</SYS>>\n\n"
23assistant: " {instruction} </s><s> "
24trailing_assistant: " "
25user: "[INST] {system}{instruction} [/INST]"
26system_in_user: true
27default_system_message: ""
28stopping_sequences: ["<unk>"]
29scaling_config:
30num_workers: 1
31num_gpus_per_worker: 0
32num_cpus_per_worker: 1
33placement_strategy: "STRICT_PACK"
34resources_per_worker:
35mock_resource: 0
36