wandb
156 строк · 4.6 Кб
1from typing import Optional
2
3import pytest
4import yaml
5from utils import (
6cleanup_deployment,
7get_sweep_id_from_proc,
8init_agent_in_launch_cluster,
9run_cmd,
10run_cmd_async,
11wait_for_k8s_job_completion,
12wait_for_queued_image_job_completion,
13wait_for_run_completion,
14)
15from wandb.apis.public import Api, Sweep
16from wandb.sdk.launch._launch_add import launch_add
17
18NAMESPACE = "wandb-release-testing"
19ENTITY = "launch-release-testing"
20PROJECT = "release-testing"
21QUEUE = "kubernetes-queue"
22JOB_NAME = "sample_job:v0" # simple job that counts to 50
23
24LAUNCH_JOB_CONFIG = {
25"resource_args": {"kubernetes": {"namespace": NAMESPACE}},
26}
27
28
29@pytest.mark.timeout(180)
30def test_kubernetes_agent_on_local_process():
31try:
32# Start launch agent
33agent_process = run_cmd_async(
34f"wandb launch-agent -q {QUEUE} -e {ENTITY} -c tests/release_tests/test_launch/local-agent-config.yml"
35)
36
37# Start run
38queued_run = launch_add(
39job=f"{ENTITY}/{PROJECT}/{JOB_NAME}",
40queue_name=QUEUE,
41entity=ENTITY,
42config=LAUNCH_JOB_CONFIG,
43)
44
45status = wait_for_k8s_job_completion(NAMESPACE, ENTITY, PROJECT, 1)
46completed_run = wait_for_queued_image_job_completion(
47ENTITY, PROJECT, queued_run
48)
49
50summary = completed_run.summary
51history = completed_run.history(pandas=False)
52
53assert (
54status == "Succeeded"
55), "Kubernetes job didn't succeed. Check Kubernetes pods and Docker container output."
56assert summary["time_elapsed"]
57assert summary["avg"]
58assert len(history) == 50
59assert history[-1]["steps"] == 49
60finally:
61agent_process.kill()
62run_cmd("rm -r artifacts")
63
64
65@pytest.mark.timeout(180)
66def test_kubernetes_agent_in_cluster(api_key: str, agent_image: Optional[str]):
67init_agent_in_launch_cluster(NAMESPACE, api_key, agent_image)
68try:
69# Start run
70queued_run = launch_add(
71job=f"{ENTITY}/{PROJECT}/{JOB_NAME}",
72queue_name=QUEUE,
73entity=ENTITY,
74config=LAUNCH_JOB_CONFIG,
75)
76
77status = wait_for_k8s_job_completion(NAMESPACE, ENTITY, PROJECT, 1)
78completed_run = wait_for_queued_image_job_completion(
79ENTITY, PROJECT, queued_run
80)
81
82summary = completed_run.summary
83history = completed_run.history(pandas=False)
84
85assert (
86status == "Succeeded"
87), "Kubernetes job didn't succeed. Check Kubernetes pods and Docker container output."
88assert summary["time_elapsed"]
89assert summary["avg"]
90assert len(history) == 50
91assert history[-1]["steps"] == 49
92
93finally:
94# Cleanup
95cleanup_deployment(NAMESPACE)
96
97
98@pytest.mark.timeout(360)
99def test_kubernetes_agent_on_local_process_sweep():
100run_cap = 4
101try:
102agent_process = run_cmd_async(
103f"wandb launch-agent -q {QUEUE} -e {ENTITY} -c tests/release_tests/test_launch/local-agent-config.yml"
104)
105sweep_config = {
106"job": f"{ENTITY}/{PROJECT}/{JOB_NAME}",
107"project": PROJECT,
108"entity": ENTITY,
109"run_cap": run_cap,
110"method": "bayes",
111"metric": {
112"name": "avg",
113"goal": "maximize",
114},
115"parameters": {
116"param1": {
117"min": 0,
118"max": 8,
119}
120},
121}
122
123yaml.dump(sweep_config, open("tests/release_tests/test_launch/c.yaml", "w"))
124
125proc = run_cmd_async(
126f"wandb launch-sweep tests/release_tests/test_launch/c.yaml -e {ENTITY} -p {PROJECT} -q {QUEUE}"
127)
128
129# Poll process.stdout to show stdout live
130sweep_id = get_sweep_id_from_proc(proc)
131assert sweep_id
132
133# poll on sweep scheduler run
134run = wait_for_run_completion(f"{ENTITY}/{PROJECT}/{sweep_id}")
135assert run
136
137api = Api()
138sweep: Sweep = api.sweep(f"{ENTITY}/{PROJECT}/{sweep_id}")
139sweep.load(force=True)
140
141assert len(sweep.runs) == run_cap
142for run in sweep.runs:
143assert run.config["param1"] in list(range(0, 8 + 1))
144assert run.state == "finished"
145
146summary = run.summary
147history = run.history(pandas=False)
148
149assert summary["time_elapsed"]
150assert summary["avg"]
151assert len(history) == 50
152assert history[-1]["steps"] == 49
153
154finally:
155agent_process.kill()
156proc.kill()
157