wandb

Форк
0
/
test_kubernetes.py 
156 строк · 4.6 Кб
1
from typing import Optional
2

3
import pytest
4
import yaml
5
from utils import (
6
    cleanup_deployment,
7
    get_sweep_id_from_proc,
8
    init_agent_in_launch_cluster,
9
    run_cmd,
10
    run_cmd_async,
11
    wait_for_k8s_job_completion,
12
    wait_for_queued_image_job_completion,
13
    wait_for_run_completion,
14
)
15
from wandb.apis.public import Api, Sweep
16
from wandb.sdk.launch._launch_add import launch_add
17

18
NAMESPACE = "wandb-release-testing"
19
ENTITY = "launch-release-testing"
20
PROJECT = "release-testing"
21
QUEUE = "kubernetes-queue"
22
JOB_NAME = "sample_job:v0"  # simple job that counts to 50
23

24
LAUNCH_JOB_CONFIG = {
25
    "resource_args": {"kubernetes": {"namespace": NAMESPACE}},
26
}
27

28

29
@pytest.mark.timeout(180)
30
def test_kubernetes_agent_on_local_process():
31
    try:
32
        # Start launch agent
33
        agent_process = run_cmd_async(
34
            f"wandb launch-agent -q {QUEUE} -e {ENTITY} -c tests/release_tests/test_launch/local-agent-config.yml"
35
        )
36

37
        # Start run
38
        queued_run = launch_add(
39
            job=f"{ENTITY}/{PROJECT}/{JOB_NAME}",
40
            queue_name=QUEUE,
41
            entity=ENTITY,
42
            config=LAUNCH_JOB_CONFIG,
43
        )
44

45
        status = wait_for_k8s_job_completion(NAMESPACE, ENTITY, PROJECT, 1)
46
        completed_run = wait_for_queued_image_job_completion(
47
            ENTITY, PROJECT, queued_run
48
        )
49

50
        summary = completed_run.summary
51
        history = completed_run.history(pandas=False)
52

53
        assert (
54
            status == "Succeeded"
55
        ), "Kubernetes job didn't succeed. Check Kubernetes pods and Docker container output."
56
        assert summary["time_elapsed"]
57
        assert summary["avg"]
58
        assert len(history) == 50
59
        assert history[-1]["steps"] == 49
60
    finally:
61
        agent_process.kill()
62
        run_cmd("rm -r artifacts")
63

64

65
@pytest.mark.timeout(180)
66
def test_kubernetes_agent_in_cluster(api_key: str, agent_image: Optional[str]):
67
    init_agent_in_launch_cluster(NAMESPACE, api_key, agent_image)
68
    try:
69
        # Start run
70
        queued_run = launch_add(
71
            job=f"{ENTITY}/{PROJECT}/{JOB_NAME}",
72
            queue_name=QUEUE,
73
            entity=ENTITY,
74
            config=LAUNCH_JOB_CONFIG,
75
        )
76

77
        status = wait_for_k8s_job_completion(NAMESPACE, ENTITY, PROJECT, 1)
78
        completed_run = wait_for_queued_image_job_completion(
79
            ENTITY, PROJECT, queued_run
80
        )
81

82
        summary = completed_run.summary
83
        history = completed_run.history(pandas=False)
84

85
        assert (
86
            status == "Succeeded"
87
        ), "Kubernetes job didn't succeed. Check Kubernetes pods and Docker container output."
88
        assert summary["time_elapsed"]
89
        assert summary["avg"]
90
        assert len(history) == 50
91
        assert history[-1]["steps"] == 49
92

93
    finally:
94
        # Cleanup
95
        cleanup_deployment(NAMESPACE)
96

97

98
@pytest.mark.timeout(360)
99
def test_kubernetes_agent_on_local_process_sweep():
100
    run_cap = 4
101
    try:
102
        agent_process = run_cmd_async(
103
            f"wandb launch-agent -q {QUEUE} -e {ENTITY} -c tests/release_tests/test_launch/local-agent-config.yml"
104
        )
105
        sweep_config = {
106
            "job": f"{ENTITY}/{PROJECT}/{JOB_NAME}",
107
            "project": PROJECT,
108
            "entity": ENTITY,
109
            "run_cap": run_cap,
110
            "method": "bayes",
111
            "metric": {
112
                "name": "avg",
113
                "goal": "maximize",
114
            },
115
            "parameters": {
116
                "param1": {
117
                    "min": 0,
118
                    "max": 8,
119
                }
120
            },
121
        }
122

123
        yaml.dump(sweep_config, open("tests/release_tests/test_launch/c.yaml", "w"))
124

125
        proc = run_cmd_async(
126
            f"wandb launch-sweep tests/release_tests/test_launch/c.yaml -e {ENTITY} -p {PROJECT} -q {QUEUE}"
127
        )
128

129
        # Poll process.stdout to show stdout live
130
        sweep_id = get_sweep_id_from_proc(proc)
131
        assert sweep_id
132

133
        # poll on sweep scheduler run
134
        run = wait_for_run_completion(f"{ENTITY}/{PROJECT}/{sweep_id}")
135
        assert run
136

137
        api = Api()
138
        sweep: Sweep = api.sweep(f"{ENTITY}/{PROJECT}/{sweep_id}")
139
        sweep.load(force=True)
140

141
        assert len(sweep.runs) == run_cap
142
        for run in sweep.runs:
143
            assert run.config["param1"] in list(range(0, 8 + 1))
144
            assert run.state == "finished"
145

146
            summary = run.summary
147
            history = run.history(pandas=False)
148

149
            assert summary["time_elapsed"]
150
            assert summary["avg"]
151
            assert len(history) == 50
152
            assert history[-1]["steps"] == 49
153

154
    finally:
155
        agent_process.kill()
156
        proc.kill()
157

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.