4
URL_BENCHMARK = "http://localhost:8080/ap/v1"
5
URL_AGENT = "http://localhost:8000/ap/v1"
11
@pytest.mark.parametrize(
12
"eval_id, input_text, expected_artifact_length, test_name, should_be_successful",
15
"021c695a-6cc4-46c2-b93a-f3a9b0f4d123",
16
"Write the word 'Washington' to a .txt file",
22
"f219f3d3-a41b-45a9-a3d0-389832086ee8",
23
"Read the file called file_to_read.txt and write its content to a file called output.txt",
30
def test_entire_workflow(
31
eval_id, input_text, expected_artifact_length, test_name, should_be_successful
33
task_request = {"eval_id": eval_id, "input": input_text}
34
response = requests.get(f"{URL_AGENT}/agent/tasks")
35
task_count_before = response.json()["pagination"]["total_items"]
37
task_response_benchmark = requests.post(
38
URL_BENCHMARK + "/agent/tasks", json=task_request
40
response = requests.get(f"{URL_AGENT}/agent/tasks")
41
task_count_after = response.json()["pagination"]["total_items"]
42
assert task_count_after == task_count_before + 1
44
timestamp_after_task_eval_created = datetime.datetime.now(datetime.timezone.utc)
46
assert task_response_benchmark.status_code == 200
47
task_response_benchmark = task_response_benchmark.json()
48
assert task_response_benchmark["input"] == input_text
50
task_response_benchmark_id = task_response_benchmark["task_id"]
52
response_task_agent = requests.get(
53
f"{URL_AGENT}/agent/tasks/{task_response_benchmark_id}"
55
assert response_task_agent.status_code == 200
56
response_task_agent = response_task_agent.json()
57
assert len(response_task_agent["artifacts"]) == expected_artifact_length
59
step_request = {"input": input_text}
61
step_response = requests.post(
62
URL_BENCHMARK + "/agent/tasks/" + task_response_benchmark_id + "/steps",
65
assert step_response.status_code == 200
66
step_response = step_response.json()
67
assert step_response["is_last"] == True
69
eval_response = requests.post(
70
URL_BENCHMARK + "/agent/tasks/" + task_response_benchmark_id + "/evaluations",
73
assert eval_response.status_code == 200
74
eval_response = eval_response.json()
75
print("eval_response")
77
assert eval_response["run_details"]["test_name"] == test_name
78
assert eval_response["metrics"]["success"] == should_be_successful
79
benchmark_start_time = datetime.datetime.fromisoformat(
80
eval_response["run_details"]["benchmark_start_time"]
83
assert benchmark_start_time < timestamp_after_task_eval_created