7
"""A simple wrapper around the OpenAI API for evaluating GPT models."""
9
def __init__(self, model_version="gpt-4", temperature=0.9, max_tokens=2048):
10
api_key = os.getenv("GPT4_API_KEY")
11
assert api_key, "Please set the GPT4_API_KEY environment variable"
12
self.__api_key = os.getenv("GPT4_API_KEY")
13
self.temperature = temperature
14
self.max_tokens = max_tokens
15
self.model = model_version
17
def evaluate_results(self, prompt, results):
18
"""Evaluate a list of results generated by several models on a single prompt."""
19
for result in results:
20
result.pop("stats", None)
26
"""You are an assistant tasked with ranking responses in
27
order of quality, creating a leaderboard of all models.
28
The best model has rank 1, the second best has rank 2, etc.
29
You have to assess the quality of the responses, and rank them."""
35
f"""You are given a prompt and a list of responses
36
from several models in Python dictionary format.
37
Specifically, the format of the results is as follows:
39
'model': <model-name>, 'result': <model-output>
41
Your job is to "rank" the responses in order of quality, (not by
42
the order in which they were generated).
44
The prompt is: {prompt}
45
The responses are: {results}
47
Please rank the responses by quality, and return a list of the model
48
names and ranks, i.e produce the following output:
50
'model': <model-name>, 'rank': <model-rank>
52
Only output this format, and nothing else. Your response must
53
be a valid Python dictionary.
54
Think step by step and give me this quality ranking.
59
return self.generate(gpt_messages)
61
def generate(self, messages):
65
"max_tokens": self.max_tokens,
66
"temperature": self.temperature,
69
"Content-Type": "application/json",
70
"Authorization": f"Bearer {self.__api_key}",
73
url="https://api.openai.com/v1/chat/completions", json=data, headers=headers
77
raise RuntimeError(f"Failed to generate: {resp.reason}")
79
return resp.json()["choices"][0]["message"]["content"]