ray-llm

evaluation.py
79 строк · 3.0 Кб
Перенос по словам
1
import os
2

3
import requests
4

5

6
class GPT:
7
    """A simple wrapper around the OpenAI API for evaluating GPT models."""
8

9
    def __init__(self, model_version="gpt-4", temperature=0.9, max_tokens=2048):
10
        api_key = os.getenv("GPT4_API_KEY")
11
        assert api_key, "Please set the GPT4_API_KEY environment variable"
12
        self.__api_key = os.getenv("GPT4_API_KEY")
13
        self.temperature = temperature
14
        self.max_tokens = max_tokens
15
        self.model = model_version
16

17
    def evaluate_results(self, prompt, results):
18
        """Evaluate a list of results generated by several models on a single prompt."""
19
        for result in results:
20
            result.pop("stats", None)
21

22
        gpt_messages = [
23
            {
24
                "role": "system",
25
                "content": (
26
                    """You are an assistant tasked with ranking responses in 
27
                    order of quality, creating a leaderboard of all models.
28
                    The best model has rank 1, the second best has rank 2, etc.
29
                    You have to assess the quality of the responses, and rank them."""
30
                ),
31
            },
32
            {
33
                "role": "user",
34
                "content": (
35
                    f"""You are given a prompt and a list of responses
36
                    from several models in Python dictionary format. 
37
                    Specifically, the format of the results is as follows:
38
                    
39
                    'model': <model-name>, 'result': <model-output>
40
                    
41
                    Your job is to "rank" the responses in order of quality, (not by
42
                    the order in which they were generated).
43
                    
44
                    The prompt is: {prompt}
45
                    The responses are: {results}
46
                    
47
                    Please rank the responses by quality, and return a list of the model
48
                    names and ranks, i.e produce the following output:
49
                    
50
                    'model': <model-name>, 'rank': <model-rank>
51
                    
52
                    Only output this format, and nothing else. Your response must
53
                    be a valid Python dictionary.
54
                    Think step by step and give me this quality ranking.
55
                    """
56
                ),
57
            },
58
        ]
59
        return self.generate(gpt_messages)
60

61
    def generate(self, messages):
62
        data = {
63
            "model": self.model,
64
            "messages": messages,
65
            "max_tokens": self.max_tokens,
66
            "temperature": self.temperature,
67
        }
68
        headers = {
69
            "Content-Type": "application/json",
70
            "Authorization": f"Bearer {self.__api_key}",
71
        }
72
        resp = requests.post(
73
            url="https://api.openai.com/v1/chat/completions", json=data, headers=headers
74
        )
75

76
        if not resp.ok:
77
            raise RuntimeError(f"Failed to generate: {resp.reason}")
78

79
        return resp.json()["choices"][0]["message"]["content"]
80
ray-llm

Использование cookies