MemGPT
157 строк · 5.7 Кб
1import argparse
2import json
3import re
4
5from openai import OpenAI
6from tqdm import tqdm
7
8from memgpt.credentials import MemGPTCredentials
9
10# Note: did not end up using since no cases of cheating were observed
11# CHEATING_PROMPT = \
12# """
13# You are evaluating an LLM trying to do task.
14# The main point of failure is the LLM may cheat and hallucinate the document it is meant to provide as evidence.
15# For example, even though the LLM has been instructed to provide the document that it's using as evidence to answer the question, in this case the LLM hallucinates a document:
16#
17# The format of the LLM response with be:
18# ANSWER: <answer>, DOCUMENT: <document_text>
19#
20# We also provide the list of documents that the LLM should have used to answer the question.
21# This is provided with the format:
22# ALL DOCUMENTS: <list of documents>
23#
24# Your task is to ensure the LLM did not cheat and hallucinate the document.
25# You can ensure this by checking if the document provided by the LLM is in the list of documents provided.
26#
27# Write a short reason for your answer before stating the final answer (format <your reason> - CHEAT/CORRECT).
28# """
29
30EVAL_PROMPT = """
31Your task is to evaluate whether an LLM correct answered a question.
32The LLM response should be the format 'ANSWER: <answer>, DOCUMENT: <document_text>' or say 'INSUFFICIENT INFORMATION'.
33The true answer is provided in the format 'TRUE ANSWER: <list of possible answers>'.
34The questions is provided in the format 'QUESTION: <question>'.
35If the LLM response contains both the correct answer and corresponding document text, the response is correct.
36Even if the LLM's answer and the true answer are slightly different in wording, the response is still correct.
37For example, if the answer is more specific than the true answer or uses a different phrasing that is still correct, the response is correct.
38If the LLM response if 'INSUFFICIENT INFORMATION', or the 'DOCUMENT' field is missing, the response is incorrect.
39Respond with a single token: 'CORRECT' or 'INCORRECT'.
40"""
41
42EVAL_MODEL = "gpt-4-0613"
43
44
45def evaluate_response(output: str):
46credentials = MemGPTCredentials().load()
47assert credentials.openai_key is not None, credentials.openai_key
48
49client = OpenAI(api_key=credentials.openai_key)
50
51chat_completion = client.chat.completions.create(
52messages=[
53{
54"role": "user",
55"content": "\n".join([EVAL_PROMPT, "\n", output, "\n"]),
56},
57],
58model=EVAL_MODEL,
59)
60
61response = chat_completion.choices[0].message.content
62print("llm judge", response)
63if "INCORRECT" in response:
64return False
65elif "CORRECT" in response:
66return True
67else:
68print("INVALID RESPONSE", response)
69return False
70
71
72# Grab the last thing MemGPT generated, treat it as the reply
73def extract_final_memgpt_response(memgpt_responses: list) -> str:
74final_index = -1
75if "function_return" in memgpt_responses[final_index]:
76final_index = -2
77final_memgpt_response = [v for k, v in memgpt_responses[final_index].items()]
78final_memgpt_response = final_memgpt_response[-1]
79return final_memgpt_response
80
81
82if __name__ == "__main__":
83parser = argparse.ArgumentParser(description="Test script")
84parser.add_argument("--file", type=str, help="File data to evaluate")
85parser.add_argument("--baseline", action="store_true", help="Whether to use the baseline model")
86args = parser.parse_args()
87
88# load data
89data = json.load(open(args.file))
90
91# counters
92correct = 0
93total = 0
94
95# Make an intial pass to determine how many documents had the correct answer
96results = [] # store all results
97eval_results = [] # store results that need LLM judge
98if args.baseline:
99# baseline experiment
100match = re.search(r"model_([^_]+)_num_docs_([^\.]+)\.json", args.file)
101model = match.group(1)
102num_docs = int(match.group(2))
103baseline = "baseline"
104else:
105# model = re.search(r"model_([^\.]+)\.json", args.file).group(1)
106model = re.search(r"model_([-\w.]+)(?:_num_docs_([-\d]+))?.json", args.file).group(1)
107
108num_docs = None
109baseline = "memgpt"
110
111# evaluate data
112for d in tqdm(data):
113answer = d["true_answers"]
114question = d["question"]
115response = d["memgpt_responses"]
116if not args.baseline:
117# need to parse response for memgpt
118response = extract_final_memgpt_response(response)
119else:
120response = response["response"]
121
122found = False
123for a in answer:
124if a in response:
125found = True
126
127if not found and not "INSUFFICIENT INFORMATION" in response:
128# inconclusive: pass to llm judge
129print(question)
130print(answer)
131print(response)
132print(args.baseline)
133doc = "QUESTION: " + question + "\n" + "TRUE ANSWER: " + str(answer) + "\n" + response
134judge = "llm"
135judge_result = evaluate_response(doc)
136print("JUDGEMENT", judge_result)
137if judge_result:
138correct += 1
139found = True
140elif found:
141# answer found in text
142correct += 1
143judge = "text"
144else:
145judge = "text"
146
147results.append({"question": question, "true_answers": answer, "response": response, "correct": found, "judge": judge})
148
149total += 1
150
151# Dump aggregated results
152json.dump(
153{"accuracy": correct / total, "total": total, "results": results},
154open(f"results_{model}_{num_docs}_{baseline}.json", "w"),
155indent=4,
156)
157print(correct / total)
158