milvus-io_bootcamp
447 строк · 12.9 Кб
1{
2"cells": [
3{
4"cell_type": "markdown",
5"source": [
6"# Use Ragas to evaluate the OpenAI Assistant\n",
7"\n",
8"**Please note that this test requires a large amount of OpenAI api token consumption. Please read it carefully and Pay attention to the number of times you request access.**"
9],
10"metadata": {
11"collapsed": false,
12"pycharm": {
13"name": "#%% md\n"
14}
15}
16},
17{
18"cell_type": "markdown",
19"source": [
20"## 1. Prepare environment and data\n",
21"\n",
22"Before starting, you must set OPENAI_API_KEY in your environment variables."
23],
24"metadata": {
25"collapsed": false,
26"pycharm": {
27"name": "#%% md\n"
28}
29}
30},
31{
32"cell_type": "markdown",
33"source": [
34"Install pip dependencies"
35],
36"metadata": {
37"collapsed": false,
38"pycharm": {
39"name": "#%% md\n"
40}
41}
42},
43{
44"cell_type": "code",
45"execution_count": null,
46"outputs": [],
47"source": [
48"# ! python -m pip install openai beir pandas ragas==0.0.17"
49],
50"metadata": {
51"collapsed": false,
52"pycharm": {
53"name": "#%%\n"
54}
55}
56},
57{
58"cell_type": "markdown",
59"source": [
60"Download [Financial Opinion Mining and Question Answering (fiqa) Dataset](https://sites.google.com/view/fiqa/) data if it not exists in your local space. We convert it into a ragas form that is easier to process, referring from this [script](https://github.com/explodinggradients/ragas/blob/main/experiments/baselines/fiqa/dataset-exploration-and-baseline.ipynb)."
61],
62"metadata": {
63"collapsed": false,
64"pycharm": {
65"name": "#%% md\n"
66}
67}
68},
69{
70"cell_type": "code",
71"execution_count": 1,
72"outputs": [
73{
74"name": "stdout",
75"output_type": "stream",
76"text": [
77"1706\n"
78]
79}
80],
81"source": [
82"import json\n",
83"import pandas as pd\n",
84"import os\n",
85"from tqdm import tqdm\n",
86"from datasets import Dataset\n",
87"from beir import util\n",
88"\n",
89"\n",
90"def prepare_fiqa_without_answer(knowledge_path):\n",
91" dataset_name = \"fiqa\"\n",
92"\n",
93" if not os.path.exists(os.path.join(knowledge_path, f'{dataset_name}.zip')):\n",
94" url = (\n",
95" \"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip\".format(\n",
96" dataset_name\n",
97" )\n",
98" )\n",
99" util.download_and_unzip(url, knowledge_path)\n",
100"\n",
101" data_path = os.path.join(knowledge_path, 'fiqa')\n",
102" with open(os.path.join(data_path, \"corpus.jsonl\")) as f:\n",
103" cs = [pd.Series(json.loads(l)) for l in f.readlines()]\n",
104"\n",
105" corpus_df = pd.DataFrame(cs)\n",
106"\n",
107" corpus_df = corpus_df.rename(columns={\"_id\": \"corpus-id\", \"text\": \"ground_truth\"})\n",
108" corpus_df = corpus_df.drop(columns=[\"title\", \"metadata\"])\n",
109" corpus_df[\"corpus-id\"] = corpus_df[\"corpus-id\"].astype(int)\n",
110" corpus_df.head()\n",
111"\n",
112" with open(os.path.join(data_path, \"queries.jsonl\")) as f:\n",
113" qs = [pd.Series(json.loads(l)) for l in f.readlines()]\n",
114"\n",
115" queries_df = pd.DataFrame(qs)\n",
116" queries_df = queries_df.rename(columns={\"_id\": \"query-id\", \"text\": \"question\"})\n",
117" queries_df = queries_df.drop(columns=[\"metadata\"])\n",
118" queries_df[\"query-id\"] = queries_df[\"query-id\"].astype(int)\n",
119" queries_df.head()\n",
120"\n",
121" splits = [\"dev\", \"test\", \"train\"]\n",
122" split_df = {}\n",
123" for s in splits:\n",
124" split_df[s] = pd.read_csv(os.path.join(data_path, f\"qrels/{s}.tsv\"), sep=\"\\t\").drop(\n",
125" columns=[\"score\"]\n",
126" )\n",
127"\n",
128" final_split_df = {}\n",
129" for split in split_df:\n",
130" df = queries_df.merge(split_df[split], on=\"query-id\")\n",
131" df = df.merge(corpus_df, on=\"corpus-id\")\n",
132" df = df.drop(columns=[\"corpus-id\"])\n",
133" grouped = df.groupby(\"query-id\").apply(\n",
134" lambda x: pd.Series(\n",
135" {\n",
136" \"question\": x[\"question\"].sample().values[0],\n",
137" \"ground_truths\": x[\"ground_truth\"].tolist(),\n",
138" }\n",
139" )\n",
140" )\n",
141"\n",
142" grouped = grouped.reset_index()\n",
143" grouped = grouped.drop(columns=\"query-id\")\n",
144" final_split_df[split] = grouped\n",
145"\n",
146" return final_split_df\n",
147"\n",
148"\n",
149"knowledge_datas_path = './knowledge_datas'\n",
150"fiqa_path = os.path.join(knowledge_datas_path, 'fiqa_doc.txt')\n",
151"\n",
152"if not os.path.exists(knowledge_datas_path):\n",
153" os.mkdir(knowledge_datas_path)\n",
154"contexts_list = []\n",
155"answer_list = []\n",
156"\n",
157"final_split_df = prepare_fiqa_without_answer(knowledge_datas_path)\n",
158"\n",
159"docs = []\n",
160"\n",
161"split = 'test'\n",
162"for ds in final_split_df[split][\"ground_truths\"]:\n",
163" docs.extend([d for d in ds])\n",
164"print(len(docs))\n",
165"\n",
166"docs_str = '\\n'.join(docs)\n",
167"with open(fiqa_path, 'w') as f:\n",
168" f.write(docs_str)\n",
169"\n",
170"split = 'test'\n",
171"question_list = final_split_df[split][\"question\"].to_list()\n",
172"ground_truth_list = final_split_df[split][\"ground_truths\"].to_list()"
173],
174"metadata": {
175"collapsed": false,
176"pycharm": {
177"name": "#%%\n"
178}
179}
180},
181{
182"cell_type": "markdown",
183"source": [
184"Now we have the question list and the ground truth list. And the knowledge documents are prepared in `fiqa_path`.\n"
185],
186"metadata": {
187"collapsed": false,
188"pycharm": {
189"name": "#%% md\n"
190}
191}
192},
193{
194"cell_type": "markdown",
195"source": [
196"## 2. Building RAG using OpenAI assistant\n",
197"\n",
198"To get the context content from the annotations returned by Open AI."
199],
200"metadata": {
201"collapsed": false,
202"pycharm": {
203"name": "#%% md\n"
204}
205}
206},
207{
208"cell_type": "code",
209"execution_count": null,
210"outputs": [],
211"source": [
212"import time\n",
213"from openai import OpenAI\n",
214"\n",
215"client = OpenAI()\n",
216"\n",
217"# Set OPENAI_API_KEY in your environment value\n",
218"client.api_key = os.getenv('OPENAI_API_KEY')\n",
219"\n",
220"\n",
221"class OpenAITimeoutException(Exception):\n",
222" pass\n",
223"\n",
224"\n",
225"def get_content_from_retrieved_message(message):\n",
226" # Extract the message content\n",
227" message_content = message.content[0].text\n",
228" annotations = message_content.annotations\n",
229" contexts = []\n",
230" for annotation in annotations:\n",
231" message_content.value = message_content.value.replace(annotation.text, f'')\n",
232" if (file_citation := getattr(annotation, 'file_citation', None)):\n",
233" contexts.append(file_citation.quote)\n",
234" if len(contexts) == 0:\n",
235" contexts = ['empty context.']\n",
236" return message_content.value, contexts\n",
237"\n",
238"\n",
239"def try_get_answer_contexts(assistant_id, question, timeout_seconds=120):\n",
240" thread = client.beta.threads.create(\n",
241" messages=[\n",
242" {\n",
243" \"role\": \"user\",\n",
244" \"content\": question,\n",
245" }\n",
246" ]\n",
247" )\n",
248" thread_id = thread.id\n",
249" run = client.beta.threads.runs.create(\n",
250" thread_id=thread_id,\n",
251" assistant_id=assistant_id,\n",
252" )\n",
253" start_time = time.time()\n",
254" while True:\n",
255" elapsed_time = time.time() - start_time\n",
256" if elapsed_time > timeout_seconds:\n",
257" raise Exception(\"OpenAI retrieving answer Timeout!\")\n",
258"\n",
259" run = client.beta.threads.runs.retrieve(\n",
260" thread_id=thread_id,\n",
261" run_id=run.id\n",
262" )\n",
263" if run.status == 'completed':\n",
264" break\n",
265" messages = client.beta.threads.messages.list(\n",
266" thread_id=thread_id\n",
267" )\n",
268" assert len(messages.data) > 1\n",
269" res, contexts = get_content_from_retrieved_message(messages.data[0])\n",
270" response = client.beta.threads.delete(thread_id)\n",
271" assert response.deleted is True\n",
272" return contexts, res\n",
273"\n",
274"\n",
275"def get_answer_contexts_from_assistant(question, assistant_id, timeout_seconds=120, retry_num=6):\n",
276" res = 'failed. please retry.'\n",
277" contexts = ['failed. please retry.']\n",
278" try:\n",
279" for _ in range(retry_num):\n",
280" try:\n",
281" contexts, res = try_get_answer_contexts(assistant_id, question, timeout_seconds)\n",
282" break\n",
283" except OpenAITimeoutException as e:\n",
284" print('OpenAI retrieving answer Timeout, retry...')\n",
285" continue\n",
286" except Exception as e:\n",
287" print(e)\n",
288" return res, contexts"
289],
290"metadata": {
291"collapsed": false,
292"pycharm": {
293"name": "#%%\n"
294}
295}
296},
297{
298"cell_type": "markdown",
299"source": [
300"Build assistant and upload knowledge files."
301],
302"metadata": {
303"collapsed": false,
304"pycharm": {
305"name": "#%% md\n"
306}
307}
308},
309{
310"cell_type": "code",
311"execution_count": 3,
312"outputs": [
313{
314"name": "stderr",
315"output_type": "stream",
316"text": [
317" 0%| | 0/648 [03:45<?, ?it/s]\n",
318"\n",
319"KeyboardInterrupt\n",
320"\n"
321]
322}
323],
324"source": [
325"file = client.files.create(\n",
326" file=open(fiqa_path, \"rb\"),\n",
327" purpose='assistants'\n",
328")\n",
329"\n",
330"# Add the file to the assistant\n",
331"assistant = client.beta.assistants.create(\n",
332" instructions=\"You are a customer support chatbot. You must use your retrieval tool to retrieve relevant knowledge to best respond to customer queries.\",\n",
333" model=\"gpt-4-1106-preview\",\n",
334" tools=[{\"type\": \"retrieval\"}],\n",
335" file_ids=[file.id]\n",
336")"
337],
338"metadata": {
339"collapsed": false,
340"pycharm": {
341"name": "#%%\n"
342}
343}
344},
345{
346"cell_type": "markdown",
347"source": [
348"## 3. Start Ragas Evaluation\n",
349"\n",
350"Note that a large amount of OpenAI api token is consumed. Every time you ask a question and every evaluation, you will ask the OpenAI service. Please pay attention to your token consumption. If you only want to run a small number of tests, you can modify the code to reduce the test size."
351],
352"metadata": {
353"collapsed": false,
354"pycharm": {
355"name": "#%% md\n"
356}
357}
358},
359{
360"cell_type": "code",
361"execution_count": null,
362"outputs": [],
363"source": [
364"for question in tqdm(question_list):\n",
365" answer, contexts = get_answer_contexts_from_assistant(question, assistant.id)\n",
366" # print(f'answer = {answer}')\n",
367" # print(f'contexts = {contexts}')\n",
368" # print('=' * 80)\n",
369" answer_list.append(answer)\n",
370" contexts_list.append(contexts)"
371],
372"metadata": {
373"collapsed": false,
374"pycharm": {
375"name": "#%%\n"
376}
377}
378},
379{
380"cell_type": "markdown",
381"source": [
382"You can choose the indicators you care about to test.\n"
383],
384"metadata": {
385"collapsed": false,
386"pycharm": {
387"name": "#%% md\n"
388}
389}
390},
391{
392"cell_type": "code",
393"execution_count": null,
394"outputs": [],
395"source": [
396"from ragas import evaluate\n",
397"from ragas.metrics import answer_relevancy, faithfulness, context_recall, context_precision, answer_similarity\n",
398"\n",
399"ds = Dataset.from_dict({\"question\": question_list,\n",
400" \"contexts\": contexts_list,\n",
401" \"answer\": answer_list,\n",
402" \"ground_truths\": ground_truth_list})\n",
403"\n",
404"result = evaluate(\n",
405" ds,\n",
406" metrics=[\n",
407" context_precision,\n",
408" # context_recall,\n",
409" # faithfulness,\n",
410" # answer_relevancy,\n",
411" # answer_similarity,\n",
412" # answer_correctness,\n",
413" ],\n",
414"\n",
415")\n",
416"print(result)"
417],
418"metadata": {
419"collapsed": false,
420"pycharm": {
421"name": "#%%\n"
422}
423}
424}
425],
426"metadata": {
427"kernelspec": {
428"display_name": "Python 3",
429"language": "python",
430"name": "python3"
431},
432"language_info": {
433"codemirror_mode": {
434"name": "ipython",
435"version": 2
436},
437"file_extension": ".py",
438"mimetype": "text/x-python",
439"name": "python",
440"nbconvert_exporter": "python",
441"pygments_lexer": "ipython2",
442"version": "2.7.6"
443}
444},
445"nbformat": 4,
446"nbformat_minor": 0
447}