milvus-io_bootcamp
76 строк · 3.1 Кб
1import numpy as np2import ragas, datasets3
4# 1. Define function to create a RAGAS dataset.
5def assemble_ragas_dataset(input_df):6"""Assemble a RAGAS HuggingFace Dataset from an input pandas df."""7
8# Assemble Ragas lists: questions, ground_truth_answers, retrieval_contexts, and RAG answers.9question_list, truth_list, context_list = [], [], []10
11# Get all the questions.12question_list = input_df.Question.to_list()13
14# Get all the ground truth answers.15truth_list = input_df.ground_truth_answer.to_list()16
17# Get all the Milvus Retrieval Contexts as list[list[str]]18context_list = input_df.Custom_RAG_context.to_list()19context_list = [[context] for context in context_list]20
21# Get all the RAG answers based on contexts.22rag_answer_list = input_df.Custom_RAG_answer.to_list()23
24# Create a HuggingFace Dataset from the ground truth lists.25ragas_ds = datasets.Dataset.from_dict({"question": question_list,26"contexts": context_list,27"answer": rag_answer_list,28"ground_truth": truth_list29})30return ragas_ds31
32# 2. Define function to evaluate RAGAS model.
33def evaluate_ragas_model(pandas_eval_df,34ragas_eval_metrics,35llm_to_evaluate,36chunking_to_evaluate=None,37what_to_evaluate=None):38"""Evaluate the RAGAS model using the input pandas df."""39
40# Replace the Custom_RAG_answer with the LLM_to_evaluate.41temp_df = pandas_eval_df.copy()42if llm_to_evaluate != 'Custom_RAG_answer':43temp_df['Custom_RAG_answer'] = temp_df[llm_to_evaluate]44
45# Replace the Custom_RAG_context with the chunks to evaluate.46if chunking_to_evaluate != 'Custom_RAG_context':47temp_df['Custom_RAG_context'] = temp_df[chunking_to_evaluate]48
49# Assemble the RAGAS dataset.50ragas_eval_ds = assemble_ragas_dataset(temp_df)51
52# Evaluate the RAGAS model.53ragas_results = ragas.evaluate(ragas_eval_ds, metrics=ragas_eval_metrics)54
55# Return evaluations as pandas df.56ragas_output_df = ragas_results.to_pandas()57temp = ragas_output_df.fillna(0.0)58
59score = -1.060if what_to_evaluate == "CONTEXTS":61print(f"Chunking to evaluate: {chunking_to_evaluate}")62# Calculate context F1 scores.63temp['context_f1'] = 2.0 * temp.context_precision * temp.context_recall \64/ (temp.context_precision + temp.context_recall)65# Calculate Retrieval average score.66avg_retrieval_f1 = np.round(temp.context_f1.mean(),2)67score = avg_retrieval_f168
69elif what_to_evaluate == "ANSWERS":70print(f"LLM to evaluate: {llm_to_evaluate}")71# Calculate avg LLM answer scores across all floating point number scores between 0 and 1.72temp['avg_answer_score'] = (temp.answer_relevancy + temp.answer_similarity + temp.answer_correctness) / 373avg_answer_score = np.round(temp.avg_answer_score.mean(),4)74score = avg_answer_score75
76return temp, score77