milvus-io_bootcamp

eval_ragas.py
76 строк · 3.1 Кб
Перенос по словам
1
import numpy as np
2
import ragas, datasets
3

4
# 1. Define function to create a RAGAS dataset.
5
def assemble_ragas_dataset(input_df):
6
    """Assemble a RAGAS HuggingFace Dataset from an input pandas df."""
7

8
    # Assemble Ragas lists: questions, ground_truth_answers, retrieval_contexts, and RAG answers.
9
    question_list, truth_list, context_list = [], [], []
10

11
    # Get all the questions.
12
    question_list = input_df.Question.to_list()
13

14
    # Get all the ground truth answers.
15
    truth_list = input_df.ground_truth_answer.to_list()
16

17
    # Get all the Milvus Retrieval Contexts as list[list[str]]
18
    context_list = input_df.Custom_RAG_context.to_list()
19
    context_list = [[context] for context in context_list]
20

21
    # Get all the RAG answers based on contexts.
22
    rag_answer_list = input_df.Custom_RAG_answer.to_list()
23

24
    # Create a HuggingFace Dataset from the ground truth lists.
25
    ragas_ds = datasets.Dataset.from_dict({"question": question_list,
26
                            "contexts": context_list,
27
                            "answer": rag_answer_list,
28
                            "ground_truth": truth_list
29
                            })
30
    return ragas_ds
31

32
# 2. Define function to evaluate RAGAS model.
33
def evaluate_ragas_model(pandas_eval_df, 
34
                         ragas_eval_metrics, 
35
                         llm_to_evaluate,
36
                         chunking_to_evaluate=None,
37
                         what_to_evaluate=None):
38
    """Evaluate the RAGAS model using the input pandas df."""
39

40
    # Replace the Custom_RAG_answer with the LLM_to_evaluate.
41
    temp_df = pandas_eval_df.copy()
42
    if llm_to_evaluate != 'Custom_RAG_answer':
43
        temp_df['Custom_RAG_answer'] = temp_df[llm_to_evaluate]
44

45
    # Replace the Custom_RAG_context with the chunks to evaluate.
46
    if chunking_to_evaluate != 'Custom_RAG_context':
47
        temp_df['Custom_RAG_context'] = temp_df[chunking_to_evaluate]
48

49
    # Assemble the RAGAS dataset.
50
    ragas_eval_ds = assemble_ragas_dataset(temp_df)
51

52
    # Evaluate the RAGAS model.
53
    ragas_results = ragas.evaluate(ragas_eval_ds, metrics=ragas_eval_metrics)
54

55
    # Return evaluations as pandas df.
56
    ragas_output_df = ragas_results.to_pandas()
57
    temp = ragas_output_df.fillna(0.0)
58

59
    score = -1.0
60
    if what_to_evaluate == "CONTEXTS":
61
        print(f"Chunking to evaluate: {chunking_to_evaluate}")
62
        # Calculate context F1 scores.
63
        temp['context_f1'] = 2.0 * temp.context_precision * temp.context_recall \
64
                            / (temp.context_precision + temp.context_recall)
65
        # Calculate Retrieval average score.
66
        avg_retrieval_f1 = np.round(temp.context_f1.mean(),2)
67
        score = avg_retrieval_f1
68

69
    elif what_to_evaluate == "ANSWERS":
70
        print(f"LLM to evaluate: {llm_to_evaluate}")
71
        # Calculate avg LLM answer scores across all floating point number scores between 0 and 1.
72
        temp['avg_answer_score'] = (temp.answer_relevancy + temp.answer_similarity + temp.answer_correctness) / 3
73
        avg_answer_score = np.round(temp.avg_answer_score.mean(),4)
74
        score = avg_answer_score
75

76
    return temp, score
77
milvus-io_bootcamp

Использование cookies