llama-index
77 строк · 2.3 Кб
1"""Notebook utils."""
2
3from collections import defaultdict4from typing import List, Optional, Tuple5
6import pandas as pd7
8from llama_index.legacy.evaluation import EvaluationResult9from llama_index.legacy.evaluation.retrieval.base import RetrievalEvalResult10
11DEFAULT_METRIC_KEYS = ["hit_rate", "mrr"]12
13
14def get_retrieval_results_df(15names: List[str],16results_arr: List[List[RetrievalEvalResult]],17metric_keys: Optional[List[str]] = None,18) -> pd.DataFrame:19"""Display retrieval results."""20metric_keys = metric_keys or DEFAULT_METRIC_KEYS21
22avg_metrics_dict = defaultdict(list)23for name, eval_results in zip(names, results_arr):24metric_dicts = []25for eval_result in eval_results:26metric_dict = eval_result.metric_vals_dict27metric_dicts.append(metric_dict)28results_df = pd.DataFrame(metric_dicts)29
30for metric_key in metric_keys:31if metric_key not in results_df.columns:32raise ValueError(f"Metric key {metric_key} not in results_df")33avg_metrics_dict[metric_key].append(results_df[metric_key].mean())34
35return pd.DataFrame({"retrievers": names, **avg_metrics_dict})36
37
38def get_eval_results_df(39names: List[str], results_arr: List[EvaluationResult], metric: Optional[str] = None40) -> Tuple[pd.DataFrame, pd.DataFrame]:41"""Organizes EvaluationResults into a deep dataframe and computes the mean42score.
43
44result:
45result_df: pd.DataFrame representing all the evaluation results
46mean_df: pd.DataFrame of average scores groupby names
47"""
48if len(names) != len(results_arr):49raise ValueError("names and results_arr must have same length.")50
51qs = []52ss = []53fs = []54rs = []55cs = []56for res in results_arr:57qs.append(res.query)58ss.append(res.score)59fs.append(res.feedback)60rs.append(res.response)61cs.append(res.contexts)62
63deep_df = pd.DataFrame(64{65"rag": names,66"query": qs,67"answer": rs,68"contexts": cs,69"scores": ss,70"feedbacks": fs,71}72)73mean_df = pd.DataFrame(deep_df.groupby(["rag"])["scores"].mean()).T74if metric:75mean_df.index = [f"mean_{metric}_score"]76
77return deep_df, mean_df78