llama-index
145 строк · 5.3 Кб
1"""Relevancy evaluation."""
2
3from __future__ import annotations4
5import asyncio6import re7from typing import Any, Callable, Optional, Sequence, Tuple8
9from llama_index.legacy import ServiceContext10from llama_index.legacy.evaluation.base import BaseEvaluator, EvaluationResult11from llama_index.legacy.prompts import BasePromptTemplate, PromptTemplate12from llama_index.legacy.prompts.mixin import PromptDictType13
14DEFAULT_EVAL_TEMPLATE = PromptTemplate(15"Your task is to evaluate if the response is relevant to the query.\n"16"The evaluation should be performed in a step-by-step manner by answering the following questions:\n"17"1. Does the provided response match the subject matter of the user's query?\n"18"2. Does the provided response attempt to address the focus or perspective "19"on the subject matter taken on by the user's query?\n"20"Each question above is worth 1 point. Provide detailed feedback on response according to the criteria questions above "21"After your feedback provide a final result by strictly following this format: '[RESULT] followed by the integer number representing the total score assigned to the response'\n\n"22"Query: \n {query}\n"23"Response: \n {response}\n"24"Feedback:"25)
26
27_DEFAULT_SCORE_THRESHOLD = 2.028
29
30def _default_parser_function(output_str: str) -> Tuple[Optional[float], Optional[str]]:31# Pattern to match the feedback and response32# This pattern looks for any text ending with '[RESULT]' followed by a number33pattern = r"([\s\S]+)(?:\[RESULT\]\s*)(\d)"34
35# Using regex to find all matches36result = re.search(pattern, output_str)37
38# Check if any match is found39if result:40# Assuming there's only one match in the text, extract feedback and response41feedback, score = result.groups()42score = float(score) if score is not None else score43return score, feedback.strip()44else:45return None, None46
47
48class AnswerRelevancyEvaluator(BaseEvaluator):49"""Answer relevancy evaluator.50
51Evaluates the relevancy of response to a query.
52This evaluator considers the query string and response string.
53
54Args:
55service_context(Optional[ServiceContext]):
56The service context to use for evaluation.
57raise_error(Optional[bool]):
58Whether to raise an error if the response is invalid.
59Defaults to False.
60eval_template(Optional[Union[str, BasePromptTemplate]]):
61The template to use for evaluation.
62refine_template(Optional[Union[str, BasePromptTemplate]]):
63The template to use for refinement.
64"""
65
66def __init__(67self,68service_context: ServiceContext | None = None,69raise_error: bool = False,70eval_template: str | BasePromptTemplate | None = None,71score_threshold: float = _DEFAULT_SCORE_THRESHOLD,72parser_function: Callable[73[str], Tuple[Optional[float], Optional[str]]74] = _default_parser_function,75) -> None:76"""Init params."""77self._service_context = service_context or ServiceContext.from_defaults()78self._raise_error = raise_error79
80self._eval_template: BasePromptTemplate81if isinstance(eval_template, str):82self._eval_template = PromptTemplate(eval_template)83else:84self._eval_template = eval_template or DEFAULT_EVAL_TEMPLATE85
86self.parser_function = parser_function87self.score_threshold = score_threshold88
89def _get_prompts(self) -> PromptDictType:90"""Get prompts."""91return {92"eval_template": self._eval_template,93"refine_template": self._refine_template,94}95
96def _update_prompts(self, prompts: PromptDictType) -> None:97"""Update prompts."""98if "eval_template" in prompts:99self._eval_template = prompts["eval_template"]100if "refine_template" in prompts:101self._refine_template = prompts["refine_template"]102
103async def aevaluate(104self,105query: str | None = None,106response: str | None = None,107contexts: Sequence[str] | None = None,108sleep_time_in_seconds: int = 0,109**kwargs: Any,110) -> EvaluationResult:111"""Evaluate whether the response is relevant to the query."""112del kwargs # Unused113del contexts # Unused114
115if query is None or response is None:116raise ValueError("query and response must be provided")117
118await asyncio.sleep(sleep_time_in_seconds)119
120eval_response = await self._service_context.llm.apredict(121prompt=self._eval_template,122query=query,123response=response,124)125
126score, reasoning = self.parser_function(eval_response)127
128invalid_result, invalid_reason = False, None129if score is None and reasoning is None:130if self._raise_error:131raise ValueError("The response is invalid")132invalid_result = True133invalid_reason = "Unable to parse the output string."134
135if score:136score /= self.score_threshold137
138return EvaluationResult(139query=query,140response=response,141score=score,142feedback=eval_response,143invalid_result=invalid_result,144invalid_reason=invalid_reason,145)146