llama-index

answer_relevancy.py
145 строк · 5.3 Кб
Перенос по словам
1
"""Relevancy evaluation."""
2

3
from __future__ import annotations
4

5
import asyncio
6
import re
7
from typing import Any, Callable, Optional, Sequence, Tuple
8

9
from llama_index.legacy import ServiceContext
10
from llama_index.legacy.evaluation.base import BaseEvaluator, EvaluationResult
11
from llama_index.legacy.prompts import BasePromptTemplate, PromptTemplate
12
from llama_index.legacy.prompts.mixin import PromptDictType
13

14
DEFAULT_EVAL_TEMPLATE = PromptTemplate(
15
    "Your task is to evaluate if the response is relevant to the query.\n"
16
    "The evaluation should be performed in a step-by-step manner by answering the following questions:\n"
17
    "1. Does the provided response match the subject matter of the user's query?\n"
18
    "2. Does the provided response attempt to address the focus or perspective "
19
    "on the subject matter taken on by the user's query?\n"
20
    "Each question above is worth 1 point. Provide detailed feedback on response according to the criteria questions above  "
21
    "After your feedback provide a final result by strictly following this format: '[RESULT] followed by the integer number representing the total score assigned to the response'\n\n"
22
    "Query: \n {query}\n"
23
    "Response: \n {response}\n"
24
    "Feedback:"
25
)
26

27
_DEFAULT_SCORE_THRESHOLD = 2.0
28

29

30
def _default_parser_function(output_str: str) -> Tuple[Optional[float], Optional[str]]:
31
    # Pattern to match the feedback and response
32
    # This pattern looks for any text ending with '[RESULT]' followed by a number
33
    pattern = r"([\s\S]+)(?:\[RESULT\]\s*)(\d)"
34

35
    # Using regex to find all matches
36
    result = re.search(pattern, output_str)
37

38
    # Check if any match is found
39
    if result:
40
        # Assuming there's only one match in the text, extract feedback and response
41
        feedback, score = result.groups()
42
        score = float(score) if score is not None else score
43
        return score, feedback.strip()
44
    else:
45
        return None, None
46

47

48
class AnswerRelevancyEvaluator(BaseEvaluator):
49
    """Answer relevancy evaluator.
50

51
    Evaluates the relevancy of response to a query.
52
    This evaluator considers the query string and response string.
53

54
    Args:
55
        service_context(Optional[ServiceContext]):
56
            The service context to use for evaluation.
57
        raise_error(Optional[bool]):
58
            Whether to raise an error if the response is invalid.
59
            Defaults to False.
60
        eval_template(Optional[Union[str, BasePromptTemplate]]):
61
            The template to use for evaluation.
62
        refine_template(Optional[Union[str, BasePromptTemplate]]):
63
            The template to use for refinement.
64
    """
65

66
    def __init__(
67
        self,
68
        service_context: ServiceContext | None = None,
69
        raise_error: bool = False,
70
        eval_template: str | BasePromptTemplate | None = None,
71
        score_threshold: float = _DEFAULT_SCORE_THRESHOLD,
72
        parser_function: Callable[
73
            [str], Tuple[Optional[float], Optional[str]]
74
        ] = _default_parser_function,
75
    ) -> None:
76
        """Init params."""
77
        self._service_context = service_context or ServiceContext.from_defaults()
78
        self._raise_error = raise_error
79

80
        self._eval_template: BasePromptTemplate
81
        if isinstance(eval_template, str):
82
            self._eval_template = PromptTemplate(eval_template)
83
        else:
84
            self._eval_template = eval_template or DEFAULT_EVAL_TEMPLATE
85

86
        self.parser_function = parser_function
87
        self.score_threshold = score_threshold
88

89
    def _get_prompts(self) -> PromptDictType:
90
        """Get prompts."""
91
        return {
92
            "eval_template": self._eval_template,
93
            "refine_template": self._refine_template,
94
        }
95

96
    def _update_prompts(self, prompts: PromptDictType) -> None:
97
        """Update prompts."""
98
        if "eval_template" in prompts:
99
            self._eval_template = prompts["eval_template"]
100
        if "refine_template" in prompts:
101
            self._refine_template = prompts["refine_template"]
102

103
    async def aevaluate(
104
        self,
105
        query: str | None = None,
106
        response: str | None = None,
107
        contexts: Sequence[str] | None = None,
108
        sleep_time_in_seconds: int = 0,
109
        **kwargs: Any,
110
    ) -> EvaluationResult:
111
        """Evaluate whether the response is relevant to the query."""
112
        del kwargs  # Unused
113
        del contexts  # Unused
114

115
        if query is None or response is None:
116
            raise ValueError("query and response must be provided")
117

118
        await asyncio.sleep(sleep_time_in_seconds)
119

120
        eval_response = await self._service_context.llm.apredict(
121
            prompt=self._eval_template,
122
            query=query,
123
            response=response,
124
        )
125

126
        score, reasoning = self.parser_function(eval_response)
127

128
        invalid_result, invalid_reason = False, None
129
        if score is None and reasoning is None:
130
            if self._raise_error:
131
                raise ValueError("The response is invalid")
132
            invalid_result = True
133
            invalid_reason = "Unable to parse the output string."
134

135
        if score:
136
            score /= self.score_threshold
137

138
        return EvaluationResult(
139
            query=query,
140
            response=response,
141
            score=score,
142
            feedback=eval_response,
143
            invalid_result=invalid_result,
144
            invalid_reason=invalid_reason,
145
        )
146
llama-index

Использование cookies