llama-index

correctness.py
151 строка · 4.9 Кб
Перенос по словам
1
"""Correctness evaluation."""
2

3
import asyncio
4
from typing import Any, Callable, Optional, Sequence, Tuple, Union
5

6
from llama_index.legacy.evaluation.base import BaseEvaluator, EvaluationResult
7
from llama_index.legacy.evaluation.eval_utils import default_parser
8
from llama_index.legacy.prompts import (
9
    BasePromptTemplate,
10
    ChatMessage,
11
    ChatPromptTemplate,
12
    MessageRole,
13
    PromptTemplate,
14
)
15
from llama_index.legacy.prompts.mixin import PromptDictType
16
from llama_index.legacy.service_context import ServiceContext
17

18
DEFAULT_SYSTEM_TEMPLATE = """
19
You are an expert evaluation system for a question answering chatbot.
20

21
You are given the following information:
22
- a user query, and
23
- a generated answer
24

25
You may also be given a reference answer to use for reference in your evaluation.
26

27
Your job is to judge the relevance and correctness of the generated answer.
28
Output a single score that represents a holistic evaluation.
29
You must return your response in a line with only the score.
30
Do not return answers in any other format.
31
On a separate line provide your reasoning for the score as well.
32

33
Follow these guidelines for scoring:
34
- Your score has to be between 1 and 5, where 1 is the worst and 5 is the best.
35
- If the generated answer is not relevant to the user query, \
36
you should give a score of 1.
37
- If the generated answer is relevant but contains mistakes, \
38
you should give a score between 2 and 3.
39
- If the generated answer is relevant and fully correct, \
40
you should give a score between 4 and 5.
41

42
Example Response:
43
4.0
44
The generated answer has the exact same metrics as the reference answer, \
45
    but it is not as concise.
46

47
"""
48

49
DEFAULT_USER_TEMPLATE = """
50
## User Query
51
{query}
52

53
## Reference Answer
54
{reference_answer}
55

56
## Generated Answer
57
{generated_answer}
58
"""
59

60
DEFAULT_EVAL_TEMPLATE = ChatPromptTemplate(
61
    message_templates=[
62
        ChatMessage(role=MessageRole.SYSTEM, content=DEFAULT_SYSTEM_TEMPLATE),
63
        ChatMessage(role=MessageRole.USER, content=DEFAULT_USER_TEMPLATE),
64
    ]
65
)
66

67

68
class CorrectnessEvaluator(BaseEvaluator):
69
    """Correctness evaluator.
70

71
    Evaluates the correctness of a question answering system.
72
    This evaluator depends on `reference` answer to be provided, in addition to the
73
    query string and response string.
74

75
    It outputs a score between 1 and 5, where 1 is the worst and 5 is the best,
76
    along with a reasoning for the score.
77
    Passing is defined as a score greater than or equal to the given threshold.
78

79
    Args:
80
        service_context (Optional[ServiceContext]): Service context.
81
        eval_template (Optional[Union[BasePromptTemplate, str]]):
82
            Template for the evaluation prompt.
83
        score_threshold (float): Numerical threshold for passing the evaluation,
84
            defaults to 4.0.
85
    """
86

87
    def __init__(
88
        self,
89
        service_context: Optional[ServiceContext] = None,
90
        eval_template: Optional[Union[BasePromptTemplate, str]] = None,
91
        score_threshold: float = 4.0,
92
        parser_function: Callable[
93
            [str], Tuple[Optional[float], Optional[str]]
94
        ] = default_parser,
95
    ) -> None:
96
        self._service_context = service_context or ServiceContext.from_defaults()
97

98
        self._eval_template: BasePromptTemplate
99
        if isinstance(eval_template, str):
100
            self._eval_template = PromptTemplate(eval_template)
101
        else:
102
            self._eval_template = eval_template or DEFAULT_EVAL_TEMPLATE
103

104
        self._score_threshold = score_threshold
105
        self.parser_function = parser_function
106

107
    def _get_prompts(self) -> PromptDictType:
108
        """Get prompts."""
109
        return {
110
            "eval_template": self._eval_template,
111
        }
112

113
    def _update_prompts(self, prompts: PromptDictType) -> None:
114
        """Update prompts."""
115
        if "eval_template" in prompts:
116
            self._eval_template = prompts["eval_template"]
117

118
    async def aevaluate(
119
        self,
120
        query: Optional[str] = None,
121
        response: Optional[str] = None,
122
        contexts: Optional[Sequence[str]] = None,
123
        reference: Optional[str] = None,
124
        sleep_time_in_seconds: int = 0,
125
        **kwargs: Any,
126
    ) -> EvaluationResult:
127
        del kwargs  # Unused
128
        del contexts  # Unused
129

130
        await asyncio.sleep(sleep_time_in_seconds)
131

132
        if query is None or response is None:
133
            raise ValueError("query, and response must be provided")
134

135
        eval_response = await self._service_context.llm.apredict(
136
            prompt=self._eval_template,
137
            query=query,
138
            generated_answer=response,
139
            reference_answer=reference or "(NO REFERENCE ANSWER SUPPLIED)",
140
        )
141

142
        # Use the parser function
143
        score, reasoning = self.parser_function(eval_response)
144

145
        return EvaluationResult(
146
            query=query,
147
            response=response,
148
            passing=score >= self._score_threshold if score is not None else None,
149
            score=score,
150
            feedback=reasoning,
151
        )
152
llama-index

Использование cookies