llama-index

pairwise.py
279 строк · 10.0 Кб
Перенос по словам
1
"""Pairwise evaluation."""
2

3
import asyncio
4
from enum import Enum
5
from typing import Any, Callable, Optional, Sequence, Tuple, Union
6

7
from llama_index.legacy import ServiceContext
8
from llama_index.legacy.evaluation.base import (
9
    BaseEvaluator,
10
    EvaluationResult,
11
)
12
from llama_index.legacy.prompts import (
13
    BasePromptTemplate,
14
    ChatMessage,
15
    ChatPromptTemplate,
16
    MessageRole,
17
    PromptTemplate,
18
)
19
from llama_index.legacy.prompts.mixin import PromptDictType
20

21
DEFAULT_SYSTEM_TEMPLATE = (
22
    "Please act as an impartial judge and evaluate the quality of the responses provided by two "
23
    "AI question-answering assistants to the user question perhaps with added reference which "
24
    "are displayed below. You should choose the assistant that "
25
    "follows the user’s instructions and answers the user’s question better using the provided "
26
    "context. Your evaluation "
27
    "should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, "
28
    "and level of detail of their responses. Begin your evaluation by comparing the two "
29
    "responses and provide a short explanation. Avoid any position biases and ensure that the "
30
    "order in which the responses were presented does not influence your decision. Do not allow "
31
    "the length of the responses to influence your evaluation. Do not favor certain names of "
32
    "the assistants. Be as objective as possible. After providing your explanation, output your "
33
    "final verdict by strictly following this format: '[[A]]' if assistant A is better, '[[B]]' "
34
    "if assistant B is better, and '[[C]]' for a tie.\n"
35
)
36

37
DEFAULT_USER_TEMPLATE = (
38
    "[User Question]\n"
39
    "{query}"
40
    "\n\n"
41
    "[The Start of Reference]\n"
42
    "{reference}\n"
43
    "[The End of Reference]"
44
    "\n\n"
45
    "[The Start of Assistant A’s Answer]\n"
46
    "{answer_1}\n"
47
    "[The End of Assistant A’s Answer]"
48
    "\n\n"
49
    "[The Start of Assistant B’s Answer]\n"
50
    "{answer_2}\n"
51
    "[The End of Assistant B’s Answer]"
52
)
53

54
DEFAULT_EVAL_TEMPLATE = ChatPromptTemplate(
55
    message_templates=[
56
        ChatMessage(role=MessageRole.SYSTEM, content=DEFAULT_SYSTEM_TEMPLATE),
57
        ChatMessage(role=MessageRole.USER, content=DEFAULT_USER_TEMPLATE),
58
    ]
59
)
60

61

62
def _default_parser_function(
63
    eval_response: str,
64
) -> Tuple[Optional[bool], Optional[float], Optional[str]]:
65
    # Extract from response
66
    feedback: Optional[str] = ""
67
    if "[[A]]" in eval_response:
68
        passing: Optional[bool] = True
69
        score = 1.0
70
    elif "[[B]]" in eval_response:
71
        passing = False
72
        score = 0.0
73
    elif "[[C]]" in eval_response:
74
        passing = None
75
        score = 0.5
76
    else:
77
        passing = None
78
        score = None
79
        feedback = None
80
    return passing, score, feedback
81

82

83
class EvaluationSource(str, Enum):
84
    """To distinguish between flipped or original."""
85

86
    ORIGINAL = "original"
87
    FLIPPED = "flipped"
88
    NEITHER = "neither"
89

90

91
class PairwiseComparisonEvaluator(BaseEvaluator):
92
    """Pairwise comparison evaluator.
93

94
    Evaluates the quality of a response vs. a "reference" response given a question by
95
    having an LLM judge which response is better.
96

97
    Outputs whether the `response` given is better than the `reference` response.
98

99
    Args:
100
        service_context (Optional[ServiceContext]):
101
            The service context to use for evaluation.
102
        eval_template (Optional[Union[str, BasePromptTemplate]]):
103
            The template to use for evaluation.
104
        enforce_consensus (bool): Whether to enforce consensus (consistency if we
105
            flip the order of the answers). Defaults to True.
106

107
    """
108

109
    def __init__(
110
        self,
111
        service_context: Optional[ServiceContext] = None,
112
        eval_template: Optional[Union[BasePromptTemplate, str]] = None,
113
        parser_function: Callable[
114
            [str], Tuple[Optional[bool], Optional[float], Optional[str]]
115
        ] = _default_parser_function,
116
        enforce_consensus: bool = True,
117
    ) -> None:
118
        self._service_context = service_context or ServiceContext.from_defaults()
119

120
        self._eval_template: BasePromptTemplate
121
        if isinstance(eval_template, str):
122
            self._eval_template = PromptTemplate(eval_template)
123
        else:
124
            self._eval_template = eval_template or DEFAULT_EVAL_TEMPLATE
125

126
        self._enforce_consensus = enforce_consensus
127
        self._parser_function = parser_function
128

129
    def _get_prompts(self) -> PromptDictType:
130
        """Get prompts."""
131
        return {
132
            "eval_template": self._eval_template,
133
        }
134

135
    def _update_prompts(self, prompts: PromptDictType) -> None:
136
        """Update prompts."""
137
        if "eval_template" in prompts:
138
            self._eval_template = prompts["eval_template"]
139

140
    async def _get_eval_result(
141
        self,
142
        query: str,
143
        response: str,
144
        second_response: str,
145
        reference: Optional[str],
146
    ) -> EvaluationResult:
147
        """Get evaluation result."""
148
        eval_response = await self._service_context.llm.apredict(
149
            prompt=self._eval_template,
150
            query=query,
151
            answer_1=response,
152
            answer_2=second_response,
153
            reference=reference or "",
154
        )
155

156
        # Extract from response
157
        passing, score, feedback = self._parser_function(eval_response)
158

159
        if passing is None and score is None and feedback is None:
160
            return EvaluationResult(
161
                query=query,
162
                invalid_result=True,
163
                invalid_reason="Output cannot be parsed",
164
                feedback=eval_response,
165
            )
166
        else:
167
            return EvaluationResult(
168
                query=query,
169
                response=eval_response,
170
                passing=passing,
171
                score=score,
172
                feedback=eval_response,
173
                pairwise_source=EvaluationSource.ORIGINAL,
174
            )
175

176
    async def _resolve_results(
177
        self,
178
        eval_result: EvaluationResult,
179
        flipped_eval_result: EvaluationResult,
180
    ) -> EvaluationResult:
181
        """Resolve eval results from evaluation + flipped evaluation.
182

183
        Args:
184
            eval_result (EvaluationResult): Result when answer_1 is shown first
185
            flipped_eval_result (EvaluationResult): Result when answer_2 is shown first
186

187
        Returns:
188
            EvaluationResult: The final evaluation result
189
        """
190
        # add pairwise_source to eval_result and flipped_eval_result
191
        eval_result.pairwise_source = EvaluationSource.ORIGINAL
192
        flipped_eval_result.pairwise_source = EvaluationSource.FLIPPED
193

194
        # count the votes for each of the 2 answers
195
        votes_1 = 0.0
196
        votes_2 = 0.0
197
        if eval_result.score is not None and flipped_eval_result.score is not None:
198
            votes_1 = eval_result.score + (1 - flipped_eval_result.score)
199
            votes_2 = (1 - eval_result.score) + flipped_eval_result.score
200

201
        if votes_1 + votes_2 != 2:  # each round, the judge can give a total of 1 vote
202
            raise ValueError("Impossible score results. Total amount of votes is 2.")
203

204
        # get the judges (original and flipped) who voted for answer_1
205
        voters_1 = [eval_result] * (eval_result.score == 1.0) + [
206
            flipped_eval_result
207
        ] * (flipped_eval_result.score == 0.0)
208

209
        # get the judges (original and flipped) who voted for answer_2
210
        voters_2 = [eval_result] * (eval_result.score == 0.0) + [
211
            flipped_eval_result
212
        ] * (flipped_eval_result.score == 1.0)
213

214
        if votes_1 > votes_2:
215
            return voters_1[0]  # return any voter for answer_1
216
        elif votes_2 > votes_1:
217
            return voters_2[0]  # return any vote for answer_2
218
        else:
219
            if (
220
                eval_result.score == 0.5
221
            ):  # votes_1 == votes_2 can only happen if both are 1.0 (so actual tie)
222
                # doesn't matter which one we return here
223
                return eval_result
224
            else:  # Inconclusive case!
225
                return EvaluationResult(
226
                    query=eval_result.query,
227
                    response="",
228
                    passing=None,
229
                    score=0.5,
230
                    feedback="",
231
                    pairwise_source=EvaluationSource.NEITHER,
232
                )
233

234
    async def aevaluate(
235
        self,
236
        query: Optional[str] = None,
237
        response: Optional[str] = None,
238
        contexts: Optional[Sequence[str]] = None,
239
        second_response: Optional[str] = None,
240
        reference: Optional[str] = None,
241
        sleep_time_in_seconds: int = 0,
242
        **kwargs: Any,
243
    ) -> EvaluationResult:
244
        del kwargs  # Unused
245
        del contexts  # Unused
246

247
        if query is None or response is None or second_response is None:
248
            raise ValueError(
249
                "query, response, second_response, and reference must be provided"
250
            )
251

252
        await asyncio.sleep(sleep_time_in_seconds)
253

254
        eval_result = await self._get_eval_result(
255
            query, response, second_response, reference
256
        )
257
        if self._enforce_consensus and not eval_result.invalid_result:
258
            # Flip the order of the answers and see if the answer is consistent
259
            # (which means that the score should flip from 0 to 1 and vice-versa)
260
            # if not, then we return a tie
261
            flipped_eval_result = await self._get_eval_result(
262
                query, second_response, response, reference
263
            )
264
            if not flipped_eval_result.invalid_result:
265
                resolved_eval_result = await self._resolve_results(
266
                    eval_result, flipped_eval_result
267
                )
268
            else:
269
                resolved_eval_result = EvaluationResult(
270
                    query=eval_result.query,
271
                    response=eval_result.response,
272
                    feedback=flipped_eval_result.response,
273
                    invalid_result=True,
274
                    invalid_reason="Output cannot be parsed.",
275
                )
276
        else:
277
            resolved_eval_result = eval_result
278

279
        return resolved_eval_result
280
llama-index

Использование cookies