llama-index

faithfulness.py
214 строк · 8.0 Кб
Перенос по словам
1
"""Faithfulness evaluation."""
2

3
from __future__ import annotations
4

5
from typing import Any, List, Optional, Sequence, Union
6

7
from llama_index.legacy.evaluation.base import BaseEvaluator, EvaluationResult
8
from llama_index.legacy.multi_modal_llms.base import MultiModalLLM
9
from llama_index.legacy.multi_modal_llms.openai import OpenAIMultiModal
10
from llama_index.legacy.prompts import BasePromptTemplate, PromptTemplate
11
from llama_index.legacy.prompts.mixin import PromptDictType
12
from llama_index.legacy.schema import ImageNode
13

14
DEFAULT_EVAL_TEMPLATE = PromptTemplate(
15
    "Please tell if a given piece of information "
16
    "is supported by the visual as well as textual context information.\n"
17
    "You need to answer with either YES or NO.\n"
18
    "Answer YES if any of the image(s) and textual context supports the information, even "
19
    "if most of the context is unrelated. "
20
    "Some examples are provided below with only text context, but please do use\n"
21
    "any images for context if they are provided.\n\n"
22
    "Information: Apple pie is generally double-crusted.\n"
23
    "Context: An apple pie is a fruit pie in which the principal filling "
24
    "ingredient is apples. \n"
25
    "Apple pie is often served with whipped cream, ice cream "
26
    "('apple pie à la mode'), custard or cheddar cheese.\n"
27
    "It is generally double-crusted, with pastry both above "
28
    "and below the filling; the upper crust may be solid or "
29
    "latticed (woven of crosswise strips).\n"
30
    "Answer: YES\n"
31
    "Information: Apple pies tastes bad.\n"
32
    "Context: An apple pie is a fruit pie in which the principal filling "
33
    "ingredient is apples. \n"
34
    "Apple pie is often served with whipped cream, ice cream "
35
    "('apple pie à la mode'), custard or cheddar cheese.\n"
36
    "It is generally double-crusted, with pastry both above "
37
    "and below the filling; the upper crust may be solid or "
38
    "latticed (woven of crosswise strips).\n"
39
    "Answer: NO\n"
40
    "Information: {query_str}\n"
41
    "Context: {context_str}\n"
42
    "Answer: "
43
)
44

45
DEFAULT_REFINE_TEMPLATE = PromptTemplate(
46
    "We want to understand if the following information is present "
47
    "in the context information: {query_str}\n"
48
    "We have provided an existing YES/NO answer: {existing_answer}\n"
49
    "We have the opportunity to refine the existing answer "
50
    "(only if needed) with some more context below.\n"
51
    "------------\n"
52
    "{context_msg}\n"
53
    "------------\n"
54
    "If the existing answer was already YES, still answer YES. "
55
    "If the information is present in the new context, answer YES. "
56
    "Otherwise answer NO.\n"
57
)
58

59

60
class MultiModalFaithfulnessEvaluator(BaseEvaluator):
61
    """Multi-Modal Faithfulness evaluator.
62

63
    Evaluates whether a response is faithful to the contexts
64
    (i.e. whether the response is supported by the contexts or hallucinated.)
65

66
    This evaluator only considers the response string and the list of context strings.
67

68
    Args:
69
        multi_modal_llm(Optional[MultiModalLLM]):
70
            The Multi-Modal LLM Judge to use for evaluations.
71
        raise_error(bool): Whether to raise an error when the response is invalid.
72
            Defaults to False.
73
        eval_template(Optional[Union[str, BasePromptTemplate]]):
74
            The template to use for evaluation.
75
        refine_template(Optional[Union[str, BasePromptTemplate]]):
76
            The template to use for refining the evaluation.
77
    """
78

79
    def __init__(
80
        self,
81
        multi_modal_llm: Optional[MultiModalLLM] = None,
82
        raise_error: bool = False,
83
        eval_template: Union[str, BasePromptTemplate, None] = None,
84
        refine_template: Union[str, BasePromptTemplate, None] = None,
85
    ) -> None:
86
        """Init params."""
87
        self._multi_modal_llm = multi_modal_llm or OpenAIMultiModal(
88
            model="gpt-4-vision-preview", max_new_tokens=1000
89
        )
90
        self._raise_error = raise_error
91

92
        self._eval_template: BasePromptTemplate
93
        if isinstance(eval_template, str):
94
            self._eval_template = PromptTemplate(eval_template)
95
        else:
96
            self._eval_template = eval_template or DEFAULT_EVAL_TEMPLATE
97

98
        self._refine_template: BasePromptTemplate
99
        if isinstance(refine_template, str):
100
            self._refine_template = PromptTemplate(refine_template)
101
        else:
102
            self._refine_template = refine_template or DEFAULT_REFINE_TEMPLATE
103

104
    def _get_prompts(self) -> PromptDictType:
105
        """Get prompts."""
106
        return {
107
            "eval_template": self._eval_template,
108
            "refine_template": self._refine_template,
109
        }
110

111
    def _update_prompts(self, prompts: PromptDictType) -> None:
112
        """Update prompts."""
113
        if "eval_template" in prompts:
114
            self._eval_template = prompts["eval_template"]
115
        if "refine_template" in prompts:
116
            self._refine_template = prompts["refine_template"]
117

118
    def evaluate(
119
        self,
120
        query: Union[str, None] = None,
121
        response: Union[str, None] = None,
122
        contexts: Union[Sequence[str], None] = None,
123
        image_paths: Union[List[str], None] = None,
124
        image_urls: Union[List[str], None] = None,
125
        **kwargs: Any,
126
    ) -> EvaluationResult:
127
        """Evaluate whether the response is faithful to the multi-modal contexts."""
128
        del query  # Unused
129
        del kwargs  # Unused
130
        if contexts is None or response is None:
131
            raise ValueError("contexts and response must be provided")
132

133
        context_str = "\n\n".join(contexts)
134
        fmt_prompt = self._eval_template.format(
135
            context_str=context_str, query_str=response
136
        )
137

138
        if image_paths:
139
            image_nodes = [
140
                ImageNode(image_path=image_path) for image_path in image_paths
141
            ]
142
        if image_urls:
143
            image_nodes = [ImageNode(image_url=image_url) for image_url in image_urls]
144

145
        response_obj = self._multi_modal_llm.complete(
146
            prompt=fmt_prompt,
147
            image_documents=image_nodes,
148
        )
149

150
        raw_response_txt = str(response_obj)
151

152
        if "yes" in raw_response_txt.lower():
153
            passing = True
154
        else:
155
            passing = False
156
            if self._raise_error:
157
                raise ValueError("The response is invalid")
158

159
        return EvaluationResult(
160
            response=response,
161
            contexts=contexts,
162
            passing=passing,
163
            score=1.0 if passing else 0.0,
164
            feedback=raw_response_txt,
165
        )
166

167
    async def aevaluate(
168
        self,
169
        query: Union[str, None] = None,
170
        response: Union[str, None] = None,
171
        contexts: Union[Sequence[str], None] = None,
172
        image_paths: Union[List[str], None] = None,
173
        image_urls: Union[List[str], None] = None,
174
        **kwargs: Any,
175
    ) -> EvaluationResult:
176
        """Async evaluate whether the response is faithful to the multi-modal contexts."""
177
        del query  # Unused
178
        del kwargs  # Unused
179
        if contexts is None or response is None:
180
            raise ValueError("contexts and response must be provided")
181

182
        context_str = "\n\n".join(contexts)
183
        fmt_prompt = self._eval_template.format(
184
            context_str=context_str, query_str=response
185
        )
186

187
        if image_paths:
188
            image_nodes = [
189
                ImageNode(image_path=image_path) for image_path in image_paths
190
            ]
191
        if image_urls:
192
            image_nodes = [ImageNode(image_url=image_url) for image_url in image_urls]
193

194
        response_obj = await self._multi_modal_llm.acomplete(
195
            prompt=fmt_prompt,
196
            image_documents=image_nodes,
197
        )
198

199
        raw_response_txt = str(response_obj)
200

201
        if "yes" in raw_response_txt.lower():
202
            passing = True
203
        else:
204
            passing = False
205
            if self._raise_error:
206
                raise ValueError("The response is invalid")
207

208
        return EvaluationResult(
209
            response=response,
210
            contexts=contexts,
211
            passing=passing,
212
            score=1.0 if passing else 0.0,
213
            feedback=raw_response_txt,
214
        )
215
llama-index

Использование cookies