llama-index

relevancy.py
195 строк · 7.1 Кб
Перенос по словам
1
"""Relevancy evaluation."""
2

3
from __future__ import annotations
4

5
from typing import Any, List, Sequence, Union
6

7
from llama_index.legacy.evaluation.base import BaseEvaluator, EvaluationResult
8
from llama_index.legacy.multi_modal_llms.base import MultiModalLLM
9
from llama_index.legacy.multi_modal_llms.openai import OpenAIMultiModal
10
from llama_index.legacy.prompts import BasePromptTemplate, PromptTemplate
11
from llama_index.legacy.prompts.mixin import PromptDictType
12
from llama_index.legacy.schema import ImageNode
13

14
DEFAULT_EVAL_TEMPLATE = PromptTemplate(
15
    "Your task is to evaluate if the response for the query \
16
    is in line with the images and textual context information provided.\n"
17
    "You have two options to answer. Either YES/ NO.\n"
18
    "Answer - YES, if the response for the query \
19
    is in line with context information otherwise NO.\n"
20
    "Query and Response: \n {query_str}\n"
21
    "Context: \n {context_str}\n"
22
    "Answer: "
23
)
24

25
DEFAULT_REFINE_TEMPLATE = PromptTemplate(
26
    "We want to understand if the following query and response is"
27
    "in line with the textual and visual context information: \n {query_str}\n"
28
    "We have provided an existing YES/NO answer: \n {existing_answer}\n"
29
    "We have the opportunity to refine the existing answer "
30
    "(only if needed) with some more context below.\n"
31
    "------------\n"
32
    "{context_msg}\n"
33
    "------------\n"
34
    "If the existing answer was already YES, still answer YES. "
35
    "If the information is present in the new context, answer YES. "
36
    "Otherwise answer NO.\n"
37
)
38

39

40
class MultiModalRelevancyEvaluator(BaseEvaluator):
41
    """Relevancy evaluator.
42

43
    Evaluates the relevancy of retrieved image and textual contexts and response to a query.
44
    This evaluator considers the query string, retrieved contexts, and response string.
45

46
    Args:
47
        multi_modal_llm(Optional[MultiModalLLM]):
48
            The Multi-Modal LLM Judge to use for evaluations.
49
        raise_error(Optional[bool]):
50
            Whether to raise an error if the response is invalid.
51
            Defaults to False.
52
        eval_template(Optional[Union[str, BasePromptTemplate]]):
53
            The template to use for evaluation.
54
        refine_template(Optional[Union[str, BasePromptTemplate]]):
55
            The template to use for refinement.
56
    """
57

58
    def __init__(
59
        self,
60
        multi_modal_llm: Union[MultiModalLLM, None] = None,
61
        raise_error: bool = False,
62
        eval_template: Union[str, BasePromptTemplate, None] = None,
63
        refine_template: Union[str, BasePromptTemplate, None] = None,
64
    ) -> None:
65
        """Init params."""
66
        self._multi_modal_llm = multi_modal_llm or OpenAIMultiModal(
67
            model="gpt-4-vision-preview", max_new_tokens=1000
68
        )
69
        self._raise_error = raise_error
70

71
        self._eval_template: BasePromptTemplate
72
        if isinstance(eval_template, str):
73
            self._eval_template = PromptTemplate(eval_template)
74
        else:
75
            self._eval_template = eval_template or DEFAULT_EVAL_TEMPLATE
76

77
        self._refine_template: BasePromptTemplate
78
        if isinstance(refine_template, str):
79
            self._refine_template = PromptTemplate(refine_template)
80
        else:
81
            self._refine_template = refine_template or DEFAULT_REFINE_TEMPLATE
82

83
    def _get_prompts(self) -> PromptDictType:
84
        """Get prompts."""
85
        return {
86
            "eval_template": self._eval_template,
87
            "refine_template": self._refine_template,
88
        }
89

90
    def _update_prompts(self, prompts: PromptDictType) -> None:
91
        """Update prompts."""
92
        if "eval_template" in prompts:
93
            self._eval_template = prompts["eval_template"]
94
        if "refine_template" in prompts:
95
            self._refine_template = prompts["refine_template"]
96

97
    def evaluate(
98
        self,
99
        query: Union[str, None] = None,
100
        response: Union[str, None] = None,
101
        contexts: Union[Sequence[str], None] = None,
102
        image_paths: Union[List[str], None] = None,
103
        image_urls: Union[List[str], None] = None,
104
        **kwargs: Any,
105
    ) -> EvaluationResult:
106
        """Evaluate whether the multi-modal contexts and response are relevant to the query."""
107
        del kwargs  # Unused
108

109
        if query is None or contexts is None or response is None:
110
            raise ValueError("query, contexts, and response must be provided")
111

112
        context_str = "\n\n".join(contexts)
113
        evaluation_query_str = f"Question: {query}\nResponse: {response}"
114
        fmt_prompt = self._eval_template.format(
115
            context_str=context_str, query_str=evaluation_query_str
116
        )
117

118
        if image_paths:
119
            image_nodes = [
120
                ImageNode(image_path=image_path) for image_path in image_paths
121
            ]
122
        if image_urls:
123
            image_nodes = [ImageNode(image_url=image_url) for image_url in image_urls]
124

125
        response_obj = self._multi_modal_llm.complete(
126
            prompt=fmt_prompt,
127
            image_documents=image_nodes,
128
        )
129

130
        raw_response_txt = str(response_obj)
131

132
        if "yes" in raw_response_txt.lower():
133
            passing = True
134
        else:
135
            if self._raise_error:
136
                raise ValueError("The response is invalid")
137
            passing = False
138

139
        return EvaluationResult(
140
            query=query,
141
            response=response,
142
            passing=passing,
143
            score=1.0 if passing else 0.0,
144
            feedback=raw_response_txt,
145
        )
146

147
    async def aevaluate(
148
        self,
149
        query: Union[str, None] = None,
150
        response: Union[str, None] = None,
151
        contexts: Union[Sequence[str], None] = None,
152
        image_paths: Union[List[str], None] = None,
153
        image_urls: Union[List[str], None] = None,
154
        **kwargs: Any,
155
    ) -> EvaluationResult:
156
        """Async evaluate whether the multi-modal contexts and response are relevant to the query."""
157
        del kwargs  # Unused
158

159
        if query is None or contexts is None or response is None:
160
            raise ValueError("query, contexts, and response must be provided")
161

162
        context_str = "\n\n".join(contexts)
163
        evaluation_query_str = f"Question: {query}\nResponse: {response}"
164
        fmt_prompt = self._eval_template.format(
165
            context_str=context_str, query_str=evaluation_query_str
166
        )
167

168
        if image_paths:
169
            image_nodes = [
170
                ImageNode(image_path=image_path) for image_path in image_paths
171
            ]
172
        if image_urls:
173
            image_nodes = [ImageNode(image_url=image_url) for image_url in image_urls]
174

175
        response_obj = await self._multi_modal_llm.acomplete(
176
            prompt=fmt_prompt,
177
            image_documents=image_nodes,
178
        )
179

180
        raw_response_txt = str(response_obj)
181

182
        if "yes" in raw_response_txt.lower():
183
            passing = True
184
        else:
185
            if self._raise_error:
186
                raise ValueError("The response is invalid")
187
            passing = False
188

189
        return EvaluationResult(
190
            query=query,
191
            response=response,
192
            passing=passing,
193
            score=1.0 if passing else 0.0,
194
            feedback=raw_response_txt,
195
        )
196
llama-index

Использование cookies