llama-index

faithfulness.py
161 строка · 5.9 Кб
Перенос по словам
1
"""Faithfulness evaluation."""
2

3
from __future__ import annotations
4

5
import asyncio
6
from typing import Any, Sequence
7

8
from llama_index.legacy import ServiceContext
9
from llama_index.legacy.evaluation.base import BaseEvaluator, EvaluationResult
10
from llama_index.legacy.indices import SummaryIndex
11
from llama_index.legacy.prompts import BasePromptTemplate, PromptTemplate
12
from llama_index.legacy.prompts.mixin import PromptDictType
13
from llama_index.legacy.schema import Document
14

15
DEFAULT_EVAL_TEMPLATE = PromptTemplate(
16
    "Please tell if a given piece of information "
17
    "is supported by the context.\n"
18
    "You need to answer with either YES or NO.\n"
19
    "Answer YES if any of the context supports the information, even "
20
    "if most of the context is unrelated. "
21
    "Some examples are provided below. \n\n"
22
    "Information: Apple pie is generally double-crusted.\n"
23
    "Context: An apple pie is a fruit pie in which the principal filling "
24
    "ingredient is apples. \n"
25
    "Apple pie is often served with whipped cream, ice cream "
26
    "('apple pie à la mode'), custard or cheddar cheese.\n"
27
    "It is generally double-crusted, with pastry both above "
28
    "and below the filling; the upper crust may be solid or "
29
    "latticed (woven of crosswise strips).\n"
30
    "Answer: YES\n"
31
    "Information: Apple pies tastes bad.\n"
32
    "Context: An apple pie is a fruit pie in which the principal filling "
33
    "ingredient is apples. \n"
34
    "Apple pie is often served with whipped cream, ice cream "
35
    "('apple pie à la mode'), custard or cheddar cheese.\n"
36
    "It is generally double-crusted, with pastry both above "
37
    "and below the filling; the upper crust may be solid or "
38
    "latticed (woven of crosswise strips).\n"
39
    "Answer: NO\n"
40
    "Information: {query_str}\n"
41
    "Context: {context_str}\n"
42
    "Answer: "
43
)
44

45
DEFAULT_REFINE_TEMPLATE = PromptTemplate(
46
    "We want to understand if the following information is present "
47
    "in the context information: {query_str}\n"
48
    "We have provided an existing YES/NO answer: {existing_answer}\n"
49
    "We have the opportunity to refine the existing answer "
50
    "(only if needed) with some more context below.\n"
51
    "------------\n"
52
    "{context_msg}\n"
53
    "------------\n"
54
    "If the existing answer was already YES, still answer YES. "
55
    "If the information is present in the new context, answer YES. "
56
    "Otherwise answer NO.\n"
57
)
58

59

60
class FaithfulnessEvaluator(BaseEvaluator):
61
    """Faithfulness evaluator.
62

63
    Evaluates whether a response is faithful to the contexts
64
    (i.e. whether the response is supported by the contexts or hallucinated.)
65

66
    This evaluator only considers the response string and the list of context strings.
67

68
    Args:
69
        service_context(Optional[ServiceContext]):
70
            The service context to use for evaluation.
71
        raise_error(bool): Whether to raise an error when the response is invalid.
72
            Defaults to False.
73
        eval_template(Optional[Union[str, BasePromptTemplate]]):
74
            The template to use for evaluation.
75
        refine_template(Optional[Union[str, BasePromptTemplate]]):
76
            The template to use for refining the evaluation.
77
    """
78

79
    def __init__(
80
        self,
81
        service_context: ServiceContext | None = None,
82
        raise_error: bool = False,
83
        eval_template: str | BasePromptTemplate | None = None,
84
        refine_template: str | BasePromptTemplate | None = None,
85
    ) -> None:
86
        """Init params."""
87
        self._service_context = service_context or ServiceContext.from_defaults()
88
        self._raise_error = raise_error
89

90
        self._eval_template: BasePromptTemplate
91
        if isinstance(eval_template, str):
92
            self._eval_template = PromptTemplate(eval_template)
93
        else:
94
            self._eval_template = eval_template or DEFAULT_EVAL_TEMPLATE
95

96
        self._refine_template: BasePromptTemplate
97
        if isinstance(refine_template, str):
98
            self._refine_template = PromptTemplate(refine_template)
99
        else:
100
            self._refine_template = refine_template or DEFAULT_REFINE_TEMPLATE
101

102
    def _get_prompts(self) -> PromptDictType:
103
        """Get prompts."""
104
        return {
105
            "eval_template": self._eval_template,
106
            "refine_template": self._refine_template,
107
        }
108

109
    def _update_prompts(self, prompts: PromptDictType) -> None:
110
        """Update prompts."""
111
        if "eval_template" in prompts:
112
            self._eval_template = prompts["eval_template"]
113
        if "refine_template" in prompts:
114
            self._refine_template = prompts["refine_template"]
115

116
    async def aevaluate(
117
        self,
118
        query: str | None = None,
119
        response: str | None = None,
120
        contexts: Sequence[str] | None = None,
121
        sleep_time_in_seconds: int = 0,
122
        **kwargs: Any,
123
    ) -> EvaluationResult:
124
        """Evaluate whether the response is faithful to the contexts."""
125
        del query  # Unused
126
        del kwargs  # Unused
127

128
        await asyncio.sleep(sleep_time_in_seconds)
129

130
        if contexts is None or response is None:
131
            raise ValueError("contexts and response must be provided")
132

133
        docs = [Document(text=context) for context in contexts]
134
        index = SummaryIndex.from_documents(docs, service_context=self._service_context)
135

136
        query_engine = index.as_query_engine(
137
            text_qa_template=self._eval_template,
138
            refine_template=self._refine_template,
139
        )
140
        response_obj = await query_engine.aquery(response)
141

142
        raw_response_txt = str(response_obj)
143

144
        if "yes" in raw_response_txt.lower():
145
            passing = True
146
        else:
147
            passing = False
148
            if self._raise_error:
149
                raise ValueError("The response is invalid")
150

151
        return EvaluationResult(
152
            response=response,
153
            contexts=contexts,
154
            passing=passing,
155
            score=1.0 if passing else 0.0,
156
            feedback=raw_response_txt,
157
        )
158

159

160
# legacy: backward compatibility
161
ResponseEvaluator = FaithfulnessEvaluator
162
llama-index

Использование cookies