llama-index

Форк
0
1
"""PII postprocessor."""
2

3
import json
4
from copy import deepcopy
5
from typing import Callable, Dict, List, Optional, Tuple
6

7
from llama_index.legacy.postprocessor.types import BaseNodePostprocessor
8
from llama_index.legacy.prompts.base import PromptTemplate
9
from llama_index.legacy.schema import MetadataMode, NodeWithScore, QueryBundle
10
from llama_index.legacy.service_context import ServiceContext
11

12
DEFAULT_PII_TMPL = (
13
    "The current context information is provided. \n"
14
    "A task is also provided to mask the PII within the context. \n"
15
    "Return the text, with all PII masked out, and a mapping of the original PII "
16
    "to the masked PII. \n"
17
    "Return the output of the task in JSON. \n"
18
    "Context:\n"
19
    "Hello Zhang Wei, I am John. "
20
    "Your AnyCompany Financial Services, "
21
    "LLC credit card account 1111-0000-1111-0008 "
22
    "has a minimum payment of $24.53 that is due "
23
    "by July 31st. Based on your autopay settings, we will withdraw your payment. "
24
    "Task: Mask out the PII, replace each PII with a tag, and return the text. Return the mapping in JSON. \n"
25
    "Output: \n"
26
    "Hello [NAME1], I am [NAME2]. "
27
    "Your AnyCompany Financial Services, "
28
    "LLC credit card account [CREDIT_CARD_NUMBER] "
29
    "has a minimum payment of $24.53 that is due "
30
    "by [DATE_TIME]. Based on your autopay settings, we will withdraw your payment. "
31
    "Output Mapping:\n"
32
    '{{"NAME1": "Zhang Wei", "NAME2": "John", "CREDIT_CARD_NUMBER": "1111-0000-1111-0008", "DATE_TIME": "July 31st"}}\n'
33
    "Context:\n{context_str}\n"
34
    "Task: {query_str}\n"
35
    "Output: \n"
36
    ""
37
)
38

39

40
class PIINodePostprocessor(BaseNodePostprocessor):
41
    """PII Node processor.
42

43
    NOTE: the ServiceContext should contain a LOCAL model, not an external API.
44

45
    NOTE: this is a beta feature, the API might change.
46

47
    Args:
48
        service_context (ServiceContext): Service context.
49

50
    """
51

52
    service_context: ServiceContext
53
    pii_str_tmpl: str = DEFAULT_PII_TMPL
54
    pii_node_info_key: str = "__pii_node_info__"
55

56
    @classmethod
57
    def class_name(cls) -> str:
58
        return "PIINodePostprocessor"
59

60
    def mask_pii(self, text: str) -> Tuple[str, Dict]:
61
        """Mask PII in text."""
62
        pii_prompt = PromptTemplate(self.pii_str_tmpl)
63
        # TODO: allow customization
64
        task_str = (
65
            "Mask out the PII, replace each PII with a tag, and return the text. "
66
            "Return the mapping in JSON."
67
        )
68

69
        response = self.service_context.llm.predict(
70
            pii_prompt, context_str=text, query_str=task_str
71
        )
72
        splits = response.split("Output Mapping:")
73
        text_output = splits[0].strip()
74
        json_str_output = splits[1].strip()
75
        json_dict = json.loads(json_str_output)
76
        return text_output, json_dict
77

78
    def _postprocess_nodes(
79
        self,
80
        nodes: List[NodeWithScore],
81
        query_bundle: Optional[QueryBundle] = None,
82
    ) -> List[NodeWithScore]:
83
        """Postprocess nodes."""
84
        # swap out text from nodes, with the original node mappings
85
        new_nodes = []
86
        for node_with_score in nodes:
87
            node = node_with_score.node
88
            new_text, mapping_info = self.mask_pii(
89
                node.get_content(metadata_mode=MetadataMode.LLM)
90
            )
91
            new_node = deepcopy(node)
92
            new_node.excluded_embed_metadata_keys.append(self.pii_node_info_key)
93
            new_node.excluded_llm_metadata_keys.append(self.pii_node_info_key)
94
            new_node.metadata[self.pii_node_info_key] = mapping_info
95
            new_node.set_content(new_text)
96
            new_nodes.append(NodeWithScore(node=new_node, score=node_with_score.score))
97

98
        return new_nodes
99

100

101
class NERPIINodePostprocessor(BaseNodePostprocessor):
102
    """NER PII Node processor.
103

104
    Uses a HF transformers model.
105

106
    """
107

108
    pii_node_info_key: str = "__pii_node_info__"
109

110
    @classmethod
111
    def class_name(cls) -> str:
112
        return "NERPIINodePostprocessor"
113

114
    def mask_pii(self, ner: Callable, text: str) -> Tuple[str, Dict]:
115
        """Mask PII in text."""
116
        new_text = text
117
        response = ner(text)
118
        mapping = {}
119
        for entry in response:
120
            entity_group_tag = f"[{entry['entity_group']}_{entry['start']}]"
121
            new_text = new_text.replace(entry["word"], entity_group_tag).strip()
122
            mapping[entity_group_tag] = entry["word"]
123
        return new_text, mapping
124

125
    def _postprocess_nodes(
126
        self,
127
        nodes: List[NodeWithScore],
128
        query_bundle: Optional[QueryBundle] = None,
129
    ) -> List[NodeWithScore]:
130
        """Postprocess nodes."""
131
        from transformers import pipeline
132

133
        ner = pipeline("ner", grouped_entities=True)
134

135
        # swap out text from nodes, with the original node mappings
136
        new_nodes = []
137
        for node_with_score in nodes:
138
            node = node_with_score.node
139
            new_text, mapping_info = self.mask_pii(
140
                ner, node.get_content(metadata_mode=MetadataMode.LLM)
141
            )
142
            new_node = deepcopy(node)
143
            new_node.excluded_embed_metadata_keys.append(self.pii_node_info_key)
144
            new_node.excluded_llm_metadata_keys.append(self.pii_node_info_key)
145
            new_node.metadata[self.pii_node_info_key] = mapping_info
146
            new_node.set_content(new_text)
147
            new_nodes.append(NodeWithScore(node=new_node, score=node_with_score.score))
148

149
        return new_nodes
150

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.