llama-index

dataset_gen.py
164 строки · 9.2 Кб
Перенос по словам
1
"""Dataset Generator for Cross Encoder Finetuning."""
2

3
import re
4
from dataclasses import dataclass
5
from typing import List, Optional
6

7
from tqdm.auto import tqdm
8

9
from llama_index.legacy import VectorStoreIndex, get_tokenizer
10
from llama_index.legacy.llms import ChatMessage, OpenAI
11
from llama_index.legacy.llms.llm import LLM
12
from llama_index.legacy.node_parser import TokenTextSplitter
13
from llama_index.legacy.schema import Document, MetadataMode
14

15

16
@dataclass
17
class CrossEncoderFinetuningDatasetSample:
18
    """Class for keeping track of each item of Cross-Encoder training Dataset."""
19

20
    query: str
21
    context: str
22
    score: int
23

24

25
DEFAULT_QUERY_GEN_SYSTEM_PROMPT = """You are Albert a Professor proficient in {qa_topic}.
26
You are working on creating {num_questions_per_chunk} questions.
27
You provide the questions such that such that each separate is separated by a semicolon ';' so that different questions can be easily separated by the python split function"""
28

29

30
DEFAULT_QUERY_GEN_USER_PROMPT = """Take a deep breath, read through the below provided document and then create {num_questions_per_chunk} questions and respond with the created questions such that each separate question is separated by a semicolon ';' so that different questions can be easily separated by the python split function.
31
Document: {context}"""
32

33

34
def generate_synthetic_queries_over_documents(
35
    documents: List[Document],
36
    num_questions_per_chunk: int = 5,
37
    max_chunk_length: int = 3000,
38
    qa_topic: str = "everything",
39
    llm: Optional[LLM] = None,
40
    qa_generate_system_msg: str = DEFAULT_QUERY_GEN_SYSTEM_PROMPT,
41
    qa_generate_user_msg: str = DEFAULT_QUERY_GEN_USER_PROMPT,
42
) -> List[str]:
43
    questions = []
44
    node_parser = TokenTextSplitter(
45
        separator=" ",
46
        chunk_size=max_chunk_length,
47
        chunk_overlap=0,
48
        backup_separators=["\n"],
49
        tokenizer=get_tokenizer(),
50
    )
51

52
    llm = llm or OpenAI(model="gpt-3.5-turbo-16k", temperature=0.3)
53
    nodes = node_parser.get_nodes_from_documents(documents, show_progress=False)
54

55
    node_dict = {
56
        node.node_id: node.get_content(metadata_mode=MetadataMode.NONE)
57
        for node in nodes
58
    }
59

60
    for node_id, text in tqdm(node_dict.items()):
61
        system_msg = qa_generate_system_msg.format(
62
            num_questions_per_chunk=num_questions_per_chunk, qa_topic=qa_topic
63
        )
64
        user_msg = qa_generate_user_msg.format(
65
            num_questions_per_chunk=num_questions_per_chunk, context=text
66
        )
67
        messages = [
68
            ChatMessage(role="system", content=system_msg),
69
            ChatMessage(role="user", content=user_msg),
70
        ]
71
        response = llm.chat(messages)
72
        response_content: str = (
73
            response.message.content if response.message.content is not None else ""
74
        )
75
        response_questions = re.split(";|\n", response_content)
76
        questions.extend(response_questions)
77

78
    return questions
79

80

81
# Query-Doc relevance prompt taken from OpenAI cookbook:-
82
# https://github.com/openai/openai-cookbook/blob/main/examples/Search_reranking_with_cross-encoders.ipynb
83
DEFAULT_QUERY_DOC_RELEVANCE_PROMPT = '''You are an Assistant responsible for helping detect whether the retrieved document is relevant to the query. For a given input, you need to output a single token: "Yes" or "No" indicating the retrieved document is relevant to the query.
84

85
Query: How to plant a tree?
86
Document: """Cars were invented in 1886, when German inventor Carl Benz patented his Benz Patent-Motorwagen.[3][4][5] Cars became widely available during the 20th century. One of the first cars affordable by the masses was the 1908 Model T, an American car manufactured by the Ford Motor Company. Cars were rapidly adopted in the US, where they replaced horse-drawn carriages.[6] In Europe and other parts of the world, demand for automobiles did not increase until after World War II.[7] The car is considered an essential part of the developed economy."""
87
Relevant: No
88

89
Query: Has the coronavirus vaccine been approved?
90
Document: """The Pfizer-BioNTech COVID-19 vaccine was approved for emergency use in the United States on December 11, 2020."""
91
Relevant: Yes
92

93
Query: What is the capital of France?
94
Document: """Paris, France's capital, is a major European city and a global center for art, fashion, gastronomy and culture. Its 19th-century cityscape is crisscrossed by wide boulevards and the River Seine. Beyond such landmarks as the Eiffel Tower and the 12th-century, Gothic Notre-Dame cathedral, the city is known for its cafe culture and designer boutiques along the Rue du Faubourg Saint-Honoré."""
95
Relevant: Yes
96

97
Query: What are some papers to learn about PPO reinforcement learning?
98
Document: """Proximal Policy Optimization and its Dynamic Version for Sequence Generation: In sequence generation task, many works use policy gradient for model optimization to tackle the intractable backpropagation issue when maximizing the non-differentiable evaluation metrics or fooling the discriminator in adversarial learning. In this paper, we replace policy gradient with proximal policy optimization (PPO), which is a proved more efficient reinforcement learning algorithm, and propose a dynamic approach for PPO (PPO-dynamic). We demonstrate the efficacy of PPO and PPO-dynamic on conditional sequence generation tasks including synthetic experiment and chit-chat chatbot. The results show that PPO and PPO-dynamic can beat policy gradient by stability and performance."""
99
Relevant: Yes
100

101
Query: Explain sentence embeddings
102
Document: """Inside the bubble: exploring the environments of reionisation-era Lyman-α emitting galaxies with JADES and FRESCO: We present a study of the environments of 16 Lyman-α emitting galaxies (LAEs) in the reionisation era (5.8<z<8) identified by JWST/NIRSpec as part of the JWST Advanced Deep Extragalactic Survey (JADES). Unless situated in sufficiently (re)ionised regions, Lyman-α emission from these galaxies would be strongly absorbed by neutral gas in the intergalactic medium (IGM). We conservatively estimate sizes of the ionised regions required to reconcile the relatively low Lyman-α velocity offsets (ΔvLyα<300kms−1) with moderately high Lyman-α escape fractions (fesc,Lyα>5%) observed in our sample of LAEs, indicating the presence of ionised ``bubbles'' with physical sizes of the order of 0.1pMpc≲Rion≲1pMpc in a patchy reionisation scenario where the bubbles are embedded in a fully neutral IGM. Around half of the LAEs in our sample are found to coincide with large-scale galaxy overdensities seen in FRESCO at z∼5.8-5.9 and z∼7.3, suggesting Lyman-α transmission is strongly enhanced in such overdense regions, and underlining the importance of LAEs as tracers of the first large-scale ionised bubbles. Considering only spectroscopically confirmed galaxies, we find our sample of UV-faint LAEs (MUV≳−20mag) and their direct neighbours are generally not able to produce the required ionised regions based on the Lyman-α transmission properties, suggesting lower-luminosity sources likely play an important role in carving out these bubbles. These observations demonstrate the combined power of JWST multi-object and slitless spectroscopy in acquiring a unique view of the early stages of Cosmic Reionisation via the most distant LAEs."""
103
Relevant: No
104

105
Query: {query}
106
Document: """{document}"""
107
Relevant:
108
'''
109

110

111
def generate_ce_fine_tuning_dataset(
112
    documents: List[Document],
113
    questions_list: List[str],
114
    max_chunk_length: int = 1000,
115
    llm: Optional[LLM] = None,
116
    qa_doc_relevance_prompt: str = DEFAULT_QUERY_DOC_RELEVANCE_PROMPT,
117
    top_k: int = 8,
118
) -> List[CrossEncoderFinetuningDatasetSample]:
119
    ce_dataset_list = []
120

121
    node_parser = TokenTextSplitter(
122
        separator=" ",
123
        chunk_size=max_chunk_length,
124
        chunk_overlap=0,
125
        backup_separators=["\n"],
126
        tokenizer=get_tokenizer(),
127
    )
128

129
    # Use logit bias in case of OpenAI for the tokens for Yes and No
130
    # to decrease the likelihood of any other tokens occurring
131
    llm = llm or OpenAI(
132
        model="gpt-3.5-turbo-16k", temperature=0.1, logit_bias={9642: 1, 2822: 1}
133
    )
134

135
    nodes = node_parser.get_nodes_from_documents(documents, show_progress=False)
136

137
    index = VectorStoreIndex(nodes)
138
    retriever = index.as_retriever(similarity_top_k=top_k)
139

140
    for question in tqdm(questions_list):
141
        if question != "":
142
            retrieved_nodes = retriever.retrieve(question)
143
            for node in retrieved_nodes:
144
                node_content = node.get_text()
145
                msg_prompt = qa_doc_relevance_prompt.format(
146
                    query=question, document=node_content
147
                )
148
                response = llm.complete(msg_prompt)
149
                result = response.text.strip().lower()
150

151
                if result == "yes":
152
                    question_row = CrossEncoderFinetuningDatasetSample(
153
                        query=question, context=node_content, score=1
154
                    )
155
                    ce_dataset_list.append(question_row)
156
                elif result == "no":
157
                    question_row = CrossEncoderFinetuningDatasetSample(
158
                        query=question, context=node_content, score=0
159
                    )
160
                    ce_dataset_list.append(question_row)
161
                else:
162
                    pass
163

164
    return ce_dataset_list
165
llama-index

Использование cookies