llama-index
104 строки · 3.1 Кб
1"""Common utils for embeddings."""
2
3import json4import re5import uuid6from typing import Dict, List, Tuple7
8from tqdm import tqdm9
10from llama_index.legacy.bridge.pydantic import BaseModel11from llama_index.legacy.llms.utils import LLM12from llama_index.legacy.schema import MetadataMode, TextNode13
14
15class EmbeddingQAFinetuneDataset(BaseModel):16"""Embedding QA Finetuning Dataset.17
18Args:
19queries (Dict[str, str]): Dict id -> query.
20corpus (Dict[str, str]): Dict id -> string.
21relevant_docs (Dict[str, List[str]]): Dict query id -> list of doc ids.
22
23"""
24
25queries: Dict[str, str] # dict id -> query26corpus: Dict[str, str] # dict id -> string27relevant_docs: Dict[str, List[str]] # query id -> list of doc ids28mode: str = "text"29
30@property31def query_docid_pairs(self) -> List[Tuple[str, List[str]]]:32"""Get query, relevant doc ids."""33return [34(query, self.relevant_docs[query_id])35for query_id, query in self.queries.items()36]37
38def save_json(self, path: str) -> None:39"""Save json."""40with open(path, "w") as f:41json.dump(self.dict(), f, indent=4)42
43@classmethod44def from_json(cls, path: str) -> "EmbeddingQAFinetuneDataset":45"""Load json."""46with open(path) as f:47data = json.load(f)48return cls(**data)49
50
51DEFAULT_QA_GENERATE_PROMPT_TMPL = """\52Context information is below.
53
54---------------------
55{context_str}
56---------------------
57
58Given the context information and not prior knowledge.
59generate only questions based on the below query.
60
61You are a Teacher/ Professor. Your task is to setup \
62{num_questions_per_chunk} questions for an upcoming \
63quiz/examination. The questions should be diverse in nature \
64across the document. Restrict the questions to the \
65context information provided."
66"""
67
68
69# generate queries as a convenience function
70def generate_qa_embedding_pairs(71nodes: List[TextNode],72llm: LLM,73qa_generate_prompt_tmpl: str = DEFAULT_QA_GENERATE_PROMPT_TMPL,74num_questions_per_chunk: int = 2,75) -> EmbeddingQAFinetuneDataset:76"""Generate examples given a set of nodes."""77node_dict = {78node.node_id: node.get_content(metadata_mode=MetadataMode.NONE)79for node in nodes80}81
82queries = {}83relevant_docs = {}84for node_id, text in tqdm(node_dict.items()):85query = qa_generate_prompt_tmpl.format(86context_str=text, num_questions_per_chunk=num_questions_per_chunk87)88response = llm.complete(query)89
90result = str(response).strip().split("\n")91questions = [92re.sub(r"^\d+[\).\s]", "", question).strip() for question in result93]94questions = [question for question in questions if len(question) > 0]95
96for question in questions:97question_id = str(uuid.uuid4())98queries[question_id] = question99relevant_docs[question_id] = [node_id]100
101# construct dataset102return EmbeddingQAFinetuneDataset(103queries=queries, corpus=node_dict, relevant_docs=relevant_docs104)105