rag-the-next-generation-of-conversational-ai-for-chatbots
94 строки · 3.3 Кб
1# import Libraries
2from pypdf import PdfReader3import faiss4import streamlit as st5import os6from langchain.vectorstores import chroma7from langchain.document_loaders import PyPDFLoader8from langchain.docstore.document import Document9from io import BytesIO10from typing import Any, Callable, Dict, List, Optional, Tuple, Union11from langchain.text_splitter import RecursiveCharacterTextSplitter12import re13from langchain.embeddings import OpenAIEmbeddings14from langchain.vectorstores.faiss import FAISS15import pickle16
17
18def parse_pdf(file:BytesIO, filename: str) -> Tuple[list[str],list]:19"""20Parses a pdf file and returns a list of pages
21"""
22pdf = PdfReader(file)23pages = []24
25for page in pdf.pages:26#extract text from page27text = page.extract_text()28
29# replace word that are spilts by hyphens at the end of the line30text = re.sub(r'(\w+)-\n(\w+)', r'\1\2', text)31# replace single newlines with spaces but not those flanked by spaces32text = re.sub(r'(?<! )\n(?![ ])', ' ', text.strip())33# Consolidate multiple newlines to two newlines34text = re.sub(r'\n\s*\n', '\n\n', text)35
36#append text to pages37pages.append(text)38
39#return the cleaned texts and the pages, filename40return pages, filename41
42
43def text_to_docs(text: list[str], filename: str) -> list[Document]:44# Please note that the input text is a list. If its single text, please convert it to a list45
46if isinstance(text, str):47text = [text]48
49# convert each text (page) to a document50page_docs = [Document(page_content=page) for page in text]51
52# assign a page number to each document53for i, doc in enumerate(page_docs):54doc.metadata["page_number"] = i + 155#doc.page_number = i + 156
57doc_chunks = []58
59# chunk the documents into smaller chunks and store them in separate documents60for doc in page_docs:61# intialize the text spillter with specific chunk size and delimeter62text_splitter = RecursiveCharacterTextSplitter(63chunk_size=4000,64chunk_overlap=0,65separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],66)67# split the document into chunks68chunks=text_splitter.split_text(doc.page_content)69
70# convert each chunk to a document and storing its chunk number, page number and filename71for i, chunk in enumerate(chunks):72doc= Document(73page_content=chunk,74metadata={75"page_number": doc.metadata["page_number"],76"chunk_number": i}77)78doc.metadata["source_filename"] = f"{doc.metadata['page_number']}-{doc.metadata['chunk_number']}-{filename}"79doc.metadata["filename"] = filename80doc_chunks.append(doc)81
82return doc_chunks83
84def docs_to_index(docs,openai_api_key):85index= FAISS.from_documents(docs,OpenAIEmbeddings(openai_api_key= openai_api_key))86return index87
88def get_index_for_pdf(pdf_files,pdf_names,openai_api_key):89documents=[]90for pdf_file,pdf_name in zip(pdf_files,pdf_names):91pages, filename = parse_pdf(BytesIO(pdf_file), pdf_name)92documents= documents+text_to_docs(pages, filename)93index= docs_to_index(documents,openai_api_key)94return index95
96