rag-the-next-generation-of-conversational-ai-for-chatbots

Форк
0
94 строки · 3.3 Кб
1
# import Libraries
2
from pypdf import PdfReader
3
import faiss
4
import streamlit as st
5
import os
6
from langchain.vectorstores import chroma
7
from langchain.document_loaders import PyPDFLoader
8
from langchain.docstore.document import Document
9
from io import BytesIO
10
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
11
from langchain.text_splitter import RecursiveCharacterTextSplitter
12
import re
13
from langchain.embeddings import OpenAIEmbeddings
14
from langchain.vectorstores.faiss import FAISS
15
import pickle
16

17

18
def parse_pdf(file:BytesIO, filename: str) -> Tuple[list[str],list]:
19
    """
20
    Parses a pdf file and returns a list of pages
21
    """
22
    pdf = PdfReader(file)
23
    pages = []
24

25
    for page in pdf.pages:
26
        #extract text from page
27
        text = page.extract_text()
28

29
        # replace word that are spilts by hyphens at the end of the line
30
        text = re.sub(r'(\w+)-\n(\w+)', r'\1\2', text)
31
        # replace single newlines with spaces but not those flanked by spaces
32
        text = re.sub(r'(?<! )\n(?![ ])', ' ', text.strip())
33
        # Consolidate multiple newlines to two newlines
34
        text = re.sub(r'\n\s*\n', '\n\n', text)
35

36
        #append text to pages
37
        pages.append(text)
38
    
39
    #return the cleaned texts and the pages, filename
40
    return pages, filename
41

42

43
def text_to_docs(text: list[str], filename: str) -> list[Document]:
44
    # Please note that the input text is a list. If its single text, please convert it to a list
45

46
    if isinstance(text, str):
47
        text = [text]
48

49
        # convert each text (page) to a document
50
    page_docs = [Document(page_content=page) for page in text]
51

52
    # assign a page number to each document
53
    for i, doc in enumerate(page_docs):
54
        doc.metadata["page_number"] = i + 1
55
        #doc.page_number = i + 1
56
    
57
    doc_chunks = []
58

59
    # chunk the documents into smaller chunks and store them in separate documents
60
    for doc in page_docs:
61
        # intialize the text spillter with specific chunk size and delimeter
62
        text_splitter = RecursiveCharacterTextSplitter(
63
            chunk_size=4000, 
64
            chunk_overlap=0,
65
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
66
            )
67
        # split the document into chunks
68
        chunks=text_splitter.split_text(doc.page_content)
69

70
        # convert each chunk to a document and storing its chunk number, page number and filename
71
        for i, chunk in enumerate(chunks):
72
            doc= Document(
73
                page_content=chunk,
74
                metadata={
75
                    "page_number": doc.metadata["page_number"], 
76
                    "chunk_number": i}
77
            )
78
            doc.metadata["source_filename"] = f"{doc.metadata['page_number']}-{doc.metadata['chunk_number']}-{filename}"
79
            doc.metadata["filename"] = filename
80
            doc_chunks.append(doc)
81

82
    return doc_chunks
83

84
def docs_to_index(docs,openai_api_key):
85
    index= FAISS.from_documents(docs,OpenAIEmbeddings(openai_api_key= openai_api_key))
86
    return index
87

88
def get_index_for_pdf(pdf_files,pdf_names,openai_api_key):
89
    documents=[]
90
    for pdf_file,pdf_name in zip(pdf_files,pdf_names):
91
        pages, filename = parse_pdf(BytesIO(pdf_file), pdf_name)
92
        documents= documents+text_to_docs(pages, filename)
93
    index= docs_to_index(documents,openai_api_key)
94
    return index
95

96

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.