local-llm-with-rag
/
document_loader.py
52 строки · 1.5 Кб
1from langchain_community.document_loaders import (
2DirectoryLoader,
3PyPDFLoader,
4TextLoader,
5)
6import os
7from typing import List
8from langchain_core.documents import Document
9
10
11def load_documents(path: str) -> List[Document]:
12"""
13Loads documents from the specified directory path.
14
15This function supports loading of PDF, Markdown, and HTML documents by utilizing
16different loaders for each file type. It checks if the provided path exists and
17raises a FileNotFoundError if it does not. It then iterates over the supported
18file types and uses the corresponding loader to load the documents into a list.
19
20Args:
21path (str): The path to the directory containing documents to load.
22
23Returns:
24List[Document]: A list of loaded documents.
25
26Raises:
27FileNotFoundError: If the specified path does not exist.
28"""
29if not os.path.exists(path):
30raise FileNotFoundError(f"The specified path does not exist: {path}")
31
32loaders = {
33".pdf": DirectoryLoader(
34path,
35glob="**/*.pdf",
36loader_cls=PyPDFLoader,
37show_progress=True,
38use_multithreading=True,
39),
40".md": DirectoryLoader(
41path,
42glob="**/*.md",
43loader_cls=TextLoader,
44show_progress=True,
45),
46}
47
48docs = []
49for file_type, loader in loaders.items():
50print(f"Loading {file_type} files")
51docs.extend(loader.load())
52return docs
53