llama-index
427 строк · 15.4 Кб
1"""Simple reader that reads files of different formats from a directory."""
2
3import logging4import mimetypes5import multiprocessing6import os7import warnings8from datetime import datetime9from functools import reduce10from itertools import repeat11from pathlib import Path12from typing import Any, Callable, Dict, Generator, List, Optional, Type13
14from tqdm import tqdm15
16from llama_index.legacy.readers.base import BaseReader17from llama_index.legacy.readers.file.docs_reader import DocxReader, HWPReader, PDFReader18from llama_index.legacy.readers.file.epub_reader import EpubReader19from llama_index.legacy.readers.file.image_reader import ImageReader20from llama_index.legacy.readers.file.ipynb_reader import IPYNBReader21from llama_index.legacy.readers.file.markdown_reader import MarkdownReader22from llama_index.legacy.readers.file.mbox_reader import MboxReader23from llama_index.legacy.readers.file.slides_reader import PptxReader24from llama_index.legacy.readers.file.tabular_reader import PandasCSVReader25from llama_index.legacy.readers.file.video_audio_reader import VideoAudioReader26from llama_index.legacy.schema import Document27
28DEFAULT_FILE_READER_CLS: Dict[str, Type[BaseReader]] = {29".hwp": HWPReader,30".pdf": PDFReader,31".docx": DocxReader,32".pptx": PptxReader,33".ppt": PptxReader,34".pptm": PptxReader,35".jpg": ImageReader,36".png": ImageReader,37".jpeg": ImageReader,38".mp3": VideoAudioReader,39".mp4": VideoAudioReader,40".csv": PandasCSVReader,41".epub": EpubReader,42".md": MarkdownReader,43".mbox": MboxReader,44".ipynb": IPYNBReader,45}
46
47
48def default_file_metadata_func(file_path: str) -> Dict:49"""Get some handy metadate from filesystem.50
51Args:
52file_path: str: file path in str
53"""
54return {55"file_path": file_path,56"file_name": os.path.basename(file_path),57"file_type": mimetypes.guess_type(file_path)[0],58"file_size": os.path.getsize(file_path),59"creation_date": datetime.fromtimestamp(60Path(file_path).stat().st_ctime61).strftime("%Y-%m-%d"),62"last_modified_date": datetime.fromtimestamp(63Path(file_path).stat().st_mtime64).strftime("%Y-%m-%d"),65"last_accessed_date": datetime.fromtimestamp(66Path(file_path).stat().st_atime67).strftime("%Y-%m-%d"),68}69
70
71logger = logging.getLogger(__name__)72
73
74class SimpleDirectoryReader(BaseReader):75"""Simple directory reader.76
77Load files from file directory.
78Automatically select the best file reader given file extensions.
79
80Args:
81input_dir (str): Path to the directory.
82input_files (List): List of file paths to read
83(Optional; overrides input_dir, exclude)
84exclude (List): glob of python file paths to exclude (Optional)
85exclude_hidden (bool): Whether to exclude hidden files (dotfiles).
86encoding (str): Encoding of the files.
87Default is utf-8.
88errors (str): how encoding and decoding errors are to be handled,
89see https://docs.python.org/3/library/functions.html#open
90recursive (bool): Whether to recursively search in subdirectories.
91False by default.
92filename_as_id (bool): Whether to use the filename as the document id.
93False by default.
94required_exts (Optional[List[str]]): List of required extensions.
95Default is None.
96file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file
97extension to a BaseReader class that specifies how to convert that file
98to text. If not specified, use default from DEFAULT_FILE_READER_CLS.
99num_files_limit (Optional[int]): Maximum number of files to read.
100Default is None.
101file_metadata (Optional[Callable[str, Dict]]): A function that takes
102in a filename and returns a Dict of metadata for the Document.
103Default is None.
104"""
105
106supported_suffix = list(DEFAULT_FILE_READER_CLS.keys())107
108def __init__(109self,110input_dir: Optional[str] = None,111input_files: Optional[List] = None,112exclude: Optional[List] = None,113exclude_hidden: bool = True,114errors: str = "ignore",115recursive: bool = False,116encoding: str = "utf-8",117filename_as_id: bool = False,118required_exts: Optional[List[str]] = None,119file_extractor: Optional[Dict[str, BaseReader]] = None,120num_files_limit: Optional[int] = None,121file_metadata: Optional[Callable[[str], Dict]] = None,122) -> None:123"""Initialize with parameters."""124super().__init__()125
126if not input_dir and not input_files:127raise ValueError("Must provide either `input_dir` or `input_files`.")128
129self.errors = errors130self.encoding = encoding131
132self.exclude = exclude133self.recursive = recursive134self.exclude_hidden = exclude_hidden135self.required_exts = required_exts136self.num_files_limit = num_files_limit137
138if input_files:139self.input_files = []140for path in input_files:141if not os.path.isfile(path):142raise ValueError(f"File {path} does not exist.")143input_file = Path(path)144self.input_files.append(input_file)145elif input_dir:146if not os.path.isdir(input_dir):147raise ValueError(f"Directory {input_dir} does not exist.")148self.input_dir = Path(input_dir)149self.exclude = exclude150self.input_files = self._add_files(self.input_dir)151
152if file_extractor is not None:153self.file_extractor = file_extractor154else:155self.file_extractor = {}156
157self.file_metadata = file_metadata or default_file_metadata_func158self.filename_as_id = filename_as_id159
160def is_hidden(self, path: Path) -> bool:161return any(162part.startswith(".") and part not in [".", ".."] for part in path.parts163)164
165def _add_files(self, input_dir: Path) -> List[Path]:166"""Add files."""167all_files = set()168rejected_files = set()169
170if self.exclude is not None:171for excluded_pattern in self.exclude:172if self.recursive:173# Recursive glob174for file in input_dir.rglob(excluded_pattern):175rejected_files.add(Path(file))176else:177# Non-recursive glob178for file in input_dir.glob(excluded_pattern):179rejected_files.add(Path(file))180
181file_refs: Generator[Path, None, None]182if self.recursive:183file_refs = Path(input_dir).rglob("*")184else:185file_refs = Path(input_dir).glob("*")186
187for ref in file_refs:188# Manually check if file is hidden or directory instead of189# in glob for backwards compatibility.190is_dir = ref.is_dir()191skip_because_hidden = self.exclude_hidden and self.is_hidden(ref)192skip_because_bad_ext = (193self.required_exts is not None and ref.suffix not in self.required_exts194)195skip_because_excluded = ref in rejected_files196
197if (198is_dir
199or skip_because_hidden200or skip_because_bad_ext201or skip_because_excluded202):203continue204else:205all_files.add(ref)206
207new_input_files = sorted(all_files)208
209if len(new_input_files) == 0:210raise ValueError(f"No files found in {input_dir}.")211
212if self.num_files_limit is not None and self.num_files_limit > 0:213new_input_files = new_input_files[0 : self.num_files_limit]214
215# print total number of files added216logger.debug(217f"> [SimpleDirectoryReader] Total files added: {len(new_input_files)}"218)219
220return new_input_files221
222def _exclude_metadata(self, documents: List[Document]) -> List[Document]:223"""Exclude metadata from documents.224
225Args:
226documents (List[Document]): List of documents.
227"""
228for doc in documents:229# Keep only metadata['file_path'] in both embedding and llm content230# str, which contain extreme important context that about the chunks.231# Dates is provided for convenience of postprocessor such as232# TimeWeightedPostprocessor, but excluded for embedding and LLMprompts233doc.excluded_embed_metadata_keys.extend(234[235"file_name",236"file_type",237"file_size",238"creation_date",239"last_modified_date",240"last_accessed_date",241]242)243doc.excluded_llm_metadata_keys.extend(244[245"file_name",246"file_type",247"file_size",248"creation_date",249"last_modified_date",250"last_accessed_date",251]252)253
254return documents255
256@staticmethod257def load_file(258input_file: Path,259file_metadata: Callable[[str], Dict],260file_extractor: Dict[str, BaseReader],261filename_as_id: bool = False,262encoding: str = "utf-8",263errors: str = "ignore",264) -> List[Document]:265"""Static method for loading file.266
267NOTE: necessarily as a static method for parallel processing.
268
269Args:
270input_file (Path): _description_
271file_metadata (Callable[[str], Dict]): _description_
272file_extractor (Dict[str, BaseReader]): _description_
273filename_as_id (bool, optional): _description_. Defaults to False.
274encoding (str, optional): _description_. Defaults to "utf-8".
275errors (str, optional): _description_. Defaults to "ignore".
276
277input_file (Path): File path to read
278file_metadata ([Callable[str, Dict]]): A function that takes
279in a filename and returns a Dict of metadata for the Document.
280file_extractor (Dict[str, BaseReader]): A mapping of file
281extension to a BaseReader class that specifies how to convert that file
282to text.
283filename_as_id (bool): Whether to use the filename as the document id.
284encoding (str): Encoding of the files.
285Default is utf-8.
286errors (str): how encoding and decoding errors are to be handled,
287see https://docs.python.org/3/library/functions.html#open
288
289Returns:
290List[Document]: loaded documents
291"""
292metadata: Optional[dict] = None293documents: List[Document] = []294
295if file_metadata is not None:296metadata = file_metadata(str(input_file))297
298file_suffix = input_file.suffix.lower()299if (300file_suffix in SimpleDirectoryReader.supported_suffix301or file_suffix in file_extractor302):303# use file readers304if file_suffix not in file_extractor:305# instantiate file reader if not already306reader_cls = DEFAULT_FILE_READER_CLS[file_suffix]307file_extractor[file_suffix] = reader_cls()308reader = file_extractor[file_suffix]309
310# load data -- catch all errors except for ImportError311try:312docs = reader.load_data(input_file, extra_info=metadata)313except ImportError as e:314# ensure that ImportError is raised so user knows315# about missing dependencies316raise ImportError(str(e))317except Exception as e:318# otherwise, just skip the file and report the error319print(320f"Failed to load file {input_file} with error: {e}. Skipping...",321flush=True,322)323return []324
325# iterate over docs if needed326if filename_as_id:327for i, doc in enumerate(docs):328doc.id_ = f"{input_file!s}_part_{i}"329
330documents.extend(docs)331else:332# do standard read333with open(input_file, errors=errors, encoding=encoding) as f:334data = f.read()335
336doc = Document(text=data, metadata=metadata or {})337if filename_as_id:338doc.id_ = str(input_file)339
340documents.append(doc)341
342return documents343
344def load_data(345self, show_progress: bool = False, num_workers: Optional[int] = None346) -> List[Document]:347"""Load data from the input directory.348
349Args:
350show_progress (bool): Whether to show tqdm progress bars. Defaults to False.
351
352Returns:
353List[Document]: A list of documents.
354"""
355documents = []356
357files_to_process = self.input_files358
359if num_workers and num_workers > 1:360if num_workers > multiprocessing.cpu_count():361warnings.warn(362"Specified num_workers exceed number of CPUs in the system. "363"Setting `num_workers` down to the maximum CPU count."364)365with multiprocessing.get_context("spawn").Pool(num_workers) as p:366results = p.starmap(367SimpleDirectoryReader.load_file,368zip(369files_to_process,370repeat(self.file_metadata),371repeat(self.file_extractor),372repeat(self.filename_as_id),373repeat(self.encoding),374repeat(self.errors),375),376)377documents = reduce(lambda x, y: x + y, results)378
379else:380if show_progress:381files_to_process = tqdm(382self.input_files, desc="Loading files", unit="file"383)384for input_file in files_to_process:385documents.extend(386SimpleDirectoryReader.load_file(387input_file=input_file,388file_metadata=self.file_metadata,389file_extractor=self.file_extractor,390filename_as_id=self.filename_as_id,391encoding=self.encoding,392errors=self.errors,393)394)395
396return self._exclude_metadata(documents)397
398def iter_data(399self, show_progress: bool = False400) -> Generator[List[Document], Any, Any]:401"""Load data iteratively from the input directory.402
403Args:
404show_progress (bool): Whether to show tqdm progress bars. Defaults to False.
405
406Returns:
407Generator[List[Document]]: A list of documents.
408"""
409files_to_process = self.input_files410
411if show_progress:412files_to_process = tqdm(self.input_files, desc="Loading files", unit="file")413
414for input_file in files_to_process:415documents = SimpleDirectoryReader.load_file(416input_file=input_file,417file_metadata=self.file_metadata,418file_extractor=self.file_extractor,419filename_as_id=self.filename_as_id,420encoding=self.encoding,421errors=self.errors,422)423
424documents = self._exclude_metadata(documents)425
426if len(documents) > 0:427yield documents428