llama-index
40 строк · 1.3 Кб
1import re2from pathlib import Path3from typing import Dict, List, Optional4
5from llama_index.legacy.readers.base import BaseReader6from llama_index.legacy.schema import Document7
8
9class IPYNBReader(BaseReader):10"""Image parser."""11
12def __init__(13self,14parser_config: Optional[Dict] = None,15concatenate: bool = False,16):17"""Init params."""18self._parser_config = parser_config19self._concatenate = concatenate20
21def load_data(22self, file: Path, extra_info: Optional[Dict] = None23) -> List[Document]:24"""Parse file."""25if file.name.endswith(".ipynb"):26try:27import nbconvert28except ImportError:29raise ImportError("Please install nbconvert 'pip install nbconvert' ")30string = nbconvert.exporters.ScriptExporter().from_file(file)[0]31# split each In[] cell into a separate string32splits = re.split(r"In\[\d+\]:", string)33# remove the first element, which is empty34splits.pop(0)35
36if self._concatenate:37docs = [Document(text="\n\n".join(splits), metadata=extra_info or {})]38else:39docs = [Document(text=s, metadata=extra_info or {}) for s in splits]40return docs41