llama-index
43 строки · 1.2 Кб
1"""Epub parser.
2
3Contains parsers for epub files.
4"""
5
6from pathlib import Path
7from typing import Dict, List, Optional
8
9from llama_index.legacy.readers.base import BaseReader
10from llama_index.legacy.schema import Document
11
12
13class EpubReader(BaseReader):
14"""Epub Parser."""
15
16def load_data(
17self, file: Path, extra_info: Optional[Dict] = None
18) -> List[Document]:
19"""Parse file."""
20try:
21import ebooklib
22import html2text
23from ebooklib import epub
24except ImportError:
25raise ImportError(
26"Please install extra dependencies that are required for "
27"the EpubReader: "
28"`pip install EbookLib html2text`"
29)
30
31text_list = []
32book = epub.read_epub(file, options={"ignore_ncx": True})
33
34# Iterate through all chapters.
35for item in book.get_items():
36# Chapters are typically located in epub documents items.
37if item.get_type() == ebooklib.ITEM_DOCUMENT:
38text_list.append(
39html2text.html2text(item.get_content().decode("utf-8"))
40)
41
42text = "\n".join(text_list)
43return [Document(text=text, metadata=extra_info or {})]
44