llama-index
77 строк · 2.1 Кб
1from pathlib import Path2from typing import TYPE_CHECKING, Dict, List, Optional3
4from llama_index.legacy.readers.base import BaseReader5from llama_index.legacy.schema import Document6
7if TYPE_CHECKING:8from bs4 import Tag9
10
11class HTMLTagReader(BaseReader):12"""13Read HTML files and extract text from a specific tag with BeautifulSoup.
14
15By default, reads the text from the ``<section>`` tag.
16"""
17
18def __init__(19self,20tag: str = "section",21ignore_no_id: bool = False,22) -> None:23self._tag = tag24self._ignore_no_id = ignore_no_id25
26super().__init__()27
28def load_data(29self, file: Path, extra_info: Optional[Dict] = None30) -> List[Document]:31try:32from bs4 import BeautifulSoup33except ImportError:34raise ImportError("bs4 is required to read HTML files.")35
36with open(file, encoding="utf-8") as html_file:37soup = BeautifulSoup(html_file, "html.parser")38
39tags = soup.find_all(self._tag)40docs = []41for tag in tags:42tag_id = tag.get("id")43tag_text = self._extract_text_from_tag(tag)44
45if self._ignore_no_id and not tag_id:46continue47
48metadata = {49"tag": self._tag,50"tag_id": tag_id,51"file_path": str(file),52}53metadata.update(extra_info or {})54
55doc = Document(56text=tag_text,57metadata=metadata,58)59docs.append(doc)60return docs61
62def _extract_text_from_tag(self, tag: "Tag") -> str:63try:64from bs4 import NavigableString65except ImportError:66raise ImportError("bs4 is required to read HTML files.")67
68texts = []69for elem in tag.children:70if isinstance(elem, NavigableString):71if elem.strip():72texts.append(elem.strip())73elif elem.name == self._tag:74continue75else:76texts.append(elem.get_text().strip())77return "\n".join(texts)78