llama-index

html_reader.py
77 строк · 2.1 Кб
Перенос по словам
1
from pathlib import Path
2
from typing import TYPE_CHECKING, Dict, List, Optional
3

4
from llama_index.legacy.readers.base import BaseReader
5
from llama_index.legacy.schema import Document
6

7
if TYPE_CHECKING:
8
    from bs4 import Tag
9

10

11
class HTMLTagReader(BaseReader):
12
    """
13
    Read HTML files and extract text from a specific tag with BeautifulSoup.
14

15
    By default, reads the text from the ``<section>`` tag.
16
    """
17

18
    def __init__(
19
        self,
20
        tag: str = "section",
21
        ignore_no_id: bool = False,
22
    ) -> None:
23
        self._tag = tag
24
        self._ignore_no_id = ignore_no_id
25

26
        super().__init__()
27

28
    def load_data(
29
        self, file: Path, extra_info: Optional[Dict] = None
30
    ) -> List[Document]:
31
        try:
32
            from bs4 import BeautifulSoup
33
        except ImportError:
34
            raise ImportError("bs4 is required to read HTML files.")
35

36
        with open(file, encoding="utf-8") as html_file:
37
            soup = BeautifulSoup(html_file, "html.parser")
38

39
        tags = soup.find_all(self._tag)
40
        docs = []
41
        for tag in tags:
42
            tag_id = tag.get("id")
43
            tag_text = self._extract_text_from_tag(tag)
44

45
            if self._ignore_no_id and not tag_id:
46
                continue
47

48
            metadata = {
49
                "tag": self._tag,
50
                "tag_id": tag_id,
51
                "file_path": str(file),
52
            }
53
            metadata.update(extra_info or {})
54

55
            doc = Document(
56
                text=tag_text,
57
                metadata=metadata,
58
            )
59
            docs.append(doc)
60
        return docs
61

62
    def _extract_text_from_tag(self, tag: "Tag") -> str:
63
        try:
64
            from bs4 import NavigableString
65
        except ImportError:
66
            raise ImportError("bs4 is required to read HTML files.")
67

68
        texts = []
69
        for elem in tag.children:
70
            if isinstance(elem, NavigableString):
71
                if elem.strip():
72
                    texts.append(elem.strip())
73
            elif elem.name == self._tag:
74
                continue
75
            else:
76
                texts.append(elem.get_text().strip())
77
        return "\n".join(texts)
78
llama-index

Использование cookies