llama-index

Форк
0
315 строк · 9.1 Кб
1
"""Web scraper."""
2

3
import logging
4
from typing import Any, Callable, Dict, List, Optional, Tuple
5

6
import requests
7

8
from llama_index.legacy.bridge.pydantic import PrivateAttr
9
from llama_index.legacy.readers.base import BasePydanticReader
10
from llama_index.legacy.schema import Document
11

12
logger = logging.getLogger(__name__)
13

14

15
class SimpleWebPageReader(BasePydanticReader):
16
    """Simple web page reader.
17

18
    Reads pages from the web.
19

20
    Args:
21
        html_to_text (bool): Whether to convert HTML to text.
22
            Requires `html2text` package.
23
        metadata_fn (Optional[Callable[[str], Dict]]): A function that takes in
24
            a URL and returns a dictionary of metadata.
25
            Default is None.
26
    """
27

28
    is_remote: bool = True
29
    html_to_text: bool
30

31
    _metadata_fn: Optional[Callable[[str], Dict]] = PrivateAttr()
32

33
    def __init__(
34
        self,
35
        html_to_text: bool = False,
36
        metadata_fn: Optional[Callable[[str], Dict]] = None,
37
    ) -> None:
38
        """Initialize with parameters."""
39
        try:
40
            import html2text  # noqa
41
        except ImportError:
42
            raise ImportError(
43
                "`html2text` package not found, please run `pip install html2text`"
44
            )
45
        self._metadata_fn = metadata_fn
46
        super().__init__(html_to_text=html_to_text)
47

48
    @classmethod
49
    def class_name(cls) -> str:
50
        return "SimpleWebPageReader"
51

52
    def load_data(self, urls: List[str]) -> List[Document]:
53
        """Load data from the input directory.
54

55
        Args:
56
            urls (List[str]): List of URLs to scrape.
57

58
        Returns:
59
            List[Document]: List of documents.
60

61
        """
62
        if not isinstance(urls, list):
63
            raise ValueError("urls must be a list of strings.")
64
        documents = []
65
        for url in urls:
66
            response = requests.get(url, headers=None).text
67
            if self.html_to_text:
68
                import html2text
69

70
                response = html2text.html2text(response)
71

72
            metadata: Optional[Dict] = None
73
            if self._metadata_fn is not None:
74
                metadata = self._metadata_fn(url)
75

76
            documents.append(Document(text=response, id_=url, metadata=metadata or {}))
77

78
        return documents
79

80

81
class TrafilaturaWebReader(BasePydanticReader):
82
    """Trafilatura web page reader.
83

84
    Reads pages from the web.
85
    Requires the `trafilatura` package.
86

87
    """
88

89
    is_remote: bool = True
90
    error_on_missing: bool
91

92
    def __init__(self, error_on_missing: bool = False) -> None:
93
        """Initialize with parameters.
94

95
        Args:
96
            error_on_missing (bool): Throw an error when data cannot be parsed
97
        """
98
        try:
99
            import trafilatura  # noqa
100
        except ImportError:
101
            raise ImportError(
102
                "`trafilatura` package not found, please run `pip install trafilatura`"
103
            )
104
        super().__init__(error_on_missing=error_on_missing)
105

106
    @classmethod
107
    def class_name(cls) -> str:
108
        return "TrafilaturaWebReader"
109

110
    def load_data(self, urls: List[str]) -> List[Document]:
111
        """Load data from the urls.
112

113
        Args:
114
            urls (List[str]): List of URLs to scrape.
115

116
        Returns:
117
            List[Document]: List of documents.
118

119
        """
120
        import trafilatura
121

122
        if not isinstance(urls, list):
123
            raise ValueError("urls must be a list of strings.")
124
        documents = []
125
        for url in urls:
126
            downloaded = trafilatura.fetch_url(url)
127
            if not downloaded:
128
                if self.error_on_missing:
129
                    raise ValueError(f"Trafilatura fails to get string from url: {url}")
130
                continue
131
            response = trafilatura.extract(downloaded)
132
            if not response:
133
                if self.error_on_missing:
134
                    raise ValueError(f"Trafilatura fails to parse page: {url}")
135
                continue
136
            documents.append(Document(id_=url, text=response))
137

138
        return documents
139

140

141
def _substack_reader(soup: Any) -> Tuple[str, Dict[str, Any]]:
142
    """Extract text from Substack blog post."""
143
    metadata = {
144
        "Title of this Substack post": soup.select_one("h1.post-title").getText(),
145
        "Subtitle": soup.select_one("h3.subtitle").getText(),
146
        "Author": soup.select_one("span.byline-names").getText(),
147
    }
148
    text = soup.select_one("div.available-content").getText()
149
    return text, metadata
150

151

152
DEFAULT_WEBSITE_EXTRACTOR: Dict[str, Callable[[Any], Tuple[str, Dict[str, Any]]]] = {
153
    "substack.com": _substack_reader,
154
}
155

156

157
class BeautifulSoupWebReader(BasePydanticReader):
158
    """BeautifulSoup web page reader.
159

160
    Reads pages from the web.
161
    Requires the `bs4` and `urllib` packages.
162

163
    Args:
164
        website_extractor (Optional[Dict[str, Callable]]): A mapping of website
165
            hostname (e.g. google.com) to a function that specifies how to
166
            extract text from the BeautifulSoup obj. See DEFAULT_WEBSITE_EXTRACTOR.
167
    """
168

169
    is_remote: bool = True
170
    _website_extractor: Dict[str, Callable] = PrivateAttr()
171

172
    def __init__(
173
        self,
174
        website_extractor: Optional[Dict[str, Callable]] = None,
175
    ) -> None:
176
        """Initialize with parameters."""
177
        try:
178
            from urllib.parse import urlparse  # noqa
179

180
            import requests  # noqa
181
            from bs4 import BeautifulSoup  # noqa
182
        except ImportError:
183
            raise ImportError(
184
                "`bs4`, `requests`, and `urllib` must be installed to scrape websites."
185
                "Please run `pip install bs4 requests urllib`."
186
            )
187

188
        self._website_extractor = website_extractor or DEFAULT_WEBSITE_EXTRACTOR
189
        super().__init__()
190

191
    @classmethod
192
    def class_name(cls) -> str:
193
        return "BeautifulSoupWebReader"
194

195
    def load_data(
196
        self, urls: List[str], custom_hostname: Optional[str] = None
197
    ) -> List[Document]:
198
        """Load data from the urls.
199

200
        Args:
201
            urls (List[str]): List of URLs to scrape.
202
            custom_hostname (Optional[str]): Force a certain hostname in the case
203
                a website is displayed under custom URLs (e.g. Substack blogs)
204

205
        Returns:
206
            List[Document]: List of documents.
207

208
        """
209
        from urllib.parse import urlparse
210

211
        import requests
212
        from bs4 import BeautifulSoup
213

214
        documents = []
215
        for url in urls:
216
            try:
217
                page = requests.get(url)
218
            except Exception:
219
                raise ValueError(f"One of the inputs is not a valid url: {url}")
220

221
            hostname = custom_hostname or urlparse(url).hostname or ""
222

223
            soup = BeautifulSoup(page.content, "html.parser")
224

225
            data = ""
226
            metadata = {"URL": url}
227
            if hostname in self._website_extractor:
228
                data, metadata = self._website_extractor[hostname](soup)
229
                metadata.update(metadata)
230
            else:
231
                data = soup.getText()
232

233
            documents.append(Document(id_=url, text=data, metadata=metadata))
234

235
        return documents
236

237

238
class RssReader(BasePydanticReader):
239
    """RSS reader.
240

241
    Reads content from an RSS feed.
242

243
    """
244

245
    is_remote: bool = True
246
    html_to_text: bool
247

248
    def __init__(self, html_to_text: bool = False) -> None:
249
        """Initialize with parameters.
250

251
        Args:
252
            html_to_text (bool): Whether to convert HTML to text.
253
                Requires `html2text` package.
254

255
        """
256
        try:
257
            import feedparser  # noqa
258
        except ImportError:
259
            raise ImportError(
260
                "`feedparser` package not found, please run `pip install feedparser`"
261
            )
262

263
        if html_to_text:
264
            try:
265
                import html2text  # noqa
266
            except ImportError:
267
                raise ImportError(
268
                    "`html2text` package not found, please run `pip install html2text`"
269
                )
270
        super().__init__(html_to_text=html_to_text)
271

272
    @classmethod
273
    def class_name(cls) -> str:
274
        return "RssReader"
275

276
    def load_data(self, urls: List[str]) -> List[Document]:
277
        """Load data from RSS feeds.
278

279
        Args:
280
            urls (List[str]): List of RSS URLs to load.
281

282
        Returns:
283
            List[Document]: List of documents.
284

285
        """
286
        import feedparser
287

288
        if not isinstance(urls, list):
289
            raise ValueError("urls must be a list of strings.")
290

291
        documents = []
292

293
        for url in urls:
294
            parsed = feedparser.parse(url)
295
            for entry in parsed.entries:
296
                doc_id = entry.id or entry.link
297
                if "content" in entry:
298
                    data = entry.content[0].value
299
                else:
300
                    data = entry.description or entry.summary
301

302
                if self.html_to_text:
303
                    import html2text
304

305
                    data = html2text.html2text(data)
306

307
                metadata = {"title": entry.title, "link": entry.link}
308
                documents.append(Document(id_=doc_id, text=data, metadata=metadata))
309

310
        return documents
311

312

313
if __name__ == "__main__":
314
    reader = SimpleWebPageReader()
315
    logger.info(reader.load_data(["http://www.google.com"]))
316

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.