embedchain
218 строк · 5.7 Кб
1import pytest
2import responses
3from bs4 import BeautifulSoup
4
5
6@pytest.mark.parametrize(
7"ignored_tag",
8[
9"<nav>This is a navigation bar.</nav>",
10"<aside>This is an aside.</aside>",
11"<form>This is a form.</form>",
12"<header>This is a header.</header>",
13"<noscript>This is a noscript.</noscript>",
14"<svg>This is an SVG.</svg>",
15"<canvas>This is a canvas.</canvas>",
16"<footer>This is a footer.</footer>",
17"<script>This is a script.</script>",
18"<style>This is a style.</style>",
19],
20ids=["nav", "aside", "form", "header", "noscript", "svg", "canvas", "footer", "script", "style"],
21)
22@pytest.mark.parametrize(
23"selectee",
24[
25"""
26<article class="bd-article">
27<h2>Article Title</h2>
28<p>Article content goes here.</p>
29{ignored_tag}
30</article>
31""",
32"""
33<article role="main">
34<h2>Main Article Title</h2>
35<p>Main article content goes here.</p>
36{ignored_tag}
37</article>
38""",
39"""
40<div class="md-content">
41<h2>Markdown Content</h2>
42<p>Markdown content goes here.</p>
43{ignored_tag}
44</div>
45""",
46"""
47<div role="main">
48<h2>Main Content</h2>
49<p>Main content goes here.</p>
50{ignored_tag}
51</div>
52""",
53"""
54<div class="container">
55<h2>Container</h2>
56<p>Container content goes here.</p>
57{ignored_tag}
58</div>
59""",
60"""
61<div class="section">
62<h2>Section</h2>
63<p>Section content goes here.</p>
64{ignored_tag}
65</div>
66""",
67"""
68<article>
69<h2>Generic Article</h2>
70<p>Generic article content goes here.</p>
71{ignored_tag}
72</article>
73""",
74"""
75<main>
76<h2>Main Content</h2>
77<p>Main content goes here.</p>
78{ignored_tag}
79</main>
80""",
81],
82ids=[
83"article.bd-article",
84'article[role="main"]',
85"div.md-content",
86'div[role="main"]',
87"div.container",
88"div.section",
89"article",
90"main",
91],
92)
93def test_load_data_gets_by_selectors_and_ignored_tags(selectee, ignored_tag, loader, mocked_responses, mocker):
94child_url = "https://docs.embedchain.ai/quickstart"
95selectee = selectee.format(ignored_tag=ignored_tag)
96html_body = """
97<!DOCTYPE html>
98<html lang="en">
99<body>
100{selectee}
101</body>
102</html>
103"""
104html_body = html_body.format(selectee=selectee)
105mocked_responses.get(child_url, body=html_body, status=200, content_type="text/html")
106
107url = "https://docs.embedchain.ai/"
108html_body = """
109<!DOCTYPE html>
110<html lang="en">
111<body>
112<li><a href="/quickstart">Quickstart</a></li>
113</body>
114</html>
115"""
116mocked_responses.get(url, body=html_body, status=200, content_type="text/html")
117
118mock_sha256 = mocker.patch("embedchain.loaders.docs_site_loader.hashlib.sha256")
119doc_id = "mocked_hash"
120mock_sha256.return_value.hexdigest.return_value = doc_id
121
122result = loader.load_data(url)
123selector_soup = BeautifulSoup(selectee, "html.parser")
124expected_content = " ".join((selector_soup.select_one("h2").get_text(), selector_soup.select_one("p").get_text()))
125assert result["doc_id"] == doc_id
126assert result["data"] == [
127{
128"content": expected_content,
129"meta_data": {"url": "https://docs.embedchain.ai/quickstart"},
130}
131]
132
133
134def test_load_data_gets_child_links_recursively(loader, mocked_responses, mocker):
135child_url = "https://docs.embedchain.ai/quickstart"
136html_body = """
137<!DOCTYPE html>
138<html lang="en">
139<body>
140<li><a href="/">..</a></li>
141<li><a href="/quickstart">.</a></li>
142</body>
143</html>
144"""
145mocked_responses.get(child_url, body=html_body, status=200, content_type="text/html")
146
147child_url = "https://docs.embedchain.ai/introduction"
148html_body = """
149<!DOCTYPE html>
150<html lang="en">
151<body>
152<li><a href="/">..</a></li>
153<li><a href="/introduction">.</a></li>
154</body>
155</html>
156"""
157mocked_responses.get(child_url, body=html_body, status=200, content_type="text/html")
158
159url = "https://docs.embedchain.ai/"
160html_body = """
161<!DOCTYPE html>
162<html lang="en">
163<body>
164<li><a href="/quickstart">Quickstart</a></li>
165<li><a href="/introduction">Introduction</a></li>
166</body>
167</html>
168"""
169mocked_responses.get(url, body=html_body, status=200, content_type="text/html")
170
171mock_sha256 = mocker.patch("embedchain.loaders.docs_site_loader.hashlib.sha256")
172doc_id = "mocked_hash"
173mock_sha256.return_value.hexdigest.return_value = doc_id
174
175result = loader.load_data(url)
176assert result["doc_id"] == doc_id
177expected_data = [
178{"content": "..\n.", "meta_data": {"url": "https://docs.embedchain.ai/quickstart"}},
179{"content": "..\n.", "meta_data": {"url": "https://docs.embedchain.ai/introduction"}},
180]
181assert all(item in expected_data for item in result["data"])
182
183
184def test_load_data_fails_to_fetch_website(loader, mocked_responses, mocker):
185child_url = "https://docs.embedchain.ai/introduction"
186mocked_responses.get(child_url, status=404)
187
188url = "https://docs.embedchain.ai/"
189html_body = """
190<!DOCTYPE html>
191<html lang="en">
192<body>
193<li><a href="/introduction">Introduction</a></li>
194</body>
195</html>
196"""
197mocked_responses.get(url, body=html_body, status=200, content_type="text/html")
198
199mock_sha256 = mocker.patch("embedchain.loaders.docs_site_loader.hashlib.sha256")
200doc_id = "mocked_hash"
201mock_sha256.return_value.hexdigest.return_value = doc_id
202
203result = loader.load_data(url)
204assert result["doc_id"] is doc_id
205assert result["data"] == []
206
207
208@pytest.fixture
209def loader():
210from embedchain.loaders.docs_site_loader import DocsSiteLoader
211
212return DocsSiteLoader()
213
214
215@pytest.fixture
216def mocked_responses():
217with responses.RequestsMock() as rsps:
218yield rsps
219