embedchain

Форк
0
/
test_docs_site_loader.py 
218 строк · 5.7 Кб
1
import pytest
2
import responses
3
from bs4 import BeautifulSoup
4

5

6
@pytest.mark.parametrize(
7
    "ignored_tag",
8
    [
9
        "<nav>This is a navigation bar.</nav>",
10
        "<aside>This is an aside.</aside>",
11
        "<form>This is a form.</form>",
12
        "<header>This is a header.</header>",
13
        "<noscript>This is a noscript.</noscript>",
14
        "<svg>This is an SVG.</svg>",
15
        "<canvas>This is a canvas.</canvas>",
16
        "<footer>This is a footer.</footer>",
17
        "<script>This is a script.</script>",
18
        "<style>This is a style.</style>",
19
    ],
20
    ids=["nav", "aside", "form", "header", "noscript", "svg", "canvas", "footer", "script", "style"],
21
)
22
@pytest.mark.parametrize(
23
    "selectee",
24
    [
25
        """
26
<article class="bd-article">
27
    <h2>Article Title</h2>
28
    <p>Article content goes here.</p>
29
    {ignored_tag}
30
</article>
31
""",
32
        """
33
<article role="main">
34
    <h2>Main Article Title</h2>
35
    <p>Main article content goes here.</p>
36
    {ignored_tag}
37
</article>
38
""",
39
        """
40
<div class="md-content">
41
    <h2>Markdown Content</h2>
42
    <p>Markdown content goes here.</p>
43
    {ignored_tag}
44
</div>
45
""",
46
        """
47
<div role="main">
48
    <h2>Main Content</h2>
49
    <p>Main content goes here.</p>
50
    {ignored_tag}
51
</div>
52
""",
53
        """
54
<div class="container">
55
    <h2>Container</h2>
56
    <p>Container content goes here.</p>
57
    {ignored_tag}
58
</div>
59
        """,
60
        """
61
<div class="section">
62
    <h2>Section</h2>
63
    <p>Section content goes here.</p>
64
    {ignored_tag}
65
</div>
66
        """,
67
        """
68
<article>
69
    <h2>Generic Article</h2>
70
    <p>Generic article content goes here.</p>
71
    {ignored_tag}
72
</article>
73
        """,
74
        """
75
<main>
76
    <h2>Main Content</h2>
77
    <p>Main content goes here.</p>
78
    {ignored_tag}
79
</main>
80
""",
81
    ],
82
    ids=[
83
        "article.bd-article",
84
        'article[role="main"]',
85
        "div.md-content",
86
        'div[role="main"]',
87
        "div.container",
88
        "div.section",
89
        "article",
90
        "main",
91
    ],
92
)
93
def test_load_data_gets_by_selectors_and_ignored_tags(selectee, ignored_tag, loader, mocked_responses, mocker):
94
    child_url = "https://docs.embedchain.ai/quickstart"
95
    selectee = selectee.format(ignored_tag=ignored_tag)
96
    html_body = """
97
<!DOCTYPE html>
98
<html lang="en">
99
<body>
100
    {selectee}
101
</body>
102
</html>
103
"""
104
    html_body = html_body.format(selectee=selectee)
105
    mocked_responses.get(child_url, body=html_body, status=200, content_type="text/html")
106

107
    url = "https://docs.embedchain.ai/"
108
    html_body = """
109
<!DOCTYPE html>
110
<html lang="en">
111
<body>
112
    <li><a href="/quickstart">Quickstart</a></li>
113
</body>
114
</html>
115
"""
116
    mocked_responses.get(url, body=html_body, status=200, content_type="text/html")
117

118
    mock_sha256 = mocker.patch("embedchain.loaders.docs_site_loader.hashlib.sha256")
119
    doc_id = "mocked_hash"
120
    mock_sha256.return_value.hexdigest.return_value = doc_id
121

122
    result = loader.load_data(url)
123
    selector_soup = BeautifulSoup(selectee, "html.parser")
124
    expected_content = " ".join((selector_soup.select_one("h2").get_text(), selector_soup.select_one("p").get_text()))
125
    assert result["doc_id"] == doc_id
126
    assert result["data"] == [
127
        {
128
            "content": expected_content,
129
            "meta_data": {"url": "https://docs.embedchain.ai/quickstart"},
130
        }
131
    ]
132

133

134
def test_load_data_gets_child_links_recursively(loader, mocked_responses, mocker):
135
    child_url = "https://docs.embedchain.ai/quickstart"
136
    html_body = """
137
<!DOCTYPE html>
138
<html lang="en">
139
<body>
140
    <li><a href="/">..</a></li>
141
    <li><a href="/quickstart">.</a></li>
142
</body>
143
</html>
144
"""
145
    mocked_responses.get(child_url, body=html_body, status=200, content_type="text/html")
146

147
    child_url = "https://docs.embedchain.ai/introduction"
148
    html_body = """
149
<!DOCTYPE html>
150
<html lang="en">
151
<body>
152
    <li><a href="/">..</a></li>
153
    <li><a href="/introduction">.</a></li>
154
</body>
155
</html>
156
"""
157
    mocked_responses.get(child_url, body=html_body, status=200, content_type="text/html")
158

159
    url = "https://docs.embedchain.ai/"
160
    html_body = """
161
<!DOCTYPE html>
162
<html lang="en">
163
<body>
164
    <li><a href="/quickstart">Quickstart</a></li>
165
    <li><a href="/introduction">Introduction</a></li>
166
</body>
167
</html>
168
"""
169
    mocked_responses.get(url, body=html_body, status=200, content_type="text/html")
170

171
    mock_sha256 = mocker.patch("embedchain.loaders.docs_site_loader.hashlib.sha256")
172
    doc_id = "mocked_hash"
173
    mock_sha256.return_value.hexdigest.return_value = doc_id
174

175
    result = loader.load_data(url)
176
    assert result["doc_id"] == doc_id
177
    expected_data = [
178
        {"content": "..\n.", "meta_data": {"url": "https://docs.embedchain.ai/quickstart"}},
179
        {"content": "..\n.", "meta_data": {"url": "https://docs.embedchain.ai/introduction"}},
180
    ]
181
    assert all(item in expected_data for item in result["data"])
182

183

184
def test_load_data_fails_to_fetch_website(loader, mocked_responses, mocker):
185
    child_url = "https://docs.embedchain.ai/introduction"
186
    mocked_responses.get(child_url, status=404)
187

188
    url = "https://docs.embedchain.ai/"
189
    html_body = """
190
<!DOCTYPE html>
191
<html lang="en">
192
<body>
193
    <li><a href="/introduction">Introduction</a></li>
194
</body>
195
</html>
196
"""
197
    mocked_responses.get(url, body=html_body, status=200, content_type="text/html")
198

199
    mock_sha256 = mocker.patch("embedchain.loaders.docs_site_loader.hashlib.sha256")
200
    doc_id = "mocked_hash"
201
    mock_sha256.return_value.hexdigest.return_value = doc_id
202

203
    result = loader.load_data(url)
204
    assert result["doc_id"] is doc_id
205
    assert result["data"] == []
206

207

208
@pytest.fixture
209
def loader():
210
    from embedchain.loaders.docs_site_loader import DocsSiteLoader
211

212
    return DocsSiteLoader()
213

214

215
@pytest.fixture
216
def mocked_responses():
217
    with responses.RequestsMock() as rsps:
218
        yield rsps
219

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.