embedchain

Форк
0
/
test_docs_site.py 
130 строк · 3.9 Кб
1
import hashlib
2
from unittest.mock import Mock, patch
3

4
import pytest
5
from requests import Response
6

7
from embedchain.loaders.docs_site_loader import DocsSiteLoader
8

9

10
@pytest.fixture
11
def mock_requests_get():
12
    with patch("requests.get") as mock_get:
13
        yield mock_get
14

15

16
@pytest.fixture
17
def docs_site_loader():
18
    return DocsSiteLoader()
19

20

21
def test_get_child_links_recursive(mock_requests_get, docs_site_loader):
22
    mock_response = Mock()
23
    mock_response.status_code = 200
24
    mock_response.text = """
25
        <html>
26
            <a href="/page1">Page 1</a>
27
            <a href="/page2">Page 2</a>
28
        </html>
29
    """
30
    mock_requests_get.return_value = mock_response
31

32
    docs_site_loader._get_child_links_recursive("https://example.com")
33

34
    assert len(docs_site_loader.visited_links) == 2
35
    assert "https://example.com/page1" in docs_site_loader.visited_links
36
    assert "https://example.com/page2" in docs_site_loader.visited_links
37

38

39
def test_get_child_links_recursive_status_not_200(mock_requests_get, docs_site_loader):
40
    mock_response = Mock()
41
    mock_response.status_code = 404
42
    mock_requests_get.return_value = mock_response
43

44
    docs_site_loader._get_child_links_recursive("https://example.com")
45

46
    assert len(docs_site_loader.visited_links) == 0
47

48

49
def test_get_all_urls(mock_requests_get, docs_site_loader):
50
    mock_response = Mock()
51
    mock_response.status_code = 200
52
    mock_response.text = """
53
        <html>
54
            <a href="/page1">Page 1</a>
55
            <a href="/page2">Page 2</a>
56
            <a href="https://example.com/external">External</a>
57
        </html>
58
    """
59
    mock_requests_get.return_value = mock_response
60

61
    all_urls = docs_site_loader._get_all_urls("https://example.com")
62

63
    assert len(all_urls) == 3
64
    assert "https://example.com/page1" in all_urls
65
    assert "https://example.com/page2" in all_urls
66
    assert "https://example.com/external" in all_urls
67

68

69
def test_load_data_from_url(mock_requests_get, docs_site_loader):
70
    mock_response = Mock()
71
    mock_response.status_code = 200
72
    mock_response.content = """
73
        <html>
74
            <nav>
75
                <h1>Navigation</h1>
76
            </nav>
77
            <article class="bd-article">
78
                <p>Article Content</p>
79
            </article>
80
        </html>
81
    """.encode()
82
    mock_requests_get.return_value = mock_response
83

84
    data = docs_site_loader._load_data_from_url("https://example.com/page1")
85

86
    assert len(data) == 1
87
    assert data[0]["content"] == "Article Content"
88
    assert data[0]["meta_data"]["url"] == "https://example.com/page1"
89

90

91
def test_load_data_from_url_status_not_200(mock_requests_get, docs_site_loader):
92
    mock_response = Mock()
93
    mock_response.status_code = 404
94
    mock_requests_get.return_value = mock_response
95

96
    data = docs_site_loader._load_data_from_url("https://example.com/page1")
97

98
    assert data == []
99
    assert len(data) == 0
100

101

102
def test_load_data(mock_requests_get, docs_site_loader):
103
    mock_response = Response()
104
    mock_response.status_code = 200
105
    mock_response._content = """
106
        <html>
107
            <a href="/page1">Page 1</a>
108
            <a href="/page2">Page 2</a>
109
        """.encode()
110
    mock_requests_get.return_value = mock_response
111

112
    url = "https://example.com"
113
    data = docs_site_loader.load_data(url)
114
    expected_doc_id = hashlib.sha256((" ".join(docs_site_loader.visited_links) + url).encode()).hexdigest()
115

116
    assert len(data["data"]) == 2
117
    assert data["doc_id"] == expected_doc_id
118

119

120
def test_if_response_status_not_200(mock_requests_get, docs_site_loader):
121
    mock_response = Response()
122
    mock_response.status_code = 404
123
    mock_requests_get.return_value = mock_response
124

125
    url = "https://example.com"
126
    data = docs_site_loader.load_data(url)
127
    expected_doc_id = hashlib.sha256((" ".join(docs_site_loader.visited_links) + url).encode()).hexdigest()
128

129
    assert len(data["data"]) == 0
130
    assert data["doc_id"] == expected_doc_id
131

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.