haystack

Форк
0
/
test_link_content_fetcher.py 
190 строк · 8.4 Кб
1
from unittest.mock import patch, Mock
2

3
import pytest
4
import requests
5

6
from haystack.components.fetchers.link_content import (
7
    LinkContentFetcher,
8
    _text_content_handler,
9
    _binary_content_handler,
10
    DEFAULT_USER_AGENT,
11
)
12

13
HTML_URL = "https://docs.haystack.deepset.ai/docs"
14
TEXT_URL = "https://raw.githubusercontent.com/deepset-ai/haystack/main/README.md"
15
PDF_URL = "https://raw.githubusercontent.com/deepset-ai/haystack/b5987a6d8d0714eb2f3011183ab40093d2e4a41a/e2e/samples/pipelines/sample_pdf_1.pdf"
16

17

18
@pytest.fixture
19
def mock_get_link_text_content():
20
    with patch("haystack.components.fetchers.link_content.requests") as mock_run:
21
        mock_run.get.return_value = Mock(
22
            status_code=200, text="Example test response", headers={"Content-Type": "text/plain"}
23
        )
24
        yield mock_run
25

26

27
@pytest.fixture
28
def mock_get_link_content(test_files_path):
29
    with patch("haystack.components.fetchers.link_content.requests") as mock_run:
30
        mock_run.get.return_value = Mock(
31
            status_code=200,
32
            content=open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb").read(),
33
            headers={"Content-Type": "application/pdf"},
34
        )
35
        yield mock_run
36

37

38
class TestLinkContentFetcher:
39
    def test_init(self):
40
        fetcher = LinkContentFetcher()
41
        assert fetcher.raise_on_failure is True
42
        assert fetcher.user_agents == [DEFAULT_USER_AGENT]
43
        assert fetcher.retry_attempts == 2
44
        assert fetcher.timeout == 3
45
        assert fetcher.handlers == {
46
            "text/html": _text_content_handler,
47
            "text/plain": _text_content_handler,
48
            "application/pdf": _binary_content_handler,
49
            "application/octet-stream": _binary_content_handler,
50
        }
51
        assert hasattr(fetcher, "_get_response")
52

53
    def test_init_with_params(self):
54
        fetcher = LinkContentFetcher(raise_on_failure=False, user_agents=["test"], retry_attempts=1, timeout=2)
55
        assert fetcher.raise_on_failure is False
56
        assert fetcher.user_agents == ["test"]
57
        assert fetcher.retry_attempts == 1
58
        assert fetcher.timeout == 2
59

60
    def test_run_text(self):
61
        correct_response = b"Example test response"
62
        with patch("haystack.components.fetchers.link_content.requests") as mock_run:
63
            mock_run.get.return_value = Mock(
64
                status_code=200, text="Example test response", headers={"Content-Type": "text/plain"}
65
            )
66
            fetcher = LinkContentFetcher()
67
            streams = fetcher.run(urls=["https://www.example.com"])["streams"]
68
            first_stream = streams[0]
69
            assert first_stream.data == correct_response
70
            assert first_stream.meta["content_type"] == "text/plain"
71

72
    def test_run_html(self):
73
        correct_response = b"<h1>Example test response</h1>"
74
        with patch("haystack.components.fetchers.link_content.requests") as mock_run:
75
            mock_run.get.return_value = Mock(
76
                status_code=200, text="<h1>Example test response</h1>", headers={"Content-Type": "text/html"}
77
            )
78
            fetcher = LinkContentFetcher()
79
            streams = fetcher.run(urls=["https://www.example.com"])["streams"]
80
            first_stream = streams[0]
81
            assert first_stream.data == correct_response
82
            assert first_stream.meta["content_type"] == "text/html"
83

84
    def test_run_binary(self, test_files_path):
85
        file_bytes = open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb").read()
86
        with patch("haystack.components.fetchers.link_content.requests") as mock_run:
87
            mock_run.get.return_value = Mock(
88
                status_code=200, content=file_bytes, headers={"Content-Type": "application/pdf"}
89
            )
90
            fetcher = LinkContentFetcher()
91
            streams = fetcher.run(urls=["https://www.example.com"])["streams"]
92
            first_stream = streams[0]
93
            assert first_stream.data == file_bytes
94
            assert first_stream.meta["content_type"] == "application/pdf"
95

96
    def test_run_bad_status_code(self):
97
        empty_byte_stream = b""
98
        fetcher = LinkContentFetcher(raise_on_failure=False)
99
        mock_response = Mock(status_code=403)
100
        with patch("haystack.components.fetchers.link_content.requests") as mock_run:
101
            mock_run.get.return_value = mock_response
102
            streams = fetcher.run(urls=["https://www.example.com"])["streams"]
103

104
        # empty byte stream is returned because raise_on_failure is False
105
        assert len(streams) == 1
106
        first_stream = streams[0]
107
        assert first_stream.data == empty_byte_stream
108
        assert first_stream.meta["content_type"] == "text/html"
109

110
    @pytest.mark.integration
111
    def test_link_content_fetcher_html(self):
112
        fetcher = LinkContentFetcher()
113
        streams = fetcher.run([HTML_URL])["streams"]
114
        first_stream = streams[0]
115
        assert "Haystack" in first_stream.data.decode("utf-8")
116
        assert first_stream.meta["content_type"] == "text/html"
117
        assert "url" in first_stream.meta and first_stream.meta["url"] == HTML_URL
118

119
    @pytest.mark.integration
120
    def test_link_content_fetcher_text(self):
121
        fetcher = LinkContentFetcher()
122
        streams = fetcher.run([TEXT_URL])["streams"]
123
        first_stream = streams[0]
124
        assert "Haystack" in first_stream.data.decode("utf-8")
125
        assert first_stream.meta["content_type"] == "text/plain"
126
        assert "url" in first_stream.meta and first_stream.meta["url"] == TEXT_URL
127

128
    @pytest.mark.integration
129
    def test_link_content_fetcher_pdf(self):
130
        fetcher = LinkContentFetcher()
131
        streams = fetcher.run([PDF_URL])["streams"]
132
        assert len(streams) == 1
133
        first_stream = streams[0]
134
        assert first_stream.meta["content_type"] in ("application/octet-stream", "application/pdf")
135
        assert "url" in first_stream.meta and first_stream.meta["url"] == PDF_URL
136

137
    @pytest.mark.integration
138
    def test_link_content_fetcher_multiple_different_content_types(self):
139
        """
140
        This test is to ensure that the fetcher can handle a list of URLs that contain different content types.
141
        """
142
        fetcher = LinkContentFetcher()
143
        streams = fetcher.run([PDF_URL, HTML_URL])["streams"]
144
        assert len(streams) == 2
145
        for stream in streams:
146
            assert stream.meta["content_type"] in ("text/html", "application/pdf", "application/octet-stream")
147
            if stream.meta["content_type"] == "text/html":
148
                assert "Haystack" in stream.data.decode("utf-8")
149
            elif stream.meta["content_type"] == "application/pdf":
150
                assert len(stream.data) > 0
151

152
    @pytest.mark.integration
153
    def test_link_content_fetcher_multiple_html_streams(self):
154
        """
155
        This test is to ensure that the fetcher can handle a list of URLs that contain different content types,
156
        and that we have two html streams.
157
        """
158

159
        fetcher = LinkContentFetcher()
160
        streams = fetcher.run([PDF_URL, HTML_URL, "https://google.com"])["streams"]
161
        assert len(streams) == 3
162
        for stream in streams:
163
            assert stream.meta["content_type"] in ("text/html", "application/pdf", "application/octet-stream")
164
            if stream.meta["content_type"] == "text/html":
165
                assert "Haystack" in stream.data.decode("utf-8") or "Google" in stream.data.decode("utf-8")
166
            elif stream.meta["content_type"] == "application/pdf":
167
                assert len(stream.data) > 0
168

169
    @pytest.mark.integration
170
    def test_mix_of_good_and_failed_requests(self):
171
        """
172
        This test is to ensure that the fetcher can handle a list of URLs that contain URLs that fail to be fetched.
173
        In such a case, the fetcher should return the content of the URLs that were successfully fetched and not raise
174
        an exception.
175
        """
176
        fetcher = LinkContentFetcher()
177
        result = fetcher.run(["https://non_existent_website_dot.com/", "https://www.google.com/"])
178
        assert len(result["streams"]) == 1
179
        first_stream = result["streams"][0]
180
        assert first_stream.meta["content_type"] == "text/html"
181

182
    @pytest.mark.integration
183
    def test_bad_request_exception_raised(self):
184
        """
185
        This test is to ensure that the fetcher raises an exception when a single bad request is made and it is configured to
186
        do so.
187
        """
188
        fetcher = LinkContentFetcher()
189
        with pytest.raises(requests.exceptions.ConnectionError):
190
            fetcher.run(["https://non_existent_website_dot.com/"])
191

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.