haystack
190 строк · 8.4 Кб
1from unittest.mock import patch, Mock2
3import pytest4import requests5
6from haystack.components.fetchers.link_content import (7LinkContentFetcher,8_text_content_handler,9_binary_content_handler,10DEFAULT_USER_AGENT,11)
12
13HTML_URL = "https://docs.haystack.deepset.ai/docs"14TEXT_URL = "https://raw.githubusercontent.com/deepset-ai/haystack/main/README.md"15PDF_URL = "https://raw.githubusercontent.com/deepset-ai/haystack/b5987a6d8d0714eb2f3011183ab40093d2e4a41a/e2e/samples/pipelines/sample_pdf_1.pdf"16
17
18@pytest.fixture19def mock_get_link_text_content():20with patch("haystack.components.fetchers.link_content.requests") as mock_run:21mock_run.get.return_value = Mock(22status_code=200, text="Example test response", headers={"Content-Type": "text/plain"}23)24yield mock_run25
26
27@pytest.fixture28def mock_get_link_content(test_files_path):29with patch("haystack.components.fetchers.link_content.requests") as mock_run:30mock_run.get.return_value = Mock(31status_code=200,32content=open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb").read(),33headers={"Content-Type": "application/pdf"},34)35yield mock_run36
37
38class TestLinkContentFetcher:39def test_init(self):40fetcher = LinkContentFetcher()41assert fetcher.raise_on_failure is True42assert fetcher.user_agents == [DEFAULT_USER_AGENT]43assert fetcher.retry_attempts == 244assert fetcher.timeout == 345assert fetcher.handlers == {46"text/html": _text_content_handler,47"text/plain": _text_content_handler,48"application/pdf": _binary_content_handler,49"application/octet-stream": _binary_content_handler,50}51assert hasattr(fetcher, "_get_response")52
53def test_init_with_params(self):54fetcher = LinkContentFetcher(raise_on_failure=False, user_agents=["test"], retry_attempts=1, timeout=2)55assert fetcher.raise_on_failure is False56assert fetcher.user_agents == ["test"]57assert fetcher.retry_attempts == 158assert fetcher.timeout == 259
60def test_run_text(self):61correct_response = b"Example test response"62with patch("haystack.components.fetchers.link_content.requests") as mock_run:63mock_run.get.return_value = Mock(64status_code=200, text="Example test response", headers={"Content-Type": "text/plain"}65)66fetcher = LinkContentFetcher()67streams = fetcher.run(urls=["https://www.example.com"])["streams"]68first_stream = streams[0]69assert first_stream.data == correct_response70assert first_stream.meta["content_type"] == "text/plain"71
72def test_run_html(self):73correct_response = b"<h1>Example test response</h1>"74with patch("haystack.components.fetchers.link_content.requests") as mock_run:75mock_run.get.return_value = Mock(76status_code=200, text="<h1>Example test response</h1>", headers={"Content-Type": "text/html"}77)78fetcher = LinkContentFetcher()79streams = fetcher.run(urls=["https://www.example.com"])["streams"]80first_stream = streams[0]81assert first_stream.data == correct_response82assert first_stream.meta["content_type"] == "text/html"83
84def test_run_binary(self, test_files_path):85file_bytes = open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb").read()86with patch("haystack.components.fetchers.link_content.requests") as mock_run:87mock_run.get.return_value = Mock(88status_code=200, content=file_bytes, headers={"Content-Type": "application/pdf"}89)90fetcher = LinkContentFetcher()91streams = fetcher.run(urls=["https://www.example.com"])["streams"]92first_stream = streams[0]93assert first_stream.data == file_bytes94assert first_stream.meta["content_type"] == "application/pdf"95
96def test_run_bad_status_code(self):97empty_byte_stream = b""98fetcher = LinkContentFetcher(raise_on_failure=False)99mock_response = Mock(status_code=403)100with patch("haystack.components.fetchers.link_content.requests") as mock_run:101mock_run.get.return_value = mock_response102streams = fetcher.run(urls=["https://www.example.com"])["streams"]103
104# empty byte stream is returned because raise_on_failure is False105assert len(streams) == 1106first_stream = streams[0]107assert first_stream.data == empty_byte_stream108assert first_stream.meta["content_type"] == "text/html"109
110@pytest.mark.integration111def test_link_content_fetcher_html(self):112fetcher = LinkContentFetcher()113streams = fetcher.run([HTML_URL])["streams"]114first_stream = streams[0]115assert "Haystack" in first_stream.data.decode("utf-8")116assert first_stream.meta["content_type"] == "text/html"117assert "url" in first_stream.meta and first_stream.meta["url"] == HTML_URL118
119@pytest.mark.integration120def test_link_content_fetcher_text(self):121fetcher = LinkContentFetcher()122streams = fetcher.run([TEXT_URL])["streams"]123first_stream = streams[0]124assert "Haystack" in first_stream.data.decode("utf-8")125assert first_stream.meta["content_type"] == "text/plain"126assert "url" in first_stream.meta and first_stream.meta["url"] == TEXT_URL127
128@pytest.mark.integration129def test_link_content_fetcher_pdf(self):130fetcher = LinkContentFetcher()131streams = fetcher.run([PDF_URL])["streams"]132assert len(streams) == 1133first_stream = streams[0]134assert first_stream.meta["content_type"] in ("application/octet-stream", "application/pdf")135assert "url" in first_stream.meta and first_stream.meta["url"] == PDF_URL136
137@pytest.mark.integration138def test_link_content_fetcher_multiple_different_content_types(self):139"""140This test is to ensure that the fetcher can handle a list of URLs that contain different content types.
141"""
142fetcher = LinkContentFetcher()143streams = fetcher.run([PDF_URL, HTML_URL])["streams"]144assert len(streams) == 2145for stream in streams:146assert stream.meta["content_type"] in ("text/html", "application/pdf", "application/octet-stream")147if stream.meta["content_type"] == "text/html":148assert "Haystack" in stream.data.decode("utf-8")149elif stream.meta["content_type"] == "application/pdf":150assert len(stream.data) > 0151
152@pytest.mark.integration153def test_link_content_fetcher_multiple_html_streams(self):154"""155This test is to ensure that the fetcher can handle a list of URLs that contain different content types,
156and that we have two html streams.
157"""
158
159fetcher = LinkContentFetcher()160streams = fetcher.run([PDF_URL, HTML_URL, "https://google.com"])["streams"]161assert len(streams) == 3162for stream in streams:163assert stream.meta["content_type"] in ("text/html", "application/pdf", "application/octet-stream")164if stream.meta["content_type"] == "text/html":165assert "Haystack" in stream.data.decode("utf-8") or "Google" in stream.data.decode("utf-8")166elif stream.meta["content_type"] == "application/pdf":167assert len(stream.data) > 0168
169@pytest.mark.integration170def test_mix_of_good_and_failed_requests(self):171"""172This test is to ensure that the fetcher can handle a list of URLs that contain URLs that fail to be fetched.
173In such a case, the fetcher should return the content of the URLs that were successfully fetched and not raise
174an exception.
175"""
176fetcher = LinkContentFetcher()177result = fetcher.run(["https://non_existent_website_dot.com/", "https://www.google.com/"])178assert len(result["streams"]) == 1179first_stream = result["streams"][0]180assert first_stream.meta["content_type"] == "text/html"181
182@pytest.mark.integration183def test_bad_request_exception_raised(self):184"""185This test is to ensure that the fetcher raises an exception when a single bad request is made and it is configured to
186do so.
187"""
188fetcher = LinkContentFetcher()189with pytest.raises(requests.exceptions.ConnectionError):190fetcher.run(["https://non_existent_website_dot.com/"])191