unstructured
693 строки · 23.9 Кб
1import os2import pathlib3from unittest.mock import patch4
5import pytest6import requests7from requests.models import Response8
9from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path10from unstructured.chunking.title import chunk_by_title11from unstructured.cleaners.core import clean_extra_whitespace12from unstructured.documents.elements import EmailAddress, ListItem, NarrativeText, Table, Title13from unstructured.documents.html import HTMLTitle14from unstructured.partition.html import partition_html15
16DIRECTORY = pathlib.Path(__file__).parent.resolve()17
18EXPECTED_OUTPUT_LANGUAGE_DE = [19Title(text="Jahresabschluss zum Geschäftsjahr vom 01.01.2020 bis zum 31.12.2020"),20]
21
22
23def test_partition_html_from_filename():24directory = os.path.join(DIRECTORY, "..", "..", "example-docs")25filename = os.path.join(directory, "example-10k.html")26elements = partition_html(filename=filename)27assert len(elements) > 028assert "PageBreak" not in [elem.category for elem in elements]29assert elements[0].metadata.filename == "example-10k.html"30assert elements[0].metadata.file_directory == directory31
32
33def test_partition_html_from_filename_returns_html_elements():34directory = os.path.join(DIRECTORY, "..", "..", "example-docs")35filename = os.path.join(directory, "example-10k.html")36elements = partition_html(filename=filename)37assert len(elements) > 038assert isinstance(elements[0], HTMLTitle)39
40
41def test_partition_html_from_filename_with_metadata_filename():42directory = os.path.join(DIRECTORY, "..", "..", "example-docs")43filename = os.path.join(directory, "example-10k.html")44elements = partition_html(filename=filename, metadata_filename="test")45assert len(elements) > 046assert all(element.metadata.filename == "test" for element in elements)47
48
49@pytest.mark.parametrize(50("filename", "encoding", "error"),51[52("example-10k-utf-16.html", "utf-8", UnicodeDecodeError),53("example-steelJIS-datasheet-utf-16.html", "utf-8", UnicodeDecodeError),54],55)
56def test_partition_html_from_filename_raises_encoding_error(filename, encoding, error):57with pytest.raises(error):58filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)59with open(filename) as f:60partition_html(file=f, encoding=encoding)61
62
63@pytest.mark.parametrize(64"filename",65[66"example-10k-utf-16.html",67"example-steelJIS-datasheet-utf-16.html",68"fake-html-lang-de.html",69],70)
71def test_partition_html_from_filename_default_encoding(filename):72filename_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)73elements = partition_html(filename=filename_path)74assert len(elements) > 075for element in elements:76assert element.metadata.filename == filename77if filename == "fake-html-lang-de.html":78assert elements == EXPECTED_OUTPUT_LANGUAGE_DE79
80
81def test_partition_html_from_filename_metadata_false():82directory = os.path.join(DIRECTORY, "..", "..", "example-docs")83filename = os.path.join(directory, "example-10k.html")84elements = partition_html(filename=filename, include_metadata=False)85metadata_present = any(element.metadata.to_dict() for element in elements)86assert not metadata_present87
88
89def test_partition_html_with_page_breaks():90filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")91elements = partition_html(filename=filename, include_page_breaks=True)92assert "PageBreak" in [elem.category for elem in elements]93assert len(elements) > 094for element in elements:95assert element.metadata.filename == "example-10k.html"96
97
98def test_partition_html_from_file():99filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")100with open(filename) as f:101elements = partition_html(file=f)102assert len(elements) > 0103for element in elements:104assert element.metadata.filename is None105
106
107def test_partition_html_from_file_with_metadata_filename():108filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")109with open(filename) as f:110elements = partition_html(file=f, metadata_filename="test")111assert len(elements) > 0112for element in elements:113assert element.metadata.filename == "test"114
115
116@pytest.mark.parametrize(117("filename", "encoding", "error"),118[119("example-10k-utf-16.html", "utf-8", UnicodeDecodeError),120("example-steelJIS-datasheet-utf-16.html", "utf-8", UnicodeDecodeError),121],122)
123def test_partition_html_from_file_raises_encoding_error(filename, encoding, error):124with pytest.raises(error):125filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)126with open(filename) as f, pytest.raises(UnicodeEncodeError):127partition_html(file=f, encoding=encoding)128
129
130@pytest.mark.parametrize(131"filename",132[133"example-10k-utf-16.html",134"example-steelJIS-datasheet-utf-16.html",135"fake-html-lang-de.html",136],137)
138def test_partition_html_from_file_default_encoding(filename):139filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)140with open(filename) as f:141elements = partition_html(file=f)142assert len(elements) > 0143if filename == "fake-html-lang-de.html":144assert elements == EXPECTED_OUTPUT_LANGUAGE_DE145
146
147@pytest.mark.parametrize(148("filename", "encoding", "error"),149[150("example-10k-utf-16.html", "utf-8", UnicodeDecodeError),151("example-steelJIS-datasheet-utf-16.html", "utf-8", UnicodeDecodeError),152],153)
154def test_partition_html_from_file_rb_raises_encoding_error(filename, encoding, error):155with pytest.raises(error):156filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)157with open(filename, "rb") as f:158partition_html(file=f, encoding=encoding)159
160
161@pytest.mark.parametrize(162"filename",163[164"example-10k-utf-16.html",165"example-steelJIS-datasheet-utf-16.html",166"fake-html-lang-de.html",167],168)
169def test_partition_html_from_file_rb_default_encoding(filename):170filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)171with open(filename, "rb") as f:172elements = partition_html(file=f)173assert len(elements) > 0174if filename == "fake-html-lang-de.html":175assert elements == EXPECTED_OUTPUT_LANGUAGE_DE176
177
178def test_partition_html_from_text():179filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")180with open(filename) as f:181text = f.read()182elements = partition_html(text=text)183assert len(elements) > 0184
185
186def test_partition_html_from_text_works_with_empty_string():187assert partition_html(text="") == []188
189
190class MockResponse:191def __init__(self, text, status_code, headers={}):192self.text = text193self.status_code = status_code194self.ok = status_code < 300195self.headers = headers196
197
198def test_partition_html_from_url():199filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")200with open(filename) as f:201text = f.read()202
203response = MockResponse(204text=text,205status_code=200,206headers={"Content-Type": "text/html"},207)208with patch.object(requests, "get", return_value=response) as _:209elements = partition_html(url="https://fake.url")210
211assert len(elements) > 0212
213
214def test_partition_html_from_url_raises_with_bad_status_code():215filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")216with open(filename) as f:217text = f.read()218
219response = MockResponse(220text=text,221status_code=500,222headers={"Content-Type": "text/html"},223)224with patch.object(requests, "get", return_value=response) as _, pytest.raises(ValueError):225partition_html(url="https://fake.url")226
227
228def test_partition_html_from_url_raises_with_bad_content_type():229filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")230with open(filename) as f:231text = f.read()232
233response = MockResponse(234text=text,235status_code=200,236headers={"Content-Type": "application/json"},237)238with patch.object(requests, "get", return_value=response) as _, pytest.raises(ValueError):239partition_html(url="https://fake.url")240
241
242def test_partition_from_url_uses_headers(mocker):243test_url = "https://example.com"244test_headers = {"User-Agent": "test"}245
246response = Response()247response.status_code = 200248response._content = (249b"<html><head></head><body><p>What do i know? Who needs to know it?</p></body></html>"250)251response.headers = {"Content-Type": "text/html"}252
253mock_get = mocker.patch("requests.get", return_value=response)254
255partition_html(url=test_url, headers=test_headers)256
257# Check if requests.get was called with the correct arguments258mock_get.assert_called_once_with(test_url, headers=test_headers, verify=True)259
260
261def test_partition_html_raises_with_none_specified():262with pytest.raises(ValueError):263partition_html()264
265
266def test_partition_html_raises_with_too_many_specified():267filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")268with open(filename) as f:269text = f.read()270
271with pytest.raises(ValueError):272partition_html(filename=filename, text=text)273
274
275def test_partition_html_on_ideas_page(filename="example-docs/ideas-page.html"):276elements = partition_html(filename=filename)277assert len(elements) == 1278assert elements[0] == Table(279text="January 2023 ( Someone fed my essays into GPT to make something "280"that could answer\nquestions based on them, then asked it where good "281"ideas come from. The\nanswer was ok, but not what I would have said. "282"This is what I would have said.) The way to get new ideas is to notice "283"anomalies: what seems strange,\nor missing, or broken? You can see anomalies"284" in everyday life (much\nof standup comedy is based on this), but the best "285"place to look for\nthem is at the frontiers of knowledge. Knowledge grows "286"fractally.\nFrom a distance its edges look smooth, but when you learn "287"enough\nto get close to one, you'll notice it's full of gaps. These "288"gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx "289"or wondered about y. In the best case, exploring such gaps yields\nwhole "290"new fractal buds.",291)292
293assert elements[0].metadata.emphasized_text_contents is None294assert elements[0].metadata.link_urls is None295assert elements[0].metadata.text_as_html is not None296
297
298def test_user_without_file_write_permission_can_partition_html(tmp_path, monkeypatch):299example_filename = os.path.join(300DIRECTORY,301"..",302"..",303"example-docs",304"example-10k.html",305)306
307# create a file with no write permissions308read_only_file = tmp_path / "example-10k-readonly.html"309read_only_file.touch()310
311# set content of read_only_file to be that of example-10k.html312with open(example_filename) as f:313read_only_file.write_text(f.read())314
315# set read_only_file to be read only316read_only_file.chmod(0o444)317
318# partition html should still work319elements = partition_html(filename=read_only_file.resolve())320assert len(elements) > 0321
322
323def test_partition_html_processes_chinese_chracters():324html_text = "<html><div><p>每日新闻</p></div></html>"325elements = partition_html(text=html_text)326assert elements[0].text == "每日新闻"327
328
329def test_emoji_appears_with_emoji_utf8_code():330html_text = """\n<html charset="utf-8"><p>Hello 😀</p></html>"""331elements = partition_html(text=html_text)332assert elements[0] == Title("Hello 😀")333
334
335def test_partition_html_can_turn_off_assemble_articles():336html_text = """<html>337<article>
338<h1>Some important stuff is going on!</h1>
339<p>Here is a description of that stuff</p>
340</article>
341<article>
342<h1>Some other important stuff is going on!</h1>
343<p>Here is a description of that stuff</p>
344</article>
345<h4>This is outside of the article.</h4>
346</html>
347"""
348elements = partition_html(text=html_text, html_assemble_articles=False)349assert elements[-1] == Title("This is outside of the article.")350
351
352def test_partition_html_with_pre_tag():353filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-html-pre.htm")354elements = partition_html(filename=filename)355
356assert len(elements) > 0357assert "PageBreak" not in [elem.category for elem in elements]358assert clean_extra_whitespace(elements[0].text).startswith("[107th Congress Public Law 56]")359assert isinstance(elements[0], NarrativeText)360assert elements[0].metadata.filetype == "text/html"361assert elements[0].metadata.filename == "fake-html-pre.htm"362
363
364def test_partition_html_from_filename_exclude_metadata():365directory = os.path.join(DIRECTORY, "..", "..", "example-docs")366filename = os.path.join(directory, "example-10k.html")367elements = partition_html(filename=filename, include_metadata=False)368assert len(elements) > 0369assert "PageBreak" not in [elem.category for elem in elements]370assert elements[0].metadata.filename is None371assert elements[0].metadata.file_directory is None372
373
374def test_partition_html_metadata_date(mocker, filename="example-docs/fake-html.html"):375mocked_last_modification_date = "2029-07-05T09:24:28"376
377mocker.patch(378"unstructured.partition.html.get_last_modified_date",379return_value=mocked_last_modification_date,380)381elements = partition_html(filename=filename)382
383assert isinstance(elements[0], Title)384assert elements[0].metadata.last_modified == mocked_last_modification_date385
386
387def test_partition_html_from_file_metadata_date(388mocker,389filename="example-docs/fake-html.html",390):391mocked_last_modification_date = "2029-07-05T09:24:28"392
393mocker.patch(394"unstructured.partition.html.get_last_modified_date_from_file",395return_value=mocked_last_modification_date,396)397
398with open(filename) as f:399elements = partition_html(file=f)400
401assert isinstance(elements[0], Title)402assert elements[0].metadata.last_modified == mocked_last_modification_date403
404
405def test_partition_html_custom_metadata_date(406mocker,407filename="example-docs/fake-html.html",408):409mocked_last_modification_date = "2029-07-05T09:24:28"410expected_last_modification_date = "2020-07-05T09:24:28"411
412mocker.patch(413"unstructured.partition.html.get_last_modified_date",414return_value=mocked_last_modification_date,415)416
417elements = partition_html(418filename=filename,419metadata_last_modified=expected_last_modification_date,420)421
422assert isinstance(elements[0], Title)423assert elements[0].metadata.last_modified == expected_last_modification_date424
425
426def test_partition_html_from_file_custom_metadata_date(427mocker,428filename="example-docs/fake-html.html",429):430mocked_last_modification_date = "2029-07-05T09:24:28"431expected_last_modification_date = "2020-07-05T09:24:28"432
433mocker.patch(434"unstructured.partition.html.get_last_modified_date_from_file",435return_value=mocked_last_modification_date,436)437
438with open(filename) as f:439elements = partition_html(440file=f,441metadata_last_modified=expected_last_modification_date,442)443
444assert isinstance(elements[0], Title)445assert elements[0].metadata.last_modified == expected_last_modification_date446
447
448def test_partition_html_from_text_metadata_date(filename="example-docs/fake-html.html"):449elements = partition_html(text="<html><div><p>TEST</p></div></html>")450
451assert isinstance(elements[0], Title)452assert elements[0].metadata.last_modified is None453
454
455def test_partition_html_from_text_custom_metadata_date(456filename="example-docs/fake-html.html",457):458expected_last_modification_date = "2020-07-05T09:24:28"459
460elements = partition_html(461text="<html><div><p>TEST</p></div></html>",462metadata_last_modified=expected_last_modification_date,463)464
465assert isinstance(elements[0], Title)466assert elements[0].metadata.last_modified == expected_last_modification_date467
468
469def test_partition_html_grabs_links():470html_text = """<html>471<p>Hello there I am a <a href="/link">very important link!</a></p>
472<p>Here is a list of my favorite things</p>
473<ul>
474<li><a href="https://en.wikipedia.org/wiki/Parrot">Parrots</a></li>
475<li>Dogs</li>
476</ul>
477<a href="/loner">A lone link!</a>
478</html>"""
479elements = partition_html(text=html_text)480
481assert elements[0] == NarrativeText("Hello there I am a very important link!")482assert elements[0].metadata.link_urls == ["/link"]483assert elements[0].metadata.link_texts == ["very important link!"]484
485assert elements[1] == NarrativeText("Here is a list of my favorite things")486assert elements[1].metadata.link_urls is None487assert elements[1].metadata.link_texts is None488
489assert elements[2] == ListItem("Parrots")490assert elements[2].metadata.link_urls == ["https://en.wikipedia.org/wiki/Parrot"]491assert elements[2].metadata.link_texts == ["Parrots"]492
493assert elements[3] == ListItem("Dogs")494assert elements[3].metadata.link_urls is None495assert elements[3].metadata.link_texts is None496
497assert elements[4] == Title("A lone link!")498assert elements[4].metadata.link_urls == ["/loner"]499assert elements[4].metadata.link_texts == ["A lone link!"]500
501
502def test_partition_html_from_filename_with_skip_headers_and_footers(503filename="example-docs/fake-html-with-footer-and-header.html",504):505elements = partition_html(filename=filename, skip_headers_and_footers=True)506
507for element in elements:508assert "footer" not in element.ancestortags509assert "header" not in element.ancestortags510
511
512def test_partition_html_from_file_with_skip_headers_and_footers(513filename="example-docs/fake-html-with-footer-and-header.html",514):515with open(filename) as f:516elements = partition_html(file=f, skip_headers_and_footers=True)517
518for element in elements:519assert "footer" not in element.ancestortags520assert "header" not in element.ancestortags521
522
523def test_partition_html_from_text_with_skip_headers_and_footers():524text = """525<!DOCTYPE html>
526<html>
527<header>
528<p>Header</p>
529</header>
530<body>
531<h1>My First Heading</h1>
532<p>My first paragraph.</p>
533</body>
534<footer>
535<p>Footer</p>
536</footer>
537</html>"""
538elements = partition_html(text=text, skip_headers_and_footers=True)539
540for element in elements:541assert "footer" not in element.ancestortags542assert "header" not in element.ancestortags543
544
545def test_partition_html_from_url_with_skip_headers_and_footers(mocker):546test_url = "https://example.com"547test_headers = {"User-Agent": "test"}548
549response = Response()550response.status_code = 200551response._content = b"""<html>552<header>
553<p>Header</p>
554</header>
555<body>
556<h1>My First Heading</h1>
557<p>My first paragraph.</p>
558</body>
559<footer>
560<p>Footer</p>
561</footer>
562</html>"""
563response.headers = {"Content-Type": "text/html"}564
565mocker.patch("requests.get", return_value=response)566
567elements = partition_html(url=test_url, headers=test_headers, skip_headers_and_footers=True)568
569for element in elements:570assert "footer" not in element.ancestortags571assert "header" not in element.ancestortags572
573
574def test_partition_html_grabs_emphasized_texts():575html_text = """<html>576<p>Hello there I am a very <strong>important</strong> text!</p>
577<p>Here is a <span>list</span> of <b>my <i>favorite</i> things</b></p>
578<ul>
579<li><em>Parrots</em></li>
580<li>Dogs</li>
581</ul>
582<span>A lone span text!</span>
583</html>"""
584elements = partition_html(text=html_text)585
586assert elements[0] == NarrativeText("Hello there I am a very important text!")587assert elements[0].metadata.emphasized_text_contents == ["important"]588assert elements[0].metadata.emphasized_text_tags == ["strong"]589
590assert elements[1] == NarrativeText("Here is a list of my favorite things")591assert elements[1].metadata.emphasized_text_contents == [592"list",593"my favorite things",594"favorite",595]596assert elements[1].metadata.emphasized_text_tags == ["span", "b", "i"]597
598assert elements[2] == ListItem("Parrots")599assert elements[2].metadata.emphasized_text_contents == ["Parrots"]600assert elements[2].metadata.emphasized_text_tags == ["em"]601
602assert elements[3] == ListItem("Dogs")603assert elements[3].metadata.emphasized_text_contents is None604assert elements[3].metadata.emphasized_text_tags is None605
606assert elements[4] == Title("A lone span text!")607assert elements[4].metadata.emphasized_text_contents == ["A lone span text!"]608assert elements[4].metadata.emphasized_text_tags == ["span"]609
610
611def test_partition_html_with_json():612elements = partition_html(example_doc_path("example-10k.html"))613assert_round_trips_through_JSON(elements)614
615
616def test_pre_tag_parsing_respects_order():617html_text = """618<pre>The Big Brown Bear</pre>
619<div>The big brown bear is growling.</div>
620<pre>The big brown bear is sleeping.</pre>
621<div>The Big Blue Bear</div>
622"""
623elements = partition_html(text=html_text)624assert elements == [625Title("The Big Brown Bear"),626NarrativeText("The big brown bear is growling."),627NarrativeText("The big brown bear is sleeping."),628Title("The Big Blue Bear"),629]630
631
632def test_add_chunking_strategy_on_partition_html(633filename="example-docs/example-10k.html",634):635elements = partition_html(filename=filename)636chunk_elements = partition_html(filename, chunking_strategy="by_title")637chunks = chunk_by_title(elements)638assert chunk_elements != elements639assert chunk_elements == chunks640
641
642def test_html_heading_title_detection():643html_text = """644<p>This is a section of narrative text, it's long, flows and has meaning</p>
645<h1>This is a section of narrative text, it's long, flows and has meaning</h1>
646<h2>A heading that is at the second level</h2>
647<h3>Finally, the third heading</h3>
648<h2>December 1-17, 2017</h2>
649<h3>email@example.com</h3>
650<h3><li>- bulleted item</li></h3>
651"""
652elements = partition_html(text=html_text)653assert elements == [654NarrativeText("This is a section of narrative text, it's long, flows and has meaning"),655Title("This is a section of narrative text, it's long, flows and has meaning"),656Title("A heading that is at the second level"),657Title("Finally, the third heading"),658Title("December 1-17, 2017"),659EmailAddress("email@example.com"),660ListItem("- bulleted item"),661]662
663
664def test_partition_html_element_metadata_has_languages():665filename = "example-docs/example-10k.html"666elements = partition_html(filename=filename)667assert elements[0].metadata.languages == ["eng"]668
669
670def test_partition_html_respects_detect_language_per_element():671filename = "example-docs/language-docs/eng_spa_mult.html"672elements = partition_html(filename=filename, detect_language_per_element=True)673langs = [element.metadata.languages for element in elements]674assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]675
676
677@pytest.mark.parametrize(678("tag", "expected"),679[680("thead", "<table><tr><td>Header 1</td><td>Header 2</td></tr></table>"),681("tfoot", "<table><tr><td>Header 1</td><td>Header 2</td></tr></table>"),682],683)
684def test_partition_html_with_table_without_tbody(tag: str, expected: str):685table_html = (686f"<table>\n"687f" <{tag}>\n"688f" <tr><th>Header 1</th><th>Header 2</th></tr>\n"689f" </{tag}>\n"690f"</table>"691)692partitions = partition_html(text=table_html)693assert partitions[0].metadata.text_as_html == expected694