unstructured
1009 строк · 30.7 Кб
1# pyright: reportPrivateUsage=false
2
3import os4import pathlib5from typing import Dict, List6
7import pytest8from lxml import etree9from lxml import html as lxml_html10
11from unstructured.documents import html12from unstructured.documents.base import Page13from unstructured.documents.elements import (14Address,15ListItem,16NarrativeText,17Table,18Text,19Title,20)
21from unstructured.documents.html import (22HEADING_TAGS,23LIST_ITEM_TAGS,24SECTION_TAGS,25TABLE_TAGS,26TEXT_TAGS,27HTMLDocument,28HTMLNarrativeText,29HTMLTable,30HTMLTitle,31TagsMixin,32_parse_HTMLTable_from_table_elem,33)
34
35DIRECTORY = pathlib.Path(__file__).parent.resolve()36
37TAGS = (38(39"<a><abbr><acronym><address><applet><area><article><aside><audio><b><base><basefont><bdi>"40"<bdo><big><blockquote><body><br><button><canvas><caption><center><cite><code><col>"41"<colgroup><data><datalist><dd><del><details><dfn><dialog><dir><div><dl><dt><em><embed>"42"<fieldset><figcaption><figure><font><footer><form><frame><frameset><h1><h2><h3><h4><h5>"43"<h6><head><header><hr><html><i><iframe><img><input><ins><kbd><label><legend><li><link>"44"<main><map><mark><meta><meter><nav><noframes><noscript><object><ol><optgroup><option>"45"<output><p><param><picture><pre><progress><q><rp><rt><ruby><s><samp><script><section>"46"<select><small><source><span><strike><strong><style><sub><summary><sup><table><tbody><td>"47"<template><textarea><tfoot><th><thead><time><title><tr><track><tt><u><ul><var><video><wbr>"48)49.replace(">", "")50.split("<")[1:]51)
52
53VOID_TAGS = (54("<area><base><br><col><embed><hr><img><input><link><meta><param><source><track><wbr>")55.replace(">", "")56.split("<")[1:]57)
58
59INCLUDED_TAGS = TEXT_TAGS + HEADING_TAGS + LIST_ITEM_TAGS + SECTION_TAGS60EXCLUDED_TAGS = [61tag
62for tag in TAGS63if tag not in (INCLUDED_TAGS + TABLE_TAGS + VOID_TAGS + ["html", "head", "body"])64]
65
66
67# -- table-extraction behaviors ------------------------------------------------------------------
68
69
70def test_it_can_parse_a_bare_bones_table_to_an_HTMLTable_element():71"""Bare-bones means no `<thead>`, `<tbody>`, or `<tfoot>` elements."""72html_str = (73"<html>\n"74"<body>\n"75" <table>\n"76" <tr><td>Lorem</td><td>Ipsum</td></tr>\n"77" <tr><td>Ut enim non</td><td>ad minim\nveniam quis</td></tr>\n"78" </table>\n"79"</body>\n"80"</html>"81)82
83html_document = HTMLDocument.from_string(html_str)84
85# -- there is exactly one element and it's an HTMLTable instance --86(element,) = html_document.elements87assert isinstance(element, HTMLTable)88# -- table text is joined into a single string; no row or cell boundaries are represented --89assert element.text == "Lorem Ipsum Ut enim non ad minim\nveniam quis"90# -- An HTML representation is also available that is longer but represents table structure.91# -- Note this is padded with undesired spaces for human-readability that doesn't matter to us.92assert element.text_as_html == (93"<table>"94"<tr><td>Lorem</td><td>Ipsum</td></tr>"95"<tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr>"96"</table>"97)98
99
100def test_it_accommodates_column_heading_cells_enclosed_in_thead_tbody_and_tfoot_elements():101"""Cells within a `table/thead` element are included in the text and html.102
103The presence of a `<thead>` element in the original also determines whether a `<thead>` element
104appears in `.text_as_html` or whether the first row of cells is simply in the body.
105"""
106html_str = (107"<html>\n"108"<body>\n"109" <table>\n"110" <thead>\n"111" <tr><th>Lorem</th><th>Ipsum</th></tr>\n"112" </thead>\n"113" <tbody>\n"114" <tr><th>Lorem ipsum</th><td>dolor sit amet nulla</td></tr>\n"115" <tr><th>Ut enim non</th><td>ad minim\nveniam quis</td></tr>\n"116" </tbody>\n"117" <tfoot>\n"118" <tr><th>Dolor</th><td>Equis</td></tr>\n"119" </tfoot>\n"120" </table>\n"121"</body>\n"122"</html>"123)124
125html_document = HTMLDocument.from_string(html_str)126
127(element,) = html_document.elements128assert isinstance(element, HTMLTable)129assert element.text_as_html == (130"<table>"131"<tr><td>Lorem</td><td>Ipsum</td></tr>"132"<tr><td>Lorem ipsum</td><td>dolor sit amet nulla</td></tr>"133"<tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr>"134"<tr><td>Dolor</td><td>Equis</td></tr>"135"</table>"136)137
138
139def test_it_does_not_emit_an_HTMLTable_element_for_a_table_with_no_text():140html_str = (141"<html>\n"142"<body>\n"143" <table>\n"144" <tr><td> </td><td> </td></tr>\n"145" <tr><td> </td><td> </td></tr>\n"146" </table>\n"147"</body>\n"148"</html>"149)150
151html_document = HTMLDocument.from_string(html_str)152
153assert html_document.elements == []154
155
156def test_it_does_not_consider_an_empty_table_a_bulleted_text_table():157html_str = (158"<html>\n"159"<body>\n"160" <table>\n"161" <tr><td> </td><td> </td></tr>\n"162" <tr><td> </td><td> </td></tr>\n"163" </table>\n"164"</body>\n"165"</html>"166)167html_document = HTMLDocument.from_string(html_str)168html_elem = html_document.document_tree169assert html_elem is not None170table = html_elem.find(".//table")171assert table is not None172
173assert html._is_bulleted_table(table) is False174
175
176def test_it_provides_parseable_HTML_in_text_as_html():177html_str = (178"<html>\n"179"<body>\n"180" <table>\n"181" <thead>\n"182" <tr><th>Lorem</th><th>Ipsum</th></tr>\n"183" </thead>\n"184" <tbody>\n"185" <tr><th>Lorem ipsum</th><td>dolor sit amet nulla</td></tr>\n"186" <tr><th>Ut enim non</th><td>ad minim\nveniam quis</td></tr>\n"187" </tbody>\n"188" <tfoot>\n"189" <tr><th>Dolor</th><td>Equis</td></tr>\n"190" </tfoot>\n"191" </table>\n"192"</body>\n"193"</html>"194)195html_document = HTMLDocument.from_string(html_str)196(element,) = html_document.elements197assert isinstance(element, HTMLTable)198text_as_html = element.text_as_html199assert text_as_html is not None200
201html = etree.fromstring(text_as_html, etree.HTMLParser())202
203assert html is not None204# -- lxml adds the <html><body> container, that's not present in `.text_as_html` --205assert etree.tostring(html, encoding=str) == (206"<html><body>"207"<table>"208"<tr><td>Lorem</td><td>Ipsum</td></tr>"209"<tr><td>Lorem ipsum</td><td>dolor sit amet nulla</td></tr>"210"<tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr>"211"<tr><td>Dolor</td><td>Equis</td></tr>"212"</table>"213"</body></html>"214)215
216
217# -- element-suppression behaviors ---------------------------------------------------------------
218
219
220def test_it_does_not_extract_text_in_script_tags():221filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-with-scripts.html")222doc = HTMLDocument.from_file(filename=filename)223assert all("function (" not in element.text for element in doc.elements)224
225
226def test_it_does_not_extract_text_in_style_tags():227html_str = (228"<html>\n"229"<body>\n"230" <p><style> p { margin:0; padding:0; } </style>Lorem ipsum dolor</p>\n"231"</body>\n"232"</html>"233)234
235html_document = HTMLDocument.from_string(html_str)236
237(element,) = html_document.elements238assert isinstance(element, Text)239assert element.text == "Lorem ipsum dolor"240
241
242# ------------------------------------------------------------------------------------------------
243
244
245def test_parses_tags_correctly():246raw_html = """<html>247<body>
248<table>
249<tbody>
250<tr>
251<td><p>Hi there!</p></td>
252</tr>
253</tbody>
254</table>
255</body>
256</html>"""
257doc = HTMLDocument.from_string(raw_html)258el = doc.elements[0]259assert el.ancestortags + (el.tag,) == ("html", "body", "table")260
261
262def test_has_table_ancestor():263title = HTMLTitle(264"I am a Title",265tag="td",266ancestortags=["html", "body", "table", "tr"],267)268assert html.has_table_ancestor(title)269
270
271def test_has_no_table_ancestor():272title = HTMLTitle(273"I am a Title",274tag="p",275ancestortags=["html", "body"],276)277assert not html.has_table_ancestor(title)278
279
280def test_read_without_skipping_table(monkeypatch):281monkeypatch.setattr(html, "is_possible_narrative_text", lambda *args: True)282doc = """<html>283<body>
284<table>
285<tbody>
286<tr>
287<td><p>Hi there! I am Matt!</p></td>
288</tr>
289</tbody>
290</table>
291</body>
292</html>"""
293document = HTMLDocument.from_string(doc).doc_after_cleaners(skip_table=False)294assert document.pages[0].elements[0] == Table(text="Hi there! I am Matt!")295
296
297@pytest.mark.parametrize(298("doc", "expected"),299[300(301"<p>Hi there <span>my name is</span> <b><i>Matt</i></i></p>",302"Hi there my name is Matt",303),304("<p>I have a</p> tail", "I have a tail"),305],306)
307def test_construct_text(doc, expected):308document_tree = etree.fromstring(doc, etree.HTMLParser())309para = document_tree.find(".//p")310text = html._construct_text(para)311assert text == expected312
313
314@pytest.mark.parametrize(315("doc", "root", "expected"),316[317(318"<p>Hello <strong>there</strong> I <em>am</em> a <b>very</b> <i>important</i> text</p>",319"p",320[321{"text": "there", "tag": "strong"},322{"text": "am", "tag": "em"},323{"text": "very", "tag": "b"},324{"text": "important", "tag": "i"},325],326),327(328"<p>Here is a <span>list</span> of <b>my <i>favorite</i> things</b></p>",329"p",330[331{"text": "list", "tag": "span"},332{"text": "my favorite things", "tag": "b"},333{"text": "favorite", "tag": "i"},334],335),336(337"<strong>A lone strong text!</strong>",338"strong",339[{"text": "A lone strong text!", "tag": "strong"}],340),341("<span>I have a</span> tail", "span", [{"text": "I have a", "tag": "span"}]),342("<p>Text with no emphasized runs</p> ", "p", []),343],344)
345def test_get_emphasized_texts_from_tag(doc: str, root: str, expected: List[Dict[str, str]]):346document_tree = etree.fromstring(doc, etree.HTMLParser())347el = document_tree.find(f".//{root}")348assert el is not None349
350emphasized_texts = html._get_emphasized_texts_from_tag(el)351
352assert emphasized_texts == expected353
354
355def test_parse_nothing():356doc = """<p></p>"""357document_tree = etree.fromstring(doc, etree.HTMLParser())358el = document_tree.find(".//p")359parsed_el = html._parse_tag(el)360assert parsed_el is None361
362
363def test_read_with_existing_pages():364page = Page(number=0)365html_document = HTMLDocument.from_pages([page])366assert html_document.pages == [page]367
368
369def test_parse_not_anything(monkeypatch):370monkeypatch.setattr(html, "is_narrative_tag", lambda *args: False)371monkeypatch.setattr(html, "is_possible_title", lambda *args: False)372doc = """<p>This is nothing</p>"""373document_tree = etree.fromstring(doc, etree.HTMLParser())374el = document_tree.find(".//p")375parsed_el = html._parse_tag(el)376assert parsed_el == Text(text="This is nothing")377
378
379def test_parse_bullets(monkeypatch):380doc = """<p>● An excellent point!</p>"""381document_tree = etree.fromstring(doc, etree.HTMLParser())382el = document_tree.find(".//p")383parsed_el = html._parse_tag(el)384assert parsed_el == ListItem("An excellent point!")385
386
387def test_parse_tag_ignores_lonely_bullets():388doc = """<p>●</p>"""389document_tree = etree.fromstring(doc, etree.HTMLParser())390el = document_tree.find(".//p")391parsed_el = html._parse_tag(el)392assert parsed_el is None393
394
395def test_parse_tag_ignores_stubs():396doc = """<p>$</p>"""397document_tree = etree.fromstring(doc, etree.HTMLParser())398el = document_tree.find(".//p")399parsed_el = html._parse_tag(el)400assert parsed_el is None401
402
403def test_adjacent_spans_are_text_tags():404doc = """<div><span>•</span><span>A bullet!</span></div>"""405document_tree = etree.fromstring(doc, etree.HTMLParser())406el = document_tree.find(".//div")407assert html._is_text_tag(el) is True408
409
410def test_process_list_item_gets_next_section():411doc = """412<div>
413<p>●</p>
414<p>●</p>
415</div>
416<div>
417<p>An excellent point!</p>
418</div>
419
420"""
421document_tree = etree.fromstring(doc, etree.HTMLParser())422el = document_tree.find(".//div")423parsed_el, _ = html._process_list_item(el, max_predecessor_len=10)424assert parsed_el == ListItem(text="An excellent point!")425
426
427def test_get_bullet_descendants():428div_1 = "<div><p>●</p><p>●</p></div>"429document_tree_1 = etree.fromstring(div_1, etree.HTMLParser())430element = document_tree_1.find(".//div")431
432div_2 = "<div><p>An excellent point!</p></div>"433document_tree_2 = etree.fromstring(div_2, etree.HTMLParser())434next_element = document_tree_2.find(".//div")435
436descendants = html._get_bullet_descendants(element, next_element)437assert len(descendants) == 1438
439
440def test_process_list_item_returns_none_if_next_blank():441doc = """442<div>
443<p>●</p>
444<p>●</p>
445</div>
446
447"""
448document_tree = etree.fromstring(doc, etree.HTMLParser())449el = document_tree.find(".//div")450parsed_el, _ = html._process_list_item(el)451assert parsed_el is None452
453
454def test_process_list_item_returns_none_if_next_has_no_text():455doc = """456<div>
457<p>●</p>
458<p>●</p>
459</div>
460<div>
461</div>
462"""
463document_tree = etree.fromstring(doc, etree.HTMLParser())464el = document_tree.find(".//div")465assert html.is_list_item_tag(el) is True466parsed_el, _ = html._process_list_item(el)467assert parsed_el is None468
469
470def test_process_list_item_ignores_deep_divs():471doc = """472<div>
473<p>●</p>
474<p>●</p>
475<p>●</p>
476<p>●</p>
477<p>●</p>
478</div>
479<div>
480<p>An excellent point!</p>
481</div>
482
483"""
484document_tree = etree.fromstring(doc, etree.HTMLParser())485el = document_tree.find(".//div")486parsed_el, _ = html._process_list_item(el, max_predecessor_len=2)487assert parsed_el is None488
489
490def test_read_html_doc(tmpdir, monkeypatch):491TITLE1 = "A Great and Glorious Section"492SECTION1 = "Dear Leader is the best. He is such a wonderful engineer!"493TITLE2 = "Another Magnificent Title"494SECTION2 = "The last section is a title because of its capitalization patterns!"495TABLE_SECTION = "Skip me because I'm in a table"496TITLE3 = "A New Beginning"497SECTION3 = "Here is the start of a new page."498
499doc = f"""<html>500<body>
501<header>
502<p>Here is a header. We want to ignore anything that is in this section.</p>
503</header>
504<h1>{TITLE1}</h1>505<p>{SECTION1}</p>506<p></p>
507<p>{TITLE2}</p>508<p><b>{SECTION2}</b></p>509<table>
510<tbody>
511<tr>
512<td><p>{TABLE_SECTION}</p></td>513</tr>
514</tbody>
515</table>
516<hr>
517<h2>{TITLE3}</h2>518<div>{SECTION3}</div>519<footer>
520<p>Here is a footer. We want to ignore anything that is in this section</p>
521</footer>
522<div>
523<p>Let's ignore anything after the footer too since it's probably garbage.</p>
524</div>
525</body>
526</html>"""
527filename = os.path.join(tmpdir.dirname, "sample-doc.html")528with open(filename, "w") as f:529f.write(doc)530
531html_document = HTMLDocument.from_file(filename=filename).doc_after_cleaners(532skip_headers_and_footers=True,533skip_table=True,534)535print("original pages: ", HTMLDocument.from_file(filename=filename).pages)536print("filtered pages: ", html_document.pages)537print([el.text for el in html_document.pages[0].elements])538
539assert len(html_document.pages) == 2540
541page_one = html_document.pages[0]542assert len(page_one.elements) == 4543assert page_one.elements == [544Title(text=TITLE1),545NarrativeText(text=SECTION1),546Title(text=TITLE2),547NarrativeText(text=SECTION2),548]549
550page_two = html_document.pages[1]551assert len(page_two.elements) == 2552assert page_two.elements == [553Title(text=TITLE3),554NarrativeText(text=SECTION3),555]556
557pages = html_document.pages558assert all(isinstance(page, Page) for page in pages)559
560
561def test_find_main():562html_str = """<header></header>563<body>
564<p>Lots preamble stuff yada yada yada</p>
565<main>
566<article>
567<section>
568<h2>A Wonderful Section!</h2>
569<p>Look at this amazing section!</p>
570</section>
571<section>
572<h2>Another Wonderful Section!</h2>
573<p>Look at this other amazing section!</p>
574</section>
575</article>
576</main>
577</body>"""
578html_document = HTMLDocument.from_string(html_str)579document_tree = html_document.document_tree580main_tag = html._find_main(document_tree)581assert main_tag.tag == "main"582
583
584def test_find_main_returns_doc_when_main_not_present():585html_str = """<header></header>586<body>
587<p>Lots preamble stuff yada yada yada</p>
588<article>
589<section>
590<h2>A Wonderful Section!</h2>
591<p>Look at this amazing section!</p>
592</section>
593<section>
594<h2>Another Wonderful Section!</h2>
595<p>Look at this other amazing section!</p>
596</section>
597</article>
598</body>"""
599html_document = HTMLDocument.from_string(html_str)600document_tree = html_document.document_tree601root = html._find_main(document_tree)602assert root.tag == "html"603
604
605def test_find_articles():606html_str = """<header></header>607<body>
608<p>Lots preamble stuff yada yada yada</p>
609<article>
610<h2>A Wonderful Section!</h2>
611<p>Look at this amazing section!</p>
612</article>
613<article>
614<h2>Another Wonderful Section!</h2>
615<p>Look at this other amazing section!</p>
616</article>
617</body>"""
618html_document = HTMLDocument.from_string(html_str)619document_tree = html_document.document_tree620articles = html._find_articles(document_tree)621assert len(articles) == 2622
623
624def test_find_articles_returns_doc_when_none_present():625html_str = """<header></header>626<body>
627<p>Lots preamble stuff yada yada yada</p>
628<section>
629<h2>A Wonderful Section!</h2>
630<p>Look at this amazing section!</p>
631</section>
632<section>
633<h2>Another Wonderful Section!</h2>
634<p>Look at this other amazing section!</p>
635</section>
636</body>"""
637html_document = HTMLDocument.from_string(html_str)638document_tree = html_document.document_tree639articles = html._find_articles(document_tree)640assert len(articles) == 1641
642
643def test_include_headers_and_footers(sample_doc):644html_document = sample_doc.doc_after_cleaners(skip_headers_and_footers=False)645assert len(html_document.pages[1].elements) == 3646
647
648def test_include_table_text(sample_doc):649html_document = sample_doc.doc_after_cleaners(skip_table=False)650assert len(html_document.pages[0].elements) == 2651
652
653@pytest.mark.parametrize("tag", [tag for tag in TEXT_TAGS if tag not in TABLE_TAGS])654def test_tag_types(tag):655html_str = f"""656<body>
657<{tag}>658There is some text here.
659</{tag}>660</body>
661"""
662html_document = HTMLDocument.from_string(html_str)663assert len(html_document.pages[0].elements) == 1664
665
666@pytest.mark.parametrize("tag", EXCLUDED_TAGS)667def test_exclude_tag_types(tag):668html_str = f"""669<body>
670<{tag}>671There is some text here.
672</{tag}>673</body>
674"""
675html_document = HTMLDocument.from_string(html_str)676assert len(html_document.pages) == 0677
678
679def test_tag_types_table(sample_doc):680html_document = sample_doc.doc_after_cleaners(skip_table=True)681assert len(html_document.pages[0].elements) == 2682
683
684def test_nested_text_tags():685tag1, tag2 = [tag for tag in TEXT_TAGS if tag not in TABLE_TAGS][:2]686html_str = f"""687<body>
688<{tag1}>689<{tag2}>690There is some text here.
691</{tag2}>692</{tag1}>693</body>
694"""
695html_document = HTMLDocument.from_string(html_str).doc_after_cleaners(skip_table=False)696assert len(html_document.pages[0].elements) == 1697
698
699def test_containers_with_text_are_processed():700html_str = """<div dir=3D"ltr">Hi All,<div><br></div>701<div>Get excited for our first annual family day!</div>
702<div>Best.<br clear=3D"all">
703<div><br></div>
704-- <br>
705<div dir=3D"ltr">
706<div dir=3D"ltr">Dino the Datasaur<div>Unstructured Technologies<br><div>Data Scientist
707</div>
708<div>Doylestown, PA 18901</div>
709<div><br></div>
710</div>
711</div>
712</div>
713</div>
714</div>"""
715html_document = HTMLDocument.from_string(html_str)716
717assert html_document.elements == [718Text(text="Hi All,"),719NarrativeText(text="Get excited for our first annual family day!"),720Title(text="Best."),721Title(text="Dino the Datasaur"),722Title(text="Unstructured Technologies"),723Title(text="Data Scientist"),724Address(text="Doylestown, PA 18901"),725]726
727
728def test_html_grabs_bulleted_text_in_tags():729html_str = """<html>730<body>
731<ol>
732<li>Happy Groundhog's day!</li>
733<li>Looks like six more weeks of winter ...</li>
734</ol>
735</body>
736</html>"""
737html_document = HTMLDocument.from_string(html_str)738
739assert html_document.elements == [740ListItem(text="Happy Groundhog's day!"),741ListItem(text="Looks like six more weeks of winter ..."),742]743
744
745def test_html_grabs_bulleted_text_in_paras():746html_str = """<html>747<body>
748<p>
749<span>• Happy Groundhog's day!</span>
750</p>
751<p>
752<span>• Looks like six more weeks of winter ...</span>
753</p>
754</body>
755</html>"""
756html_document = HTMLDocument.from_string(html_str)757
758assert html_document.elements == [759ListItem(text="Happy Groundhog's day!"),760ListItem(text="Looks like six more weeks of winter ..."),761]762
763
764def test_bulletized_bulleted_text_from_table():765doc = """<html>766<body>
767<table>
768<tbody>
769<tr>
770<td>•</td>
771<td><p>Happy Groundhog's day!</p></td>
772</tr>
773<tr>
774<td>•</td>
775<td><p>Looks like six more weeks of winter ...</p></td>
776</tr>
777</tbody>
778</table>
779</body>
780</html>"""
781document_tree = etree.fromstring(doc, etree.HTMLParser())782table = document_tree.find(".//table")783bulleted_text = html._bulleted_text_from_table(table)784assert bulleted_text == [785ListItem(text="Happy Groundhog's day!"),786ListItem(text="Looks like six more weeks of winter ..."),787]788
789
790def test_html_grabs_bulleted_text_in_tables():791html_str = """<html>792<body>
793<table>
794<tbody>
795<tr>
796<td>•</td>
797<td><p>Happy Groundhog's day!</p></td>
798</tr>
799<tr>
800<td>•</td>
801<td><p>Looks like six more weeks of winter ...</p></td>
802</tr>
803</tbody>
804</table>
805</body>
806</html>"""
807html_document = HTMLDocument.from_string(html_str)808
809assert html_document.elements == [810ListItem(text="Happy Groundhog's day!"),811ListItem(text="Looks like six more weeks of winter ..."),812]813
814
815def test_raises_error_no_tag():816with pytest.raises(TypeError):817TagsMixin(tag=None)818with pytest.raises(TypeError):819TagsMixin()820
821
822def test_raises_error_wrong_elements(monkeypatch, sample_doc):823page = Page(0)824page.elements = ["this should def not be a string"]825monkeypatch.setattr(sample_doc, "_pages", [page])826with pytest.raises(ValueError):827sample_doc.doc_after_cleaners()828
829
830def test_filter_in_place():831html_doc = """832<table><tbody><tr><td>A table thing.</td></tr></tbody></table>
833<p>A non-table thing</p>
834"""
835doc = HTMLDocument.from_string(html_doc)836assert len(doc.elements) == 2837doc.doc_after_cleaners(skip_table=True, inplace=True)838assert len(doc.elements) == 1839
840
841def test_joins_tag_text_correctly():842raw_html = "<p>Hello again peet mag<i>ic</i>al</p>"843doc = HTMLDocument.from_string(raw_html)844el = doc.elements[0]845assert el.text == "Hello again peet magical"846
847
848def test_sample_doc_with_emoji():849raw_html = """850<html charset="unicode">
851<p>Hello again 😀</p>
852</html>
853"""
854doc = HTMLDocument.from_string(raw_html)855# NOTE(robinson) - unclear why right now, but the output is the emoji on the test runners856# and the byte string representation when running locally on mac857assert doc.elements[0].text in ["Hello again ð\x9f\x98\x80", "Hello again 😀"]858
859
860def test_only_plain_text_in_body():861raw_html = "<body>Hello</body>"862doc = HTMLDocument.from_string(raw_html)863assert doc.elements[0].text == "Hello"864
865
866def test_plain_text_before_anything_in_body():867raw_html = "<body>Hello<p>World</p></body>"868doc = HTMLDocument.from_string(raw_html)869assert doc.elements[0].text == "Hello"870assert doc.elements[1].text == "World"871
872
873def test_line_break_in_container():874raw_html = "<div>Hello<br/>World</div>"875doc = HTMLDocument.from_string(raw_html)876assert doc.elements[0].text == "Hello"877assert doc.elements[1].text == "World"878
879
880@pytest.mark.parametrize("tag", TEXT_TAGS)881def test_line_break_in_text_tag(tag):882raw_html = f"<{tag}>Hello<br/>World</{tag}>"883doc = HTMLDocument.from_string(raw_html)884assert doc.elements[0].text == "Hello"885assert doc.elements[1].text == "World"886
887
888# -- unit-level tests ----------------------------------------------------------------------------
889
890
891class Describe_parse_HTMLTable_from_table_elem:892"""Unit-test suite for `unstructured.documents.html._parse_HTMLTable_from_table_elem`."""893
894def it_produces_one_cell_for_each_original_table_cell(self):895table_html = (896# -- include formatting whitespace to make sure it is removed --897"<table>\n"898" <tr>\n"899" <td>foo</td>\n"900" <td>bar</td>\n"901" </tr>\n"902"</table>"903)904table_elem = lxml_html.fromstring(table_html) # pyright: ignore[reportUnknownMemberType]905
906html_table = _parse_HTMLTable_from_table_elem(table_elem)907
908assert isinstance(html_table, HTMLTable)909assert html_table.text == "foo bar"910assert html_table.text_as_html == "<table><tr><td>foo</td><td>bar</td></tr></table>"911
912def it_accommodates_tds_with_child_elements(self):913"""Like this example from an SEC 10k filing."""914table_html = (915"<table>\n"916" <tr>\n"917" <td></td>\n"918" <td></td>\n"919" </tr>\n"920" <tr>\n"921" <td>\n"922" <p>\n"923" <span>\n"924' <ix:nonNumeric id="F_be4cc145-372a-4689-be60-d8a70b0c8b9a"'925' contextRef="C_1de69f73-df01-4830-8af0-0f11b469bc4a" name="dei:DocumentAnnualReport"'926' format="ixt-sec:boolballotbox">\n'927" <span>☒</span>\n"928" </ix:nonNumeric>\n"929" </span>\n"930" </p>\n"931" </td>\n"932" <td>\n"933" <p>\n"934" <span>ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE"935" ACT OF 1934</span>\n"936" </p>\n"937" </td>\n"938" </tr>\n"939"</table>\n"940)941table_elem = lxml_html.fromstring(table_html) # pyright: ignore[reportUnknownMemberType]942
943html_table = _parse_HTMLTable_from_table_elem(table_elem)944
945assert isinstance(html_table, HTMLTable)946assert html_table.text == (947"☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934"948)949print(f"{html_table.text_as_html=}")950assert html_table.text_as_html == (951"<table>"952"<tr><td></td><td></td></tr>"953"<tr><td>☒</td><td>ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES"954" EXCHANGE ACT OF 1934</td></tr>"955"</table>"956)957
958def it_reduces_a_nested_table_to_its_text_placed_in_the_cell_containing_the_nested_table(self):959"""Recursively ..."""960nested_table_html = (961"<table>\n"962" <tr>\n"963" <td>\n"964" <table>\n"965" <tr><td>foo</td><td>bar</td></tr>\n"966" <tr><td>baz</td><td>bng</td></tr>\n"967" </table>\n"968" </td>\n"969" <td>\n"970" <table>\n"971" <tr><td>fizz</td><td>bang</td></tr>\n"972" </table>\n"973" </td>\n"974" </tr>\n"975"</table>"976)977table_elem = lxml_html.fromstring( # pyright: ignore[reportUnknownMemberType]978nested_table_html
979)980
981html_table = _parse_HTMLTable_from_table_elem(table_elem)982
983assert isinstance(html_table, HTMLTable)984assert html_table.text == "foo bar baz bng fizz bang"985assert html_table.text_as_html == (986"<table><tr><td>foo bar baz bng</td><td>fizz bang</td></tr></table>"987)988
989
990# -- module-level fixtures -----------------------------------------------------------------------
991
992
993@pytest.fixture()994def sample_doc():995table_element = HTMLTitle(996"I'm a title in a table.",997tag="p",998ancestortags=("table", "tbody", "tr", "td"),999)1000narrative = HTMLNarrativeText("I'm some narrative text", tag="p", ancestortags=())1001page1 = Page(0)1002page1.elements = [table_element, narrative]1003header = HTMLTitle("I'm a header", tag="header", ancestortags=())1004body = HTMLNarrativeText("Body text", tag="p", ancestortags=())1005footer = HTMLTitle("I'm a footer", tag="footer", ancestortags=())1006page2 = Page(1)1007page2.elements = [header, body, footer]1008doc = HTMLDocument.from_pages([page1, page2])1009return doc1010