unstructured

Форк
0
1009 строк · 30.7 Кб
1
# pyright: reportPrivateUsage=false
2

3
import os
4
import pathlib
5
from typing import Dict, List
6

7
import pytest
8
from lxml import etree
9
from lxml import html as lxml_html
10

11
from unstructured.documents import html
12
from unstructured.documents.base import Page
13
from unstructured.documents.elements import (
14
    Address,
15
    ListItem,
16
    NarrativeText,
17
    Table,
18
    Text,
19
    Title,
20
)
21
from unstructured.documents.html import (
22
    HEADING_TAGS,
23
    LIST_ITEM_TAGS,
24
    SECTION_TAGS,
25
    TABLE_TAGS,
26
    TEXT_TAGS,
27
    HTMLDocument,
28
    HTMLNarrativeText,
29
    HTMLTable,
30
    HTMLTitle,
31
    TagsMixin,
32
    _parse_HTMLTable_from_table_elem,
33
)
34

35
DIRECTORY = pathlib.Path(__file__).parent.resolve()
36

37
TAGS = (
38
    (
39
        "<a><abbr><acronym><address><applet><area><article><aside><audio><b><base><basefont><bdi>"
40
        "<bdo><big><blockquote><body><br><button><canvas><caption><center><cite><code><col>"
41
        "<colgroup><data><datalist><dd><del><details><dfn><dialog><dir><div><dl><dt><em><embed>"
42
        "<fieldset><figcaption><figure><font><footer><form><frame><frameset><h1><h2><h3><h4><h5>"
43
        "<h6><head><header><hr><html><i><iframe><img><input><ins><kbd><label><legend><li><link>"
44
        "<main><map><mark><meta><meter><nav><noframes><noscript><object><ol><optgroup><option>"
45
        "<output><p><param><picture><pre><progress><q><rp><rt><ruby><s><samp><script><section>"
46
        "<select><small><source><span><strike><strong><style><sub><summary><sup><table><tbody><td>"
47
        "<template><textarea><tfoot><th><thead><time><title><tr><track><tt><u><ul><var><video><wbr>"
48
    )
49
    .replace(">", "")
50
    .split("<")[1:]
51
)
52

53
VOID_TAGS = (
54
    ("<area><base><br><col><embed><hr><img><input><link><meta><param><source><track><wbr>")
55
    .replace(">", "")
56
    .split("<")[1:]
57
)
58

59
INCLUDED_TAGS = TEXT_TAGS + HEADING_TAGS + LIST_ITEM_TAGS + SECTION_TAGS
60
EXCLUDED_TAGS = [
61
    tag
62
    for tag in TAGS
63
    if tag not in (INCLUDED_TAGS + TABLE_TAGS + VOID_TAGS + ["html", "head", "body"])
64
]
65

66

67
# -- table-extraction behaviors ------------------------------------------------------------------
68

69

70
def test_it_can_parse_a_bare_bones_table_to_an_HTMLTable_element():
71
    """Bare-bones means no `<thead>`, `<tbody>`, or `<tfoot>` elements."""
72
    html_str = (
73
        "<html>\n"
74
        "<body>\n"
75
        "  <table>\n"
76
        "    <tr><td>Lorem</td><td>Ipsum</td></tr>\n"
77
        "    <tr><td>Ut enim non</td><td>ad minim\nveniam quis</td></tr>\n"
78
        "  </table>\n"
79
        "</body>\n"
80
        "</html>"
81
    )
82

83
    html_document = HTMLDocument.from_string(html_str)
84

85
    # -- there is exactly one element and it's an HTMLTable instance --
86
    (element,) = html_document.elements
87
    assert isinstance(element, HTMLTable)
88
    # -- table text is joined into a single string; no row or cell boundaries are represented --
89
    assert element.text == "Lorem Ipsum Ut enim non ad minim\nveniam quis"
90
    # -- An HTML representation is also available that is longer but represents table structure.
91
    # -- Note this is padded with undesired spaces for human-readability that doesn't matter to us.
92
    assert element.text_as_html == (
93
        "<table>"
94
        "<tr><td>Lorem</td><td>Ipsum</td></tr>"
95
        "<tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr>"
96
        "</table>"
97
    )
98

99

100
def test_it_accommodates_column_heading_cells_enclosed_in_thead_tbody_and_tfoot_elements():
101
    """Cells within a `table/thead` element are included in the text and html.
102

103
    The presence of a `<thead>` element in the original also determines whether a `<thead>` element
104
    appears in `.text_as_html` or whether the first row of cells is simply in the body.
105
    """
106
    html_str = (
107
        "<html>\n"
108
        "<body>\n"
109
        "  <table>\n"
110
        "    <thead>\n"
111
        "      <tr><th>Lorem</th><th>Ipsum</th></tr>\n"
112
        "    </thead>\n"
113
        "    <tbody>\n"
114
        "      <tr><th>Lorem ipsum</th><td>dolor sit amet nulla</td></tr>\n"
115
        "      <tr><th>Ut enim non</th><td>ad minim\nveniam quis</td></tr>\n"
116
        "    </tbody>\n"
117
        "    <tfoot>\n"
118
        "      <tr><th>Dolor</th><td>Equis</td></tr>\n"
119
        "    </tfoot>\n"
120
        "  </table>\n"
121
        "</body>\n"
122
        "</html>"
123
    )
124

125
    html_document = HTMLDocument.from_string(html_str)
126

127
    (element,) = html_document.elements
128
    assert isinstance(element, HTMLTable)
129
    assert element.text_as_html == (
130
        "<table>"
131
        "<tr><td>Lorem</td><td>Ipsum</td></tr>"
132
        "<tr><td>Lorem ipsum</td><td>dolor sit amet nulla</td></tr>"
133
        "<tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr>"
134
        "<tr><td>Dolor</td><td>Equis</td></tr>"
135
        "</table>"
136
    )
137

138

139
def test_it_does_not_emit_an_HTMLTable_element_for_a_table_with_no_text():
140
    html_str = (
141
        "<html>\n"
142
        "<body>\n"
143
        "  <table>\n"
144
        "    <tr><td> </td><td> </td></tr>\n"
145
        "    <tr><td> </td><td> </td></tr>\n"
146
        "  </table>\n"
147
        "</body>\n"
148
        "</html>"
149
    )
150

151
    html_document = HTMLDocument.from_string(html_str)
152

153
    assert html_document.elements == []
154

155

156
def test_it_does_not_consider_an_empty_table_a_bulleted_text_table():
157
    html_str = (
158
        "<html>\n"
159
        "<body>\n"
160
        "  <table>\n"
161
        "    <tr><td> </td><td> </td></tr>\n"
162
        "    <tr><td> </td><td> </td></tr>\n"
163
        "  </table>\n"
164
        "</body>\n"
165
        "</html>"
166
    )
167
    html_document = HTMLDocument.from_string(html_str)
168
    html_elem = html_document.document_tree
169
    assert html_elem is not None
170
    table = html_elem.find(".//table")
171
    assert table is not None
172

173
    assert html._is_bulleted_table(table) is False
174

175

176
def test_it_provides_parseable_HTML_in_text_as_html():
177
    html_str = (
178
        "<html>\n"
179
        "<body>\n"
180
        "  <table>\n"
181
        "    <thead>\n"
182
        "      <tr><th>Lorem</th><th>Ipsum</th></tr>\n"
183
        "    </thead>\n"
184
        "    <tbody>\n"
185
        "      <tr><th>Lorem ipsum</th><td>dolor sit amet nulla</td></tr>\n"
186
        "      <tr><th>Ut enim non</th><td>ad minim\nveniam quis</td></tr>\n"
187
        "    </tbody>\n"
188
        "    <tfoot>\n"
189
        "      <tr><th>Dolor</th><td>Equis</td></tr>\n"
190
        "    </tfoot>\n"
191
        "  </table>\n"
192
        "</body>\n"
193
        "</html>"
194
    )
195
    html_document = HTMLDocument.from_string(html_str)
196
    (element,) = html_document.elements
197
    assert isinstance(element, HTMLTable)
198
    text_as_html = element.text_as_html
199
    assert text_as_html is not None
200

201
    html = etree.fromstring(text_as_html, etree.HTMLParser())
202

203
    assert html is not None
204
    # -- lxml adds the <html><body> container, that's not present in `.text_as_html` --
205
    assert etree.tostring(html, encoding=str) == (
206
        "<html><body>"
207
        "<table>"
208
        "<tr><td>Lorem</td><td>Ipsum</td></tr>"
209
        "<tr><td>Lorem ipsum</td><td>dolor sit amet nulla</td></tr>"
210
        "<tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr>"
211
        "<tr><td>Dolor</td><td>Equis</td></tr>"
212
        "</table>"
213
        "</body></html>"
214
    )
215

216

217
# -- element-suppression behaviors ---------------------------------------------------------------
218

219

220
def test_it_does_not_extract_text_in_script_tags():
221
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-with-scripts.html")
222
    doc = HTMLDocument.from_file(filename=filename)
223
    assert all("function (" not in element.text for element in doc.elements)
224

225

226
def test_it_does_not_extract_text_in_style_tags():
227
    html_str = (
228
        "<html>\n"
229
        "<body>\n"
230
        "  <p><style> p { margin:0; padding:0; } </style>Lorem ipsum dolor</p>\n"
231
        "</body>\n"
232
        "</html>"
233
    )
234

235
    html_document = HTMLDocument.from_string(html_str)
236

237
    (element,) = html_document.elements
238
    assert isinstance(element, Text)
239
    assert element.text == "Lorem ipsum dolor"
240

241

242
# ------------------------------------------------------------------------------------------------
243

244

245
def test_parses_tags_correctly():
246
    raw_html = """<html>
247
    <body>
248
        <table>
249
            <tbody>
250
                <tr>
251
                    <td><p>Hi there!</p></td>
252
                </tr>
253
            </tbody>
254
        </table>
255
    </body>
256
</html>"""
257
    doc = HTMLDocument.from_string(raw_html)
258
    el = doc.elements[0]
259
    assert el.ancestortags + (el.tag,) == ("html", "body", "table")
260

261

262
def test_has_table_ancestor():
263
    title = HTMLTitle(
264
        "I am a Title",
265
        tag="td",
266
        ancestortags=["html", "body", "table", "tr"],
267
    )
268
    assert html.has_table_ancestor(title)
269

270

271
def test_has_no_table_ancestor():
272
    title = HTMLTitle(
273
        "I am a Title",
274
        tag="p",
275
        ancestortags=["html", "body"],
276
    )
277
    assert not html.has_table_ancestor(title)
278

279

280
def test_read_without_skipping_table(monkeypatch):
281
    monkeypatch.setattr(html, "is_possible_narrative_text", lambda *args: True)
282
    doc = """<html>
283
    <body>
284
        <table>
285
            <tbody>
286
                <tr>
287
                    <td><p>Hi there! I am Matt!</p></td>
288
                </tr>
289
            </tbody>
290
        </table>
291
    </body>
292
</html>"""
293
    document = HTMLDocument.from_string(doc).doc_after_cleaners(skip_table=False)
294
    assert document.pages[0].elements[0] == Table(text="Hi there! I am Matt!")
295

296

297
@pytest.mark.parametrize(
298
    ("doc", "expected"),
299
    [
300
        (
301
            "<p>Hi there <span>my name is</span> <b><i>Matt</i></i></p>",
302
            "Hi there my name is Matt",
303
        ),
304
        ("<p>I have a</p> tail", "I have a tail"),
305
    ],
306
)
307
def test_construct_text(doc, expected):
308
    document_tree = etree.fromstring(doc, etree.HTMLParser())
309
    para = document_tree.find(".//p")
310
    text = html._construct_text(para)
311
    assert text == expected
312

313

314
@pytest.mark.parametrize(
315
    ("doc", "root", "expected"),
316
    [
317
        (
318
            "<p>Hello <strong>there</strong> I <em>am</em> a <b>very</b> <i>important</i> text</p>",
319
            "p",
320
            [
321
                {"text": "there", "tag": "strong"},
322
                {"text": "am", "tag": "em"},
323
                {"text": "very", "tag": "b"},
324
                {"text": "important", "tag": "i"},
325
            ],
326
        ),
327
        (
328
            "<p>Here is a <span>list</span> of <b>my <i>favorite</i> things</b></p>",
329
            "p",
330
            [
331
                {"text": "list", "tag": "span"},
332
                {"text": "my favorite things", "tag": "b"},
333
                {"text": "favorite", "tag": "i"},
334
            ],
335
        ),
336
        (
337
            "<strong>A lone strong text!</strong>",
338
            "strong",
339
            [{"text": "A lone strong text!", "tag": "strong"}],
340
        ),
341
        ("<span>I have a</span> tail", "span", [{"text": "I have a", "tag": "span"}]),
342
        ("<p>Text with no emphasized runs</p> ", "p", []),
343
    ],
344
)
345
def test_get_emphasized_texts_from_tag(doc: str, root: str, expected: List[Dict[str, str]]):
346
    document_tree = etree.fromstring(doc, etree.HTMLParser())
347
    el = document_tree.find(f".//{root}")
348
    assert el is not None
349

350
    emphasized_texts = html._get_emphasized_texts_from_tag(el)
351

352
    assert emphasized_texts == expected
353

354

355
def test_parse_nothing():
356
    doc = """<p></p>"""
357
    document_tree = etree.fromstring(doc, etree.HTMLParser())
358
    el = document_tree.find(".//p")
359
    parsed_el = html._parse_tag(el)
360
    assert parsed_el is None
361

362

363
def test_read_with_existing_pages():
364
    page = Page(number=0)
365
    html_document = HTMLDocument.from_pages([page])
366
    assert html_document.pages == [page]
367

368

369
def test_parse_not_anything(monkeypatch):
370
    monkeypatch.setattr(html, "is_narrative_tag", lambda *args: False)
371
    monkeypatch.setattr(html, "is_possible_title", lambda *args: False)
372
    doc = """<p>This is nothing</p>"""
373
    document_tree = etree.fromstring(doc, etree.HTMLParser())
374
    el = document_tree.find(".//p")
375
    parsed_el = html._parse_tag(el)
376
    assert parsed_el == Text(text="This is nothing")
377

378

379
def test_parse_bullets(monkeypatch):
380
    doc = """<p>● An excellent point!</p>"""
381
    document_tree = etree.fromstring(doc, etree.HTMLParser())
382
    el = document_tree.find(".//p")
383
    parsed_el = html._parse_tag(el)
384
    assert parsed_el == ListItem("An excellent point!")
385

386

387
def test_parse_tag_ignores_lonely_bullets():
388
    doc = """<p>●</p>"""
389
    document_tree = etree.fromstring(doc, etree.HTMLParser())
390
    el = document_tree.find(".//p")
391
    parsed_el = html._parse_tag(el)
392
    assert parsed_el is None
393

394

395
def test_parse_tag_ignores_stubs():
396
    doc = """<p>$</p>"""
397
    document_tree = etree.fromstring(doc, etree.HTMLParser())
398
    el = document_tree.find(".//p")
399
    parsed_el = html._parse_tag(el)
400
    assert parsed_el is None
401

402

403
def test_adjacent_spans_are_text_tags():
404
    doc = """<div><span>&#8226;</span><span>A bullet!</span></div>"""
405
    document_tree = etree.fromstring(doc, etree.HTMLParser())
406
    el = document_tree.find(".//div")
407
    assert html._is_text_tag(el) is True
408

409

410
def test_process_list_item_gets_next_section():
411
    doc = """
412
    <div>
413
        <p>●</p>
414
        <p>●</p>
415
    </div>
416
    <div>
417
        <p>An excellent point!</p>
418
    </div>
419

420
    """
421
    document_tree = etree.fromstring(doc, etree.HTMLParser())
422
    el = document_tree.find(".//div")
423
    parsed_el, _ = html._process_list_item(el, max_predecessor_len=10)
424
    assert parsed_el == ListItem(text="An excellent point!")
425

426

427
def test_get_bullet_descendants():
428
    div_1 = "<div><p>●</p><p>●</p></div>"
429
    document_tree_1 = etree.fromstring(div_1, etree.HTMLParser())
430
    element = document_tree_1.find(".//div")
431

432
    div_2 = "<div><p>An excellent point!</p></div>"
433
    document_tree_2 = etree.fromstring(div_2, etree.HTMLParser())
434
    next_element = document_tree_2.find(".//div")
435

436
    descendants = html._get_bullet_descendants(element, next_element)
437
    assert len(descendants) == 1
438

439

440
def test_process_list_item_returns_none_if_next_blank():
441
    doc = """
442
    <div>
443
        <p>●</p>
444
        <p>●</p>
445
    </div>
446

447
    """
448
    document_tree = etree.fromstring(doc, etree.HTMLParser())
449
    el = document_tree.find(".//div")
450
    parsed_el, _ = html._process_list_item(el)
451
    assert parsed_el is None
452

453

454
def test_process_list_item_returns_none_if_next_has_no_text():
455
    doc = """
456
    <div>
457
        <p>●</p>
458
        <p>●</p>
459
    </div>
460
    <div>
461
    </div>
462
    """
463
    document_tree = etree.fromstring(doc, etree.HTMLParser())
464
    el = document_tree.find(".//div")
465
    assert html.is_list_item_tag(el) is True
466
    parsed_el, _ = html._process_list_item(el)
467
    assert parsed_el is None
468

469

470
def test_process_list_item_ignores_deep_divs():
471
    doc = """
472
    <div>
473
        <p>●</p>
474
        <p>●</p>
475
        <p>●</p>
476
        <p>●</p>
477
        <p>●</p>
478
    </div>
479
    <div>
480
        <p>An excellent point!</p>
481
    </div>
482

483
    """
484
    document_tree = etree.fromstring(doc, etree.HTMLParser())
485
    el = document_tree.find(".//div")
486
    parsed_el, _ = html._process_list_item(el, max_predecessor_len=2)
487
    assert parsed_el is None
488

489

490
def test_read_html_doc(tmpdir, monkeypatch):
491
    TITLE1 = "A Great and Glorious Section"
492
    SECTION1 = "Dear Leader is the best. He is such a wonderful engineer!"
493
    TITLE2 = "Another Magnificent Title"
494
    SECTION2 = "The last section is a title because of its capitalization patterns!"
495
    TABLE_SECTION = "Skip me because I'm in a table"
496
    TITLE3 = "A New Beginning"
497
    SECTION3 = "Here is the start of a new page."
498

499
    doc = f"""<html>
500
    <body>
501
        <header>
502
            <p>Here is a header. We want to ignore anything that is in this section.</p>
503
        </header>
504
        <h1>{TITLE1}</h1>
505
        <p>{SECTION1}</p>
506
        <p></p>
507
        <p>{TITLE2}</p>
508
        <p><b>{SECTION2}</b></p>
509
        <table>
510
            <tbody>
511
                <tr>
512
                    <td><p>{TABLE_SECTION}</p></td>
513
                </tr>
514
            </tbody>
515
        </table>
516
        <hr>
517
        <h2>{TITLE3}</h2>
518
        <div>{SECTION3}</div>
519
        <footer>
520
            <p>Here is a footer. We want to ignore anything that is in this section</p>
521
        </footer>
522
        <div>
523
            <p>Let's ignore anything after the footer too since it's probably garbage.</p>
524
        </div>
525
    </body>
526
</html>"""
527
    filename = os.path.join(tmpdir.dirname, "sample-doc.html")
528
    with open(filename, "w") as f:
529
        f.write(doc)
530

531
    html_document = HTMLDocument.from_file(filename=filename).doc_after_cleaners(
532
        skip_headers_and_footers=True,
533
        skip_table=True,
534
    )
535
    print("original pages: ", HTMLDocument.from_file(filename=filename).pages)
536
    print("filtered pages: ", html_document.pages)
537
    print([el.text for el in html_document.pages[0].elements])
538

539
    assert len(html_document.pages) == 2
540

541
    page_one = html_document.pages[0]
542
    assert len(page_one.elements) == 4
543
    assert page_one.elements == [
544
        Title(text=TITLE1),
545
        NarrativeText(text=SECTION1),
546
        Title(text=TITLE2),
547
        NarrativeText(text=SECTION2),
548
    ]
549

550
    page_two = html_document.pages[1]
551
    assert len(page_two.elements) == 2
552
    assert page_two.elements == [
553
        Title(text=TITLE3),
554
        NarrativeText(text=SECTION3),
555
    ]
556

557
    pages = html_document.pages
558
    assert all(isinstance(page, Page) for page in pages)
559

560

561
def test_find_main():
562
    html_str = """<header></header>
563
    <body>
564
        <p>Lots preamble stuff yada yada yada</p>
565
        <main>
566
            <article>
567
                <section>
568
                    <h2>A Wonderful Section!</h2>
569
                    <p>Look at this amazing section!</p>
570
                </section>
571
                <section>
572
                    <h2>Another Wonderful Section!</h2>
573
                    <p>Look at this other amazing section!</p>
574
                </section>
575
            </article>
576
        </main>
577
    </body>"""
578
    html_document = HTMLDocument.from_string(html_str)
579
    document_tree = html_document.document_tree
580
    main_tag = html._find_main(document_tree)
581
    assert main_tag.tag == "main"
582

583

584
def test_find_main_returns_doc_when_main_not_present():
585
    html_str = """<header></header>
586
    <body>
587
    <p>Lots preamble stuff yada yada yada</p>
588
        <article>
589
            <section>
590
                <h2>A Wonderful Section!</h2>
591
                <p>Look at this amazing section!</p>
592
            </section>
593
            <section>
594
                <h2>Another Wonderful Section!</h2>
595
                <p>Look at this other amazing section!</p>
596
            </section>
597
        </article>
598
    </body>"""
599
    html_document = HTMLDocument.from_string(html_str)
600
    document_tree = html_document.document_tree
601
    root = html._find_main(document_tree)
602
    assert root.tag == "html"
603

604

605
def test_find_articles():
606
    html_str = """<header></header>
607
    <body>
608
    <p>Lots preamble stuff yada yada yada</p>
609
        <article>
610
            <h2>A Wonderful Section!</h2>
611
            <p>Look at this amazing section!</p>
612
        </article>
613
        <article>
614
            <h2>Another Wonderful Section!</h2>
615
            <p>Look at this other amazing section!</p>
616
        </article>
617
    </body>"""
618
    html_document = HTMLDocument.from_string(html_str)
619
    document_tree = html_document.document_tree
620
    articles = html._find_articles(document_tree)
621
    assert len(articles) == 2
622

623

624
def test_find_articles_returns_doc_when_none_present():
625
    html_str = """<header></header>
626
    <body>
627
    <p>Lots preamble stuff yada yada yada</p>
628
        <section>
629
            <h2>A Wonderful Section!</h2>
630
            <p>Look at this amazing section!</p>
631
        </section>
632
        <section>
633
            <h2>Another Wonderful Section!</h2>
634
            <p>Look at this other amazing section!</p>
635
        </section>
636
    </body>"""
637
    html_document = HTMLDocument.from_string(html_str)
638
    document_tree = html_document.document_tree
639
    articles = html._find_articles(document_tree)
640
    assert len(articles) == 1
641

642

643
def test_include_headers_and_footers(sample_doc):
644
    html_document = sample_doc.doc_after_cleaners(skip_headers_and_footers=False)
645
    assert len(html_document.pages[1].elements) == 3
646

647

648
def test_include_table_text(sample_doc):
649
    html_document = sample_doc.doc_after_cleaners(skip_table=False)
650
    assert len(html_document.pages[0].elements) == 2
651

652

653
@pytest.mark.parametrize("tag", [tag for tag in TEXT_TAGS if tag not in TABLE_TAGS])
654
def test_tag_types(tag):
655
    html_str = f"""
656
    <body>
657
        <{tag}>
658
            There is some text here.
659
        </{tag}>
660
    </body>
661
    """
662
    html_document = HTMLDocument.from_string(html_str)
663
    assert len(html_document.pages[0].elements) == 1
664

665

666
@pytest.mark.parametrize("tag", EXCLUDED_TAGS)
667
def test_exclude_tag_types(tag):
668
    html_str = f"""
669
    <body>
670
        <{tag}>
671
            There is some text here.
672
        </{tag}>
673
    </body>
674
    """
675
    html_document = HTMLDocument.from_string(html_str)
676
    assert len(html_document.pages) == 0
677

678

679
def test_tag_types_table(sample_doc):
680
    html_document = sample_doc.doc_after_cleaners(skip_table=True)
681
    assert len(html_document.pages[0].elements) == 2
682

683

684
def test_nested_text_tags():
685
    tag1, tag2 = [tag for tag in TEXT_TAGS if tag not in TABLE_TAGS][:2]
686
    html_str = f"""
687
    <body>
688
        <{tag1}>
689
            <{tag2}>
690
                There is some text here.
691
            </{tag2}>
692
        </{tag1}>
693
    </body>
694
    """
695
    html_document = HTMLDocument.from_string(html_str).doc_after_cleaners(skip_table=False)
696
    assert len(html_document.pages[0].elements) == 1
697

698

699
def test_containers_with_text_are_processed():
700
    html_str = """<div dir=3D"ltr">Hi All,<div><br></div>
701
   <div>Get excited for our first annual family day!</div>
702
   <div>Best.<br clear=3D"all">
703
      <div><br></div>
704
      -- <br>
705
      <div dir=3D"ltr">
706
         <div dir=3D"ltr">Dino the Datasaur<div>Unstructured Technologies<br><div>Data Scientist
707
                </div>
708
                <div>Doylestown, PA 18901</div>
709
               <div><br></div>
710
            </div>
711
         </div>
712
      </div>
713
   </div>
714
</div>"""
715
    html_document = HTMLDocument.from_string(html_str)
716

717
    assert html_document.elements == [
718
        Text(text="Hi All,"),
719
        NarrativeText(text="Get excited for our first annual family day!"),
720
        Title(text="Best."),
721
        Title(text="Dino the Datasaur"),
722
        Title(text="Unstructured Technologies"),
723
        Title(text="Data Scientist"),
724
        Address(text="Doylestown, PA 18901"),
725
    ]
726

727

728
def test_html_grabs_bulleted_text_in_tags():
729
    html_str = """<html>
730
    <body>
731
        <ol>
732
            <li>Happy Groundhog's day!</li>
733
            <li>Looks like six more weeks of winter ...</li>
734
        </ol>
735
    </body>
736
</html>"""
737
    html_document = HTMLDocument.from_string(html_str)
738

739
    assert html_document.elements == [
740
        ListItem(text="Happy Groundhog's day!"),
741
        ListItem(text="Looks like six more weeks of winter ..."),
742
    ]
743

744

745
def test_html_grabs_bulleted_text_in_paras():
746
    html_str = """<html>
747
    <body>
748
        <p>
749
            <span>&#8226; Happy Groundhog's day!</span>
750
        </p>
751
        <p>
752
            <span>&#8226; Looks like six more weeks of winter ...</span>
753
        </p>
754
    </body>
755
</html>"""
756
    html_document = HTMLDocument.from_string(html_str)
757

758
    assert html_document.elements == [
759
        ListItem(text="Happy Groundhog's day!"),
760
        ListItem(text="Looks like six more weeks of winter ..."),
761
    ]
762

763

764
def test_bulletized_bulleted_text_from_table():
765
    doc = """<html>
766
    <body>
767
        <table>
768
            <tbody>
769
                <tr>
770
                    <td>•</td>
771
                    <td><p>Happy Groundhog's day!</p></td>
772
                </tr>
773
                <tr>
774
                    <td>•</td>
775
                    <td><p>Looks like six more weeks of winter ...</p></td>
776
                </tr>
777
            </tbody>
778
        </table>
779
    </body>
780
</html>"""
781
    document_tree = etree.fromstring(doc, etree.HTMLParser())
782
    table = document_tree.find(".//table")
783
    bulleted_text = html._bulleted_text_from_table(table)
784
    assert bulleted_text == [
785
        ListItem(text="Happy Groundhog's day!"),
786
        ListItem(text="Looks like six more weeks of winter ..."),
787
    ]
788

789

790
def test_html_grabs_bulleted_text_in_tables():
791
    html_str = """<html>
792
    <body>
793
        <table>
794
            <tbody>
795
                <tr>
796
                    <td>&#8226;</td>
797
                    <td><p>Happy Groundhog's day!</p></td>
798
                </tr>
799
                <tr>
800
                    <td>&#8226;</td>
801
                    <td><p>Looks like six more weeks of winter ...</p></td>
802
                </tr>
803
            </tbody>
804
        </table>
805
    </body>
806
</html>"""
807
    html_document = HTMLDocument.from_string(html_str)
808

809
    assert html_document.elements == [
810
        ListItem(text="Happy Groundhog's day!"),
811
        ListItem(text="Looks like six more weeks of winter ..."),
812
    ]
813

814

815
def test_raises_error_no_tag():
816
    with pytest.raises(TypeError):
817
        TagsMixin(tag=None)
818
    with pytest.raises(TypeError):
819
        TagsMixin()
820

821

822
def test_raises_error_wrong_elements(monkeypatch, sample_doc):
823
    page = Page(0)
824
    page.elements = ["this should def not be a string"]
825
    monkeypatch.setattr(sample_doc, "_pages", [page])
826
    with pytest.raises(ValueError):
827
        sample_doc.doc_after_cleaners()
828

829

830
def test_filter_in_place():
831
    html_doc = """
832
    <table><tbody><tr><td>A table thing.</td></tr></tbody></table>
833
    <p>A non-table thing</p>
834
    """
835
    doc = HTMLDocument.from_string(html_doc)
836
    assert len(doc.elements) == 2
837
    doc.doc_after_cleaners(skip_table=True, inplace=True)
838
    assert len(doc.elements) == 1
839

840

841
def test_joins_tag_text_correctly():
842
    raw_html = "<p>Hello again peet mag<i>ic</i>al</p>"
843
    doc = HTMLDocument.from_string(raw_html)
844
    el = doc.elements[0]
845
    assert el.text == "Hello again peet magical"
846

847

848
def test_sample_doc_with_emoji():
849
    raw_html = """
850
    <html charset="unicode">
851
        <p>Hello again 😀</p>
852
    </html>
853
    """
854
    doc = HTMLDocument.from_string(raw_html)
855
    # NOTE(robinson) - unclear why right now, but the output is the emoji on the test runners
856
    # and the byte string representation when running locally on mac
857
    assert doc.elements[0].text in ["Hello again ð\x9f\x98\x80", "Hello again 😀"]
858

859

860
def test_only_plain_text_in_body():
861
    raw_html = "<body>Hello</body>"
862
    doc = HTMLDocument.from_string(raw_html)
863
    assert doc.elements[0].text == "Hello"
864

865

866
def test_plain_text_before_anything_in_body():
867
    raw_html = "<body>Hello<p>World</p></body>"
868
    doc = HTMLDocument.from_string(raw_html)
869
    assert doc.elements[0].text == "Hello"
870
    assert doc.elements[1].text == "World"
871

872

873
def test_line_break_in_container():
874
    raw_html = "<div>Hello<br/>World</div>"
875
    doc = HTMLDocument.from_string(raw_html)
876
    assert doc.elements[0].text == "Hello"
877
    assert doc.elements[1].text == "World"
878

879

880
@pytest.mark.parametrize("tag", TEXT_TAGS)
881
def test_line_break_in_text_tag(tag):
882
    raw_html = f"<{tag}>Hello<br/>World</{tag}>"
883
    doc = HTMLDocument.from_string(raw_html)
884
    assert doc.elements[0].text == "Hello"
885
    assert doc.elements[1].text == "World"
886

887

888
# -- unit-level tests ----------------------------------------------------------------------------
889

890

891
class Describe_parse_HTMLTable_from_table_elem:
892
    """Unit-test suite for `unstructured.documents.html._parse_HTMLTable_from_table_elem`."""
893

894
    def it_produces_one_cell_for_each_original_table_cell(self):
895
        table_html = (
896
            # -- include formatting whitespace to make sure it is removed --
897
            "<table>\n"
898
            "  <tr>\n"
899
            "    <td>foo</td>\n"
900
            "    <td>bar</td>\n"
901
            "  </tr>\n"
902
            "</table>"
903
        )
904
        table_elem = lxml_html.fromstring(table_html)  # pyright: ignore[reportUnknownMemberType]
905

906
        html_table = _parse_HTMLTable_from_table_elem(table_elem)
907

908
        assert isinstance(html_table, HTMLTable)
909
        assert html_table.text == "foo bar"
910
        assert html_table.text_as_html == "<table><tr><td>foo</td><td>bar</td></tr></table>"
911

912
    def it_accommodates_tds_with_child_elements(self):
913
        """Like this example from an SEC 10k filing."""
914
        table_html = (
915
            "<table>\n"
916
            " <tr>\n"
917
            "  <td></td>\n"
918
            "  <td></td>\n"
919
            " </tr>\n"
920
            " <tr>\n"
921
            "  <td>\n"
922
            "   <p>\n"
923
            "    <span>\n"
924
            '     <ix:nonNumeric id="F_be4cc145-372a-4689-be60-d8a70b0c8b9a"'
925
            ' contextRef="C_1de69f73-df01-4830-8af0-0f11b469bc4a" name="dei:DocumentAnnualReport"'
926
            ' format="ixt-sec:boolballotbox">\n'
927
            "     <span>&#9746;</span>\n"
928
            "     </ix:nonNumeric>\n"
929
            "    </span>\n"
930
            "   </p>\n"
931
            "  </td>\n"
932
            "  <td>\n"
933
            "   <p>\n"
934
            "    <span>ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE"
935
            " ACT OF 1934</span>\n"
936
            "   </p>\n"
937
            "  </td>\n"
938
            " </tr>\n"
939
            "</table>\n"
940
        )
941
        table_elem = lxml_html.fromstring(table_html)  # pyright: ignore[reportUnknownMemberType]
942

943
        html_table = _parse_HTMLTable_from_table_elem(table_elem)
944

945
        assert isinstance(html_table, HTMLTable)
946
        assert html_table.text == (
947
            "☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934"
948
        )
949
        print(f"{html_table.text_as_html=}")
950
        assert html_table.text_as_html == (
951
            "<table>"
952
            "<tr><td></td><td></td></tr>"
953
            "<tr><td>☒</td><td>ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES"
954
            " EXCHANGE ACT OF 1934</td></tr>"
955
            "</table>"
956
        )
957

958
    def it_reduces_a_nested_table_to_its_text_placed_in_the_cell_containing_the_nested_table(self):
959
        """Recursively ..."""
960
        nested_table_html = (
961
            "<table>\n"
962
            " <tr>\n"
963
            "  <td>\n"
964
            "   <table>\n"
965
            "     <tr><td>foo</td><td>bar</td></tr>\n"
966
            "     <tr><td>baz</td><td>bng</td></tr>\n"
967
            "   </table>\n"
968
            "  </td>\n"
969
            "  <td>\n"
970
            "   <table>\n"
971
            "     <tr><td>fizz</td><td>bang</td></tr>\n"
972
            "   </table>\n"
973
            "  </td>\n"
974
            " </tr>\n"
975
            "</table>"
976
        )
977
        table_elem = lxml_html.fromstring(  # pyright: ignore[reportUnknownMemberType]
978
            nested_table_html
979
        )
980

981
        html_table = _parse_HTMLTable_from_table_elem(table_elem)
982

983
        assert isinstance(html_table, HTMLTable)
984
        assert html_table.text == "foo bar baz bng fizz bang"
985
        assert html_table.text_as_html == (
986
            "<table><tr><td>foo bar baz bng</td><td>fizz bang</td></tr></table>"
987
        )
988

989

990
# -- module-level fixtures -----------------------------------------------------------------------
991

992

993
@pytest.fixture()
994
def sample_doc():
995
    table_element = HTMLTitle(
996
        "I'm a title in a table.",
997
        tag="p",
998
        ancestortags=("table", "tbody", "tr", "td"),
999
    )
1000
    narrative = HTMLNarrativeText("I'm some narrative text", tag="p", ancestortags=())
1001
    page1 = Page(0)
1002
    page1.elements = [table_element, narrative]
1003
    header = HTMLTitle("I'm a header", tag="header", ancestortags=())
1004
    body = HTMLNarrativeText("Body text", tag="p", ancestortags=())
1005
    footer = HTMLTitle("I'm a footer", tag="footer", ancestortags=())
1006
    page2 = Page(1)
1007
    page2.elements = [header, body, footer]
1008
    doc = HTMLDocument.from_pages([page1, page2])
1009
    return doc
1010

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.