unstructured

Форк
0
/
test_html_partition.py 
693 строки · 23.9 Кб
1
import os
2
import pathlib
3
from unittest.mock import patch
4

5
import pytest
6
import requests
7
from requests.models import Response
8

9
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
10
from unstructured.chunking.title import chunk_by_title
11
from unstructured.cleaners.core import clean_extra_whitespace
12
from unstructured.documents.elements import EmailAddress, ListItem, NarrativeText, Table, Title
13
from unstructured.documents.html import HTMLTitle
14
from unstructured.partition.html import partition_html
15

16
DIRECTORY = pathlib.Path(__file__).parent.resolve()
17

18
EXPECTED_OUTPUT_LANGUAGE_DE = [
19
    Title(text="Jahresabschluss zum Geschäftsjahr vom 01.01.2020 bis zum 31.12.2020"),
20
]
21

22

23
def test_partition_html_from_filename():
24
    directory = os.path.join(DIRECTORY, "..", "..", "example-docs")
25
    filename = os.path.join(directory, "example-10k.html")
26
    elements = partition_html(filename=filename)
27
    assert len(elements) > 0
28
    assert "PageBreak" not in [elem.category for elem in elements]
29
    assert elements[0].metadata.filename == "example-10k.html"
30
    assert elements[0].metadata.file_directory == directory
31

32

33
def test_partition_html_from_filename_returns_html_elements():
34
    directory = os.path.join(DIRECTORY, "..", "..", "example-docs")
35
    filename = os.path.join(directory, "example-10k.html")
36
    elements = partition_html(filename=filename)
37
    assert len(elements) > 0
38
    assert isinstance(elements[0], HTMLTitle)
39

40

41
def test_partition_html_from_filename_with_metadata_filename():
42
    directory = os.path.join(DIRECTORY, "..", "..", "example-docs")
43
    filename = os.path.join(directory, "example-10k.html")
44
    elements = partition_html(filename=filename, metadata_filename="test")
45
    assert len(elements) > 0
46
    assert all(element.metadata.filename == "test" for element in elements)
47

48

49
@pytest.mark.parametrize(
50
    ("filename", "encoding", "error"),
51
    [
52
        ("example-10k-utf-16.html", "utf-8", UnicodeDecodeError),
53
        ("example-steelJIS-datasheet-utf-16.html", "utf-8", UnicodeDecodeError),
54
    ],
55
)
56
def test_partition_html_from_filename_raises_encoding_error(filename, encoding, error):
57
    with pytest.raises(error):
58
        filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
59
        with open(filename) as f:
60
            partition_html(file=f, encoding=encoding)
61

62

63
@pytest.mark.parametrize(
64
    "filename",
65
    [
66
        "example-10k-utf-16.html",
67
        "example-steelJIS-datasheet-utf-16.html",
68
        "fake-html-lang-de.html",
69
    ],
70
)
71
def test_partition_html_from_filename_default_encoding(filename):
72
    filename_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
73
    elements = partition_html(filename=filename_path)
74
    assert len(elements) > 0
75
    for element in elements:
76
        assert element.metadata.filename == filename
77
    if filename == "fake-html-lang-de.html":
78
        assert elements == EXPECTED_OUTPUT_LANGUAGE_DE
79

80

81
def test_partition_html_from_filename_metadata_false():
82
    directory = os.path.join(DIRECTORY, "..", "..", "example-docs")
83
    filename = os.path.join(directory, "example-10k.html")
84
    elements = partition_html(filename=filename, include_metadata=False)
85
    metadata_present = any(element.metadata.to_dict() for element in elements)
86
    assert not metadata_present
87

88

89
def test_partition_html_with_page_breaks():
90
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
91
    elements = partition_html(filename=filename, include_page_breaks=True)
92
    assert "PageBreak" in [elem.category for elem in elements]
93
    assert len(elements) > 0
94
    for element in elements:
95
        assert element.metadata.filename == "example-10k.html"
96

97

98
def test_partition_html_from_file():
99
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
100
    with open(filename) as f:
101
        elements = partition_html(file=f)
102
    assert len(elements) > 0
103
    for element in elements:
104
        assert element.metadata.filename is None
105

106

107
def test_partition_html_from_file_with_metadata_filename():
108
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
109
    with open(filename) as f:
110
        elements = partition_html(file=f, metadata_filename="test")
111
    assert len(elements) > 0
112
    for element in elements:
113
        assert element.metadata.filename == "test"
114

115

116
@pytest.mark.parametrize(
117
    ("filename", "encoding", "error"),
118
    [
119
        ("example-10k-utf-16.html", "utf-8", UnicodeDecodeError),
120
        ("example-steelJIS-datasheet-utf-16.html", "utf-8", UnicodeDecodeError),
121
    ],
122
)
123
def test_partition_html_from_file_raises_encoding_error(filename, encoding, error):
124
    with pytest.raises(error):
125
        filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
126
        with open(filename) as f, pytest.raises(UnicodeEncodeError):
127
            partition_html(file=f, encoding=encoding)
128

129

130
@pytest.mark.parametrize(
131
    "filename",
132
    [
133
        "example-10k-utf-16.html",
134
        "example-steelJIS-datasheet-utf-16.html",
135
        "fake-html-lang-de.html",
136
    ],
137
)
138
def test_partition_html_from_file_default_encoding(filename):
139
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
140
    with open(filename) as f:
141
        elements = partition_html(file=f)
142
    assert len(elements) > 0
143
    if filename == "fake-html-lang-de.html":
144
        assert elements == EXPECTED_OUTPUT_LANGUAGE_DE
145

146

147
@pytest.mark.parametrize(
148
    ("filename", "encoding", "error"),
149
    [
150
        ("example-10k-utf-16.html", "utf-8", UnicodeDecodeError),
151
        ("example-steelJIS-datasheet-utf-16.html", "utf-8", UnicodeDecodeError),
152
    ],
153
)
154
def test_partition_html_from_file_rb_raises_encoding_error(filename, encoding, error):
155
    with pytest.raises(error):
156
        filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
157
        with open(filename, "rb") as f:
158
            partition_html(file=f, encoding=encoding)
159

160

161
@pytest.mark.parametrize(
162
    "filename",
163
    [
164
        "example-10k-utf-16.html",
165
        "example-steelJIS-datasheet-utf-16.html",
166
        "fake-html-lang-de.html",
167
    ],
168
)
169
def test_partition_html_from_file_rb_default_encoding(filename):
170
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
171
    with open(filename, "rb") as f:
172
        elements = partition_html(file=f)
173
    assert len(elements) > 0
174
    if filename == "fake-html-lang-de.html":
175
        assert elements == EXPECTED_OUTPUT_LANGUAGE_DE
176

177

178
def test_partition_html_from_text():
179
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
180
    with open(filename) as f:
181
        text = f.read()
182
    elements = partition_html(text=text)
183
    assert len(elements) > 0
184

185

186
def test_partition_html_from_text_works_with_empty_string():
187
    assert partition_html(text="") == []
188

189

190
class MockResponse:
191
    def __init__(self, text, status_code, headers={}):
192
        self.text = text
193
        self.status_code = status_code
194
        self.ok = status_code < 300
195
        self.headers = headers
196

197

198
def test_partition_html_from_url():
199
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
200
    with open(filename) as f:
201
        text = f.read()
202

203
    response = MockResponse(
204
        text=text,
205
        status_code=200,
206
        headers={"Content-Type": "text/html"},
207
    )
208
    with patch.object(requests, "get", return_value=response) as _:
209
        elements = partition_html(url="https://fake.url")
210

211
    assert len(elements) > 0
212

213

214
def test_partition_html_from_url_raises_with_bad_status_code():
215
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
216
    with open(filename) as f:
217
        text = f.read()
218

219
    response = MockResponse(
220
        text=text,
221
        status_code=500,
222
        headers={"Content-Type": "text/html"},
223
    )
224
    with patch.object(requests, "get", return_value=response) as _, pytest.raises(ValueError):
225
        partition_html(url="https://fake.url")
226

227

228
def test_partition_html_from_url_raises_with_bad_content_type():
229
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
230
    with open(filename) as f:
231
        text = f.read()
232

233
    response = MockResponse(
234
        text=text,
235
        status_code=200,
236
        headers={"Content-Type": "application/json"},
237
    )
238
    with patch.object(requests, "get", return_value=response) as _, pytest.raises(ValueError):
239
        partition_html(url="https://fake.url")
240

241

242
def test_partition_from_url_uses_headers(mocker):
243
    test_url = "https://example.com"
244
    test_headers = {"User-Agent": "test"}
245

246
    response = Response()
247
    response.status_code = 200
248
    response._content = (
249
        b"<html><head></head><body><p>What do i know? Who needs to know it?</p></body></html>"
250
    )
251
    response.headers = {"Content-Type": "text/html"}
252

253
    mock_get = mocker.patch("requests.get", return_value=response)
254

255
    partition_html(url=test_url, headers=test_headers)
256

257
    # Check if requests.get was called with the correct arguments
258
    mock_get.assert_called_once_with(test_url, headers=test_headers, verify=True)
259

260

261
def test_partition_html_raises_with_none_specified():
262
    with pytest.raises(ValueError):
263
        partition_html()
264

265

266
def test_partition_html_raises_with_too_many_specified():
267
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
268
    with open(filename) as f:
269
        text = f.read()
270

271
    with pytest.raises(ValueError):
272
        partition_html(filename=filename, text=text)
273

274

275
def test_partition_html_on_ideas_page(filename="example-docs/ideas-page.html"):
276
    elements = partition_html(filename=filename)
277
    assert len(elements) == 1
278
    assert elements[0] == Table(
279
        text="January 2023 ( Someone fed my essays into GPT to make something "
280
        "that could answer\nquestions based on them, then asked it where good "
281
        "ideas come from.  The\nanswer was ok, but not what I would have said. "
282
        "This is what I would have said.) The way to get new ideas is to notice "
283
        "anomalies: what seems strange,\nor missing, or broken? You can see anomalies"
284
        " in everyday life (much\nof standup comedy is based on this), but the best "
285
        "place to look for\nthem is at the frontiers of knowledge. Knowledge grows "
286
        "fractally.\nFrom a distance its edges look smooth, but when you learn "
287
        "enough\nto get close to one, you'll notice it's full of gaps. These "
288
        "gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx "
289
        "or wondered about y. In the best case, exploring such gaps yields\nwhole "
290
        "new fractal buds.",
291
    )
292

293
    assert elements[0].metadata.emphasized_text_contents is None
294
    assert elements[0].metadata.link_urls is None
295
    assert elements[0].metadata.text_as_html is not None
296

297

298
def test_user_without_file_write_permission_can_partition_html(tmp_path, monkeypatch):
299
    example_filename = os.path.join(
300
        DIRECTORY,
301
        "..",
302
        "..",
303
        "example-docs",
304
        "example-10k.html",
305
    )
306

307
    # create a file with no write permissions
308
    read_only_file = tmp_path / "example-10k-readonly.html"
309
    read_only_file.touch()
310

311
    # set content of read_only_file to be that of example-10k.html
312
    with open(example_filename) as f:
313
        read_only_file.write_text(f.read())
314

315
    # set read_only_file to be read only
316
    read_only_file.chmod(0o444)
317

318
    # partition html should still work
319
    elements = partition_html(filename=read_only_file.resolve())
320
    assert len(elements) > 0
321

322

323
def test_partition_html_processes_chinese_chracters():
324
    html_text = "<html><div><p>每日新闻</p></div></html>"
325
    elements = partition_html(text=html_text)
326
    assert elements[0].text == "每日新闻"
327

328

329
def test_emoji_appears_with_emoji_utf8_code():
330
    html_text = """\n<html charset="utf-8"><p>Hello &#128512;</p></html>"""
331
    elements = partition_html(text=html_text)
332
    assert elements[0] == Title("Hello 😀")
333

334

335
def test_partition_html_can_turn_off_assemble_articles():
336
    html_text = """<html>
337
    <article>
338
        <h1>Some important stuff is going on!</h1>
339
        <p>Here is a description of that stuff</p>
340
    </article>
341
    <article>
342
        <h1>Some other important stuff is going on!</h1>
343
        <p>Here is a description of that stuff</p>
344
    </article>
345
    <h4>This is outside of the article.</h4>
346
</html>
347
"""
348
    elements = partition_html(text=html_text, html_assemble_articles=False)
349
    assert elements[-1] == Title("This is outside of the article.")
350

351

352
def test_partition_html_with_pre_tag():
353
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-html-pre.htm")
354
    elements = partition_html(filename=filename)
355

356
    assert len(elements) > 0
357
    assert "PageBreak" not in [elem.category for elem in elements]
358
    assert clean_extra_whitespace(elements[0].text).startswith("[107th Congress Public Law 56]")
359
    assert isinstance(elements[0], NarrativeText)
360
    assert elements[0].metadata.filetype == "text/html"
361
    assert elements[0].metadata.filename == "fake-html-pre.htm"
362

363

364
def test_partition_html_from_filename_exclude_metadata():
365
    directory = os.path.join(DIRECTORY, "..", "..", "example-docs")
366
    filename = os.path.join(directory, "example-10k.html")
367
    elements = partition_html(filename=filename, include_metadata=False)
368
    assert len(elements) > 0
369
    assert "PageBreak" not in [elem.category for elem in elements]
370
    assert elements[0].metadata.filename is None
371
    assert elements[0].metadata.file_directory is None
372

373

374
def test_partition_html_metadata_date(mocker, filename="example-docs/fake-html.html"):
375
    mocked_last_modification_date = "2029-07-05T09:24:28"
376

377
    mocker.patch(
378
        "unstructured.partition.html.get_last_modified_date",
379
        return_value=mocked_last_modification_date,
380
    )
381
    elements = partition_html(filename=filename)
382

383
    assert isinstance(elements[0], Title)
384
    assert elements[0].metadata.last_modified == mocked_last_modification_date
385

386

387
def test_partition_html_from_file_metadata_date(
388
    mocker,
389
    filename="example-docs/fake-html.html",
390
):
391
    mocked_last_modification_date = "2029-07-05T09:24:28"
392

393
    mocker.patch(
394
        "unstructured.partition.html.get_last_modified_date_from_file",
395
        return_value=mocked_last_modification_date,
396
    )
397

398
    with open(filename) as f:
399
        elements = partition_html(file=f)
400

401
    assert isinstance(elements[0], Title)
402
    assert elements[0].metadata.last_modified == mocked_last_modification_date
403

404

405
def test_partition_html_custom_metadata_date(
406
    mocker,
407
    filename="example-docs/fake-html.html",
408
):
409
    mocked_last_modification_date = "2029-07-05T09:24:28"
410
    expected_last_modification_date = "2020-07-05T09:24:28"
411

412
    mocker.patch(
413
        "unstructured.partition.html.get_last_modified_date",
414
        return_value=mocked_last_modification_date,
415
    )
416

417
    elements = partition_html(
418
        filename=filename,
419
        metadata_last_modified=expected_last_modification_date,
420
    )
421

422
    assert isinstance(elements[0], Title)
423
    assert elements[0].metadata.last_modified == expected_last_modification_date
424

425

426
def test_partition_html_from_file_custom_metadata_date(
427
    mocker,
428
    filename="example-docs/fake-html.html",
429
):
430
    mocked_last_modification_date = "2029-07-05T09:24:28"
431
    expected_last_modification_date = "2020-07-05T09:24:28"
432

433
    mocker.patch(
434
        "unstructured.partition.html.get_last_modified_date_from_file",
435
        return_value=mocked_last_modification_date,
436
    )
437

438
    with open(filename) as f:
439
        elements = partition_html(
440
            file=f,
441
            metadata_last_modified=expected_last_modification_date,
442
        )
443

444
    assert isinstance(elements[0], Title)
445
    assert elements[0].metadata.last_modified == expected_last_modification_date
446

447

448
def test_partition_html_from_text_metadata_date(filename="example-docs/fake-html.html"):
449
    elements = partition_html(text="<html><div><p>TEST</p></div></html>")
450

451
    assert isinstance(elements[0], Title)
452
    assert elements[0].metadata.last_modified is None
453

454

455
def test_partition_html_from_text_custom_metadata_date(
456
    filename="example-docs/fake-html.html",
457
):
458
    expected_last_modification_date = "2020-07-05T09:24:28"
459

460
    elements = partition_html(
461
        text="<html><div><p>TEST</p></div></html>",
462
        metadata_last_modified=expected_last_modification_date,
463
    )
464

465
    assert isinstance(elements[0], Title)
466
    assert elements[0].metadata.last_modified == expected_last_modification_date
467

468

469
def test_partition_html_grabs_links():
470
    html_text = """<html>
471
        <p>Hello there I am a <a href="/link">very important link!</a></p>
472
        <p>Here is a list of my favorite things</p>
473
        <ul>
474
            <li><a href="https://en.wikipedia.org/wiki/Parrot">Parrots</a></li>
475
            <li>Dogs</li>
476
        </ul>
477
        <a href="/loner">A lone link!</a>
478
    </html>"""
479
    elements = partition_html(text=html_text)
480

481
    assert elements[0] == NarrativeText("Hello there I am a very important link!")
482
    assert elements[0].metadata.link_urls == ["/link"]
483
    assert elements[0].metadata.link_texts == ["very important link!"]
484

485
    assert elements[1] == NarrativeText("Here is a list of my favorite things")
486
    assert elements[1].metadata.link_urls is None
487
    assert elements[1].metadata.link_texts is None
488

489
    assert elements[2] == ListItem("Parrots")
490
    assert elements[2].metadata.link_urls == ["https://en.wikipedia.org/wiki/Parrot"]
491
    assert elements[2].metadata.link_texts == ["Parrots"]
492

493
    assert elements[3] == ListItem("Dogs")
494
    assert elements[3].metadata.link_urls is None
495
    assert elements[3].metadata.link_texts is None
496

497
    assert elements[4] == Title("A lone link!")
498
    assert elements[4].metadata.link_urls == ["/loner"]
499
    assert elements[4].metadata.link_texts == ["A lone link!"]
500

501

502
def test_partition_html_from_filename_with_skip_headers_and_footers(
503
    filename="example-docs/fake-html-with-footer-and-header.html",
504
):
505
    elements = partition_html(filename=filename, skip_headers_and_footers=True)
506

507
    for element in elements:
508
        assert "footer" not in element.ancestortags
509
        assert "header" not in element.ancestortags
510

511

512
def test_partition_html_from_file_with_skip_headers_and_footers(
513
    filename="example-docs/fake-html-with-footer-and-header.html",
514
):
515
    with open(filename) as f:
516
        elements = partition_html(file=f, skip_headers_and_footers=True)
517

518
    for element in elements:
519
        assert "footer" not in element.ancestortags
520
        assert "header" not in element.ancestortags
521

522

523
def test_partition_html_from_text_with_skip_headers_and_footers():
524
    text = """
525
    <!DOCTYPE html>
526
    <html>
527
        <header>
528
            <p>Header</p>
529
        </header>
530
        <body>
531
            <h1>My First Heading</h1>
532
            <p>My first paragraph.</p>
533
        </body>
534
        <footer>
535
            <p>Footer</p>
536
        </footer>
537
    </html>"""
538
    elements = partition_html(text=text, skip_headers_and_footers=True)
539

540
    for element in elements:
541
        assert "footer" not in element.ancestortags
542
        assert "header" not in element.ancestortags
543

544

545
def test_partition_html_from_url_with_skip_headers_and_footers(mocker):
546
    test_url = "https://example.com"
547
    test_headers = {"User-Agent": "test"}
548

549
    response = Response()
550
    response.status_code = 200
551
    response._content = b"""<html>
552
        <header>
553
            <p>Header</p>
554
        </header>
555
        <body>
556
            <h1>My First Heading</h1>
557
            <p>My first paragraph.</p>
558
        </body>
559
        <footer>
560
            <p>Footer</p>
561
        </footer>
562
    </html>"""
563
    response.headers = {"Content-Type": "text/html"}
564

565
    mocker.patch("requests.get", return_value=response)
566

567
    elements = partition_html(url=test_url, headers=test_headers, skip_headers_and_footers=True)
568

569
    for element in elements:
570
        assert "footer" not in element.ancestortags
571
        assert "header" not in element.ancestortags
572

573

574
def test_partition_html_grabs_emphasized_texts():
575
    html_text = """<html>
576
        <p>Hello there I am a very <strong>important</strong> text!</p>
577
        <p>Here is a <span>list</span> of <b>my <i>favorite</i> things</b></p>
578
        <ul>
579
            <li><em>Parrots</em></li>
580
            <li>Dogs</li>
581
        </ul>
582
        <span>A lone span text!</span>
583
    </html>"""
584
    elements = partition_html(text=html_text)
585

586
    assert elements[0] == NarrativeText("Hello there I am a very important text!")
587
    assert elements[0].metadata.emphasized_text_contents == ["important"]
588
    assert elements[0].metadata.emphasized_text_tags == ["strong"]
589

590
    assert elements[1] == NarrativeText("Here is a list of my favorite things")
591
    assert elements[1].metadata.emphasized_text_contents == [
592
        "list",
593
        "my favorite things",
594
        "favorite",
595
    ]
596
    assert elements[1].metadata.emphasized_text_tags == ["span", "b", "i"]
597

598
    assert elements[2] == ListItem("Parrots")
599
    assert elements[2].metadata.emphasized_text_contents == ["Parrots"]
600
    assert elements[2].metadata.emphasized_text_tags == ["em"]
601

602
    assert elements[3] == ListItem("Dogs")
603
    assert elements[3].metadata.emphasized_text_contents is None
604
    assert elements[3].metadata.emphasized_text_tags is None
605

606
    assert elements[4] == Title("A lone span text!")
607
    assert elements[4].metadata.emphasized_text_contents == ["A lone span text!"]
608
    assert elements[4].metadata.emphasized_text_tags == ["span"]
609

610

611
def test_partition_html_with_json():
612
    elements = partition_html(example_doc_path("example-10k.html"))
613
    assert_round_trips_through_JSON(elements)
614

615

616
def test_pre_tag_parsing_respects_order():
617
    html_text = """
618
    <pre>The Big Brown Bear</pre>
619
    <div>The big brown bear is growling.</div>
620
    <pre>The big brown bear is sleeping.</pre>
621
    <div>The Big Blue Bear</div>
622
    """
623
    elements = partition_html(text=html_text)
624
    assert elements == [
625
        Title("The Big Brown Bear"),
626
        NarrativeText("The big brown bear is growling."),
627
        NarrativeText("The big brown bear is sleeping."),
628
        Title("The Big Blue Bear"),
629
    ]
630

631

632
def test_add_chunking_strategy_on_partition_html(
633
    filename="example-docs/example-10k.html",
634
):
635
    elements = partition_html(filename=filename)
636
    chunk_elements = partition_html(filename, chunking_strategy="by_title")
637
    chunks = chunk_by_title(elements)
638
    assert chunk_elements != elements
639
    assert chunk_elements == chunks
640

641

642
def test_html_heading_title_detection():
643
    html_text = """
644
    <p>This is a section of narrative text, it's long, flows and has meaning</p>
645
    <h1>This is a section of narrative text, it's long, flows and has meaning</h1>
646
    <h2>A heading that is at the second level</h2>
647
    <h3>Finally, the third heading</h3>
648
    <h2>December 1-17, 2017</h2>
649
    <h3>email@example.com</h3>
650
    <h3><li>- bulleted item</li></h3>
651
    """
652
    elements = partition_html(text=html_text)
653
    assert elements == [
654
        NarrativeText("This is a section of narrative text, it's long, flows and has meaning"),
655
        Title("This is a section of narrative text, it's long, flows and has meaning"),
656
        Title("A heading that is at the second level"),
657
        Title("Finally, the third heading"),
658
        Title("December 1-17, 2017"),
659
        EmailAddress("email@example.com"),
660
        ListItem("- bulleted item"),
661
    ]
662

663

664
def test_partition_html_element_metadata_has_languages():
665
    filename = "example-docs/example-10k.html"
666
    elements = partition_html(filename=filename)
667
    assert elements[0].metadata.languages == ["eng"]
668

669

670
def test_partition_html_respects_detect_language_per_element():
671
    filename = "example-docs/language-docs/eng_spa_mult.html"
672
    elements = partition_html(filename=filename, detect_language_per_element=True)
673
    langs = [element.metadata.languages for element in elements]
674
    assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
675

676

677
@pytest.mark.parametrize(
678
    ("tag", "expected"),
679
    [
680
        ("thead", "<table><tr><td>Header 1</td><td>Header 2</td></tr></table>"),
681
        ("tfoot", "<table><tr><td>Header 1</td><td>Header 2</td></tr></table>"),
682
    ],
683
)
684
def test_partition_html_with_table_without_tbody(tag: str, expected: str):
685
    table_html = (
686
        f"<table>\n"
687
        f"  <{tag}>\n"
688
        f"    <tr><th>Header 1</th><th>Header 2</th></tr>\n"
689
        f"  </{tag}>\n"
690
        f"</table>"
691
    )
692
    partitions = partition_html(text=table_html)
693
    assert partitions[0].metadata.text_as_html == expected
694

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.