unstructured
1291 строка · 46.1 Кб
1import json
2import os
3import pathlib
4import tempfile
5import warnings
6from importlib import import_module
7from unittest.mock import Mock, patch
8
9import docx
10import pytest
11from PIL import Image
12
13from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction
14from test_unstructured.partition.test_constants import (
15EXPECTED_TABLE,
16EXPECTED_TABLE_XLSX,
17EXPECTED_TEXT,
18EXPECTED_TEXT_XLSX,
19EXPECTED_TITLE,
20)
21from unstructured.chunking.title import chunk_by_title
22from unstructured.cleaners.core import clean_extra_whitespace
23from unstructured.documents.elements import (
24Address,
25ElementMetadata,
26ListItem,
27NarrativeText,
28Table,
29TableChunk,
30Text,
31Title,
32)
33from unstructured.file_utils.filetype import FILETYPE_TO_MIMETYPE, FileType
34from unstructured.partition import auto
35from unstructured.partition.auto import _get_partition_with_extras, partition
36from unstructured.partition.common import convert_office_doc
37from unstructured.partition.utils.constants import PartitionStrategy
38from unstructured.staging.base import elements_to_json
39
40DIRECTORY = pathlib.Path(__file__).parent.resolve()
41EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
42
43EXPECTED_EMAIL_OUTPUT = [
44NarrativeText(text="This is a test email to use for unit tests."),
45Title(text="Important points:"),
46ListItem(text="Roses are red"),
47ListItem(text="Violets are blue"),
48]
49
50EML_TEST_FILE = "eml/fake-email.eml"
51
52is_in_docker = os.path.exists("/.dockerenv")
53
54
55def test_auto_partition_email_from_filename():
56filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
57elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
58assert len(elements) > 0
59assert elements == EXPECTED_EMAIL_OUTPUT
60assert elements[0].metadata.filename == os.path.basename(filename)
61assert elements[0].metadata.file_directory == os.path.split(filename)[0]
62
63
64def test_auto_partition_email_from_file():
65filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
66with open(filename) as f:
67elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
68assert len(elements) > 0
69assert elements == EXPECTED_EMAIL_OUTPUT
70
71
72def test_auto_partition_email_from_file_rb():
73filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
74with open(filename, "rb") as f:
75elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
76assert len(elements) > 0
77assert elements == EXPECTED_EMAIL_OUTPUT
78
79
80@pytest.fixture()
81def mock_docx_document():
82document = docx.Document()
83
84document.add_paragraph("These are a few of my favorite things:", style="Heading 1")
85# NOTE(robinson) - this should get picked up as a list item due to the •
86document.add_paragraph("• Parrots", style="Normal")
87document.add_paragraph("Hockey", style="List Bullet")
88# NOTE(robinson) - this should get picked up as a title
89document.add_paragraph("Analysis", style="Normal")
90# NOTE(robinson) - this should get dropped because it is empty
91document.add_paragraph("", style="Normal")
92# NOTE(robinson) - this should get picked up as a narrative text
93document.add_paragraph("This is my first thought. This is my second thought.", style="Normal")
94document.add_paragraph("This is my third thought.", style="Body Text")
95# NOTE(robinson) - this should just be regular text
96document.add_paragraph("2023")
97
98return document
99
100
101@pytest.fixture()
102def expected_docx_elements():
103return [
104Title("These are a few of my favorite things:"),
105ListItem("Parrots"),
106ListItem("Hockey"),
107Title("Analysis"),
108NarrativeText("This is my first thought. This is my second thought."),
109NarrativeText("This is my third thought."),
110Text("2023"),
111]
112
113
114def test_auto_partition_docx_with_filename(mock_docx_document, expected_docx_elements, tmpdir):
115filename = os.path.join(tmpdir.dirname, "mock_document.docx")
116mock_docx_document.save(filename)
117
118elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
119assert elements == expected_docx_elements
120assert elements[0].metadata.filename == os.path.basename(filename)
121
122
123def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_elements, tmpdir):
124filename = os.path.join(tmpdir.dirname, "mock_document.docx")
125mock_docx_document.save(filename)
126
127with open(filename, "rb") as f:
128elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
129assert elements == expected_docx_elements
130
131
132@pytest.mark.parametrize(
133("pass_metadata_filename", "content_type"),
134[(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
135)
136def test_auto_partition_doc_with_filename(
137mock_docx_document,
138expected_docx_elements,
139tmpdir,
140pass_metadata_filename,
141content_type,
142):
143docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
144doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
145mock_docx_document.save(docx_filename)
146convert_office_doc(docx_filename, tmpdir.dirname, "doc")
147metadata_filename = doc_filename if pass_metadata_filename else None
148elements = partition(
149filename=doc_filename,
150metadata_filename=metadata_filename,
151content_type=content_type,
152strategy=PartitionStrategy.HI_RES,
153)
154assert elements == expected_docx_elements
155assert elements[0].metadata.filename == "mock_document.doc"
156assert elements[0].metadata.file_directory == tmpdir.dirname
157
158
159# NOTE(robinson) - the application/x-ole-storage mime type is not specific enough to
160# determine that the file is an .doc document
161@pytest.mark.xfail()
162def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements, tmpdir):
163docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
164doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
165mock_docx_document.save(docx_filename)
166convert_office_doc(docx_filename, tmpdir.dirname, "doc")
167
168with open(doc_filename, "rb") as f:
169elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
170assert elements == expected_docx_elements
171
172
173@pytest.mark.parametrize(
174("pass_metadata_filename", "content_type"),
175[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
176)
177def test_auto_partition_html_from_filename(pass_metadata_filename, content_type):
178filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example-10k.html")
179metadata_filename = filename if pass_metadata_filename else None
180elements = partition(
181filename=filename,
182metadata_filename=metadata_filename,
183content_type=content_type,
184strategy=PartitionStrategy.HI_RES,
185)
186assert len(elements) > 0
187assert elements[0].metadata.filename == os.path.basename(filename)
188assert elements[0].metadata.file_directory == os.path.split(filename)[0]
189
190
191@pytest.mark.parametrize(
192("pass_metadata_filename", "content_type"),
193[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
194)
195def test_auto_partition_html_from_file(pass_metadata_filename, content_type):
196filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html")
197metadata_filename = filename if pass_metadata_filename else None
198with open(filename) as f:
199elements = partition(
200file=f,
201metadata_filename=metadata_filename,
202content_type=content_type,
203strategy=PartitionStrategy.HI_RES,
204)
205assert len(elements) > 0
206
207
208def test_auto_partition_html_from_file_rb():
209filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html")
210with open(filename, "rb") as f:
211elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
212assert len(elements) > 0
213
214
215def test_auto_partition_json_from_filename():
216"""Test auto-processing an unstructured json output file by filename."""
217filename = os.path.join(
218EXAMPLE_DOCS_DIRECTORY,
219"..",
220"test_unstructured_ingest",
221"expected-structured-output",
222"azure",
223"spring-weather.html.json",
224)
225with open(filename) as json_f:
226json_data = json.load(json_f)
227json_elems = json.loads(
228elements_to_json(partition(filename=filename, strategy=PartitionStrategy.HI_RES))
229)
230for elem in json_elems:
231elem.pop("metadata")
232for elem in json_data:
233elem.pop("metadata")
234assert json_data == json_elems
235
236
237def test_auto_partition_json_raises_with_unprocessable_json(tmpdir):
238# NOTE(robinson) - This is unprocessable because it is not a list of dicts,
239# per the Unstructured ISD format
240text = '{"hi": "there"}'
241
242filename = os.path.join(tmpdir, "unprocessable.json")
243with open(filename, "w") as f:
244f.write(text)
245
246with pytest.raises(ValueError):
247partition(filename=filename)
248
249
250@pytest.mark.xfail(
251reason="parsed as text not json, https://github.com/Unstructured-IO/unstructured/issues/492",
252)
253def test_auto_partition_json_from_file():
254"""Test auto-processing an unstructured json output file by file handle."""
255filename = os.path.join(
256EXAMPLE_DOCS_DIRECTORY,
257"..",
258"test_unstructured_ingest",
259"expected-structured-output",
260"azure-blob-storage",
261"spring-weather.html.json",
262)
263with open(filename) as json_f:
264json_data = json.load(json_f)
265with open(filename, encoding="utf-8") as partition_f:
266json_elems = json.loads(
267elements_to_json(partition(file=partition_f, strategy=PartitionStrategy.HI_RES))
268)
269for elem in json_elems:
270# coordinates are always in the element data structures, even if None
271elem.pop("coordinates")
272elem.pop("coordinate_system")
273assert json_data == json_elems
274
275
276EXPECTED_TEXT_OUTPUT = [
277NarrativeText(text="This is a test document to use for unit tests."),
278Address(text="Doylestown, PA 18901"),
279Title(text="Important points:"),
280ListItem(text="Hamburgers are delicious"),
281ListItem(text="Dogs are the best"),
282ListItem(text="I love fuzzy blankets"),
283]
284
285
286def test_auto_partition_text_from_filename():
287filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
288elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
289assert len(elements) > 0
290assert elements == EXPECTED_TEXT_OUTPUT
291assert elements[0].metadata.filename == os.path.basename(filename)
292assert elements[0].metadata.file_directory == os.path.split(filename)[0]
293
294
295def test_auto_partition_text_from_file():
296filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
297with open(filename) as f:
298elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
299assert len(elements) > 0
300assert elements == EXPECTED_TEXT_OUTPUT
301
302
303@pytest.mark.parametrize(
304("pass_metadata_filename", "content_type"),
305[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
306)
307def test_auto_partition_pdf_from_filename(pass_metadata_filename, content_type, request):
308filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
309metadata_filename = filename if pass_metadata_filename else None
310
311elements = partition(
312filename=filename,
313metadata_filename=metadata_filename,
314content_type=content_type,
315strategy=PartitionStrategy.HI_RES,
316)
317
318idx = 3
319assert isinstance(elements[idx], Title)
320assert elements[idx].text.startswith("LayoutParser")
321
322assert elements[idx].metadata.filename == os.path.basename(filename)
323assert elements[idx].metadata.file_directory == os.path.split(filename)[0]
324
325# NOTE(alan): Xfail since new model skips the word Zejiang
326request.applymarker(pytest.mark.xfail)
327
328idx += 1
329assert isinstance(elements[idx], NarrativeText)
330assert elements[idx].text.startswith("Zejiang Shen")
331
332
333def test_auto_partition_pdf_uses_table_extraction():
334filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
335with patch(
336"unstructured.partition.pdf_image.ocr.process_file_with_ocr",
337) as mock_process_file_with_model:
338partition(filename, pdf_infer_table_structure=True, strategy=PartitionStrategy.HI_RES)
339assert mock_process_file_with_model.call_args[1]["infer_table_structure"]
340
341
342def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
343filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
344
345mock_return = [NarrativeText("Hello there!")]
346with patch.object(auto, "partition_pdf", return_value=mock_return) as mock_partition:
347mock_partition_with_extras_map = {"pdf": mock_partition}
348monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
349partition(filename=filename, strategy=PartitionStrategy.FAST)
350
351mock_partition.assert_called_once_with(
352filename=filename,
353file=None,
354url=None,
355strategy=PartitionStrategy.FAST,
356languages=None,
357metadata_filename=None,
358include_page_breaks=False,
359infer_table_structure=False,
360extract_images_in_pdf=False,
361extract_image_block_types=None,
362extract_image_block_output_dir=None,
363extract_image_block_to_payload=False,
364hi_res_model_name=None,
365)
366
367
368@pytest.mark.parametrize(
369("pass_metadata_filename", "content_type"),
370[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
371)
372def test_auto_partition_pdf_from_file(pass_metadata_filename, content_type, request):
373filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
374metadata_filename = filename if pass_metadata_filename else None
375
376with open(filename, "rb") as f:
377elements = partition(
378file=f,
379metadata_filename=metadata_filename,
380content_type=content_type,
381strategy=PartitionStrategy.HI_RES,
382)
383
384idx = 3
385assert isinstance(elements[idx], Title)
386assert elements[idx].text.startswith("LayoutParser")
387
388# NOTE(alan): Xfail since new model misses the first word Zejiang
389request.applymarker(pytest.mark.xfail)
390
391idx += 1
392assert isinstance(elements[idx], NarrativeText)
393assert elements[idx].text.startswith("Zejiang Shen")
394
395
396def test_auto_partition_formats_languages_for_tesseract():
397filename = "example-docs/chi_sim_image.jpeg"
398with patch(
399"unstructured.partition.pdf_image.ocr.process_file_with_ocr",
400) as mock_process_file_with_ocr:
401partition(filename, strategy=PartitionStrategy.HI_RES, languages=["zh"])
402_, kwargs = mock_process_file_with_ocr.call_args_list[0]
403assert "ocr_languages" in kwargs
404assert kwargs["ocr_languages"] == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert"
405
406
407def test_auto_partition_element_metadata_user_provided_languages():
408filename = "example-docs/chevron-page.pdf"
409elements = partition(filename=filename, strategy=PartitionStrategy.OCR_ONLY, languages=["eng"])
410assert elements[0].metadata.languages == ["eng"]
411
412
413@pytest.mark.parametrize(
414("languages", "ocr_languages"),
415[(["auto"], ""), (["eng"], "")],
416)
417def test_auto_partition_ignores_empty_string_for_ocr_languages(languages, ocr_languages):
418filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "book-war-and-peace-1p.txt")
419elements = partition(
420filename=filename,
421strategy=PartitionStrategy.OCR_ONLY,
422ocr_languages=ocr_languages,
423languages=languages,
424)
425assert elements[0].metadata.languages == ["eng"]
426
427
428def test_auto_partition_warns_with_ocr_languages(caplog):
429filename = "example-docs/chevron-page.pdf"
430partition(filename=filename, strategy=PartitionStrategy.HI_RES, ocr_languages="eng")
431assert "The ocr_languages kwarg will be deprecated" in caplog.text
432
433
434def test_partition_pdf_doesnt_raise_warning():
435filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
436# NOTE(robinson): This is the recommended way to check that no warning is emitted,
437# per the pytest docs.
438# ref: https://docs.pytest.org/en/7.0.x/how-to/capture-warnings.html
439# #additional-use-cases-of-warnings-in-tests
440with warnings.catch_warnings():
441warnings.simplefilter("error")
442partition(filename=filename, strategy=PartitionStrategy.HI_RES)
443
444
445@pytest.mark.parametrize(
446("pass_metadata_filename", "content_type"),
447[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
448)
449def test_auto_partition_image(pass_metadata_filename, content_type):
450filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
451metadata_filename = filename if pass_metadata_filename else None
452elements = partition(
453filename=filename,
454metadata_filename=metadata_filename,
455content_type=content_type,
456strategy=PartitionStrategy.AUTO,
457)
458
459# should be same result as test_partition_image_default_strategy_hi_res() in test_image.py
460title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
461idx = 2
462assert elements[idx].text == title
463assert elements[idx].metadata.coordinates is not None
464
465
466@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
467def test_auto_partition_image_element_extraction(
468extract_image_block_to_payload,
469filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "embedded-images-tables.jpg"),
470):
471extract_image_block_types = ["Image", "Table"]
472
473with tempfile.TemporaryDirectory() as tmpdir:
474elements = partition(
475filename=filename,
476extract_image_block_types=extract_image_block_types,
477extract_image_block_to_payload=extract_image_block_to_payload,
478extract_image_block_output_dir=tmpdir,
479)
480
481assert_element_extraction(
482elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
483)
484
485
486@pytest.mark.parametrize(
487("pass_metadata_filename", "content_type"),
488[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
489)
490def test_auto_partition_jpg(pass_metadata_filename, content_type):
491filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
492metadata_filename = filename if pass_metadata_filename else None
493elements = partition(
494filename=filename,
495metadata_filename=metadata_filename,
496content_type=content_type,
497strategy=PartitionStrategy.AUTO,
498)
499assert len(elements) > 0
500
501
502@pytest.mark.parametrize(
503("pass_metadata_filename", "content_type"),
504[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
505)
506def test_auto_partition_jpg_from_file(pass_metadata_filename, content_type):
507filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
508metadata_filename = filename if pass_metadata_filename else None
509with open(filename, "rb") as f:
510elements = partition(
511file=f,
512metadata_filename=metadata_filename,
513content_type=content_type,
514strategy=PartitionStrategy.AUTO,
515)
516assert len(elements) > 0
517
518
519def test_auto_partition_raises_with_bad_type(monkeypatch):
520monkeypatch.setattr(auto, "detect_filetype", lambda *args, **kwargs: None)
521with pytest.raises(ValueError):
522partition(filename="made-up.fake", strategy=PartitionStrategy.HI_RES)
523
524
525EXPECTED_PPTX_OUTPUT = [
526Title(text="Adding a Bullet Slide"),
527ListItem(text="Find the bullet slide layout"),
528ListItem(text="Use _TextFrame.text for first bullet"),
529ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"),
530NarrativeText(text="Here is a lot of text!"),
531NarrativeText(text="Here is some text in a text box!"),
532]
533
534
535def test_auto_partition_pptx_from_filename():
536filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
537elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
538assert elements == EXPECTED_PPTX_OUTPUT
539assert elements[0].metadata.filename == os.path.basename(filename)
540assert elements[0].metadata.file_directory == os.path.split(filename)[0]
541
542
543@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
544def test_auto_partition_ppt_from_filename():
545filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt")
546elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
547assert elements == EXPECTED_PPTX_OUTPUT
548assert elements[0].metadata.filename == os.path.basename(filename)
549assert elements[0].metadata.file_directory == os.path.split(filename)[0]
550
551
552def test_auto_with_page_breaks():
553filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
554elements = partition(
555filename=filename, include_page_breaks=True, strategy=PartitionStrategy.HI_RES
556)
557assert "PageBreak" in [elem.category for elem in elements]
558
559
560def test_auto_partition_epub_from_filename():
561filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
562elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
563assert len(elements) > 0
564assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
565
566
567def test_auto_partition_epub_from_file():
568filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
569with open(filename, "rb") as f:
570elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
571assert len(elements) > 0
572assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
573
574
575EXPECTED_MSG_OUTPUT = [
576NarrativeText(text="This is a test email to use for unit tests."),
577Title(text="Important points:"),
578ListItem(text="Roses are red"),
579ListItem(text="Violets are blue"),
580]
581
582
583def test_auto_partition_msg_from_filename():
584filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
585elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
586assert elements == EXPECTED_MSG_OUTPUT
587
588
589def test_auto_partition_rtf_from_filename():
590filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-doc.rtf")
591elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
592assert elements[0] == Title("My First Heading")
593
594
595def test_auto_partition_from_url():
596url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
597elements = partition(url=url, content_type="text/plain", strategy=PartitionStrategy.HI_RES)
598assert elements[0] == Title("Apache License")
599assert elements[0].metadata.url == url
600
601
602def test_partition_md_works_with_embedded_html():
603url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/README.md"
604elements = partition(url=url, content_type="text/markdown", strategy=PartitionStrategy.HI_RES)
605elements[0].text
606unstructured_found = False
607for element in elements:
608if "unstructured" in elements[0].text:
609unstructured_found = True
610break
611assert unstructured_found is True
612
613
614def test_auto_partition_warns_if_header_set_and_not_url(caplog):
615filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
616partition(
617filename=filename, headers={"Accept": "application/pdf"}, strategy=PartitionStrategy.HI_RES
618)
619assert caplog.records[0].levelname == "WARNING"
620
621
622def test_auto_partition_works_with_unstructured_jsons():
623filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "spring-weather.html.json")
624elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
625assert elements[0].text == "News Around NOAA"
626
627
628def test_auto_partition_works_with_unstructured_jsons_from_file():
629filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "spring-weather.html.json")
630with open(filename, "rb") as f:
631elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
632assert elements[0].text == "News Around NOAA"
633
634
635def test_auto_partition_odt_from_filename():
636filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
637elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
638assert elements[0] == Title("Lorem ipsum dolor sit amet.")
639
640
641def test_auto_partition_odt_from_file():
642filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
643with open(filename, "rb") as f:
644elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
645
646assert elements[0] == Title("Lorem ipsum dolor sit amet.")
647
648
649@pytest.mark.parametrize(
650("content_type", "routing_func", "expected"),
651[
652("text/csv", "csv", "text/csv"),
653("text/html", "html", "text/html"),
654("jdsfjdfsjkds", "pdf", None),
655],
656)
657def test_auto_adds_filetype_to_metadata(content_type, routing_func, expected, monkeypatch):
658with patch(
659f"unstructured.partition.auto.partition_{routing_func}",
660lambda *args, **kwargs: [Text("text 1"), Text("text 2")],
661) as mock_partition:
662mock_partition_with_extras_map = {routing_func: mock_partition}
663monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
664elements = partition("example-docs/layout-parser-paper-fast.pdf", content_type=content_type)
665assert len(elements) == 2
666assert all(el.metadata.filetype == expected for el in elements)
667
668
669@pytest.mark.parametrize(
670("content_type", "expected"),
671[
672("application/pdf", FILETYPE_TO_MIMETYPE[FileType.PDF]),
673(None, FILETYPE_TO_MIMETYPE[FileType.PDF]),
674],
675)
676def test_auto_filetype_overrides_file_specific(content_type, expected, monkeypatch):
677pdf_metadata = ElementMetadata(filetype="imapdf")
678with patch(
679"unstructured.partition.auto.partition_pdf",
680lambda *args, **kwargs: [
681Text("text 1", metadata=pdf_metadata),
682Text("text 2", metadata=pdf_metadata),
683],
684) as mock_partition:
685mock_partition_with_extras_map = {"pdf": mock_partition}
686monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
687elements = partition("example-docs/layout-parser-paper-fast.pdf", content_type=content_type)
688assert len(elements) == 2
689assert all(el.metadata.filetype == expected for el in elements)
690
691
692@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
693def test_auto_partition_pdf_element_extraction(
694extract_image_block_to_payload,
695filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "embedded-images-tables.pdf"),
696):
697extract_image_block_types = ["Image", "Table"]
698
699with tempfile.TemporaryDirectory() as tmpdir:
700elements = partition(
701filename=filename,
702extract_image_block_types=extract_image_block_types,
703extract_image_block_to_payload=extract_image_block_to_payload,
704extract_image_block_output_dir=tmpdir,
705)
706
707assert_element_extraction(
708elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
709)
710
711
712supported_filetypes = [
713_
714for _ in FileType
715if _
716not in (
717FileType.UNK,
718FileType.ZIP,
719FileType.XLS,
720)
721]
722
723
724FILETYPE_TO_MODULE = {
725FileType.JPG: "image",
726FileType.PNG: "image",
727FileType.HEIC: "image",
728FileType.TXT: "text",
729FileType.EML: "email",
730}
731
732
733@pytest.mark.parametrize("filetype", supported_filetypes)
734def test_file_specific_produces_correct_filetype(filetype: FileType):
735if filetype in auto.IMAGE_FILETYPES or filetype in (FileType.WAV, FileType.EMPTY):
736pytest.skip()
737extension = filetype.name.lower()
738filetype_module = FILETYPE_TO_MODULE.get(filetype, extension)
739fun_name = "partition_" + filetype_module
740module = import_module(f"unstructured.partition.{filetype_module}") # noqa
741fun = eval(f"module.{fun_name}")
742for file in pathlib.Path("example-docs").iterdir():
743if file.is_file() and file.suffix == f".{extension}":
744elements = fun(str(file))
745assert all(
746el.metadata.filetype == FILETYPE_TO_MIMETYPE[filetype]
747for el in elements
748if el.metadata.filetype is not None
749)
750break
751
752
753def test_auto_partition_xml_from_filename(filename="example-docs/factbook.xml"):
754elements = partition(filename=filename, xml_keep_tags=False, metadata_filename=filename)
755
756assert elements[0].text == "United States"
757assert elements[0].metadata.filename == "factbook.xml"
758
759
760def test_auto_partition_xml_from_file(filename="example-docs/factbook.xml"):
761with open(filename, "rb") as f:
762elements = partition(file=f, xml_keep_tags=False)
763
764assert elements[0].text == "United States"
765
766
767def test_auto_partition_xml_from_filename_with_tags(filename="example-docs/factbook.xml"):
768elements = partition(filename=filename, xml_keep_tags=True)
769
770assert "<leader>Joe Biden</leader>" in elements[0].text
771assert elements[0].metadata.filename == "factbook.xml"
772
773
774def test_auto_partition_xml_from_file_with_tags(filename="example-docs/factbook.xml"):
775with open(filename, "rb") as f:
776elements = partition(file=f, xml_keep_tags=True)
777
778assert "<leader>Joe Biden</leader>" in elements[0].text
779
780
781EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
782
783
784def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
785elements = partition(filename=filename, include_header=False, skip_infer_table_types=[])
786
787assert sum(isinstance(element, Table) for element in elements) == 2
788assert sum(isinstance(element, Title) for element in elements) == 2
789assert len(elements) == 4
790
791assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
792assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT_XLSX
793assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
794assert elements[1].metadata.page_number == 1
795assert elements[1].metadata.filetype == EXPECTED_XLSX_FILETYPE
796
797
798@pytest.mark.parametrize(
799("skip_infer_table_types", "filename", "has_text_as_html_field"),
800[
801(["xlsx"], "stanley-cups.xlsx", False),
802([], "stanley-cups.xlsx", True),
803(["odt"], "fake.odt", False),
804([], "fake.odt", True),
805],
806)
807def test_auto_partition_respects_skip_infer_table_types(
808skip_infer_table_types,
809filename,
810has_text_as_html_field,
811):
812filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
813with open(filename, "rb") as f:
814table_elements = [
815e
816for e in partition(file=f, skip_infer_table_types=skip_infer_table_types)
817if isinstance(e, Table)
818]
819for table_element in table_elements:
820table_element_has_text_as_html_field = (
821hasattr(table_element.metadata, "text_as_html")
822and table_element.metadata.text_as_html is not None
823)
824assert table_element_has_text_as_html_field == has_text_as_html_field
825
826
827def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
828with open(filename, "rb") as f:
829elements = partition(file=f, include_header=False, skip_infer_table_types=[])
830
831assert sum(isinstance(element, Table) for element in elements) == 2
832assert sum(isinstance(element, Title) for element in elements) == 2
833assert len(elements) == 4
834
835assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
836assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT_XLSX
837assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
838assert elements[1].metadata.page_number == 1
839assert elements[1].metadata.filetype == EXPECTED_XLSX_FILETYPE
840
841
842EXPECTED_XLS_TEXT_LEN = 550
843
844
845EXPECTED_XLS_INITIAL_45_CLEAN_TEXT = "MC What is 2+2? 4 correct 3 incorrect MA What"
846
847EXPECTED_XLS_TABLE = (
848"""<table border="1" class="dataframe">
849<tbody>
850<tr>
851<td>MC</td>
852<td>What is 2+2?</td>
853<td>4</td>
854<td>correct</td>
855<td>3</td>
856<td>incorrect</td>
857<td></td>
858<td></td>
859<td></td>
860</tr>
861<tr>
862<td>MA</td>
863<td>What C datatypes are 8 bits? (assume i386)</td>
864<td>int</td>
865<td></td>
866<td>float</td>
867<td></td>
868<td>double</td>
869<td></td>
870<td>char</td>
871</tr>
872<tr>
873<td>TF</td>
874<td>Bagpipes are awesome.</td>
875<td>true</td>
876<td></td>
877<td></td>
878<td></td>
879<td></td>
880<td></td>
881<td></td>
882</tr>
883<tr>
884<td>ESS</td>
885<td>How have the original Henry Hornbostel buildings """
886"""influenced campus architecture and design in the last 30 years?</td>
887<td></td>
888<td></td>
889<td></td>
890<td></td>
891<td></td>
892<td></td>
893<td></td>
894</tr>
895<tr>
896<td>ORD</td>
897<td>Rank the following in their order of operation.</td>
898<td>Parentheses</td>
899<td>Exponents</td>
900<td>Division</td>
901<td>Addition</td>
902<td></td>
903<td></td>
904<td></td>
905</tr>
906<tr>
907<td>FIB</td>
908<td>The student activities fee is</td>
909<td>95</td>
910<td>dollars for students enrolled in</td>
911<td>19</td>
912<td>units or more,</td>
913<td></td>
914<td></td>
915<td></td>
916</tr>
917<tr>
918<td>MAT</td>
919<td>Match the lower-case greek letter with its capital form.</td>
920<td>λ</td>
921<td>Λ</td>
922<td>α</td>
923<td>γ</td>
924<td>Γ</td>
925<td>φ</td>
926<td>Φ</td>
927</tr>
928</tbody>
929</table>"""
930)
931
932
933@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
934def test_auto_partition_xls_from_filename(filename="example-docs/tests-example.xls"):
935elements = partition(filename=filename, include_header=False, skip_infer_table_types=[])
936
937assert sum(isinstance(element, Table) for element in elements) == 2
938assert len(elements) == 14
939
940assert clean_extra_whitespace(elements[0].text)[:45] == EXPECTED_XLS_INITIAL_45_CLEAN_TEXT
941# NOTE(crag): if the beautifulsoup4 package is installed, some (but not all) additional
942# whitespace is removed, so the expected text length is less than is the case
943# when beautifulsoup4 is *not* installed. E.g.
944# "\n\n\nMA\nWhat C datatypes are 8 bits" vs.
945# '\n \n \n MA\n What C datatypes are 8 bits?... "
946assert len(elements[0].text) == EXPECTED_XLS_TEXT_LEN
947assert elements[0].metadata.text_as_html == EXPECTED_XLS_TABLE
948
949
950@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
951def test_auto_partition_csv_from_filename(filename="example-docs/stanley-cups.csv"):
952elements = partition(filename=filename)
953
954assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
955assert elements[0].metadata.text_as_html == EXPECTED_TABLE
956assert elements[0].metadata.filetype == "text/csv"
957
958
959@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
960def test_auto_partition_tsv_from_filename(filename="example-docs/stanley-cups.tsv"):
961elements = partition(filename=filename)
962
963assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
964assert elements[0].metadata.text_as_html == EXPECTED_TABLE
965assert elements[0].metadata.filetype == "text/tsv"
966
967
968@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
969def test_auto_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
970with open(filename, "rb") as f:
971elements = partition(file=f)
972
973assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
974assert isinstance(elements[0], Table)
975assert elements[0].metadata.text_as_html == EXPECTED_TABLE
976assert elements[0].metadata.filetype == "text/csv"
977
978
979def test_auto_partition_html_pre_from_file(filename="example-docs/fake-html-pre.htm"):
980elements = partition(filename=filename)
981
982assert len(elements) > 0
983assert "PageBreak" not in [elem.category for elem in elements]
984assert clean_extra_whitespace(elements[0].text).startswith("[107th Congress Public Law 56]")
985assert isinstance(elements[0], NarrativeText)
986assert elements[0].metadata.filetype == "text/html"
987assert elements[0].metadata.filename == "fake-html-pre.htm"
988
989
990def test_auto_partition_works_on_empty_filename(filename="example-docs/empty.txt"):
991assert partition(filename=filename) == []
992
993
994def test_auto_partition_works_on_empty_file(filename="example-docs/empty.txt"):
995with open(filename, "rb") as f:
996assert partition(file=f) == []
997
998
999def test_auto_partition_org_from_filename(filename="example-docs/README.org"):
1000elements = partition(filename=filename)
1001
1002assert elements[0] == Title("Example Docs")
1003assert elements[0].metadata.filetype == "text/org"
1004
1005
1006def test_auto_partition_org_from_file(filename="example-docs/README.org"):
1007with open(filename, "rb") as f:
1008elements = partition(file=f, content_type="text/org")
1009
1010assert elements[0] == Title("Example Docs")
1011assert elements[0].metadata.filetype == "text/org"
1012
1013
1014def test_auto_partition_rst_from_filename(filename="example-docs/README.rst"):
1015elements = partition(filename=filename)
1016
1017assert elements[0] == Title("Example Docs")
1018assert elements[0].metadata.filetype == "text/x-rst"
1019
1020
1021def test_auto_partition_rst_from_file(filename="example-docs/README.rst"):
1022with open(filename, "rb") as f:
1023elements = partition(file=f, content_type="text/x-rst")
1024
1025assert elements[0] == Title("Example Docs")
1026assert elements[0].metadata.filetype == "text/x-rst"
1027
1028
1029def test_auto_partition_metadata_filename():
1030filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
1031with open(filename) as f:
1032elements = partition(file=f, metadata_filename=filename)
1033assert elements[0].metadata.filename == os.path.split(filename)[-1]
1034
1035
1036def test_auto_partition_warns_about_file_filename_deprecation(caplog):
1037filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
1038with open(filename) as f:
1039elements = partition(file=f, file_filename=filename)
1040assert elements[0].metadata.filename == os.path.split(filename)[-1]
1041assert "WARNING" in caplog.text
1042assert "The file_filename kwarg will be deprecated" in caplog.text
1043
1044
1045def test_auto_partition_raises_with_file_and_metadata_filename():
1046filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
1047with open(filename) as f, pytest.raises(ValueError):
1048partition(file=f, file_filename=filename, metadata_filename=filename)
1049
1050
1051def test_get_partition_with_extras_prompts_for_install_if_missing():
1052partition_with_extras_map = {}
1053with pytest.raises(ImportError) as exception_info:
1054_get_partition_with_extras("pdf", partition_with_extras_map)
1055
1056msg = str(exception_info.value)
1057assert 'Install the pdf dependencies with pip install "unstructured[pdf]"' in msg
1058
1059
1060def test_add_chunking_strategy_on_partition_auto():
1061filename = "example-docs/example-10k-1p.html"
1062elements = partition(filename)
1063chunk_elements = partition(filename, chunking_strategy="by_title")
1064chunks = chunk_by_title(elements)
1065assert chunk_elements != elements
1066assert chunk_elements == chunks
1067
1068
1069def test_add_chunking_strategy_title_on_partition_auto_respects_multipage():
1070filename = "example-docs/example-10k-1p.html"
1071partitioned_elements_multipage_false_combine_chars_0 = partition(
1072filename,
1073chunking_strategy="by_title",
1074multipage_sections=False,
1075combine_text_under_n_chars=0,
1076new_after_n_chars=300,
1077max_characters=400,
1078)
1079partitioned_elements_multipage_true_combine_chars_0 = partition(
1080filename,
1081chunking_strategy="by_title",
1082multipage_sections=True,
1083combine_text_under_n_chars=0,
1084new_after_n_chars=300,
1085max_characters=400,
1086)
1087elements = partition(filename)
1088cleaned_elements_multipage_false_combine_chars_0 = chunk_by_title(
1089elements,
1090multipage_sections=False,
1091combine_text_under_n_chars=0,
1092new_after_n_chars=300,
1093max_characters=400,
1094)
1095cleaned_elements_multipage_true_combine_chars_0 = chunk_by_title(
1096elements,
1097multipage_sections=True,
1098combine_text_under_n_chars=0,
1099new_after_n_chars=300,
1100max_characters=400,
1101)
1102assert (
1103partitioned_elements_multipage_false_combine_chars_0
1104== cleaned_elements_multipage_false_combine_chars_0
1105)
1106assert (
1107partitioned_elements_multipage_true_combine_chars_0
1108== cleaned_elements_multipage_true_combine_chars_0
1109)
1110assert len(partitioned_elements_multipage_true_combine_chars_0) != len(
1111partitioned_elements_multipage_false_combine_chars_0,
1112)
1113
1114
1115def test_add_chunking_strategy_on_partition_auto_respects_max_chars():
1116filename = "example-docs/example-10k-1p.html"
1117
1118# default chunk size in chars is 200
1119partitioned_table_elements_200_chars = [
1120e
1121for e in partition(
1122filename,
1123chunking_strategy="by_title",
1124max_characters=200,
1125combine_text_under_n_chars=5,
1126)
1127if isinstance(e, (Table, TableChunk))
1128]
1129
1130partitioned_table_elements_5_chars = [
1131e
1132for e in partition(
1133filename,
1134chunking_strategy="by_title",
1135max_characters=5,
1136combine_text_under_n_chars=5,
1137)
1138if isinstance(e, (Table, TableChunk))
1139]
1140
1141elements = partition(filename)
1142
1143table_elements = [e for e in elements if isinstance(e, Table)]
1144
1145assert len(partitioned_table_elements_5_chars) != len(table_elements)
1146assert len(partitioned_table_elements_200_chars) != len(table_elements)
1147
1148# trailing whitespace is stripped from the first chunk, leaving only a checkbox character
1149assert len(partitioned_table_elements_5_chars[0].text) == 1
1150# but the second chunk is the full 5 characters
1151assert len(partitioned_table_elements_5_chars[1].text) == 5
1152assert len(partitioned_table_elements_5_chars[0].metadata.text_as_html) == 5
1153
1154# the first table element is under 200 chars so doesn't get chunked!
1155assert table_elements[0] == partitioned_table_elements_200_chars[0]
1156assert len(partitioned_table_elements_200_chars[0].text) < 200
1157assert len(partitioned_table_elements_200_chars[1].text) == 198
1158assert len(partitioned_table_elements_200_chars[1].metadata.text_as_html) == 200
1159
1160
1161def test_add_chunking_strategy_chars_on_partition_auto_adds_is_continuation():
1162filename = "example-docs/example-10k-1p.html"
1163
1164table_elements = [e for e in partition(filename) if isinstance(e, Table)]
1165chunked_table_elements = [
1166e
1167for e in partition(
1168filename,
1169chunking_strategy="by_title",
1170)
1171if isinstance(e, Table)
1172]
1173
1174assert table_elements != chunked_table_elements
1175
1176i = 0
1177for table in chunked_table_elements:
1178# have to reset the counter to 0 here when we encounter a Table element
1179if isinstance(table, Table):
1180i = 0
1181if i > 0 and isinstance(table, TableChunk):
1182assert table.metadata.is_continuation is True
1183i += 1
1184
1185
1186EXAMPLE_LANG_DOCS = "example-docs/language-docs/eng_spa_mult."
1187
1188
1189@pytest.mark.parametrize(
1190"file_extension",
1191[
1192"doc",
1193"docx",
1194"eml",
1195"epub",
1196"html",
1197"md",
1198"odt",
1199"org",
1200"ppt",
1201"pptx",
1202"rst",
1203"rtf",
1204"txt",
1205"xml",
1206],
1207)
1208def test_partition_respects_language_arg(file_extension):
1209filename = EXAMPLE_LANG_DOCS + file_extension
1210elements = partition(filename=filename, languages=["deu"])
1211assert all(element.metadata.languages == ["deu"] for element in elements)
1212
1213
1214def test_partition_respects_detect_language_per_element_arg():
1215filename = "example-docs/language-docs/eng_spa_mult.txt"
1216elements = partition(filename=filename, detect_language_per_element=True)
1217langs = [element.metadata.languages for element in elements]
1218assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
1219
1220
1221# check that the ["eng"] default in `partition` does not overwrite the ["auto"]
1222# default in other `partition_` functions.
1223def test_partition_default_does_not_overwrite_other_defaults():
1224# the default for `languages` is ["auto"] in partiton_text
1225from unstructured.partition.text import partition_text
1226
1227# Use a document that is primarily in a language other than English
1228filename = "example-docs/language-docs/UDHR_first_article_all.txt"
1229text_elements = partition_text(filename)
1230assert text_elements[0].metadata.languages != ["eng"]
1231
1232auto_elements = partition(filename)
1233assert auto_elements[0].metadata.languages != ["eng"]
1234assert auto_elements[0].metadata.languages == text_elements[0].metadata.languages
1235
1236
1237def test_partition_languages_default_to_None():
1238filename = "example-docs/handbook-1p.docx"
1239elements = partition(filename=filename, detect_language_per_element=True)
1240# PageBreak and other elements with no text will have `None` for `languages`
1241none_langs = [element for element in elements if element.metadata.languages is None]
1242assert none_langs[0].text == ""
1243
1244
1245def test_partition_languages_incorrectly_defaults_to_English(tmpdir):
1246# We don't totally rely on langdetect for short text, so text like the following that is
1247# in German will be labeled as English.
1248german = "Ein kurzer Satz."
1249filepath = os.path.join(tmpdir, "short-german.txt")
1250with open(filepath, "w") as f:
1251f.write(german)
1252elements = partition(filepath)
1253assert elements[0].metadata.languages == ["eng"]
1254
1255
1256def test_partition_timeout_gets_routed():
1257class CallException(Exception):
1258pass
1259
1260mock_ocr_func = Mock(side_effect=CallException("Function called!"))
1261with patch("unstructured.partition.auto.file_and_type_from_url", mock_ocr_func), pytest.raises(
1262CallException
1263):
1264auto.partition(url="fake_url", request_timeout=326)
1265kwargs = mock_ocr_func.call_args.kwargs
1266assert "request_timeout" in kwargs
1267assert kwargs["request_timeout"] == 326
1268
1269
1270def test_partition_image_with_bmp_with_auto(
1271tmpdir,
1272filename="example-docs/layout-parser-paper-with-table.jpg",
1273):
1274bmp_filename = os.path.join(tmpdir.dirname, "example.bmp")
1275img = Image.open(filename)
1276img.save(bmp_filename)
1277
1278elements = partition(
1279filename=bmp_filename,
1280strategy=PartitionStrategy.HI_RES,
1281)
1282table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html]
1283assert len(table) == 1
1284assert "<table><thead><th>" in table[0]
1285
1286
1287def test_auto_partition_eml_add_signature_to_metadata():
1288elements = partition(filename="example-docs/eml/signed-doc.p7s")
1289assert len(elements) == 1
1290assert elements[0].text == "This is a test"
1291assert elements[0].metadata.signature == "<SIGNATURE>\n"
1292