unstructured
642 строки · 22.4 Кб
1import datetime
2import email
3import os
4import pathlib
5
6import pytest
7
8from test_unstructured.unit_utils import (
9assert_round_trips_through_JSON,
10example_doc_path,
11parse_optional_datetime,
12)
13from unstructured.chunking.title import chunk_by_title
14from unstructured.documents.elements import (
15ElementMetadata,
16Image,
17ListItem,
18NarrativeText,
19Text,
20Title,
21)
22from unstructured.documents.email_elements import (
23MetaData,
24ReceivedInfo,
25Recipient,
26Sender,
27Subject,
28)
29from unstructured.partition.email import (
30convert_to_iso_8601,
31extract_attachment_info,
32partition_email,
33partition_email_header,
34)
35from unstructured.partition.text import partition_text
36
37FILE_DIRECTORY = pathlib.Path(__file__).parent.resolve()
38EXAMPLE_DOCS_DIRECTORY = os.path.join(FILE_DIRECTORY, "..", "..", "example-docs", "eml")
39
40
41EXPECTED_OUTPUT = [
42NarrativeText(text="This is a test email to use for unit tests."),
43Title(text="Important points:"),
44ListItem(text="Roses are red"),
45ListItem(text="Violets are blue"),
46]
47
48IMAGE_EXPECTED_OUTPUT = [
49NarrativeText(text="This is a test email to use for unit tests."),
50Title(text="Important points:"),
51NarrativeText(text="hello this is our logo."),
52Image(text="unstructured_logo.png"),
53ListItem(text="Roses are red"),
54ListItem(text="Violets are blue"),
55]
56
57RECEIVED_HEADER_OUTPUT = [
58ReceivedInfo(name="ABCDEFG-000.ABC.guide", text="00.0.0.00"),
59ReceivedInfo(name="ABCDEFG-000.ABC.guide", text="ba23::58b5:2236:45g2:88h2"),
60ReceivedInfo(
61name="received_datetimetz",
62text="2023-02-20 10:03:18+12:00",
63datestamp=datetime.datetime(
642023,
652,
6620,
6710,
683,
6918,
70tzinfo=datetime.timezone(datetime.timedelta(seconds=43200)),
71),
72),
73MetaData(name="MIME-Version", text="1.0"),
74MetaData(name="Date", text="Fri, 16 Dec 2022 17:04:16 -0500"),
75MetaData(
76name="Message-ID",
77text="<CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>",
78),
79Subject(text="Test Email"),
80Sender(name="Matthew Robinson", text="mrobinson@unstructured.io"),
81Recipient(name="Matthew Robinson", text="mrobinson@unstructured.io"),
82MetaData(
83name="Content-Type",
84text='multipart/alternative; boundary="00000000000095c9b205eff92630"',
85),
86]
87
88HEADER_EXPECTED_OUTPUT = [
89MetaData(name="MIME-Version", text="1.0"),
90MetaData(name="Date", text="Fri, 16 Dec 2022 17:04:16 -0500"),
91MetaData(
92name="Message-ID",
93text="<CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>",
94),
95Subject(text="Test Email"),
96Sender(name="Matthew Robinson", text="mrobinson@unstructured.io"),
97Recipient(name="Matthew Robinson", text="mrobinson@unstructured.io"),
98MetaData(
99name="Content-Type",
100text='multipart/alternative; boundary="00000000000095c9b205eff92630"',
101),
102]
103
104ALL_EXPECTED_OUTPUT = HEADER_EXPECTED_OUTPUT + EXPECTED_OUTPUT
105
106ATTACH_EXPECTED_OUTPUT = [
107{"filename": "fake-attachment.txt", "payload": b"Hey this is a fake attachment!"},
108]
109
110
111def test_partition_email_from_filename():
112filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
113elements = partition_email(filename=filename)
114assert len(elements) > 0
115assert elements == EXPECTED_OUTPUT
116for element in elements:
117assert element.metadata.filename == "fake-email.eml"
118
119
120def test_partition_email_from_filename_with_metadata_filename():
121filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
122elements = partition_email(filename=filename, metadata_filename="test")
123assert len(elements) > 0
124assert all(element.metadata.filename == "test" for element in elements)
125
126
127def test_partition_email_from_filename_malformed_encoding():
128filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-malformed-encoding.eml")
129elements = partition_email(filename=filename)
130assert len(elements) > 0
131assert elements == EXPECTED_OUTPUT
132
133
134@pytest.mark.parametrize(
135("filename", "expected_output"),
136[
137("fake-email-utf-16.eml", EXPECTED_OUTPUT),
138("fake-email-utf-16-be.eml", EXPECTED_OUTPUT),
139("fake-email-utf-16-le.eml", EXPECTED_OUTPUT),
140("fake-email-b64.eml", EXPECTED_OUTPUT),
141("email-no-utf8-2008-07-16.062410.eml", None),
142("email-no-utf8-2014-03-17.111517.eml", None),
143("email-replace-mime-encodings-error-1.eml", None),
144("email-replace-mime-encodings-error-2.eml", None),
145("email-replace-mime-encodings-error-3.eml", None),
146("email-replace-mime-encodings-error-4.eml", None),
147("email-replace-mime-encodings-error-5.eml", None),
148],
149)
150def test_partition_email_from_filename_default_encoding(filename, expected_output):
151filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
152elements = partition_email(filename=filename_path)
153assert len(elements) > 0
154if expected_output:
155assert elements == expected_output
156for element in elements:
157assert element.metadata.filename == filename
158
159
160def test_partition_email_from_file():
161filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
162with open(filename) as f:
163elements = partition_email(file=f)
164assert len(elements) > 0
165assert elements == EXPECTED_OUTPUT
166for element in elements:
167assert element.metadata.filename is None
168
169
170@pytest.mark.parametrize(
171("filename", "expected_output"),
172[
173("fake-email-utf-16.eml", EXPECTED_OUTPUT),
174("fake-email-utf-16-be.eml", EXPECTED_OUTPUT),
175("fake-email-utf-16-le.eml", EXPECTED_OUTPUT),
176("fake-email-b64.eml", EXPECTED_OUTPUT),
177("email-no-utf8-2008-07-16.062410.eml", None),
178("email-no-utf8-2014-03-17.111517.eml", None),
179("email-replace-mime-encodings-error-1.eml", None),
180("email-replace-mime-encodings-error-2.eml", None),
181("email-replace-mime-encodings-error-3.eml", None),
182("email-replace-mime-encodings-error-4.eml", None),
183("email-replace-mime-encodings-error-5.eml", None),
184],
185)
186def test_partition_email_from_file_default_encoding(filename, expected_output):
187filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
188with open(filename_path) as f:
189elements = partition_email(file=f)
190assert len(elements) > 0
191if expected_output:
192assert elements == expected_output
193for element in elements:
194assert element.metadata.filename is None
195
196
197def test_partition_email_from_file_rb():
198filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
199with open(filename, "rb") as f:
200elements = partition_email(file=f)
201assert len(elements) > 0
202assert elements == EXPECTED_OUTPUT
203for element in elements:
204assert element.metadata.filename is None
205
206
207@pytest.mark.parametrize(
208("filename", "expected_output"),
209[
210("fake-email-utf-16.eml", EXPECTED_OUTPUT),
211("fake-email-utf-16-be.eml", EXPECTED_OUTPUT),
212("fake-email-utf-16-le.eml", EXPECTED_OUTPUT),
213("email-no-utf8-2008-07-16.062410.eml", None),
214("email-no-utf8-2014-03-17.111517.eml", None),
215("email-replace-mime-encodings-error-1.eml", None),
216("email-replace-mime-encodings-error-2.eml", None),
217("email-replace-mime-encodings-error-3.eml", None),
218("email-replace-mime-encodings-error-4.eml", None),
219("email-replace-mime-encodings-error-5.eml", None),
220],
221)
222def test_partition_email_from_file_rb_default_encoding(filename, expected_output):
223filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
224with open(filename_path, "rb") as f:
225elements = partition_email(file=f)
226assert len(elements) > 0
227if expected_output:
228assert elements == expected_output
229for element in elements:
230assert element.metadata.filename is None
231
232
233def test_partition_email_from_text_file():
234filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
235with open(filename) as f:
236elements = partition_email(file=f, content_source="text/plain")
237assert len(elements) > 0
238assert elements == EXPECTED_OUTPUT
239for element in elements:
240assert element.metadata.filename is None
241
242
243def test_partition_email_from_text_file_with_headers():
244filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
245with open(filename) as f:
246elements = partition_email(
247file=f,
248content_source="text/plain",
249include_headers=True,
250)
251assert len(elements) > 0
252assert elements == ALL_EXPECTED_OUTPUT
253for element in elements:
254assert element.metadata.filename is None
255
256
257def test_partition_email_from_text_file_max():
258filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
259with open(filename) as f:
260elements = partition_email(
261file=f,
262content_source="text/plain",
263max_partition=20,
264)
265assert len(elements) == 6
266
267
268def test_partition_email_from_text_file_raises_value_error():
269filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
270with pytest.raises(ValueError), open(filename) as f:
271partition_email(file=f, content_source="text/plain", min_partition=1000)
272
273
274def test_partition_email_from_text():
275filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
276with open(filename) as f:
277text = f.read()
278elements = partition_email(text=text)
279assert len(elements) > 0
280assert elements == EXPECTED_OUTPUT
281for element in elements:
282assert element.metadata.filename is None
283
284
285def test_partition_email_from_text_work_with_empty_string():
286assert partition_email(text="") == []
287
288
289def test_partition_email_from_filename_with_embedded_image():
290filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-image-embedded.eml")
291elements = partition_email(filename=filename, content_source="text/plain")
292assert len(elements) > 0
293assert elements == IMAGE_EXPECTED_OUTPUT
294for element in elements:
295assert element.metadata.filename == "fake-email-image-embedded.eml"
296
297
298def test_partition_email_from_file_with_header():
299filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-header.eml")
300with open(filename) as f:
301msg = email.message_from_file(f)
302elements = partition_email_header(msg)
303assert len(elements) > 0
304assert elements == RECEIVED_HEADER_OUTPUT
305for element in elements:
306assert element.metadata.filename is None
307
308
309def test_partition_email_from_filename_has_metadata():
310filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
311elements = partition_email(filename=filename)
312parent_id = elements[0].metadata.parent_id
313
314assert len(elements) > 0
315assert (
316elements[0].metadata.to_dict()
317== ElementMetadata(
318coordinates=None,
319filename=filename,
320last_modified="2022-12-16T17:04:16-05:00",
321page_number=None,
322url=None,
323sent_from=["Matthew Robinson <mrobinson@unstructured.io>"],
324sent_to=["NotMatthew <NotMatthew@notunstructured.com>"],
325subject="Test Email",
326filetype="message/rfc822",
327parent_id=parent_id,
328languages=["eng"],
329).to_dict()
330)
331expected_dt = datetime.datetime.fromisoformat("2022-12-16T17:04:16-05:00")
332assert parse_optional_datetime(elements[0].metadata.last_modified) == expected_dt
333for element in elements:
334assert element.metadata.filename == "fake-email.eml"
335
336
337def test_extract_email_text_matches_html():
338filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-attachment.eml")
339elements_from_text = partition_email(filename=filename, content_source="text/plain")
340elements_from_html = partition_email(filename=filename, content_source="text/html")
341assert len(elements_from_text) == len(elements_from_html)
342# NOTE(robinson) - checking each individually is necessary because the text/html returns
343# HTMLTitle, HTMLNarrativeText, etc
344for i, element in enumerate(elements_from_text):
345assert element == elements_from_text[i]
346assert element.metadata.filename == "fake-email-attachment.eml"
347
348
349def test_extract_base64_email_text_matches_html():
350filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-b64.eml")
351elements_from_text = partition_email(filename=filename, content_source="text/plain")
352elements_from_html = partition_email(filename=filename, content_source="text/html")
353assert len(elements_from_text) == len(elements_from_html)
354for i, element in enumerate(elements_from_text):
355assert element == elements_from_text[i]
356assert element.metadata.filename == "fake-email-b64.eml"
357
358
359def test_extract_attachment_info():
360filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-attachment.eml")
361with open(filename) as f:
362msg = email.message_from_file(f)
363attachment_info = extract_attachment_info(msg)
364assert len(attachment_info) > 0
365assert attachment_info == ATTACH_EXPECTED_OUTPUT
366
367
368def test_partition_email_raises_with_none_specified():
369with pytest.raises(ValueError):
370partition_email()
371
372
373def test_partition_email_raises_with_too_many_specified():
374filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
375with open(filename) as f:
376text = f.read()
377with pytest.raises(ValueError):
378partition_email(filename=filename, text=text)
379
380
381def test_partition_email_raises_with_invalid_content_type():
382filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
383with pytest.raises(ValueError):
384partition_email(filename=filename, content_source="application/json")
385
386
387def test_partition_email_processes_fake_email_with_header():
388filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-header.eml")
389elements = partition_email(filename=filename)
390assert len(elements) > 0
391for element in elements:
392assert element.metadata.filename == "fake-email-header.eml"
393
394
395@pytest.mark.parametrize(
396(("time", "expected")),
397[
398("Thu, 4 May 2023 02:32:49 +0000", "2023-05-04T02:32:49+00:00"),
399("Thu, 4 May 2023 02:32:49 +0000", "2023-05-04T02:32:49+00:00"),
400("Thu, 4 May 2023 02:32:49 +0000 (UTC)", "2023-05-04T02:32:49+00:00"),
401("Thursday 5/3/2023 02:32:49", None),
402],
403)
404def test_convert_to_iso_8601(time, expected):
405iso_time = convert_to_iso_8601(time)
406assert iso_time == expected
407
408
409def test_partition_email_still_works_with_no_content(caplog):
410filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "email-no-html-content-1.eml")
411elements = partition_email(filename=filename)
412assert len(elements) == 1
413assert elements[0].text.startswith("Hey there")
414assert "text/html was not found. Falling back to text/plain" in caplog.text
415
416
417def test_partition_email_from_filename_exclude_metadata():
418filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-header.eml")
419elements = partition_email(filename=filename, include_metadata=False)
420assert parse_optional_datetime(elements[0].metadata.last_modified) is None
421assert elements[0].metadata.filetype is None
422assert elements[0].metadata.page_name is None
423assert elements[0].metadata.filename is None
424
425
426def test_partition_email_from_text_file_exclude_metadata():
427filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
428with open(filename) as f:
429elements = partition_email(
430file=f,
431content_source="text/plain",
432include_metadata=False,
433)
434assert parse_optional_datetime(elements[0].metadata.last_modified) is None
435assert elements[0].metadata.filetype is None
436assert elements[0].metadata.page_name is None
437assert elements[0].metadata.filename is None
438
439
440def test_partition_email_from_file_exclude_metadata():
441filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
442with open(filename) as f:
443elements = partition_email(file=f, include_metadata=False)
444assert parse_optional_datetime(elements[0].metadata.last_modified) is None
445assert elements[0].metadata.filetype is None
446assert elements[0].metadata.page_name is None
447assert elements[0].metadata.filename is None
448
449
450def test_partition_email_can_process_attachments(
451tmpdir,
452filename="example-docs/eml/fake-email-attachment.eml",
453):
454with open(filename) as f:
455msg = email.message_from_file(f)
456extract_attachment_info(msg, output_dir=tmpdir.dirname)
457attachment_filename = os.path.join(
458tmpdir.dirname,
459ATTACH_EXPECTED_OUTPUT[0]["filename"],
460)
461
462mocked_last_modification_date = "0000-00-05T09:24:28"
463
464attachment_elements = partition_text(
465filename=attachment_filename,
466metadata_filename=attachment_filename,
467metadata_last_modified=mocked_last_modification_date,
468)
469expected_metadata = attachment_elements[0].metadata
470expected_metadata.file_directory = None
471expected_metadata.attached_to_filename = filename
472
473elements = partition_email(
474filename=filename,
475attachment_partitioner=partition_text,
476process_attachments=True,
477metadata_last_modified=mocked_last_modification_date,
478)
479
480# This test does not need to validate if hierarchy is working
481# Patch to nullify parent_id
482expected_metadata.parent_id = None
483elements[-1].metadata.parent_id = None
484
485assert elements[0].text.startswith("Hello!")
486
487for element in elements[:-1]:
488assert element.metadata.filename == "fake-email-attachment.eml"
489assert element.metadata.subject == "Fake email with attachment"
490
491assert elements[-1].text == "Hey this is a fake attachment!"
492assert elements[-1].metadata == expected_metadata
493
494
495def test_partition_email_can_process_min_max_with_attachments(
496tmpdir,
497filename="example-docs/eml/fake-email-attachment.eml",
498):
499with open(filename) as f:
500msg = email.message_from_file(f)
501extract_attachment_info(msg, output_dir=tmpdir.dirname)
502attachment_filename = os.path.join(
503tmpdir.dirname,
504ATTACH_EXPECTED_OUTPUT[0]["filename"],
505)
506
507attachment_elements = partition_text(
508filename=attachment_filename,
509metadata_filename=attachment_filename,
510min_partition=6,
511max_partition=12,
512)
513
514elements = partition_email(
515filename=filename,
516attachment_partitioner=partition_text,
517process_attachments=True,
518min_partition=6,
519max_partition=12,
520)
521
522assert elements[0].text.startswith("Hello!")
523assert elements[-1].text == attachment_elements[-1].text
524assert elements[-2].text == attachment_elements[-2].text
525for element in elements:
526if element.metadata.attached_to_filename is not None:
527assert len(element.text) <= 12
528assert len(element.text) >= 6
529
530
531def test_partition_msg_raises_with_no_partitioner(
532filename="example-docs/eml/fake-email-attachment.eml",
533):
534with pytest.raises(ValueError):
535partition_email(filename=filename, process_attachments=True)
536
537
538def test_partition_email_from_file_custom_metadata_date(
539filename="example-docs/eml/fake-email-attachment.eml",
540):
541expected_last_modification_date = "2020-07-05T09:24:28"
542
543with open(filename) as f:
544elements = partition_email(
545file=f,
546metadata_last_modified=expected_last_modification_date,
547)
548
549assert elements[0].metadata.last_modified == expected_last_modification_date
550
551
552def test_partition_email_custom_metadata_date(
553filename="example-docs/eml/fake-email-attachment.eml",
554):
555expected_last_modification_date = "2020-07-05T09:24:28"
556
557elements = partition_email(
558filename=filename,
559metadata_last_modified=expected_last_modification_date,
560)
561
562assert elements[0].metadata.last_modified == expected_last_modification_date
563
564
565def test_partition_email_inline_content_disposition(
566filename="example-docs/eml/email-inline-content-disposition.eml",
567):
568elements = partition_email(
569filename=filename,
570process_attachments=True,
571attachment_partitioner=partition_text,
572)
573
574assert isinstance(elements[0], Text)
575assert isinstance(elements[1], Text)
576
577
578def test_partition_email_odd_attachment_filename(
579filename="example-docs/eml/email-equals-attachment-filename.eml",
580):
581elements = partition_email(
582filename=filename,
583process_attachments=True,
584attachment_partitioner=partition_text,
585)
586
587assert elements[1].metadata.filename == "odd=file=name.txt"
588
589
590def test_partition_email_with_json():
591elements = partition_email(example_doc_path("eml/fake-email.eml"))
592assert_round_trips_through_JSON(elements)
593
594
595def test_partition_email_with_pgp_encrypted_message(
596caplog,
597filename="example-docs/eml/fake-encrypted.eml",
598):
599elements = partition_email(filename=filename)
600
601assert elements == []
602assert "WARNING" in caplog.text
603assert "Encrypted email detected" in caplog.text
604
605
606def test_add_chunking_strategy_on_partition_email(
607filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt"),
608):
609elements = partition_email(filename=filename)
610chunk_elements = partition_email(filename, chunking_strategy="by_title")
611chunks = chunk_by_title(elements)
612assert chunk_elements != elements
613assert chunk_elements == chunks
614
615
616def test_partition_email_element_metadata_has_languages():
617filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
618elements = partition_email(filename=filename)
619assert elements[0].metadata.languages == ["eng"]
620
621
622def test_partition_email_respects_languages_arg():
623filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
624elements = partition_email(filename=filename, languages=["deu"])
625assert all(element.metadata.languages == ["deu"] for element in elements)
626
627
628def test_partition_eml_respects_detect_language_per_element():
629filename = "example-docs/language-docs/eng_spa_mult.eml"
630elements = partition_email(filename=filename, detect_language_per_element=True)
631# languages other than English and Spanish are detected by this partitioner,
632# so this test is slightly different from the other partition tests
633langs = {element.metadata.languages[0] for element in elements}
634assert "eng" in langs
635assert "spa" in langs
636
637
638def test_partition_eml_add_signature_to_metadata():
639elements = partition_email(filename="example-docs/eml/signed-doc.p7s")
640assert len(elements) == 1
641assert elements[0].text == "This is a test"
642assert elements[0].metadata.signature == "<SIGNATURE>\n"
643