unstructured

Форк
0
642 строки · 22.4 Кб
1
import datetime
2
import email
3
import os
4
import pathlib
5

6
import pytest
7

8
from test_unstructured.unit_utils import (
9
    assert_round_trips_through_JSON,
10
    example_doc_path,
11
    parse_optional_datetime,
12
)
13
from unstructured.chunking.title import chunk_by_title
14
from unstructured.documents.elements import (
15
    ElementMetadata,
16
    Image,
17
    ListItem,
18
    NarrativeText,
19
    Text,
20
    Title,
21
)
22
from unstructured.documents.email_elements import (
23
    MetaData,
24
    ReceivedInfo,
25
    Recipient,
26
    Sender,
27
    Subject,
28
)
29
from unstructured.partition.email import (
30
    convert_to_iso_8601,
31
    extract_attachment_info,
32
    partition_email,
33
    partition_email_header,
34
)
35
from unstructured.partition.text import partition_text
36

37
FILE_DIRECTORY = pathlib.Path(__file__).parent.resolve()
38
EXAMPLE_DOCS_DIRECTORY = os.path.join(FILE_DIRECTORY, "..", "..", "example-docs", "eml")
39

40

41
EXPECTED_OUTPUT = [
42
    NarrativeText(text="This is a test email to use for unit tests."),
43
    Title(text="Important points:"),
44
    ListItem(text="Roses are red"),
45
    ListItem(text="Violets are blue"),
46
]
47

48
IMAGE_EXPECTED_OUTPUT = [
49
    NarrativeText(text="This is a test email to use for unit tests."),
50
    Title(text="Important points:"),
51
    NarrativeText(text="hello this is our logo."),
52
    Image(text="unstructured_logo.png"),
53
    ListItem(text="Roses are red"),
54
    ListItem(text="Violets are blue"),
55
]
56

57
RECEIVED_HEADER_OUTPUT = [
58
    ReceivedInfo(name="ABCDEFG-000.ABC.guide", text="00.0.0.00"),
59
    ReceivedInfo(name="ABCDEFG-000.ABC.guide", text="ba23::58b5:2236:45g2:88h2"),
60
    ReceivedInfo(
61
        name="received_datetimetz",
62
        text="2023-02-20 10:03:18+12:00",
63
        datestamp=datetime.datetime(
64
            2023,
65
            2,
66
            20,
67
            10,
68
            3,
69
            18,
70
            tzinfo=datetime.timezone(datetime.timedelta(seconds=43200)),
71
        ),
72
    ),
73
    MetaData(name="MIME-Version", text="1.0"),
74
    MetaData(name="Date", text="Fri, 16 Dec 2022 17:04:16 -0500"),
75
    MetaData(
76
        name="Message-ID",
77
        text="<CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>",
78
    ),
79
    Subject(text="Test Email"),
80
    Sender(name="Matthew Robinson", text="mrobinson@unstructured.io"),
81
    Recipient(name="Matthew Robinson", text="mrobinson@unstructured.io"),
82
    MetaData(
83
        name="Content-Type",
84
        text='multipart/alternative; boundary="00000000000095c9b205eff92630"',
85
    ),
86
]
87

88
HEADER_EXPECTED_OUTPUT = [
89
    MetaData(name="MIME-Version", text="1.0"),
90
    MetaData(name="Date", text="Fri, 16 Dec 2022 17:04:16 -0500"),
91
    MetaData(
92
        name="Message-ID",
93
        text="<CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>",
94
    ),
95
    Subject(text="Test Email"),
96
    Sender(name="Matthew Robinson", text="mrobinson@unstructured.io"),
97
    Recipient(name="Matthew Robinson", text="mrobinson@unstructured.io"),
98
    MetaData(
99
        name="Content-Type",
100
        text='multipart/alternative; boundary="00000000000095c9b205eff92630"',
101
    ),
102
]
103

104
ALL_EXPECTED_OUTPUT = HEADER_EXPECTED_OUTPUT + EXPECTED_OUTPUT
105

106
ATTACH_EXPECTED_OUTPUT = [
107
    {"filename": "fake-attachment.txt", "payload": b"Hey this is a fake attachment!"},
108
]
109

110

111
def test_partition_email_from_filename():
112
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
113
    elements = partition_email(filename=filename)
114
    assert len(elements) > 0
115
    assert elements == EXPECTED_OUTPUT
116
    for element in elements:
117
        assert element.metadata.filename == "fake-email.eml"
118

119

120
def test_partition_email_from_filename_with_metadata_filename():
121
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
122
    elements = partition_email(filename=filename, metadata_filename="test")
123
    assert len(elements) > 0
124
    assert all(element.metadata.filename == "test" for element in elements)
125

126

127
def test_partition_email_from_filename_malformed_encoding():
128
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-malformed-encoding.eml")
129
    elements = partition_email(filename=filename)
130
    assert len(elements) > 0
131
    assert elements == EXPECTED_OUTPUT
132

133

134
@pytest.mark.parametrize(
135
    ("filename", "expected_output"),
136
    [
137
        ("fake-email-utf-16.eml", EXPECTED_OUTPUT),
138
        ("fake-email-utf-16-be.eml", EXPECTED_OUTPUT),
139
        ("fake-email-utf-16-le.eml", EXPECTED_OUTPUT),
140
        ("fake-email-b64.eml", EXPECTED_OUTPUT),
141
        ("email-no-utf8-2008-07-16.062410.eml", None),
142
        ("email-no-utf8-2014-03-17.111517.eml", None),
143
        ("email-replace-mime-encodings-error-1.eml", None),
144
        ("email-replace-mime-encodings-error-2.eml", None),
145
        ("email-replace-mime-encodings-error-3.eml", None),
146
        ("email-replace-mime-encodings-error-4.eml", None),
147
        ("email-replace-mime-encodings-error-5.eml", None),
148
    ],
149
)
150
def test_partition_email_from_filename_default_encoding(filename, expected_output):
151
    filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
152
    elements = partition_email(filename=filename_path)
153
    assert len(elements) > 0
154
    if expected_output:
155
        assert elements == expected_output
156
    for element in elements:
157
        assert element.metadata.filename == filename
158

159

160
def test_partition_email_from_file():
161
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
162
    with open(filename) as f:
163
        elements = partition_email(file=f)
164
    assert len(elements) > 0
165
    assert elements == EXPECTED_OUTPUT
166
    for element in elements:
167
        assert element.metadata.filename is None
168

169

170
@pytest.mark.parametrize(
171
    ("filename", "expected_output"),
172
    [
173
        ("fake-email-utf-16.eml", EXPECTED_OUTPUT),
174
        ("fake-email-utf-16-be.eml", EXPECTED_OUTPUT),
175
        ("fake-email-utf-16-le.eml", EXPECTED_OUTPUT),
176
        ("fake-email-b64.eml", EXPECTED_OUTPUT),
177
        ("email-no-utf8-2008-07-16.062410.eml", None),
178
        ("email-no-utf8-2014-03-17.111517.eml", None),
179
        ("email-replace-mime-encodings-error-1.eml", None),
180
        ("email-replace-mime-encodings-error-2.eml", None),
181
        ("email-replace-mime-encodings-error-3.eml", None),
182
        ("email-replace-mime-encodings-error-4.eml", None),
183
        ("email-replace-mime-encodings-error-5.eml", None),
184
    ],
185
)
186
def test_partition_email_from_file_default_encoding(filename, expected_output):
187
    filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
188
    with open(filename_path) as f:
189
        elements = partition_email(file=f)
190
    assert len(elements) > 0
191
    if expected_output:
192
        assert elements == expected_output
193
    for element in elements:
194
        assert element.metadata.filename is None
195

196

197
def test_partition_email_from_file_rb():
198
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
199
    with open(filename, "rb") as f:
200
        elements = partition_email(file=f)
201
    assert len(elements) > 0
202
    assert elements == EXPECTED_OUTPUT
203
    for element in elements:
204
        assert element.metadata.filename is None
205

206

207
@pytest.mark.parametrize(
208
    ("filename", "expected_output"),
209
    [
210
        ("fake-email-utf-16.eml", EXPECTED_OUTPUT),
211
        ("fake-email-utf-16-be.eml", EXPECTED_OUTPUT),
212
        ("fake-email-utf-16-le.eml", EXPECTED_OUTPUT),
213
        ("email-no-utf8-2008-07-16.062410.eml", None),
214
        ("email-no-utf8-2014-03-17.111517.eml", None),
215
        ("email-replace-mime-encodings-error-1.eml", None),
216
        ("email-replace-mime-encodings-error-2.eml", None),
217
        ("email-replace-mime-encodings-error-3.eml", None),
218
        ("email-replace-mime-encodings-error-4.eml", None),
219
        ("email-replace-mime-encodings-error-5.eml", None),
220
    ],
221
)
222
def test_partition_email_from_file_rb_default_encoding(filename, expected_output):
223
    filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
224
    with open(filename_path, "rb") as f:
225
        elements = partition_email(file=f)
226
    assert len(elements) > 0
227
    if expected_output:
228
        assert elements == expected_output
229
    for element in elements:
230
        assert element.metadata.filename is None
231

232

233
def test_partition_email_from_text_file():
234
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
235
    with open(filename) as f:
236
        elements = partition_email(file=f, content_source="text/plain")
237
    assert len(elements) > 0
238
    assert elements == EXPECTED_OUTPUT
239
    for element in elements:
240
        assert element.metadata.filename is None
241

242

243
def test_partition_email_from_text_file_with_headers():
244
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
245
    with open(filename) as f:
246
        elements = partition_email(
247
            file=f,
248
            content_source="text/plain",
249
            include_headers=True,
250
        )
251
    assert len(elements) > 0
252
    assert elements == ALL_EXPECTED_OUTPUT
253
    for element in elements:
254
        assert element.metadata.filename is None
255

256

257
def test_partition_email_from_text_file_max():
258
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
259
    with open(filename) as f:
260
        elements = partition_email(
261
            file=f,
262
            content_source="text/plain",
263
            max_partition=20,
264
        )
265
    assert len(elements) == 6
266

267

268
def test_partition_email_from_text_file_raises_value_error():
269
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
270
    with pytest.raises(ValueError), open(filename) as f:
271
        partition_email(file=f, content_source="text/plain", min_partition=1000)
272

273

274
def test_partition_email_from_text():
275
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
276
    with open(filename) as f:
277
        text = f.read()
278
    elements = partition_email(text=text)
279
    assert len(elements) > 0
280
    assert elements == EXPECTED_OUTPUT
281
    for element in elements:
282
        assert element.metadata.filename is None
283

284

285
def test_partition_email_from_text_work_with_empty_string():
286
    assert partition_email(text="") == []
287

288

289
def test_partition_email_from_filename_with_embedded_image():
290
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-image-embedded.eml")
291
    elements = partition_email(filename=filename, content_source="text/plain")
292
    assert len(elements) > 0
293
    assert elements == IMAGE_EXPECTED_OUTPUT
294
    for element in elements:
295
        assert element.metadata.filename == "fake-email-image-embedded.eml"
296

297

298
def test_partition_email_from_file_with_header():
299
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-header.eml")
300
    with open(filename) as f:
301
        msg = email.message_from_file(f)
302
    elements = partition_email_header(msg)
303
    assert len(elements) > 0
304
    assert elements == RECEIVED_HEADER_OUTPUT
305
    for element in elements:
306
        assert element.metadata.filename is None
307

308

309
def test_partition_email_from_filename_has_metadata():
310
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
311
    elements = partition_email(filename=filename)
312
    parent_id = elements[0].metadata.parent_id
313

314
    assert len(elements) > 0
315
    assert (
316
        elements[0].metadata.to_dict()
317
        == ElementMetadata(
318
            coordinates=None,
319
            filename=filename,
320
            last_modified="2022-12-16T17:04:16-05:00",
321
            page_number=None,
322
            url=None,
323
            sent_from=["Matthew Robinson <mrobinson@unstructured.io>"],
324
            sent_to=["NotMatthew <NotMatthew@notunstructured.com>"],
325
            subject="Test Email",
326
            filetype="message/rfc822",
327
            parent_id=parent_id,
328
            languages=["eng"],
329
        ).to_dict()
330
    )
331
    expected_dt = datetime.datetime.fromisoformat("2022-12-16T17:04:16-05:00")
332
    assert parse_optional_datetime(elements[0].metadata.last_modified) == expected_dt
333
    for element in elements:
334
        assert element.metadata.filename == "fake-email.eml"
335

336

337
def test_extract_email_text_matches_html():
338
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-attachment.eml")
339
    elements_from_text = partition_email(filename=filename, content_source="text/plain")
340
    elements_from_html = partition_email(filename=filename, content_source="text/html")
341
    assert len(elements_from_text) == len(elements_from_html)
342
    # NOTE(robinson) - checking each individually is necessary because the text/html returns
343
    # HTMLTitle, HTMLNarrativeText, etc
344
    for i, element in enumerate(elements_from_text):
345
        assert element == elements_from_text[i]
346
        assert element.metadata.filename == "fake-email-attachment.eml"
347

348

349
def test_extract_base64_email_text_matches_html():
350
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-b64.eml")
351
    elements_from_text = partition_email(filename=filename, content_source="text/plain")
352
    elements_from_html = partition_email(filename=filename, content_source="text/html")
353
    assert len(elements_from_text) == len(elements_from_html)
354
    for i, element in enumerate(elements_from_text):
355
        assert element == elements_from_text[i]
356
        assert element.metadata.filename == "fake-email-b64.eml"
357

358

359
def test_extract_attachment_info():
360
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-attachment.eml")
361
    with open(filename) as f:
362
        msg = email.message_from_file(f)
363
    attachment_info = extract_attachment_info(msg)
364
    assert len(attachment_info) > 0
365
    assert attachment_info == ATTACH_EXPECTED_OUTPUT
366

367

368
def test_partition_email_raises_with_none_specified():
369
    with pytest.raises(ValueError):
370
        partition_email()
371

372

373
def test_partition_email_raises_with_too_many_specified():
374
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
375
    with open(filename) as f:
376
        text = f.read()
377
    with pytest.raises(ValueError):
378
        partition_email(filename=filename, text=text)
379

380

381
def test_partition_email_raises_with_invalid_content_type():
382
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
383
    with pytest.raises(ValueError):
384
        partition_email(filename=filename, content_source="application/json")
385

386

387
def test_partition_email_processes_fake_email_with_header():
388
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-header.eml")
389
    elements = partition_email(filename=filename)
390
    assert len(elements) > 0
391
    for element in elements:
392
        assert element.metadata.filename == "fake-email-header.eml"
393

394

395
@pytest.mark.parametrize(
396
    (("time", "expected")),
397
    [
398
        ("Thu,  4 May 2023 02:32:49 +0000", "2023-05-04T02:32:49+00:00"),
399
        ("Thu, 4 May 2023 02:32:49 +0000", "2023-05-04T02:32:49+00:00"),
400
        ("Thu, 4 May 2023 02:32:49 +0000 (UTC)", "2023-05-04T02:32:49+00:00"),
401
        ("Thursday 5/3/2023 02:32:49", None),
402
    ],
403
)
404
def test_convert_to_iso_8601(time, expected):
405
    iso_time = convert_to_iso_8601(time)
406
    assert iso_time == expected
407

408

409
def test_partition_email_still_works_with_no_content(caplog):
410
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "email-no-html-content-1.eml")
411
    elements = partition_email(filename=filename)
412
    assert len(elements) == 1
413
    assert elements[0].text.startswith("Hey there")
414
    assert "text/html was not found. Falling back to text/plain" in caplog.text
415

416

417
def test_partition_email_from_filename_exclude_metadata():
418
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-header.eml")
419
    elements = partition_email(filename=filename, include_metadata=False)
420
    assert parse_optional_datetime(elements[0].metadata.last_modified) is None
421
    assert elements[0].metadata.filetype is None
422
    assert elements[0].metadata.page_name is None
423
    assert elements[0].metadata.filename is None
424

425

426
def test_partition_email_from_text_file_exclude_metadata():
427
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
428
    with open(filename) as f:
429
        elements = partition_email(
430
            file=f,
431
            content_source="text/plain",
432
            include_metadata=False,
433
        )
434
    assert parse_optional_datetime(elements[0].metadata.last_modified) is None
435
    assert elements[0].metadata.filetype is None
436
    assert elements[0].metadata.page_name is None
437
    assert elements[0].metadata.filename is None
438

439

440
def test_partition_email_from_file_exclude_metadata():
441
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
442
    with open(filename) as f:
443
        elements = partition_email(file=f, include_metadata=False)
444
    assert parse_optional_datetime(elements[0].metadata.last_modified) is None
445
    assert elements[0].metadata.filetype is None
446
    assert elements[0].metadata.page_name is None
447
    assert elements[0].metadata.filename is None
448

449

450
def test_partition_email_can_process_attachments(
451
    tmpdir,
452
    filename="example-docs/eml/fake-email-attachment.eml",
453
):
454
    with open(filename) as f:
455
        msg = email.message_from_file(f)
456
    extract_attachment_info(msg, output_dir=tmpdir.dirname)
457
    attachment_filename = os.path.join(
458
        tmpdir.dirname,
459
        ATTACH_EXPECTED_OUTPUT[0]["filename"],
460
    )
461

462
    mocked_last_modification_date = "0000-00-05T09:24:28"
463

464
    attachment_elements = partition_text(
465
        filename=attachment_filename,
466
        metadata_filename=attachment_filename,
467
        metadata_last_modified=mocked_last_modification_date,
468
    )
469
    expected_metadata = attachment_elements[0].metadata
470
    expected_metadata.file_directory = None
471
    expected_metadata.attached_to_filename = filename
472

473
    elements = partition_email(
474
        filename=filename,
475
        attachment_partitioner=partition_text,
476
        process_attachments=True,
477
        metadata_last_modified=mocked_last_modification_date,
478
    )
479

480
    # This test does not need to validate if hierarchy is working
481
    # Patch to nullify parent_id
482
    expected_metadata.parent_id = None
483
    elements[-1].metadata.parent_id = None
484

485
    assert elements[0].text.startswith("Hello!")
486

487
    for element in elements[:-1]:
488
        assert element.metadata.filename == "fake-email-attachment.eml"
489
        assert element.metadata.subject == "Fake email with attachment"
490

491
    assert elements[-1].text == "Hey this is a fake attachment!"
492
    assert elements[-1].metadata == expected_metadata
493

494

495
def test_partition_email_can_process_min_max_with_attachments(
496
    tmpdir,
497
    filename="example-docs/eml/fake-email-attachment.eml",
498
):
499
    with open(filename) as f:
500
        msg = email.message_from_file(f)
501
    extract_attachment_info(msg, output_dir=tmpdir.dirname)
502
    attachment_filename = os.path.join(
503
        tmpdir.dirname,
504
        ATTACH_EXPECTED_OUTPUT[0]["filename"],
505
    )
506

507
    attachment_elements = partition_text(
508
        filename=attachment_filename,
509
        metadata_filename=attachment_filename,
510
        min_partition=6,
511
        max_partition=12,
512
    )
513

514
    elements = partition_email(
515
        filename=filename,
516
        attachment_partitioner=partition_text,
517
        process_attachments=True,
518
        min_partition=6,
519
        max_partition=12,
520
    )
521

522
    assert elements[0].text.startswith("Hello!")
523
    assert elements[-1].text == attachment_elements[-1].text
524
    assert elements[-2].text == attachment_elements[-2].text
525
    for element in elements:
526
        if element.metadata.attached_to_filename is not None:
527
            assert len(element.text) <= 12
528
            assert len(element.text) >= 6
529

530

531
def test_partition_msg_raises_with_no_partitioner(
532
    filename="example-docs/eml/fake-email-attachment.eml",
533
):
534
    with pytest.raises(ValueError):
535
        partition_email(filename=filename, process_attachments=True)
536

537

538
def test_partition_email_from_file_custom_metadata_date(
539
    filename="example-docs/eml/fake-email-attachment.eml",
540
):
541
    expected_last_modification_date = "2020-07-05T09:24:28"
542

543
    with open(filename) as f:
544
        elements = partition_email(
545
            file=f,
546
            metadata_last_modified=expected_last_modification_date,
547
        )
548

549
    assert elements[0].metadata.last_modified == expected_last_modification_date
550

551

552
def test_partition_email_custom_metadata_date(
553
    filename="example-docs/eml/fake-email-attachment.eml",
554
):
555
    expected_last_modification_date = "2020-07-05T09:24:28"
556

557
    elements = partition_email(
558
        filename=filename,
559
        metadata_last_modified=expected_last_modification_date,
560
    )
561

562
    assert elements[0].metadata.last_modified == expected_last_modification_date
563

564

565
def test_partition_email_inline_content_disposition(
566
    filename="example-docs/eml/email-inline-content-disposition.eml",
567
):
568
    elements = partition_email(
569
        filename=filename,
570
        process_attachments=True,
571
        attachment_partitioner=partition_text,
572
    )
573

574
    assert isinstance(elements[0], Text)
575
    assert isinstance(elements[1], Text)
576

577

578
def test_partition_email_odd_attachment_filename(
579
    filename="example-docs/eml/email-equals-attachment-filename.eml",
580
):
581
    elements = partition_email(
582
        filename=filename,
583
        process_attachments=True,
584
        attachment_partitioner=partition_text,
585
    )
586

587
    assert elements[1].metadata.filename == "odd=file=name.txt"
588

589

590
def test_partition_email_with_json():
591
    elements = partition_email(example_doc_path("eml/fake-email.eml"))
592
    assert_round_trips_through_JSON(elements)
593

594

595
def test_partition_email_with_pgp_encrypted_message(
596
    caplog,
597
    filename="example-docs/eml/fake-encrypted.eml",
598
):
599
    elements = partition_email(filename=filename)
600

601
    assert elements == []
602
    assert "WARNING" in caplog.text
603
    assert "Encrypted email detected" in caplog.text
604

605

606
def test_add_chunking_strategy_on_partition_email(
607
    filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt"),
608
):
609
    elements = partition_email(filename=filename)
610
    chunk_elements = partition_email(filename, chunking_strategy="by_title")
611
    chunks = chunk_by_title(elements)
612
    assert chunk_elements != elements
613
    assert chunk_elements == chunks
614

615

616
def test_partition_email_element_metadata_has_languages():
617
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
618
    elements = partition_email(filename=filename)
619
    assert elements[0].metadata.languages == ["eng"]
620

621

622
def test_partition_email_respects_languages_arg():
623
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
624
    elements = partition_email(filename=filename, languages=["deu"])
625
    assert all(element.metadata.languages == ["deu"] for element in elements)
626

627

628
def test_partition_eml_respects_detect_language_per_element():
629
    filename = "example-docs/language-docs/eng_spa_mult.eml"
630
    elements = partition_email(filename=filename, detect_language_per_element=True)
631
    # languages other than English and Spanish are detected by this partitioner,
632
    # so this test is slightly different from the other partition tests
633
    langs = {element.metadata.languages[0] for element in elements}
634
    assert "eng" in langs
635
    assert "spa" in langs
636

637

638
def test_partition_eml_add_signature_to_metadata():
639
    elements = partition_email(filename="example-docs/eml/signed-doc.p7s")
640
    assert len(elements) == 1
641
    assert elements[0].text == "This is a test"
642
    assert elements[0].metadata.signature == "<SIGNATURE>\n"
643

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.