unstructured
385 строк · 14.0 Кб
1"""Test-suite for `unstructured.partition.json` module."""
2
3from __future__ import annotations4
5import os6import pathlib7import tempfile8
9import pytest10from pytest_mock import MockFixture11
12from unstructured.documents.elements import CompositeElement13from unstructured.file_utils.filetype import FileType, detect_filetype14from unstructured.partition.email import partition_email15from unstructured.partition.html import partition_html16from unstructured.partition.json import partition_json17from unstructured.partition.text import partition_text18from unstructured.partition.xml import partition_xml19from unstructured.staging.base import elements_to_json20
21DIRECTORY = pathlib.Path(__file__).parent.resolve()22
23is_in_docker = os.path.exists("/.dockerenv")24
25test_files = [26"fake-text.txt",27"fake-html.html",28"eml/fake-email.eml",29]
30
31is_in_docker = os.path.exists("/.dockerenv")32
33
34def test_it_chunks_elements_when_a_chunking_strategy_is_specified():35chunks = partition_json(36"example-docs/spring-weather.html.json", chunking_strategy="basic", max_characters=150037)38
39assert len(chunks) == 1040assert all(isinstance(ch, CompositeElement) for ch in chunks)41
42
43@pytest.mark.parametrize("filename", test_files)44def test_partition_json_from_filename(filename: str):45path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)46elements = []47filetype = detect_filetype(filename=path)48if filetype == FileType.TXT:49elements = partition_text(filename=path)50if filetype == FileType.HTML:51elements = partition_html(filename=path)52if filetype == FileType.XML:53elements = partition_xml(filename=path)54if filetype == FileType.EML:55elements = partition_email(filename=path)56
57with tempfile.TemporaryDirectory() as tmpdir:58_filename = os.path.basename(filename)59test_path = os.path.join(tmpdir, _filename + ".json")60elements_to_json(elements, filename=test_path, indent=2)61test_elements = partition_json(filename=test_path)62
63assert len(elements) > 064assert len(str(elements[0])) > 065
66assert len(elements) == len(test_elements)67for i in range(len(elements)):68assert elements[i] == test_elements[i]69assert elements[i].metadata.filename == filename.split("/")[-1]70
71
72@pytest.mark.parametrize("filename", test_files)73def test_partition_json_from_filename_with_metadata_filename(filename: str):74path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)75elements = []76filetype = detect_filetype(filename=path)77if filetype == FileType.TXT:78elements = partition_text(filename=path)79if filetype == FileType.HTML:80elements = partition_html(filename=path)81if filetype == FileType.XML:82elements = partition_xml(filename=path)83if filetype == FileType.EML:84elements = partition_email(filename=path)85
86with tempfile.TemporaryDirectory() as tmpdir:87_filename = os.path.basename(filename)88test_path = os.path.join(tmpdir, _filename + ".json")89elements_to_json(elements, filename=test_path, indent=2)90test_elements = partition_json(filename=test_path, metadata_filename="test")91
92assert len(test_elements) > 093assert len(str(test_elements[0])) > 094assert all(element.metadata.filename == "test" for element in test_elements)95
96
97@pytest.mark.parametrize("filename", test_files)98def test_partition_json_from_file(filename: str):99path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)100elements = []101filetype = detect_filetype(filename=path)102if filetype == FileType.TXT:103elements = partition_text(filename=path)104if filetype == FileType.HTML:105elements = partition_html(filename=path)106if filetype == FileType.XML:107elements = partition_xml(filename=path)108if filetype == FileType.EML:109elements = partition_email(filename=path)110
111with tempfile.TemporaryDirectory() as tmpdir:112_filename = os.path.basename(filename)113test_path = os.path.join(tmpdir, _filename + ".json")114elements_to_json(elements, filename=test_path, indent=2)115with open(test_path, "rb") as f:116test_elements = partition_json(file=f)117
118assert len(elements) > 0119assert len(str(elements[0])) > 0120assert len(elements) == len(test_elements)121for i in range(len(elements)):122assert elements[i] == test_elements[i]123assert elements[i].metadata.filename == filename.split("/")[-1]124
125
126@pytest.mark.parametrize("filename", test_files)127def test_partition_json_from_file_with_metadata_filename(filename: str):128path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)129elements = []130filetype = detect_filetype(filename=path)131if filetype == FileType.TXT:132elements = partition_text(filename=path)133if filetype == FileType.HTML:134elements = partition_html(filename=path)135if filetype == FileType.XML:136elements = partition_xml(filename=path)137if filetype == FileType.EML:138elements = partition_email(filename=path)139with tempfile.TemporaryDirectory() as tmpdir:140_filename = os.path.basename(filename)141test_path = os.path.join(tmpdir, _filename + ".json")142elements_to_json(elements, filename=test_path, indent=2)143with open(test_path, "rb") as f:144test_elements = partition_json(file=f, metadata_filename="test")145
146for i in range(len(test_elements)):147assert test_elements[i].metadata.filename == "test"148
149
150@pytest.mark.parametrize("filename", test_files)151def test_partition_json_from_text(filename: str):152path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)153elements = []154filetype = detect_filetype(filename=path)155if filetype == FileType.TXT:156elements = partition_text(filename=path)157if filetype == FileType.HTML:158elements = partition_html(filename=path)159if filetype == FileType.XML:160elements = partition_xml(filename=path)161if filetype == FileType.EML:162elements = partition_email(filename=path)163
164with tempfile.TemporaryDirectory() as tmpdir:165_filename = os.path.basename(filename)166test_path = os.path.join(tmpdir, _filename + ".json")167elements_to_json(elements, filename=test_path, indent=2)168with open(test_path) as f:169text = f.read()170test_elements = partition_json(text=text)171
172assert len(elements) > 0173assert len(str(elements[0])) > 0174assert len(elements) == len(test_elements)175for i in range(len(elements)):176assert elements[i] == test_elements[i]177assert elements[i].metadata.filename == filename.split("/")[-1]178
179
180def test_partition_json_raises_with_none_specified():181with pytest.raises(ValueError):182partition_json()183
184
185def test_partition_json_works_with_empty_string():186assert partition_json(text="") == []187
188
189def test_partition_json_works_with_empty_list():190assert partition_json(text="[]") == []191
192
193def test_partition_json_raises_with_too_many_specified():194path = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")195elements = []196filetype = detect_filetype(filename=path)197if filetype == FileType.TXT:198elements = partition_text(filename=path)199if filetype == FileType.HTML:200elements = partition_html(filename=path)201if filetype == FileType.XML:202elements = partition_xml(filename=path)203if filetype == FileType.EML:204elements = partition_email(filename=path)205
206with tempfile.TemporaryDirectory() as tmpdir:207test_path = os.path.join(tmpdir, "fake-text.txt.json")208elements_to_json(elements, filename=test_path, indent=2)209with open(test_path, "rb") as f:210text = f.read().decode("utf-8")211
212with pytest.raises(ValueError):213partition_json(filename=test_path, file=f)214
215with pytest.raises(ValueError):216partition_json(filename=test_path, text=text)217
218with pytest.raises(ValueError):219partition_json(file=f, text=text)220
221with pytest.raises(ValueError):222partition_json(filename=test_path, file=f, text=text)223
224
225@pytest.mark.parametrize("filename", test_files)226def test_partition_json_from_filename_exclude_metadata(filename: str):227path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)228elements = []229filetype = detect_filetype(filename=path)230if filetype == FileType.TXT:231elements = partition_text(filename=path)232if filetype == FileType.HTML:233elements = partition_html(filename=path)234if filetype == FileType.XML:235elements = partition_xml(filename=path)236if filetype == FileType.EML:237elements = partition_email(filename=path)238
239with tempfile.TemporaryDirectory() as tmpdir:240_filename = os.path.basename(filename)241test_path = os.path.join(tmpdir, _filename + ".json")242elements_to_json(elements, filename=test_path, indent=2)243test_elements = partition_json(filename=test_path, include_metadata=False)244
245for i in range(len(test_elements)):246assert any(test_elements[i].metadata.to_dict()) is False247
248
249@pytest.mark.parametrize("filename", test_files)250def test_partition_json_from_file_exclude_metadata(filename: str):251path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)252elements = []253filetype = detect_filetype(filename=path)254if filetype == FileType.TXT:255elements = partition_text(filename=path)256if filetype == FileType.HTML:257elements = partition_html(filename=path)258if filetype == FileType.XML:259elements = partition_xml(filename=path)260if filetype == FileType.EML:261elements = partition_email(filename=path)262
263with tempfile.TemporaryDirectory() as tmpdir:264_filename = os.path.basename(filename)265test_path = os.path.join(tmpdir, _filename + ".json")266elements_to_json(elements, filename=test_path, indent=2)267with open(test_path, "rb") as f:268test_elements = partition_json(file=f, include_metadata=False)269
270for i in range(len(test_elements)):271assert any(test_elements[i].metadata.to_dict()) is False272
273
274@pytest.mark.parametrize("filename", test_files)275def test_partition_json_from_text_exclude_metadata(filename: str):276path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)277elements = []278filetype = detect_filetype(filename=path)279if filetype == FileType.TXT:280elements = partition_text(filename=path)281if filetype == FileType.HTML:282elements = partition_html(filename=path)283if filetype == FileType.XML:284elements = partition_xml(filename=path)285if filetype == FileType.EML:286elements = partition_email(filename=path)287
288with tempfile.TemporaryDirectory() as tmpdir:289_filename = os.path.basename(filename)290test_path = os.path.join(tmpdir, _filename + ".json")291elements_to_json(elements, filename=test_path, indent=2)292with open(test_path) as f:293text = f.read()294test_elements = partition_json(text=text, include_metadata=False)295
296for i in range(len(test_elements)):297assert any(test_elements[i].metadata.to_dict()) is False298
299
300def test_partition_json_metadata_date(mocker: MockFixture):301mocked_last_modification_date = "2029-07-05T09:24:28"302mocker.patch(303"unstructured.partition.json.get_last_modified_date",304return_value=mocked_last_modification_date,305)306
307elements = partition_json("example-docs/spring-weather.html.json")308
309assert elements[0].metadata.last_modified == mocked_last_modification_date310
311
312def test_partition_json_with_custom_metadata_date(mocker: MockFixture):313mocked_last_modification_date = "2029-07-05T09:24:28"314expected_last_modification_date = "2020-07-05T09:24:28"315mocker.patch(316"unstructured.partition.json.get_last_modified_date",317return_value=mocked_last_modification_date,318)319
320elements = partition_json(321"example-docs/spring-weather.html.json",322metadata_last_modified=expected_last_modification_date,323)324
325assert elements[0].metadata.last_modified == expected_last_modification_date326
327
328def test_partition_json_from_file_metadata_date(mocker: MockFixture):329mocked_last_modification_date = "2029-07-05T09:24:28"330mocker.patch(331"unstructured.partition.json.get_last_modified_date_from_file",332return_value=mocked_last_modification_date,333)334
335with open("example-docs/spring-weather.html.json", "rb") as f:336elements = partition_json(file=f)337
338assert elements[0].metadata.last_modified == mocked_last_modification_date339
340
341def test_partition_json_from_file_with_custom_metadata_date(mocker: MockFixture):342mocked_last_modification_date = "2029-07-05T09:24:28"343expected_last_modification_date = "2020-07-05T09:24:28"344mocker.patch(345"unstructured.partition.json.get_last_modified_date_from_file",346return_value=mocked_last_modification_date,347)348
349with open("example-docs/spring-weather.html.json", "rb") as f:350elements = partition_json(file=f, metadata_last_modified=expected_last_modification_date)351
352assert elements[0].metadata.last_modified == expected_last_modification_date353
354
355def test_partition_json_from_text_metadata_date():356with open("example-docs/spring-weather.html.json") as f:357text = f.read()358
359elements = partition_json(text=text)360
361assert elements[0].metadata.last_modified is None362
363
364def test_partition_json_from_text_with_custom_metadata_date():365expected_last_modification_date = "2020-07-05T09:24:28"366with open("example-docs/spring-weather.html.json") as f:367text = f.read()368
369elements = partition_json(text=text, metadata_last_modified=expected_last_modification_date)370
371assert elements[0].metadata.last_modified == expected_last_modification_date372
373
374def test_partition_json_raises_with_unprocessable_json():375# NOTE(robinson) - This is unprocessable because it is not a list of dicts,376# per the Unstructured ISD format377text = '{"hi": "there"}'378with pytest.raises(ValueError):379partition_json(text=text)380
381
382def test_partition_json_raises_with_invalid_json():383text = '[{"hi": "there"}]]'384with pytest.raises(ValueError):385partition_json(text=text)386