unstructured

Форк
0
385 строк · 14.0 Кб
1
"""Test-suite for `unstructured.partition.json` module."""
2

3
from __future__ import annotations
4

5
import os
6
import pathlib
7
import tempfile
8

9
import pytest
10
from pytest_mock import MockFixture
11

12
from unstructured.documents.elements import CompositeElement
13
from unstructured.file_utils.filetype import FileType, detect_filetype
14
from unstructured.partition.email import partition_email
15
from unstructured.partition.html import partition_html
16
from unstructured.partition.json import partition_json
17
from unstructured.partition.text import partition_text
18
from unstructured.partition.xml import partition_xml
19
from unstructured.staging.base import elements_to_json
20

21
DIRECTORY = pathlib.Path(__file__).parent.resolve()
22

23
is_in_docker = os.path.exists("/.dockerenv")
24

25
test_files = [
26
    "fake-text.txt",
27
    "fake-html.html",
28
    "eml/fake-email.eml",
29
]
30

31
is_in_docker = os.path.exists("/.dockerenv")
32

33

34
def test_it_chunks_elements_when_a_chunking_strategy_is_specified():
35
    chunks = partition_json(
36
        "example-docs/spring-weather.html.json", chunking_strategy="basic", max_characters=1500
37
    )
38

39
    assert len(chunks) == 10
40
    assert all(isinstance(ch, CompositeElement) for ch in chunks)
41

42

43
@pytest.mark.parametrize("filename", test_files)
44
def test_partition_json_from_filename(filename: str):
45
    path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
46
    elements = []
47
    filetype = detect_filetype(filename=path)
48
    if filetype == FileType.TXT:
49
        elements = partition_text(filename=path)
50
    if filetype == FileType.HTML:
51
        elements = partition_html(filename=path)
52
    if filetype == FileType.XML:
53
        elements = partition_xml(filename=path)
54
    if filetype == FileType.EML:
55
        elements = partition_email(filename=path)
56

57
    with tempfile.TemporaryDirectory() as tmpdir:
58
        _filename = os.path.basename(filename)
59
        test_path = os.path.join(tmpdir, _filename + ".json")
60
        elements_to_json(elements, filename=test_path, indent=2)
61
        test_elements = partition_json(filename=test_path)
62

63
    assert len(elements) > 0
64
    assert len(str(elements[0])) > 0
65

66
    assert len(elements) == len(test_elements)
67
    for i in range(len(elements)):
68
        assert elements[i] == test_elements[i]
69
        assert elements[i].metadata.filename == filename.split("/")[-1]
70

71

72
@pytest.mark.parametrize("filename", test_files)
73
def test_partition_json_from_filename_with_metadata_filename(filename: str):
74
    path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
75
    elements = []
76
    filetype = detect_filetype(filename=path)
77
    if filetype == FileType.TXT:
78
        elements = partition_text(filename=path)
79
    if filetype == FileType.HTML:
80
        elements = partition_html(filename=path)
81
    if filetype == FileType.XML:
82
        elements = partition_xml(filename=path)
83
    if filetype == FileType.EML:
84
        elements = partition_email(filename=path)
85

86
    with tempfile.TemporaryDirectory() as tmpdir:
87
        _filename = os.path.basename(filename)
88
        test_path = os.path.join(tmpdir, _filename + ".json")
89
        elements_to_json(elements, filename=test_path, indent=2)
90
        test_elements = partition_json(filename=test_path, metadata_filename="test")
91

92
    assert len(test_elements) > 0
93
    assert len(str(test_elements[0])) > 0
94
    assert all(element.metadata.filename == "test" for element in test_elements)
95

96

97
@pytest.mark.parametrize("filename", test_files)
98
def test_partition_json_from_file(filename: str):
99
    path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
100
    elements = []
101
    filetype = detect_filetype(filename=path)
102
    if filetype == FileType.TXT:
103
        elements = partition_text(filename=path)
104
    if filetype == FileType.HTML:
105
        elements = partition_html(filename=path)
106
    if filetype == FileType.XML:
107
        elements = partition_xml(filename=path)
108
    if filetype == FileType.EML:
109
        elements = partition_email(filename=path)
110

111
    with tempfile.TemporaryDirectory() as tmpdir:
112
        _filename = os.path.basename(filename)
113
        test_path = os.path.join(tmpdir, _filename + ".json")
114
        elements_to_json(elements, filename=test_path, indent=2)
115
        with open(test_path, "rb") as f:
116
            test_elements = partition_json(file=f)
117

118
    assert len(elements) > 0
119
    assert len(str(elements[0])) > 0
120
    assert len(elements) == len(test_elements)
121
    for i in range(len(elements)):
122
        assert elements[i] == test_elements[i]
123
        assert elements[i].metadata.filename == filename.split("/")[-1]
124

125

126
@pytest.mark.parametrize("filename", test_files)
127
def test_partition_json_from_file_with_metadata_filename(filename: str):
128
    path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
129
    elements = []
130
    filetype = detect_filetype(filename=path)
131
    if filetype == FileType.TXT:
132
        elements = partition_text(filename=path)
133
    if filetype == FileType.HTML:
134
        elements = partition_html(filename=path)
135
    if filetype == FileType.XML:
136
        elements = partition_xml(filename=path)
137
    if filetype == FileType.EML:
138
        elements = partition_email(filename=path)
139
    with tempfile.TemporaryDirectory() as tmpdir:
140
        _filename = os.path.basename(filename)
141
        test_path = os.path.join(tmpdir, _filename + ".json")
142
        elements_to_json(elements, filename=test_path, indent=2)
143
        with open(test_path, "rb") as f:
144
            test_elements = partition_json(file=f, metadata_filename="test")
145

146
    for i in range(len(test_elements)):
147
        assert test_elements[i].metadata.filename == "test"
148

149

150
@pytest.mark.parametrize("filename", test_files)
151
def test_partition_json_from_text(filename: str):
152
    path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
153
    elements = []
154
    filetype = detect_filetype(filename=path)
155
    if filetype == FileType.TXT:
156
        elements = partition_text(filename=path)
157
    if filetype == FileType.HTML:
158
        elements = partition_html(filename=path)
159
    if filetype == FileType.XML:
160
        elements = partition_xml(filename=path)
161
    if filetype == FileType.EML:
162
        elements = partition_email(filename=path)
163

164
    with tempfile.TemporaryDirectory() as tmpdir:
165
        _filename = os.path.basename(filename)
166
        test_path = os.path.join(tmpdir, _filename + ".json")
167
        elements_to_json(elements, filename=test_path, indent=2)
168
        with open(test_path) as f:
169
            text = f.read()
170
        test_elements = partition_json(text=text)
171

172
    assert len(elements) > 0
173
    assert len(str(elements[0])) > 0
174
    assert len(elements) == len(test_elements)
175
    for i in range(len(elements)):
176
        assert elements[i] == test_elements[i]
177
        assert elements[i].metadata.filename == filename.split("/")[-1]
178

179

180
def test_partition_json_raises_with_none_specified():
181
    with pytest.raises(ValueError):
182
        partition_json()
183

184

185
def test_partition_json_works_with_empty_string():
186
    assert partition_json(text="") == []
187

188

189
def test_partition_json_works_with_empty_list():
190
    assert partition_json(text="[]") == []
191

192

193
def test_partition_json_raises_with_too_many_specified():
194
    path = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
195
    elements = []
196
    filetype = detect_filetype(filename=path)
197
    if filetype == FileType.TXT:
198
        elements = partition_text(filename=path)
199
    if filetype == FileType.HTML:
200
        elements = partition_html(filename=path)
201
    if filetype == FileType.XML:
202
        elements = partition_xml(filename=path)
203
    if filetype == FileType.EML:
204
        elements = partition_email(filename=path)
205

206
    with tempfile.TemporaryDirectory() as tmpdir:
207
        test_path = os.path.join(tmpdir, "fake-text.txt.json")
208
        elements_to_json(elements, filename=test_path, indent=2)
209
        with open(test_path, "rb") as f:
210
            text = f.read().decode("utf-8")
211

212
    with pytest.raises(ValueError):
213
        partition_json(filename=test_path, file=f)
214

215
    with pytest.raises(ValueError):
216
        partition_json(filename=test_path, text=text)
217

218
    with pytest.raises(ValueError):
219
        partition_json(file=f, text=text)
220

221
    with pytest.raises(ValueError):
222
        partition_json(filename=test_path, file=f, text=text)
223

224

225
@pytest.mark.parametrize("filename", test_files)
226
def test_partition_json_from_filename_exclude_metadata(filename: str):
227
    path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
228
    elements = []
229
    filetype = detect_filetype(filename=path)
230
    if filetype == FileType.TXT:
231
        elements = partition_text(filename=path)
232
    if filetype == FileType.HTML:
233
        elements = partition_html(filename=path)
234
    if filetype == FileType.XML:
235
        elements = partition_xml(filename=path)
236
    if filetype == FileType.EML:
237
        elements = partition_email(filename=path)
238

239
    with tempfile.TemporaryDirectory() as tmpdir:
240
        _filename = os.path.basename(filename)
241
        test_path = os.path.join(tmpdir, _filename + ".json")
242
        elements_to_json(elements, filename=test_path, indent=2)
243
        test_elements = partition_json(filename=test_path, include_metadata=False)
244

245
    for i in range(len(test_elements)):
246
        assert any(test_elements[i].metadata.to_dict()) is False
247

248

249
@pytest.mark.parametrize("filename", test_files)
250
def test_partition_json_from_file_exclude_metadata(filename: str):
251
    path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
252
    elements = []
253
    filetype = detect_filetype(filename=path)
254
    if filetype == FileType.TXT:
255
        elements = partition_text(filename=path)
256
    if filetype == FileType.HTML:
257
        elements = partition_html(filename=path)
258
    if filetype == FileType.XML:
259
        elements = partition_xml(filename=path)
260
    if filetype == FileType.EML:
261
        elements = partition_email(filename=path)
262

263
    with tempfile.TemporaryDirectory() as tmpdir:
264
        _filename = os.path.basename(filename)
265
        test_path = os.path.join(tmpdir, _filename + ".json")
266
        elements_to_json(elements, filename=test_path, indent=2)
267
        with open(test_path, "rb") as f:
268
            test_elements = partition_json(file=f, include_metadata=False)
269

270
    for i in range(len(test_elements)):
271
        assert any(test_elements[i].metadata.to_dict()) is False
272

273

274
@pytest.mark.parametrize("filename", test_files)
275
def test_partition_json_from_text_exclude_metadata(filename: str):
276
    path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
277
    elements = []
278
    filetype = detect_filetype(filename=path)
279
    if filetype == FileType.TXT:
280
        elements = partition_text(filename=path)
281
    if filetype == FileType.HTML:
282
        elements = partition_html(filename=path)
283
    if filetype == FileType.XML:
284
        elements = partition_xml(filename=path)
285
    if filetype == FileType.EML:
286
        elements = partition_email(filename=path)
287

288
    with tempfile.TemporaryDirectory() as tmpdir:
289
        _filename = os.path.basename(filename)
290
        test_path = os.path.join(tmpdir, _filename + ".json")
291
        elements_to_json(elements, filename=test_path, indent=2)
292
        with open(test_path) as f:
293
            text = f.read()
294
        test_elements = partition_json(text=text, include_metadata=False)
295

296
    for i in range(len(test_elements)):
297
        assert any(test_elements[i].metadata.to_dict()) is False
298

299

300
def test_partition_json_metadata_date(mocker: MockFixture):
301
    mocked_last_modification_date = "2029-07-05T09:24:28"
302
    mocker.patch(
303
        "unstructured.partition.json.get_last_modified_date",
304
        return_value=mocked_last_modification_date,
305
    )
306

307
    elements = partition_json("example-docs/spring-weather.html.json")
308

309
    assert elements[0].metadata.last_modified == mocked_last_modification_date
310

311

312
def test_partition_json_with_custom_metadata_date(mocker: MockFixture):
313
    mocked_last_modification_date = "2029-07-05T09:24:28"
314
    expected_last_modification_date = "2020-07-05T09:24:28"
315
    mocker.patch(
316
        "unstructured.partition.json.get_last_modified_date",
317
        return_value=mocked_last_modification_date,
318
    )
319

320
    elements = partition_json(
321
        "example-docs/spring-weather.html.json",
322
        metadata_last_modified=expected_last_modification_date,
323
    )
324

325
    assert elements[0].metadata.last_modified == expected_last_modification_date
326

327

328
def test_partition_json_from_file_metadata_date(mocker: MockFixture):
329
    mocked_last_modification_date = "2029-07-05T09:24:28"
330
    mocker.patch(
331
        "unstructured.partition.json.get_last_modified_date_from_file",
332
        return_value=mocked_last_modification_date,
333
    )
334

335
    with open("example-docs/spring-weather.html.json", "rb") as f:
336
        elements = partition_json(file=f)
337

338
    assert elements[0].metadata.last_modified == mocked_last_modification_date
339

340

341
def test_partition_json_from_file_with_custom_metadata_date(mocker: MockFixture):
342
    mocked_last_modification_date = "2029-07-05T09:24:28"
343
    expected_last_modification_date = "2020-07-05T09:24:28"
344
    mocker.patch(
345
        "unstructured.partition.json.get_last_modified_date_from_file",
346
        return_value=mocked_last_modification_date,
347
    )
348

349
    with open("example-docs/spring-weather.html.json", "rb") as f:
350
        elements = partition_json(file=f, metadata_last_modified=expected_last_modification_date)
351

352
    assert elements[0].metadata.last_modified == expected_last_modification_date
353

354

355
def test_partition_json_from_text_metadata_date():
356
    with open("example-docs/spring-weather.html.json") as f:
357
        text = f.read()
358

359
    elements = partition_json(text=text)
360

361
    assert elements[0].metadata.last_modified is None
362

363

364
def test_partition_json_from_text_with_custom_metadata_date():
365
    expected_last_modification_date = "2020-07-05T09:24:28"
366
    with open("example-docs/spring-weather.html.json") as f:
367
        text = f.read()
368

369
    elements = partition_json(text=text, metadata_last_modified=expected_last_modification_date)
370

371
    assert elements[0].metadata.last_modified == expected_last_modification_date
372

373

374
def test_partition_json_raises_with_unprocessable_json():
375
    # NOTE(robinson) - This is unprocessable because it is not a list of dicts,
376
    # per the Unstructured ISD format
377
    text = '{"hi": "there"}'
378
    with pytest.raises(ValueError):
379
        partition_json(text=text)
380

381

382
def test_partition_json_raises_with_invalid_json():
383
    text = '[{"hi": "there"}]]'
384
    with pytest.raises(ValueError):
385
        partition_json(text=text)
386

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.