unstructured
344 строки · 11.5 Кб
1import json
2import os
3
4import pytest
5
6from unstructured import utils
7from unstructured.documents.coordinates import PixelSpace
8from unstructured.documents.elements import ElementMetadata, NarrativeText, Title
9
10
11@pytest.fixture()
12def input_data():
13return [
14{"text": "This is a sentence."},
15{"text": "This is another sentence.", "meta": {"score": 0.1}},
16]
17
18
19@pytest.fixture()
20def output_jsonl_file(tmp_path):
21return os.path.join(tmp_path, "output.jsonl")
22
23
24@pytest.fixture()
25def input_jsonl_file(tmp_path, input_data):
26file_path = os.path.join(tmp_path, "input.jsonl")
27with open(file_path, "w+") as input_file:
28input_file.writelines([json.dumps(obj) + "\n" for obj in input_data])
29return file_path
30
31
32def test_save_as_jsonl(input_data, output_jsonl_file):
33utils.save_as_jsonl(input_data, output_jsonl_file)
34with open(output_jsonl_file) as output_file:
35file_data = [json.loads(line) for line in output_file]
36assert file_data == input_data
37
38
39def test_read_as_jsonl(input_jsonl_file, input_data):
40file_data = utils.read_from_jsonl(input_jsonl_file)
41assert file_data == input_data
42
43
44def test_requires_dependencies_decorator():
45@utils.requires_dependencies(dependencies="numpy")
46def test_func():
47import numpy # noqa: F401
48
49test_func()
50
51
52def test_requires_dependencies_decorator_multiple():
53@utils.requires_dependencies(dependencies=["numpy", "pandas"])
54def test_func():
55import numpy # noqa: F401
56import pandas # noqa: F401
57
58test_func()
59
60
61def test_requires_dependencies_decorator_import_error():
62@utils.requires_dependencies(dependencies="not_a_package")
63def test_func():
64import not_a_package # noqa: F401
65
66with pytest.raises(ImportError):
67test_func()
68
69
70def test_requires_dependencies_decorator_import_error_multiple():
71@utils.requires_dependencies(dependencies=["not_a_package", "numpy"])
72def test_func():
73import not_a_package # noqa: F401
74import numpy # noqa: F401
75
76with pytest.raises(ImportError):
77test_func()
78
79
80def test_requires_dependencies_decorator_in_class():
81@utils.requires_dependencies(dependencies="numpy")
82class TestClass:
83def __init__(self):
84import numpy # noqa: F401
85
86TestClass()
87
88
89@pytest.mark.parametrize("iterator", [[0, 1], (0, 1), range(10), [0], (0,), range(1)])
90def test_first_gives_first(iterator):
91assert utils.first(iterator) == 0
92
93
94@pytest.mark.parametrize("iterator", [[], ()])
95def test_first_raises_if_empty(iterator):
96with pytest.raises(ValueError):
97utils.first(iterator)
98
99
100@pytest.mark.parametrize("iterator", [[0], (0,), range(1)])
101def test_only_gives_only(iterator):
102assert utils.first(iterator) == 0
103
104
105@pytest.mark.parametrize("iterator", [[0, 1], (0, 1), range(10)])
106def test_only_raises_when_len_more_than_1(iterator):
107with pytest.raises(ValueError):
108utils.only(iterator) == 0
109
110
111@pytest.mark.parametrize("iterator", [[], ()])
112def test_only_raises_if_empty(iterator):
113with pytest.raises(ValueError):
114utils.only(iterator)
115
116
117@pytest.mark.parametrize(
118("elements", "nested_error_tolerance_px", "sm_overlap_threshold", "expectation"),
119[
120(
121[
122Title(
123text="Some lovely title",
124coordinates=((4, 5), (4, 8), (7, 8), (7, 5)),
125coordinate_system=PixelSpace(width=20, height=20),
126metadata=ElementMetadata(page_number=1),
127),
128NarrativeText(
129text="Some lovely text",
130coordinates=((2, 3), (2, 6), (5, 6), (5, 3)),
131coordinate_system=PixelSpace(width=20, height=20),
132metadata=ElementMetadata(page_number=1),
133),
134],
1355,
13610.0,
137(
138True,
139[
140{
141"overlapping_elements": ["Title(ix=0)", "NarrativeText(ix=1)"],
142"parent_element": "Title(ix=0)",
143"overlapping_case": "nested NarrativeText in Title",
144"overlap_percentage": "100%",
145"metadata": {
146"largest_ngram_percentage": None,
147"overlap_percentage_total": "5.88%",
148"max_area": "9pxˆ2",
149"min_area": "9pxˆ2",
150"total_area": "18pxˆ2",
151},
152},
153],
154),
155),
156(
157[
158Title(
159text="Some lovely title",
160coordinates=((4, 5), (4, 8), (7, 8), (7, 5)),
161coordinate_system=PixelSpace(width=20, height=20),
162metadata=ElementMetadata(page_number=1),
163),
164NarrativeText(
165text="Some lovely text",
166coordinates=((2, 3), (2, 6), (5, 6), (5, 3)),
167coordinate_system=PixelSpace(width=20, height=20),
168metadata=ElementMetadata(page_number=1),
169),
170],
1711,
17210.0,
173(
174True,
175[
176{
177"overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"],
178"parent_element": None,
179"overlapping_case": "partial overlap sharing 50.0% of the text from1. "
180"NarrativeText(2-gram)",
181"overlap_percentage": "11.11%",
182"metadata": {
183"largest_ngram_percentage": 50.0,
184"overlap_percentage_total": "5.88%",
185"max_area": "9pxˆ2",
186"min_area": "9pxˆ2",
187"total_area": "18pxˆ2",
188},
189},
190],
191),
192),
193(
194[
195Title(
196text="Some lovely title",
197coordinates=((4, 5), (4, 8), (7, 8), (7, 5)),
198coordinate_system=PixelSpace(width=20, height=20),
199metadata=ElementMetadata(page_number=1),
200),
201NarrativeText(
202text="Some lovely title",
203coordinates=((2, 3), (2, 6), (5, 6), (5, 3)),
204coordinate_system=PixelSpace(width=20, height=20),
205metadata=ElementMetadata(page_number=1),
206),
207],
2081,
20910.0,
210(
211True,
212[
213{
214"overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"],
215"parent_element": None,
216"overlapping_case": "partial overlap with duplicate text",
217"overlap_percentage": "11.11%",
218"metadata": {
219"largest_ngram_percentage": None,
220"overlap_percentage_total": "5.88%",
221"max_area": "9pxˆ2",
222"min_area": "9pxˆ2",
223"total_area": "18pxˆ2",
224},
225},
226],
227),
228),
229(
230[
231Title(
232text="Some lovely title",
233coordinates=((4, 5), (4, 8), (7, 8), (7, 5)),
234coordinate_system=PixelSpace(width=20, height=20),
235metadata=ElementMetadata(page_number=1),
236),
237NarrativeText(
238text="Something totally different here",
239coordinates=((2, 3), (2, 6), (5, 6), (5, 3)),
240coordinate_system=PixelSpace(width=20, height=20),
241metadata=ElementMetadata(page_number=1),
242),
243],
2441,
24510.0,
246(
247True,
248[
249{
250"overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"],
251"parent_element": None,
252"overlapping_case": "partial overlap without sharing text",
253"overlap_percentage": "11.11%",
254"metadata": {
255"largest_ngram_percentage": 0,
256"overlap_percentage_total": "5.88%",
257"max_area": "9pxˆ2",
258"min_area": "9pxˆ2",
259"total_area": "18pxˆ2",
260},
261},
262],
263),
264),
265(
266[
267Title(
268text="Some lovely title",
269coordinates=((5, 6), (5, 10), (8, 10), (8, 6)),
270coordinate_system=PixelSpace(width=20, height=20),
271metadata=ElementMetadata(page_number=1),
272),
273NarrativeText(
274text="Some lovely text",
275coordinates=((1, 3), (2, 7), (6, 7), (5, 3)),
276coordinate_system=PixelSpace(width=20, height=20),
277metadata=ElementMetadata(page_number=1),
278),
279],
2801,
28110.0,
282(
283True,
284[
285{
286"overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"],
287"parent_element": None,
288"overlapping_case": "Small partial overlap",
289"overlap_percentage": "8.33%",
290"metadata": {
291"largest_ngram_percentage": None,
292"overlap_percentage_total": "3.23%",
293"max_area": "20pxˆ2",
294"min_area": "12pxˆ2",
295"total_area": "32pxˆ2",
296},
297},
298],
299),
300),
301(
302[
303Title(
304text="Some lovely title",
305coordinates=((4, 6), (4, 7), (7, 7), (7, 6)),
306coordinate_system=PixelSpace(width=20, height=20),
307metadata=ElementMetadata(page_number=1),
308),
309NarrativeText(
310text="Some lovely text",
311coordinates=((6, 8), (6, 9), (9, 9), (9, 8)),
312coordinate_system=PixelSpace(width=20, height=20),
313metadata=ElementMetadata(page_number=1),
314),
315],
3161,
31710.0,
318(False, []),
319),
320],
321)
322def test_catch_overlapping_and_nested_bboxes(
323elements,
324expectation,
325nested_error_tolerance_px,
326sm_overlap_threshold,
327):
328overlapping_flag, overlapping_cases = utils.catch_overlapping_and_nested_bboxes(
329elements,
330nested_error_tolerance_px,
331sm_overlap_threshold,
332)
333assert overlapping_flag == expectation[0]
334assert overlapping_cases == expectation[1]
335
336
337def test_validate_data_args():
338assert utils.validate_date_args("2020-10-10") is True
339
340with pytest.raises(ValueError):
341utils.validate_date_args("blah")
342
343with pytest.raises(ValueError):
344utils.validate_date_args(None)
345