unstructured

Форк
0
344 строки · 11.5 Кб
1
import json
2
import os
3

4
import pytest
5

6
from unstructured import utils
7
from unstructured.documents.coordinates import PixelSpace
8
from unstructured.documents.elements import ElementMetadata, NarrativeText, Title
9

10

11
@pytest.fixture()
12
def input_data():
13
    return [
14
        {"text": "This is a sentence."},
15
        {"text": "This is another sentence.", "meta": {"score": 0.1}},
16
    ]
17

18

19
@pytest.fixture()
20
def output_jsonl_file(tmp_path):
21
    return os.path.join(tmp_path, "output.jsonl")
22

23

24
@pytest.fixture()
25
def input_jsonl_file(tmp_path, input_data):
26
    file_path = os.path.join(tmp_path, "input.jsonl")
27
    with open(file_path, "w+") as input_file:
28
        input_file.writelines([json.dumps(obj) + "\n" for obj in input_data])
29
    return file_path
30

31

32
def test_save_as_jsonl(input_data, output_jsonl_file):
33
    utils.save_as_jsonl(input_data, output_jsonl_file)
34
    with open(output_jsonl_file) as output_file:
35
        file_data = [json.loads(line) for line in output_file]
36
    assert file_data == input_data
37

38

39
def test_read_as_jsonl(input_jsonl_file, input_data):
40
    file_data = utils.read_from_jsonl(input_jsonl_file)
41
    assert file_data == input_data
42

43

44
def test_requires_dependencies_decorator():
45
    @utils.requires_dependencies(dependencies="numpy")
46
    def test_func():
47
        import numpy  # noqa: F401
48

49
    test_func()
50

51

52
def test_requires_dependencies_decorator_multiple():
53
    @utils.requires_dependencies(dependencies=["numpy", "pandas"])
54
    def test_func():
55
        import numpy  # noqa: F401
56
        import pandas  # noqa: F401
57

58
    test_func()
59

60

61
def test_requires_dependencies_decorator_import_error():
62
    @utils.requires_dependencies(dependencies="not_a_package")
63
    def test_func():
64
        import not_a_package  # noqa: F401
65

66
    with pytest.raises(ImportError):
67
        test_func()
68

69

70
def test_requires_dependencies_decorator_import_error_multiple():
71
    @utils.requires_dependencies(dependencies=["not_a_package", "numpy"])
72
    def test_func():
73
        import not_a_package  # noqa: F401
74
        import numpy  # noqa: F401
75

76
    with pytest.raises(ImportError):
77
        test_func()
78

79

80
def test_requires_dependencies_decorator_in_class():
81
    @utils.requires_dependencies(dependencies="numpy")
82
    class TestClass:
83
        def __init__(self):
84
            import numpy  # noqa: F401
85

86
    TestClass()
87

88

89
@pytest.mark.parametrize("iterator", [[0, 1], (0, 1), range(10), [0], (0,), range(1)])
90
def test_first_gives_first(iterator):
91
    assert utils.first(iterator) == 0
92

93

94
@pytest.mark.parametrize("iterator", [[], ()])
95
def test_first_raises_if_empty(iterator):
96
    with pytest.raises(ValueError):
97
        utils.first(iterator)
98

99

100
@pytest.mark.parametrize("iterator", [[0], (0,), range(1)])
101
def test_only_gives_only(iterator):
102
    assert utils.first(iterator) == 0
103

104

105
@pytest.mark.parametrize("iterator", [[0, 1], (0, 1), range(10)])
106
def test_only_raises_when_len_more_than_1(iterator):
107
    with pytest.raises(ValueError):
108
        utils.only(iterator) == 0
109

110

111
@pytest.mark.parametrize("iterator", [[], ()])
112
def test_only_raises_if_empty(iterator):
113
    with pytest.raises(ValueError):
114
        utils.only(iterator)
115

116

117
@pytest.mark.parametrize(
118
    ("elements", "nested_error_tolerance_px", "sm_overlap_threshold", "expectation"),
119
    [
120
        (
121
            [
122
                Title(
123
                    text="Some lovely title",
124
                    coordinates=((4, 5), (4, 8), (7, 8), (7, 5)),
125
                    coordinate_system=PixelSpace(width=20, height=20),
126
                    metadata=ElementMetadata(page_number=1),
127
                ),
128
                NarrativeText(
129
                    text="Some lovely text",
130
                    coordinates=((2, 3), (2, 6), (5, 6), (5, 3)),
131
                    coordinate_system=PixelSpace(width=20, height=20),
132
                    metadata=ElementMetadata(page_number=1),
133
                ),
134
            ],
135
            5,
136
            10.0,
137
            (
138
                True,
139
                [
140
                    {
141
                        "overlapping_elements": ["Title(ix=0)", "NarrativeText(ix=1)"],
142
                        "parent_element": "Title(ix=0)",
143
                        "overlapping_case": "nested NarrativeText in Title",
144
                        "overlap_percentage": "100%",
145
                        "metadata": {
146
                            "largest_ngram_percentage": None,
147
                            "overlap_percentage_total": "5.88%",
148
                            "max_area": "9pxˆ2",
149
                            "min_area": "9pxˆ2",
150
                            "total_area": "18pxˆ2",
151
                        },
152
                    },
153
                ],
154
            ),
155
        ),
156
        (
157
            [
158
                Title(
159
                    text="Some lovely title",
160
                    coordinates=((4, 5), (4, 8), (7, 8), (7, 5)),
161
                    coordinate_system=PixelSpace(width=20, height=20),
162
                    metadata=ElementMetadata(page_number=1),
163
                ),
164
                NarrativeText(
165
                    text="Some lovely text",
166
                    coordinates=((2, 3), (2, 6), (5, 6), (5, 3)),
167
                    coordinate_system=PixelSpace(width=20, height=20),
168
                    metadata=ElementMetadata(page_number=1),
169
                ),
170
            ],
171
            1,
172
            10.0,
173
            (
174
                True,
175
                [
176
                    {
177
                        "overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"],
178
                        "parent_element": None,
179
                        "overlapping_case": "partial overlap sharing 50.0% of the text from1. "
180
                        "NarrativeText(2-gram)",
181
                        "overlap_percentage": "11.11%",
182
                        "metadata": {
183
                            "largest_ngram_percentage": 50.0,
184
                            "overlap_percentage_total": "5.88%",
185
                            "max_area": "9pxˆ2",
186
                            "min_area": "9pxˆ2",
187
                            "total_area": "18pxˆ2",
188
                        },
189
                    },
190
                ],
191
            ),
192
        ),
193
        (
194
            [
195
                Title(
196
                    text="Some lovely title",
197
                    coordinates=((4, 5), (4, 8), (7, 8), (7, 5)),
198
                    coordinate_system=PixelSpace(width=20, height=20),
199
                    metadata=ElementMetadata(page_number=1),
200
                ),
201
                NarrativeText(
202
                    text="Some lovely title",
203
                    coordinates=((2, 3), (2, 6), (5, 6), (5, 3)),
204
                    coordinate_system=PixelSpace(width=20, height=20),
205
                    metadata=ElementMetadata(page_number=1),
206
                ),
207
            ],
208
            1,
209
            10.0,
210
            (
211
                True,
212
                [
213
                    {
214
                        "overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"],
215
                        "parent_element": None,
216
                        "overlapping_case": "partial overlap with duplicate text",
217
                        "overlap_percentage": "11.11%",
218
                        "metadata": {
219
                            "largest_ngram_percentage": None,
220
                            "overlap_percentage_total": "5.88%",
221
                            "max_area": "9pxˆ2",
222
                            "min_area": "9pxˆ2",
223
                            "total_area": "18pxˆ2",
224
                        },
225
                    },
226
                ],
227
            ),
228
        ),
229
        (
230
            [
231
                Title(
232
                    text="Some lovely title",
233
                    coordinates=((4, 5), (4, 8), (7, 8), (7, 5)),
234
                    coordinate_system=PixelSpace(width=20, height=20),
235
                    metadata=ElementMetadata(page_number=1),
236
                ),
237
                NarrativeText(
238
                    text="Something totally different here",
239
                    coordinates=((2, 3), (2, 6), (5, 6), (5, 3)),
240
                    coordinate_system=PixelSpace(width=20, height=20),
241
                    metadata=ElementMetadata(page_number=1),
242
                ),
243
            ],
244
            1,
245
            10.0,
246
            (
247
                True,
248
                [
249
                    {
250
                        "overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"],
251
                        "parent_element": None,
252
                        "overlapping_case": "partial overlap without sharing text",
253
                        "overlap_percentage": "11.11%",
254
                        "metadata": {
255
                            "largest_ngram_percentage": 0,
256
                            "overlap_percentage_total": "5.88%",
257
                            "max_area": "9pxˆ2",
258
                            "min_area": "9pxˆ2",
259
                            "total_area": "18pxˆ2",
260
                        },
261
                    },
262
                ],
263
            ),
264
        ),
265
        (
266
            [
267
                Title(
268
                    text="Some lovely title",
269
                    coordinates=((5, 6), (5, 10), (8, 10), (8, 6)),
270
                    coordinate_system=PixelSpace(width=20, height=20),
271
                    metadata=ElementMetadata(page_number=1),
272
                ),
273
                NarrativeText(
274
                    text="Some lovely text",
275
                    coordinates=((1, 3), (2, 7), (6, 7), (5, 3)),
276
                    coordinate_system=PixelSpace(width=20, height=20),
277
                    metadata=ElementMetadata(page_number=1),
278
                ),
279
            ],
280
            1,
281
            10.0,
282
            (
283
                True,
284
                [
285
                    {
286
                        "overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"],
287
                        "parent_element": None,
288
                        "overlapping_case": "Small partial overlap",
289
                        "overlap_percentage": "8.33%",
290
                        "metadata": {
291
                            "largest_ngram_percentage": None,
292
                            "overlap_percentage_total": "3.23%",
293
                            "max_area": "20pxˆ2",
294
                            "min_area": "12pxˆ2",
295
                            "total_area": "32pxˆ2",
296
                        },
297
                    },
298
                ],
299
            ),
300
        ),
301
        (
302
            [
303
                Title(
304
                    text="Some lovely title",
305
                    coordinates=((4, 6), (4, 7), (7, 7), (7, 6)),
306
                    coordinate_system=PixelSpace(width=20, height=20),
307
                    metadata=ElementMetadata(page_number=1),
308
                ),
309
                NarrativeText(
310
                    text="Some lovely text",
311
                    coordinates=((6, 8), (6, 9), (9, 9), (9, 8)),
312
                    coordinate_system=PixelSpace(width=20, height=20),
313
                    metadata=ElementMetadata(page_number=1),
314
                ),
315
            ],
316
            1,
317
            10.0,
318
            (False, []),
319
        ),
320
    ],
321
)
322
def test_catch_overlapping_and_nested_bboxes(
323
    elements,
324
    expectation,
325
    nested_error_tolerance_px,
326
    sm_overlap_threshold,
327
):
328
    overlapping_flag, overlapping_cases = utils.catch_overlapping_and_nested_bboxes(
329
        elements,
330
        nested_error_tolerance_px,
331
        sm_overlap_threshold,
332
    )
333
    assert overlapping_flag == expectation[0]
334
    assert overlapping_cases == expectation[1]
335

336

337
def test_validate_data_args():
338
    assert utils.validate_date_args("2020-10-10") is True
339

340
    with pytest.raises(ValueError):
341
        utils.validate_date_args("blah")
342

343
    with pytest.raises(ValueError):
344
        utils.validate_date_args(None)
345

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.