unstructured

test_common.py
541 строка · 16.4 Кб
Перенос по словам
1
from dataclasses import dataclass
2
from unittest import mock
3

4
import pytest
5
from PIL import Image
6
from unstructured_inference.inference import layout
7
from unstructured_inference.inference.elements import TextRegion
8
from unstructured_inference.inference.layout import DocumentLayout, LayoutElement, PageLayout
9

10
from unstructured.documents.coordinates import PixelSpace
11
from unstructured.documents.elements import (
12
    CheckBox,
13
    CoordinatesMetadata,
14
    ElementMetadata,
15
    ElementType,
16
    FigureCaption,
17
    Header,
18
    ListItem,
19
    NarrativeText,
20
    Text,
21
    Title,
22
)
23
from unstructured.documents.elements import (
24
    Image as ImageElement,
25
)
26
from unstructured.partition import common
27
from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_DONT, SORT_MODE_XY_CUT
28

29

30
class MockPageLayout(layout.PageLayout):
31
    def __init__(self, number: int, image: Image):
32
        self.number = number
33
        self.image = image
34

35
    @property
36
    def elements(self):
37
        return [
38
            LayoutElement(
39
                type="Headline",
40
                text="Charlie Brown and the Great Pumpkin",
41
                bbox=None,
42
            ),
43
            LayoutElement(
44
                type="Subheadline",
45
                text="The Beginning",
46
                bbox=None,
47
            ),
48
            LayoutElement(
49
                type="Text",
50
                text="This time Charlie Brown had it really tricky...",
51
                bbox=None,
52
            ),
53
            LayoutElement(
54
                type="Title",
55
                text="Another book title in the same page",
56
                bbox=None,
57
            ),
58
        ]
59

60

61
class MockDocumentLayout(layout.DocumentLayout):
62
    @property
63
    def pages(self):
64
        return [
65
            MockPageLayout(number=1, image=Image.new("1", (1, 1))),
66
        ]
67

68

69
def test_normalize_layout_element_dict():
70
    layout_element = {
71
        "type": "Title",
72
        "coordinates": [[1, 2], [3, 4], [5, 6], [7, 8]],
73
        "coordinate_system": None,
74
        "text": "Some lovely text",
75
    }
76
    coordinate_system = PixelSpace(width=10, height=20)
77
    element = common.normalize_layout_element(
78
        layout_element,
79
        coordinate_system=coordinate_system,
80
    )
81
    assert element == Title(
82
        text="Some lovely text",
83
        coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]],
84
        coordinate_system=coordinate_system,
85
    )
86

87

88
def test_normalize_layout_element_dict_caption():
89
    layout_element = {
90
        "type": "Figure",
91
        "coordinates": ((1, 2), (3, 4), (5, 6), (7, 8)),
92
        "text": "Some lovely text",
93
    }
94
    coordinate_system = PixelSpace(width=10, height=20)
95
    element = common.normalize_layout_element(
96
        layout_element,
97
        coordinate_system=coordinate_system,
98
    )
99
    assert element == ImageElement(
100
        text="Some lovely text",
101
        coordinates=((1, 2), (3, 4), (5, 6), (7, 8)),
102
        coordinate_system=coordinate_system,
103
    )
104

105

106
@pytest.mark.parametrize(
107
    ("element_type", "expected_type", "expected_depth"),
108
    [
109
        ("Title", Title, None),
110
        ("Headline", Title, 1),
111
        ("Subheadline", Title, 2),
112
        ("Header", Header, None),
113
    ],
114
)
115
def test_normalize_layout_element_headline(element_type, expected_type, expected_depth):
116
    layout_element = {
117
        "type": element_type,
118
        "coordinates": [[1, 2], [3, 4], [5, 6], [7, 8]],
119
        "text": "Some lovely text",
120
    }
121
    coordinate_system = PixelSpace(width=10, height=20)
122
    element = common.normalize_layout_element(layout_element, coordinate_system=coordinate_system)
123
    assert element.metadata.category_depth == expected_depth
124
    assert isinstance(element, expected_type)
125

126

127
def test_normalize_layout_element_dict_figure_caption():
128
    layout_element = {
129
        "type": "FigureCaption",
130
        "coordinates": [[1, 2], [3, 4], [5, 6], [7, 8]],
131
        "text": "Some lovely text",
132
    }
133
    coordinate_system = PixelSpace(width=10, height=20)
134
    element = common.normalize_layout_element(
135
        layout_element,
136
        coordinate_system=coordinate_system,
137
    )
138
    assert element == FigureCaption(
139
        text="Some lovely text",
140
        coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]],
141
        coordinate_system=coordinate_system,
142
    )
143

144

145
def test_normalize_layout_element_dict_misc():
146
    layout_element = {
147
        "type": "Misc",
148
        "coordinates": [[1, 2], [3, 4], [5, 6], [7, 8]],
149
        "text": "Some lovely text",
150
    }
151
    coordinate_system = PixelSpace(width=10, height=20)
152
    element = common.normalize_layout_element(
153
        layout_element,
154
        coordinate_system=coordinate_system,
155
    )
156
    assert element == Text(
157
        text="Some lovely text",
158
        coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]],
159
        coordinate_system=coordinate_system,
160
    )
161

162

163
def test_normalize_layout_element_layout_element():
164
    layout_element = LayoutElement.from_coords(
165
        type="Text",
166
        x1=1,
167
        y1=2,
168
        x2=3,
169
        y2=4,
170
        text="Some lovely text",
171
    )
172
    coordinate_system = PixelSpace(width=10, height=20)
173
    element = common.normalize_layout_element(
174
        layout_element,
175
        coordinate_system=coordinate_system,
176
    )
177
    assert element == NarrativeText(
178
        text="Some lovely text",
179
        coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
180
        coordinate_system=coordinate_system,
181
    )
182

183

184
def test_normalize_layout_element_layout_element_narrative_text():
185
    layout_element = LayoutElement.from_coords(
186
        type="NarrativeText",
187
        x1=1,
188
        y1=2,
189
        x2=3,
190
        y2=4,
191
        text="Some lovely text",
192
    )
193
    coordinate_system = PixelSpace(width=10, height=20)
194
    element = common.normalize_layout_element(
195
        layout_element,
196
        coordinate_system=coordinate_system,
197
    )
198
    assert element == NarrativeText(
199
        text="Some lovely text",
200
        coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
201
        coordinate_system=coordinate_system,
202
    )
203

204

205
def test_normalize_layout_element_checked_box():
206
    layout_element = LayoutElement.from_coords(
207
        type="Checked",
208
        x1=1,
209
        y1=2,
210
        x2=3,
211
        y2=4,
212
        text="",
213
    )
214
    coordinate_system = PixelSpace(width=10, height=20)
215
    element = common.normalize_layout_element(
216
        layout_element,
217
        coordinate_system=coordinate_system,
218
    )
219
    assert element == CheckBox(
220
        checked=True,
221
        coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
222
        coordinate_system=coordinate_system,
223
    )
224

225

226
def test_normalize_layout_element_unchecked_box():
227
    layout_element = LayoutElement.from_coords(
228
        type="Unchecked",
229
        x1=1,
230
        y1=2,
231
        x2=3,
232
        y2=4,
233
        text="",
234
    )
235
    coordinate_system = PixelSpace(width=10, height=20)
236
    element = common.normalize_layout_element(
237
        layout_element,
238
        coordinate_system=coordinate_system,
239
    )
240
    assert element == CheckBox(
241
        checked=False,
242
        coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
243
        coordinate_system=coordinate_system,
244
    )
245

246

247
def test_normalize_layout_element_enumerated_list():
248
    layout_element = LayoutElement.from_coords(
249
        type="List",
250
        x1=1,
251
        y1=2,
252
        x2=3,
253
        y2=4,
254
        text="1. I'm so cool! 2. You're cool too. 3. We're all cool!",
255
    )
256
    coordinate_system = PixelSpace(width=10, height=20)
257
    elements = common.normalize_layout_element(
258
        layout_element,
259
        coordinate_system=coordinate_system,
260
    )
261
    assert elements == [
262
        ListItem(
263
            text="I'm so cool!",
264
            coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
265
            coordinate_system=coordinate_system,
266
        ),
267
        ListItem(
268
            text="You're cool too.",
269
            coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
270
            coordinate_system=coordinate_system,
271
        ),
272
        ListItem(
273
            text="We're all cool!",
274
            coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
275
            coordinate_system=coordinate_system,
276
        ),
277
    ]
278

279

280
def test_normalize_layout_element_bulleted_list():
281
    layout_element = LayoutElement.from_coords(
282
        type="List",
283
        x1=1,
284
        y1=2,
285
        x2=3,
286
        y2=4,
287
        text="* I'm so cool! * You're cool too. * We're all cool!",
288
    )
289
    coordinate_system = PixelSpace(width=10, height=20)
290
    elements = common.normalize_layout_element(
291
        layout_element,
292
        coordinate_system=coordinate_system,
293
    )
294
    assert elements == [
295
        ListItem(
296
            text="I'm so cool!",
297
            coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
298
            coordinate_system=coordinate_system,
299
        ),
300
        ListItem(
301
            text="You're cool too.",
302
            coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
303
            coordinate_system=coordinate_system,
304
        ),
305
        ListItem(
306
            text="We're all cool!",
307
            coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
308
            coordinate_system=coordinate_system,
309
        ),
310
    ]
311

312

313
class MockPopenWithError:
314
    def __init__(self, *args, **kwargs):
315
        pass
316

317
    def communicate(self):
318
        return b"", b"an error occurred"
319

320

321
def test_convert_office_doc_captures_errors(monkeypatch, caplog):
322
    import subprocess
323

324
    monkeypatch.setattr(subprocess, "Popen", MockPopenWithError)
325
    common.convert_office_doc("no-real.docx", "fake-directory", target_format="docx")
326
    assert "an error occurred" in caplog.text
327

328

329
class MockDocxEmptyTable:
330
    def __init__(self):
331
        self.rows = []
332

333

334
def test_convert_ms_office_table_to_text_works_with_empty_tables():
335
    table = MockDocxEmptyTable()
336
    assert common.convert_ms_office_table_to_text(table, as_html=True) == ""
337
    assert common.convert_ms_office_table_to_text(table, as_html=False) == ""
338

339

340
@pytest.mark.parametrize(
341
    ("text", "expected"),
342
    [
343
        ("<table><tbody><tr><td>👨\\U+1F3FB🔧</td></tr></tbody></table>", True),
344
        ("<table><tbody><tr><td>Hello!</td></tr></tbody></table>", False),
345
    ],
346
)
347
def test_contains_emoji(text, expected):
348
    assert common.contains_emoji(text) is expected
349

350

351
def test_document_to_element_list_omits_coord_system_when_coord_points_absent():
352
    layout_elem_absent_coordinates = MockDocumentLayout()
353
    for page in layout_elem_absent_coordinates.pages:
354
        for el in page.elements:
355
            el.bbox = None
356
    elements = common.document_to_element_list(layout_elem_absent_coordinates)
357
    assert elements[0].metadata.coordinates is None
358

359

360
def test_get_page_image_metadata_and_coordinate_system():
361
    doc = MockDocumentLayout()
362
    metadata = common._get_page_image_metadata(doc.pages[0])
363
    assert isinstance(metadata, dict)
364

365

366
def test_set_element_hierarchy():
367
    elements_to_set = [
368
        Title(text="Title"),  # 0
369
        NarrativeText(text="NarrativeText"),  # 1
370
        FigureCaption(text="FigureCaption"),  # 2
371
        ListItem(text="ListItem"),  # 3
372
        ListItem(text="ListItem", metadata=ElementMetadata(category_depth=1)),  # 4
373
        ListItem(text="ListItem", metadata=ElementMetadata(category_depth=1)),  # 5
374
        ListItem(text="ListItem"),  # 6
375
        CheckBox(element_id="some-id-1", checked=True),  # 7
376
        Title(text="Title 2"),  # 8
377
        ListItem(text="ListItem"),  # 9
378
        ListItem(text="ListItem"),  # 10
379
        Text(text="Text"),  # 11
380
    ]
381
    elements = common.set_element_hierarchy(elements_to_set)
382

383
    assert (
384
        elements[1].metadata.parent_id == elements[0].id
385
    ), "NarrativeText should be child of Title"
386
    assert (
387
        elements[2].metadata.parent_id == elements[0].id
388
    ), "FigureCaption should be child of Title"
389
    assert elements[3].metadata.parent_id == elements[0].id, "ListItem should be child of Title"
390
    assert elements[4].metadata.parent_id == elements[3].id, "ListItem should be child of Title"
391
    assert elements[5].metadata.parent_id == elements[3].id, "ListItem should be child of Title"
392
    assert elements[6].metadata.parent_id == elements[0].id, "ListItem should be child of Title"
393
    assert (
394
        elements[7].metadata.parent_id is None
395
    ), "CheckBox should be None, as it's not a Text based element"
396
    assert elements[8].metadata.parent_id is None, "Title 2 should be child of None"
397
    assert elements[9].metadata.parent_id == elements[8].id, "ListItem should be child of Title 2"
398
    assert elements[10].metadata.parent_id == elements[8].id, "ListItem should be child of Title 2"
399
    assert elements[11].metadata.parent_id == elements[8].id, "Text should be child of Title 2"
400

401

402
def test_set_element_hierarchy_custom_rule_set():
403
    elements_to_set = [
404
        Header(text="Header"),  # 0
405
        Title(text="Title"),  # 1
406
        NarrativeText(text="NarrativeText"),  # 2
407
        Text(text="Text"),  # 3
408
        Title(text="Title 2"),  # 4
409
        FigureCaption(text="FigureCaption"),  # 5
410
    ]
411

412
    custom_rule_set = {
413
        "Header": ["Title", "Text"],
414
        "Title": ["NarrativeText", "UncategorizedText", "FigureCaption"],
415
    }
416

417
    elements = common.set_element_hierarchy(
418
        elements=elements_to_set,
419
        ruleset=custom_rule_set,
420
    )
421

422
    assert elements[1].metadata.parent_id == elements[0].id, "Title should be child of Header"
423
    assert (
424
        elements[2].metadata.parent_id == elements[1].id
425
    ), "NarrativeText should be child of Title"
426
    assert elements[3].metadata.parent_id == elements[1].id, "Text should be child of Title"
427
    assert elements[4].metadata.parent_id == elements[0].id, "Title 2 should be child of Header"
428
    assert (
429
        elements[5].metadata.parent_id == elements[4].id
430
    ), "FigureCaption should be child of Title 2"
431

432

433
@dataclass
434
class MockImage:
435
    width = 640
436
    height = 480
437
    format = "JPG"
438

439

440
def test_document_to_element_list_handles_parent():
441
    block1 = LayoutElement.from_coords(1, 2, 3, 4, text="block 1", type="NarrativeText")
442
    block2 = LayoutElement.from_coords(
443
        1,
444
        2,
445
        3,
446
        4,
447
        text="block 2",
448
        parent=block1,
449
        type="NarrativeText",
450
    )
451
    page = PageLayout(
452
        number=1,
453
        image=MockImage(),
454
    )
455
    page.elements = [block1, block2]
456
    doc = DocumentLayout.from_pages([page])
457
    el1, el2 = common.document_to_element_list(doc)
458
    assert el2.metadata.parent_id == el1.id
459

460

461
@pytest.mark.parametrize(
462
    ("sort_mode", "call_count"),
463
    [(SORT_MODE_DONT, 0), (SORT_MODE_BASIC, 1), (SORT_MODE_XY_CUT, 1)],
464
)
465
def test_document_to_element_list_doesnt_sort_on_sort_method(sort_mode, call_count):
466
    block1 = LayoutElement.from_coords(1, 2, 3, 4, text="block 1", type="NarrativeText")
467
    block2 = LayoutElement.from_coords(
468
        1,
469
        2,
470
        3,
471
        4,
472
        text="block 2",
473
        parent=block1,
474
        type="NarrativeText",
475
    )
476
    page = PageLayout(
477
        number=1,
478
        image=MockImage(),
479
    )
480
    page.elements = [block1, block2]
481
    doc = DocumentLayout.from_pages([page])
482
    with mock.patch.object(common, "sort_page_elements") as mock_sort_page_elements:
483
        common.document_to_element_list(doc, sortable=True, sort_mode=sort_mode)
484
    assert mock_sort_page_elements.call_count == call_count
485

486

487
def test_document_to_element_list_sets_category_depth_titles():
488
    layout_with_hierarchies = MockDocumentLayout()
489
    elements = common.document_to_element_list(layout_with_hierarchies)
490
    assert elements[0].metadata.category_depth == 1
491
    assert elements[1].metadata.category_depth == 2
492
    assert elements[2].metadata.category_depth is None
493
    assert elements[3].metadata.category_depth == 0
494

495

496
def test_ocr_data_to_elements(
497
    filename="example-docs/layout-parser-paper-fast.jpg",
498
):
499
    text_regions = [
500
        TextRegion.from_coords(
501
            163.0,
502
            115.0,
503
            452.0,
504
            129.0,
505
            text="LayoutParser: A Unified Toolkit for Deep",
506
        ),
507
        TextRegion.from_coords(
508
            156.0,
509
            132.0,
510
            457.0,
511
            147.0,
512
            text="Learning Based Document Image Analysis",
513
        ),
514
    ]
515
    ocr_data = [
516
        LayoutElement(
517
            bbox=r.bbox,
518
            text=r.text,
519
            source=r.source,
520
            type=ElementType.UNCATEGORIZED_TEXT,
521
        )
522
        for r in text_regions
523
    ]
524
    image = Image.open(filename)
525

526
    elements = common.ocr_data_to_elements(
527
        ocr_data=ocr_data,
528
        image_size=image.size,
529
    )
530

531
    assert len(ocr_data) == len(elements)
532
    assert {el.category for el in elements} == {ElementType.UNCATEGORIZED_TEXT}
533

534
    # check coordinates metadata
535
    image_width, image_height = image.size
536
    coordinate_system = PixelSpace(width=image_width, height=image_height)
537
    for el, layout_el in zip(elements, ocr_data):
538
        assert el.metadata.coordinates == CoordinatesMetadata(
539
            points=layout_el.bbox.coordinates,
540
            system=coordinate_system,
541
        )
542
unstructured

Использование cookies