unstructured
541 строка · 16.4 Кб
1from dataclasses import dataclass2from unittest import mock3
4import pytest5from PIL import Image6from unstructured_inference.inference import layout7from unstructured_inference.inference.elements import TextRegion8from unstructured_inference.inference.layout import DocumentLayout, LayoutElement, PageLayout9
10from unstructured.documents.coordinates import PixelSpace11from unstructured.documents.elements import (12CheckBox,13CoordinatesMetadata,14ElementMetadata,15ElementType,16FigureCaption,17Header,18ListItem,19NarrativeText,20Text,21Title,22)
23from unstructured.documents.elements import (24Image as ImageElement,25)
26from unstructured.partition import common27from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_DONT, SORT_MODE_XY_CUT28
29
30class MockPageLayout(layout.PageLayout):31def __init__(self, number: int, image: Image):32self.number = number33self.image = image34
35@property36def elements(self):37return [38LayoutElement(39type="Headline",40text="Charlie Brown and the Great Pumpkin",41bbox=None,42),43LayoutElement(44type="Subheadline",45text="The Beginning",46bbox=None,47),48LayoutElement(49type="Text",50text="This time Charlie Brown had it really tricky...",51bbox=None,52),53LayoutElement(54type="Title",55text="Another book title in the same page",56bbox=None,57),58]59
60
61class MockDocumentLayout(layout.DocumentLayout):62@property63def pages(self):64return [65MockPageLayout(number=1, image=Image.new("1", (1, 1))),66]67
68
69def test_normalize_layout_element_dict():70layout_element = {71"type": "Title",72"coordinates": [[1, 2], [3, 4], [5, 6], [7, 8]],73"coordinate_system": None,74"text": "Some lovely text",75}76coordinate_system = PixelSpace(width=10, height=20)77element = common.normalize_layout_element(78layout_element,79coordinate_system=coordinate_system,80)81assert element == Title(82text="Some lovely text",83coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]],84coordinate_system=coordinate_system,85)86
87
88def test_normalize_layout_element_dict_caption():89layout_element = {90"type": "Figure",91"coordinates": ((1, 2), (3, 4), (5, 6), (7, 8)),92"text": "Some lovely text",93}94coordinate_system = PixelSpace(width=10, height=20)95element = common.normalize_layout_element(96layout_element,97coordinate_system=coordinate_system,98)99assert element == ImageElement(100text="Some lovely text",101coordinates=((1, 2), (3, 4), (5, 6), (7, 8)),102coordinate_system=coordinate_system,103)104
105
106@pytest.mark.parametrize(107("element_type", "expected_type", "expected_depth"),108[109("Title", Title, None),110("Headline", Title, 1),111("Subheadline", Title, 2),112("Header", Header, None),113],114)
115def test_normalize_layout_element_headline(element_type, expected_type, expected_depth):116layout_element = {117"type": element_type,118"coordinates": [[1, 2], [3, 4], [5, 6], [7, 8]],119"text": "Some lovely text",120}121coordinate_system = PixelSpace(width=10, height=20)122element = common.normalize_layout_element(layout_element, coordinate_system=coordinate_system)123assert element.metadata.category_depth == expected_depth124assert isinstance(element, expected_type)125
126
127def test_normalize_layout_element_dict_figure_caption():128layout_element = {129"type": "FigureCaption",130"coordinates": [[1, 2], [3, 4], [5, 6], [7, 8]],131"text": "Some lovely text",132}133coordinate_system = PixelSpace(width=10, height=20)134element = common.normalize_layout_element(135layout_element,136coordinate_system=coordinate_system,137)138assert element == FigureCaption(139text="Some lovely text",140coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]],141coordinate_system=coordinate_system,142)143
144
145def test_normalize_layout_element_dict_misc():146layout_element = {147"type": "Misc",148"coordinates": [[1, 2], [3, 4], [5, 6], [7, 8]],149"text": "Some lovely text",150}151coordinate_system = PixelSpace(width=10, height=20)152element = common.normalize_layout_element(153layout_element,154coordinate_system=coordinate_system,155)156assert element == Text(157text="Some lovely text",158coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]],159coordinate_system=coordinate_system,160)161
162
163def test_normalize_layout_element_layout_element():164layout_element = LayoutElement.from_coords(165type="Text",166x1=1,167y1=2,168x2=3,169y2=4,170text="Some lovely text",171)172coordinate_system = PixelSpace(width=10, height=20)173element = common.normalize_layout_element(174layout_element,175coordinate_system=coordinate_system,176)177assert element == NarrativeText(178text="Some lovely text",179coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),180coordinate_system=coordinate_system,181)182
183
184def test_normalize_layout_element_layout_element_narrative_text():185layout_element = LayoutElement.from_coords(186type="NarrativeText",187x1=1,188y1=2,189x2=3,190y2=4,191text="Some lovely text",192)193coordinate_system = PixelSpace(width=10, height=20)194element = common.normalize_layout_element(195layout_element,196coordinate_system=coordinate_system,197)198assert element == NarrativeText(199text="Some lovely text",200coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),201coordinate_system=coordinate_system,202)203
204
205def test_normalize_layout_element_checked_box():206layout_element = LayoutElement.from_coords(207type="Checked",208x1=1,209y1=2,210x2=3,211y2=4,212text="",213)214coordinate_system = PixelSpace(width=10, height=20)215element = common.normalize_layout_element(216layout_element,217coordinate_system=coordinate_system,218)219assert element == CheckBox(220checked=True,221coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),222coordinate_system=coordinate_system,223)224
225
226def test_normalize_layout_element_unchecked_box():227layout_element = LayoutElement.from_coords(228type="Unchecked",229x1=1,230y1=2,231x2=3,232y2=4,233text="",234)235coordinate_system = PixelSpace(width=10, height=20)236element = common.normalize_layout_element(237layout_element,238coordinate_system=coordinate_system,239)240assert element == CheckBox(241checked=False,242coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),243coordinate_system=coordinate_system,244)245
246
247def test_normalize_layout_element_enumerated_list():248layout_element = LayoutElement.from_coords(249type="List",250x1=1,251y1=2,252x2=3,253y2=4,254text="1. I'm so cool! 2. You're cool too. 3. We're all cool!",255)256coordinate_system = PixelSpace(width=10, height=20)257elements = common.normalize_layout_element(258layout_element,259coordinate_system=coordinate_system,260)261assert elements == [262ListItem(263text="I'm so cool!",264coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),265coordinate_system=coordinate_system,266),267ListItem(268text="You're cool too.",269coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),270coordinate_system=coordinate_system,271),272ListItem(273text="We're all cool!",274coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),275coordinate_system=coordinate_system,276),277]278
279
280def test_normalize_layout_element_bulleted_list():281layout_element = LayoutElement.from_coords(282type="List",283x1=1,284y1=2,285x2=3,286y2=4,287text="* I'm so cool! * You're cool too. * We're all cool!",288)289coordinate_system = PixelSpace(width=10, height=20)290elements = common.normalize_layout_element(291layout_element,292coordinate_system=coordinate_system,293)294assert elements == [295ListItem(296text="I'm so cool!",297coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),298coordinate_system=coordinate_system,299),300ListItem(301text="You're cool too.",302coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),303coordinate_system=coordinate_system,304),305ListItem(306text="We're all cool!",307coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),308coordinate_system=coordinate_system,309),310]311
312
313class MockPopenWithError:314def __init__(self, *args, **kwargs):315pass316
317def communicate(self):318return b"", b"an error occurred"319
320
321def test_convert_office_doc_captures_errors(monkeypatch, caplog):322import subprocess323
324monkeypatch.setattr(subprocess, "Popen", MockPopenWithError)325common.convert_office_doc("no-real.docx", "fake-directory", target_format="docx")326assert "an error occurred" in caplog.text327
328
329class MockDocxEmptyTable:330def __init__(self):331self.rows = []332
333
334def test_convert_ms_office_table_to_text_works_with_empty_tables():335table = MockDocxEmptyTable()336assert common.convert_ms_office_table_to_text(table, as_html=True) == ""337assert common.convert_ms_office_table_to_text(table, as_html=False) == ""338
339
340@pytest.mark.parametrize(341("text", "expected"),342[343("<table><tbody><tr><td>👨\\U+1F3FB🔧</td></tr></tbody></table>", True),344("<table><tbody><tr><td>Hello!</td></tr></tbody></table>", False),345],346)
347def test_contains_emoji(text, expected):348assert common.contains_emoji(text) is expected349
350
351def test_document_to_element_list_omits_coord_system_when_coord_points_absent():352layout_elem_absent_coordinates = MockDocumentLayout()353for page in layout_elem_absent_coordinates.pages:354for el in page.elements:355el.bbox = None356elements = common.document_to_element_list(layout_elem_absent_coordinates)357assert elements[0].metadata.coordinates is None358
359
360def test_get_page_image_metadata_and_coordinate_system():361doc = MockDocumentLayout()362metadata = common._get_page_image_metadata(doc.pages[0])363assert isinstance(metadata, dict)364
365
366def test_set_element_hierarchy():367elements_to_set = [368Title(text="Title"), # 0369NarrativeText(text="NarrativeText"), # 1370FigureCaption(text="FigureCaption"), # 2371ListItem(text="ListItem"), # 3372ListItem(text="ListItem", metadata=ElementMetadata(category_depth=1)), # 4373ListItem(text="ListItem", metadata=ElementMetadata(category_depth=1)), # 5374ListItem(text="ListItem"), # 6375CheckBox(element_id="some-id-1", checked=True), # 7376Title(text="Title 2"), # 8377ListItem(text="ListItem"), # 9378ListItem(text="ListItem"), # 10379Text(text="Text"), # 11380]381elements = common.set_element_hierarchy(elements_to_set)382
383assert (384elements[1].metadata.parent_id == elements[0].id385), "NarrativeText should be child of Title"386assert (387elements[2].metadata.parent_id == elements[0].id388), "FigureCaption should be child of Title"389assert elements[3].metadata.parent_id == elements[0].id, "ListItem should be child of Title"390assert elements[4].metadata.parent_id == elements[3].id, "ListItem should be child of Title"391assert elements[5].metadata.parent_id == elements[3].id, "ListItem should be child of Title"392assert elements[6].metadata.parent_id == elements[0].id, "ListItem should be child of Title"393assert (394elements[7].metadata.parent_id is None395), "CheckBox should be None, as it's not a Text based element"396assert elements[8].metadata.parent_id is None, "Title 2 should be child of None"397assert elements[9].metadata.parent_id == elements[8].id, "ListItem should be child of Title 2"398assert elements[10].metadata.parent_id == elements[8].id, "ListItem should be child of Title 2"399assert elements[11].metadata.parent_id == elements[8].id, "Text should be child of Title 2"400
401
402def test_set_element_hierarchy_custom_rule_set():403elements_to_set = [404Header(text="Header"), # 0405Title(text="Title"), # 1406NarrativeText(text="NarrativeText"), # 2407Text(text="Text"), # 3408Title(text="Title 2"), # 4409FigureCaption(text="FigureCaption"), # 5410]411
412custom_rule_set = {413"Header": ["Title", "Text"],414"Title": ["NarrativeText", "UncategorizedText", "FigureCaption"],415}416
417elements = common.set_element_hierarchy(418elements=elements_to_set,419ruleset=custom_rule_set,420)421
422assert elements[1].metadata.parent_id == elements[0].id, "Title should be child of Header"423assert (424elements[2].metadata.parent_id == elements[1].id425), "NarrativeText should be child of Title"426assert elements[3].metadata.parent_id == elements[1].id, "Text should be child of Title"427assert elements[4].metadata.parent_id == elements[0].id, "Title 2 should be child of Header"428assert (429elements[5].metadata.parent_id == elements[4].id430), "FigureCaption should be child of Title 2"431
432
433@dataclass
434class MockImage:435width = 640436height = 480437format = "JPG"438
439
440def test_document_to_element_list_handles_parent():441block1 = LayoutElement.from_coords(1, 2, 3, 4, text="block 1", type="NarrativeText")442block2 = LayoutElement.from_coords(4431,4442,4453,4464,447text="block 2",448parent=block1,449type="NarrativeText",450)451page = PageLayout(452number=1,453image=MockImage(),454)455page.elements = [block1, block2]456doc = DocumentLayout.from_pages([page])457el1, el2 = common.document_to_element_list(doc)458assert el2.metadata.parent_id == el1.id459
460
461@pytest.mark.parametrize(462("sort_mode", "call_count"),463[(SORT_MODE_DONT, 0), (SORT_MODE_BASIC, 1), (SORT_MODE_XY_CUT, 1)],464)
465def test_document_to_element_list_doesnt_sort_on_sort_method(sort_mode, call_count):466block1 = LayoutElement.from_coords(1, 2, 3, 4, text="block 1", type="NarrativeText")467block2 = LayoutElement.from_coords(4681,4692,4703,4714,472text="block 2",473parent=block1,474type="NarrativeText",475)476page = PageLayout(477number=1,478image=MockImage(),479)480page.elements = [block1, block2]481doc = DocumentLayout.from_pages([page])482with mock.patch.object(common, "sort_page_elements") as mock_sort_page_elements:483common.document_to_element_list(doc, sortable=True, sort_mode=sort_mode)484assert mock_sort_page_elements.call_count == call_count485
486
487def test_document_to_element_list_sets_category_depth_titles():488layout_with_hierarchies = MockDocumentLayout()489elements = common.document_to_element_list(layout_with_hierarchies)490assert elements[0].metadata.category_depth == 1491assert elements[1].metadata.category_depth == 2492assert elements[2].metadata.category_depth is None493assert elements[3].metadata.category_depth == 0494
495
496def test_ocr_data_to_elements(497filename="example-docs/layout-parser-paper-fast.jpg",498):499text_regions = [500TextRegion.from_coords(501163.0,502115.0,503452.0,504129.0,505text="LayoutParser: A Unified Toolkit for Deep",506),507TextRegion.from_coords(508156.0,509132.0,510457.0,511147.0,512text="Learning Based Document Image Analysis",513),514]515ocr_data = [516LayoutElement(517bbox=r.bbox,518text=r.text,519source=r.source,520type=ElementType.UNCATEGORIZED_TEXT,521)522for r in text_regions523]524image = Image.open(filename)525
526elements = common.ocr_data_to_elements(527ocr_data=ocr_data,528image_size=image.size,529)530
531assert len(ocr_data) == len(elements)532assert {el.category for el in elements} == {ElementType.UNCATEGORIZED_TEXT}533
534# check coordinates metadata535image_width, image_height = image.size536coordinate_system = PixelSpace(width=image_width, height=image_height)537for el, layout_el in zip(elements, ocr_data):538assert el.metadata.coordinates == CoordinatesMetadata(539points=layout_el.bbox.coordinates,540system=coordinate_system,541)542