unstructured
638 строк · 23.3 Кб
1# pyright: reportPrivateUsage=false
2
3"""Test-suite for `unstructured.documents.elements` module."""
4
5from __future__ import annotations6
7import json8import pathlib9from functools import partial10
11import pytest12
13from unstructured.cleaners.core import clean_prefix14from unstructured.cleaners.translate import translate_text15from unstructured.documents.coordinates import (16CoordinateSystem,17Orientation,18RelativeCoordinateSystem,19)
20from unstructured.documents.elements import (21UUID,22ConsolidationStrategy,23CoordinatesMetadata,24DataSourceMetadata,25Element,26ElementMetadata,27NoID,28Points,29RegexMetadata,30Text,31)
32
33
34def test_text_id():35text_element = Text(text="hello there!")36assert text_element.id == "c69509590d81db2f37f9d75480c8efed"37
38
39def test_text_uuid():40text_element = Text(text="hello there!", element_id=UUID())41
42id = text_element.id43
44assert isinstance(id, str)45assert len(id) == 3646assert id.count("-") == 447# -- Test that the element is JSON serializable. This shold run without an error --48json.dumps(text_element.to_dict())49
50
51def test_element_defaults_to_blank_id():52element = Element()53assert isinstance(element.id, NoID)54
55
56def test_element_uuid():57element = Element(element_id=UUID())58assert isinstance(element.id, UUID)59
60
61def test_text_element_apply_cleaners():62text_element = Text(text="[1] A Textbook on Crocodile Habitats")63
64text_element.apply(partial(clean_prefix, pattern=r"\[\d{1,2}\]"))65assert str(text_element) == "A Textbook on Crocodile Habitats"66
67
68def test_text_element_apply_multiple_cleaners():69cleaners = [70partial(clean_prefix, pattern=r"\[\d{1,2}\]"),71partial(translate_text, target_lang="ru"),72]73text_element = Text(text="[1] A Textbook on Crocodile Habitats")74text_element.apply(*cleaners)75assert str(text_element) == "Учебник по крокодильным средам обитания"76
77
78def test_apply_raises_if_func_does_not_produce_string():79def bad_cleaner(s: str):80return 181
82text_element = Text(text="[1] A Textbook on Crocodile Habitats")83
84with pytest.raises(ValueError, match="Cleaner produced a non-string output."):85text_element.apply(bad_cleaner) # pyright: ignore[reportGeneralTypeIssues]86
87
88@pytest.mark.parametrize(89("coordinates", "orientation1", "orientation2", "expected_coords"),90[91(92((1, 2), (1, 4), (3, 4), (3, 2)),93Orientation.CARTESIAN,94Orientation.CARTESIAN,95((10, 20), (10, 40), (30, 40), (30, 20)),96),97(98((1, 2), (1, 4), (3, 4), (3, 2)),99Orientation.CARTESIAN,100Orientation.SCREEN,101((10, 1980), (10, 1960), (30, 1960), (30, 1980)),102),103(104((1, 2), (1, 4), (3, 4), (3, 2)),105Orientation.SCREEN,106Orientation.CARTESIAN,107((10, 1980), (10, 1960), (30, 1960), (30, 1980)),108),109(110((1, 2), (1, 4), (3, 4), (3, 2)),111Orientation.SCREEN,112Orientation.SCREEN,113((10, 20), (10, 40), (30, 40), (30, 20)),114),115],116)
117def test_convert_coordinates_to_new_system(118coordinates: Points,119orientation1: Orientation,120orientation2: Orientation,121expected_coords: Points,122):123coord1 = CoordinateSystem(100, 200)124coord1.orientation = orientation1125coord2 = CoordinateSystem(1000, 2000)126coord2.orientation = orientation2127element = Element(coordinates=coordinates, coordinate_system=coord1)128
129new_coords = element.convert_coordinates_to_new_system(coord2)130
131assert new_coords is not None132for new_coord, expected in zip(new_coords, expected_coords):133assert new_coord == pytest.approx(expected) # pyright: ignore[reportUnknownMemberType]134element.convert_coordinates_to_new_system(coord2, in_place=True)135assert element.metadata.coordinates is not None136assert element.metadata.coordinates.points is not None137for new_coord, expected in zip(element.metadata.coordinates.points, expected_coords):138assert new_coord == pytest.approx(expected) # pyright: ignore[reportUnknownMemberType]139assert element.metadata.coordinates.system == coord2140
141
142def test_convert_coordinate_to_new_system_none():143element = Element(coordinates=None, coordinate_system=None)144coord = CoordinateSystem(100, 200)145coord.orientation = Orientation.SCREEN146assert element.convert_coordinates_to_new_system(coord) is None147
148
149def test_element_constructor_coordinates_all_present():150coordinates = ((1, 2), (1, 4), (3, 4), (3, 2))151coordinate_system = RelativeCoordinateSystem()152element = Element(coordinates=coordinates, coordinate_system=coordinate_system)153expected_coordinates_metadata = CoordinatesMetadata(154points=coordinates,155system=coordinate_system,156)157assert element.metadata.coordinates == expected_coordinates_metadata158
159
160def test_element_constructor_coordinates_points_absent():161with pytest.raises(ValueError) as exc_info:162Element(coordinate_system=RelativeCoordinateSystem())163assert (164str(exc_info.value)165== "Coordinates points should not exist without coordinates system and vice versa."166)167
168
169def test_element_constructor_coordinates_system_absent():170with pytest.raises(ValueError) as exc_info:171Element(coordinates=((1, 2), (1, 4), (3, 4), (3, 2)))172assert (173str(exc_info.value)174== "Coordinates points should not exist without coordinates system and vice versa."175)176
177
178def test_coordinate_metadata_serdes():179coordinates = ((1, 2), (1, 4), (3, 4), (3, 2))180coordinate_system = RelativeCoordinateSystem()181coordinates_metadata = CoordinatesMetadata(points=coordinates, system=coordinate_system)182expected_schema = {183"layout_height": 1,184"layout_width": 1,185"points": ((1, 2), (1, 4), (3, 4), (3, 2)),186"system": "RelativeCoordinateSystem",187}188coordinates_metadata_dict = coordinates_metadata.to_dict()189assert coordinates_metadata_dict == expected_schema190assert CoordinatesMetadata.from_dict(coordinates_metadata_dict) == coordinates_metadata191
192
193def test_element_to_dict():194coordinates = ((1, 2), (1, 4), (3, 4), (3, 2))195coordinate_system = RelativeCoordinateSystem()196element = Element(197element_id="awt32t1",198coordinates=coordinates,199coordinate_system=coordinate_system,200)201
202assert element.to_dict() == {203"metadata": {204"coordinates": {205"layout_height": 1,206"layout_width": 1,207"points": ((1, 2), (1, 4), (3, 4), (3, 2)),208"system": "RelativeCoordinateSystem",209},210},211"type": None,212"text": "",213"element_id": "awt32t1",214}215
216
217def test_regex_metadata_round_trips_through_JSON():218"""metadata.regex_metadata should appear at full depth in JSON."""219regex_metadata = {220"mail-stop": [RegexMetadata(text="MS-107", start=18, end=24)],221"version": [222RegexMetadata(text="current=v1.7.2", start=7, end=21),223RegexMetadata(text="supersedes=v1.7.2", start=22, end=40),224],225}226metadata = ElementMetadata(regex_metadata=regex_metadata)227
228metadata_json = json.dumps(metadata.to_dict())229deserialized_metadata = ElementMetadata.from_dict(json.loads(metadata_json))230reserialized_metadata_json = json.dumps(deserialized_metadata.to_dict())231
232assert reserialized_metadata_json == metadata_json233
234
235class DescribeElementMetadata:236"""Unit-test suite for `unstructured.documents.elements.ElementMetadata`."""237
238# -- It can be constructed with known keyword arguments. In particular, including a non-known239# -- keyword argument produces a type-error at development time and raises an exception at240# -- runtime. This catches typos before they reach production.241
242def it_detects_unknown_constructor_args_at_both_development_time_and_runtime(self):243with pytest.raises(TypeError, match="got an unexpected keyword argument 'file_name'"):244ElementMetadata(file_name="memo.docx") # pyright: ignore[reportGeneralTypeIssues]245
246@pytest.mark.parametrize(247"file_path",248[249pathlib.Path("documents/docx") / "memos" / "memo-2023-11-10.docx",250"documents/docx/memos/memo-2023-11-10.docx",251],252)253def it_accommodates_either_a_pathlib_Path_or_str_for_its_filename_arg(254self, file_path: pathlib.Path | str255):256meta = ElementMetadata(filename=file_path)257
258assert meta.file_directory == "documents/docx/memos"259assert meta.filename == "memo-2023-11-10.docx"260
261def it_leaves_both_filename_and_file_directory_None_when_neither_is_specified(self):262meta = ElementMetadata()263
264assert meta.file_directory is None265assert meta.filename is None266
267@pytest.mark.parametrize("file_path", [pathlib.Path("memo.docx"), "memo.docx"])268def and_it_leaves_file_directory_None_when_not_specified_and_filename_is_not_a_path(269self, file_path: pathlib.Path | str270):271meta = ElementMetadata(filename=file_path)272
273assert meta.file_directory is None274assert meta.filename == "memo.docx"275
276def and_it_splits_off_directory_path_from_its_filename_arg_when_it_is_a_file_path(self):277meta = ElementMetadata(filename="documents/docx/memo-2023-11-11.docx")278
279assert meta.file_directory == "documents/docx"280assert meta.filename == "memo-2023-11-11.docx"281
282def but_it_prefers_a_specified_file_directory_when_filename_also_contains_a_path(self):283meta = ElementMetadata(filename="tmp/staging/memo.docx", file_directory="documents/docx")284
285assert meta.file_directory == "documents/docx"286assert meta.filename == "memo.docx"287
288# -- It knows the types of its known members so type-checking support is available. --289
290def it_knows_the_types_of_its_known_members_so_type_checking_support_is_available(self):291ElementMetadata(292category_depth="2", # pyright: ignore[reportGeneralTypeIssues]293file_directory=True, # pyright: ignore[reportGeneralTypeIssues]294text_as_html=42, # pyright: ignore[reportGeneralTypeIssues]295)296# -- it does not check types at runtime however (choosing to avoid validation overhead) --297
298# -- It only stores a field's value when it is not None. --299
300def it_returns_the_value_of_an_attribute_it_has(self):301meta = ElementMetadata(url="https://google.com")302assert "url" in meta.__dict__303assert meta.url == "https://google.com"304
305def and_it_returns_None_for_a_known_attribute_it_does_not_have(self):306meta = ElementMetadata()307assert "url" not in meta.__dict__308assert meta.url is None309
310def but_it_raises_AttributeError_for_an_unknown_attribute_it_does_not_have(self):311meta = ElementMetadata()312assert "coefficient" not in meta.__dict__313with pytest.raises(AttributeError, match="object has no attribute 'coefficient'"):314meta.coefficient315
316def it_stores_a_non_None_field_value_when_assigned(self):317meta = ElementMetadata()318assert "file_directory" not in meta.__dict__319meta.file_directory = "tmp/"320assert "file_directory" in meta.__dict__321assert meta.file_directory == "tmp/"322
323def it_removes_a_field_when_None_is_assigned_to_it(self):324meta = ElementMetadata(file_directory="tmp/")325assert "file_directory" in meta.__dict__326assert meta.file_directory == "tmp/"327
328meta.file_directory = None329assert "file_directory" not in meta.__dict__330assert meta.file_directory is None331
332# -- It can serialize itself to a dict -------------------------------------------------------333
334def it_can_serialize_itself_to_a_dict(self):335meta = ElementMetadata(336category_depth=1,337file_directory="tmp/",338page_number=2,339text_as_html="<table></table>",340url="https://google.com",341)342assert meta.to_dict() == {343"category_depth": 1,344"file_directory": "tmp/",345"page_number": 2,346"text_as_html": "<table></table>",347"url": "https://google.com",348}349
350def and_it_serializes_a_coordinates_sub_object_to_a_dict_when_it_is_present(self):351meta = ElementMetadata(352category_depth=1,353coordinates=CoordinatesMetadata(354points=((2, 2), (1, 4), (3, 4), (3, 2)),355system=RelativeCoordinateSystem(),356),357page_number=2,358)359assert meta.to_dict() == {360"category_depth": 1,361"coordinates": {362"layout_height": 1,363"layout_width": 1,364"points": ((2, 2), (1, 4), (3, 4), (3, 2)),365"system": "RelativeCoordinateSystem",366},367"page_number": 2,368}369
370def and_it_serializes_a_data_source_sub_object_to_a_dict_when_it_is_present(self):371meta = ElementMetadata(372category_depth=1,373data_source=DataSourceMetadata(374url="https://www.nih.gov/about-nih/who-we-are/nih-director",375date_created="2023-11-09",376),377page_number=2,378)379assert meta.to_dict() == {380"category_depth": 1,381"data_source": {382"url": "https://www.nih.gov/about-nih/who-we-are/nih-director",383"date_created": "2023-11-09",384},385"page_number": 2,386}387
388def but_unlike_in_ElementMetadata_unknown_fields_in_sub_objects_are_ignored(self):389"""Metadata sub-objects ignore fields they do not explicitly define.390
391This is _not_ the case for ElementMetadata itself where an non-known field is welcomed as a
392user-defined ad-hoc metadata field.
393"""
394element_metadata = {395"new_field": "hello",396"data_source": {397"new_field": "world",398},399"coordinates": {400"new_field": "foo",401},402}403
404metadata = ElementMetadata.from_dict(element_metadata)405metadata_dict = metadata.to_dict()406
407assert "new_field" in metadata_dict408assert "new_field" not in metadata_dict["coordinates"]409assert "new_field" not in metadata_dict["data_source"]410
411# -- It can deserialize itself from a dict ---------------------------------------------------412
413def it_can_deserialize_itself_from_a_dict(self):414meta_dict = {415"category_depth": 1,416"coefficient": 0.58,417"coordinates": {418"layout_height": 4,419"layout_width": 2,420"points": ((1, 2), (1, 4), (3, 4), (3, 2)),421"system": "RelativeCoordinateSystem",422},423"data_source": {424"url": "https://www.nih.gov/about-nih/who-we-are/nih-director",425"date_created": "2023-11-09",426},427"languages": ["eng"],428}429
430meta = ElementMetadata.from_dict(meta_dict)431
432# -- known fields present in dict are present in meta --433assert meta.category_depth == 1434
435# -- known sub-object fields present in dict are present in meta --436assert meta.coordinates == CoordinatesMetadata(437points=((1, 2), (1, 4), (3, 4), (3, 2)),438system=RelativeCoordinateSystem(),439)440assert meta.data_source == DataSourceMetadata(441url="https://www.nih.gov/about-nih/who-we-are/nih-director",442date_created="2023-11-09",443)444
445# -- known fields absent from dict report None but are not present in meta --446assert meta.file_directory is None447assert "file_directory" not in meta.__dict__448
449# -- non-known fields present in dict are present in meta (we have no way to tell whether450# -- they are "ad-hoc" or not because we lack indication of user-intent)451assert meta.coefficient == 0.58452
453# -- ad-hoc fields absent from dict raise on attempted access --454with pytest.raises(AttributeError, match="ntMetadata' object has no attribute 'quotient'"):455meta.quotient456
457# -- but that can be worked around by end-user --458assert (meta.quotient if hasattr(meta, "quotient") else None) is None459
460# -- mutating a mutable (collection) field does not affect the original value --461assert isinstance(meta.languages, list)462assert meta.languages == ["eng"]463meta.languages.append("spa")464assert meta.languages == ["eng", "spa"]465assert meta_dict["languages"] == ["eng"]466
467# -- It allows downstream users to add an arbitrary new member by assignment. ----------------468
469def it_allows_an_end_user_to_add_an_arbitrary_field(self):470meta = ElementMetadata()471meta.foobar = 7472assert "foobar" in meta.__dict__473assert meta.foobar == 7474
475def and_fields_so_added_appear_in_the_metadata_JSON(self):476meta = ElementMetadata()477meta.foobar = 7478assert meta.to_dict() == {"foobar": 7}479
480def and_it_removes_an_end_user_field_when_it_is_assigned_None(self):481meta = ElementMetadata()482meta.foobar = 7483assert "foobar" in meta.__dict__484meta.foobar = None485assert "foobar" not in meta.__dict__486with pytest.raises(487AttributeError, match="'ElementMetadata' object has no attribute 'foobar'"488):489meta.foobar490
491# -- It can update itself from another instance ----------------------------------------------492
493def it_can_update_itself_from_another_instance(self):494meta = ElementMetadata(category_depth=1, page_number=1)495meta.coefficient = 0.58496meta.stem_length = 18497other = ElementMetadata(file_directory="tmp/", page_number=2)498other.quotient = 1.4499other.stem_length = 20500
501meta.update(other)502
503# -- known-fields present on self but not other are unchanged --504assert meta.category_depth == 1505# -- known-fields present on other but not self are added --506assert meta.file_directory == "tmp/"507# -- known-fields present on both self and other are updated --508assert meta.page_number == 2509# -- ad-hoc-fields present on self but not other are unchanged --510assert meta.coefficient == 0.58511# -- ad-hoc-fields present on other but not self are added --512assert meta.quotient == 1.4513# -- ad-hoc-fields present on both self and other are updated --514assert meta.stem_length == 20515# -- other is left unchanged --516assert other.category_depth is None517assert other.file_directory == "tmp/"518assert other.page_number == 2519assert other.text_as_html is None520assert other.url is None521assert other.quotient == 1.4522assert other.stem_length == 20523with pytest.raises(AttributeError, match="etadata' object has no attribute 'coefficient'"):524other.coefficient525
526def but_it_raises_on_attempt_to_update_from_a_non_ElementMetadata_object(self):527meta = ElementMetadata()528with pytest.raises(ValueError, match=r"ate\(\)' must be an instance of 'ElementMetadata'"):529meta.update({"coefficient": "0.56"}) # pyright: ignore[reportGeneralTypeIssues]530
531# -- It knows when it is equal to another instance -------------------------------------------532
533def it_is_equal_to_another_instance_with_the_same_known_field_values(self):534meta = ElementMetadata(535category_depth=1,536coordinates=CoordinatesMetadata(537points=((1, 2), (1, 4), (3, 4), (3, 2)),538system=RelativeCoordinateSystem(),539),540data_source=DataSourceMetadata(541url="https://www.nih.gov/about-nih/who-we-are/nih-director",542date_created="2023-11-08",543),544file_directory="tmp/",545languages=["eng"],546page_number=2,547text_as_html="<table></table>",548url="https://google.com",549)550assert meta == ElementMetadata(551category_depth=1,552coordinates=CoordinatesMetadata(553points=((1, 2), (1, 4), (3, 4), (3, 2)),554system=RelativeCoordinateSystem(),555),556data_source=DataSourceMetadata(557url="https://www.nih.gov/about-nih/who-we-are/nih-director",558date_created="2023-11-08",559),560file_directory="tmp/",561languages=["eng"],562page_number=2,563text_as_html="<table></table>",564url="https://google.com",565)566
567def but_it_is_never_equal_to_a_non_ElementMetadata_object(self):568class NotElementMetadata:569pass570
571meta = ElementMetadata()572other = NotElementMetadata()573
574# -- all the "fields" are the same --575assert meta.__dict__ == other.__dict__576# -- but it is rejected solely because its type is different --577assert meta != other578
579def it_is_equal_to_another_instance_with_the_same_ad_hoc_field_values(self):580meta = ElementMetadata(category_depth=1)581meta.coefficient = 0.58582other = ElementMetadata(category_depth=1)583other.coefficient = 0.58584
585assert meta == other586
587def but_it_is_not_equal_to_an_instance_with_ad_hoc_fields_that_differ(self):588meta = ElementMetadata(category_depth=1)589meta.coefficient = 0.58590other = ElementMetadata(category_depth=1)591other.coefficient = 0.72592
593assert meta != other594
595def it_is_not_equal_when_a_list_field_contains_different_items(self):596meta = ElementMetadata(languages=["eng"])597assert meta != ElementMetadata(languages=["eng", "spa"])598
599def and_it_is_not_equal_when_the_coordinates_sub_object_field_differs(self):600meta = ElementMetadata(601coordinates=CoordinatesMetadata(602points=((1, 2), (1, 4), (3, 4), (3, 2)),603system=RelativeCoordinateSystem(),604)605)606assert meta != ElementMetadata(607coordinates=CoordinatesMetadata(608points=((2, 2), (2, 4), (3, 4), (4, 2)),609system=RelativeCoordinateSystem(),610)611)612
613def and_it_is_not_equal_when_the_data_source_sub_object_field_differs(self):614meta = ElementMetadata(615data_source=DataSourceMetadata(616url="https://www.nih.gov/about-nih/who-we-are/nih-director",617date_created="2023-11-08",618)619)620assert meta != ElementMetadata(621data_source=DataSourceMetadata(622url="https://www.nih.gov/about-nih/who-we-are/nih-director",623date_created="2023-11-09",624)625)626
627# -- There is a consolidation-strategy for all known fields ----------------------------------628
629def it_can_find_the_consolidation_strategy_for_each_of_its_known_fields(self):630metadata = ElementMetadata()631metadata_field_names = sorted(metadata._known_field_names)632consolidation_strategies = ConsolidationStrategy.field_consolidation_strategies()633
634for field_name in metadata_field_names:635assert field_name in consolidation_strategies, (636f"ElementMetadata field `.{field_name}` does not have a consolidation strategy."637f" Add one in `ConsolidationStrategy.field_consolidation_strategies()."638)639