unstructured
546 строк · 18.6 Кб
1# pyright: reportPrivateUsage=false
2
3from __future__ import annotations4
5import json6import os7import pathlib8from typing import Optional, Type9
10import pytest11from pytest_mock import MockerFixture12
13from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path14from unstructured.chunking.title import chunk_by_title15from unstructured.cleaners.core import group_broken_paragraphs16from unstructured.documents.elements import Address, ListItem, NarrativeText, Title17from unstructured.partition.text import (18_combine_paragraphs_less_than_min,19_split_content_to_fit_max,20partition_text,21)
22from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA23
24DIRECTORY = pathlib.Path(__file__).parent.resolve()25EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")26
27EXPECTED_OUTPUT = [28NarrativeText(text="This is a test document to use for unit tests."),29Address(text="Doylestown, PA 18901"),30Title(text="Important points:"),31ListItem(text="Hamburgers are delicious"),32ListItem(text="Dogs are the best"),33ListItem(text="I love fuzzy blankets"),34]
35
36MIN_MAX_TEXT = """This is a story. This is a story that doesn't matter37because it is just being used as an example. Hi. Hello. Howdy. Hola.
38The example is simple and repetitive and long and somewhat boring,
39but it serves a purpose. End.""".replace(40"\n",41"",42)
43
44SHORT_PARAGRAPHS = """This is a story.45
46This is a story that doesn't matter because it is just being used as an example.
47
48Hi.
49
50Hello.
51
52Howdy.
53
54Hola.
55
56The example is simple and repetitive and long and somewhat boring, but it serves a purpose.
57
58End.
59"""
60
61
62@pytest.mark.parametrize(63("filename", "encoding"),64[65("fake-text.txt", "utf-8"),66("fake-text.txt", None),67("fake-text-utf-16-be.txt", "utf-16-be"),68],69)
70def test_partition_text_from_filename(filename: str, encoding: Optional[str]):71filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)72elements = partition_text(filename=filename_path, encoding=encoding)73assert len(elements) > 074assert elements == EXPECTED_OUTPUT75for element in elements:76assert element.metadata.filename == filename77if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:78assert {element.metadata.detection_origin for element in elements} == {"text"}79
80
81def test_partition_text_from_filename_with_metadata_filename():82filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")83elements = partition_text(84filename=filename_path,85encoding="utf-8",86metadata_filename="test",87)88assert elements == EXPECTED_OUTPUT89for element in elements:90assert element.metadata.filename == "test"91
92
93@pytest.mark.parametrize(94"filename",95["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],96)
97def test_partition_text_from_filename_default_encoding(filename: str):98filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)99elements = partition_text(filename=filename_path)100assert len(elements) > 0101assert elements == EXPECTED_OUTPUT102for element in elements:103assert element.metadata.filename == filename104
105
106@pytest.mark.parametrize(107("filename", "encoding", "error"),108[109("fake-text.txt", "utf-16", UnicodeDecodeError),110("fake-text-utf-16-be.txt", "utf-16", UnicodeError),111],112)
113def test_partition_text_from_filename_raises_econding_error(114filename: str,115encoding: Optional[str],116error: Type[BaseException],117):118with pytest.raises(error):119filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)120partition_text(filename=filename, encoding=encoding)121
122
123def test_partition_text_from_file():124filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")125with open(filename, "rb") as f:126elements = partition_text(file=f)127assert len(elements) > 0128assert elements == EXPECTED_OUTPUT129for element in elements:130assert element.metadata.filename is None131
132
133def test_partition_text_from_file_with_metadata_filename():134filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")135with open(filename, "rb") as f:136elements = partition_text(file=f, metadata_filename="test")137assert len(elements) > 0138assert elements == EXPECTED_OUTPUT139for element in elements:140assert element.metadata.filename == "test"141
142
143@pytest.mark.parametrize(144"filename",145["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],146)
147def test_partition_text_from_file_default_encoding(filename: str):148filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)149with open(filename_path, "rb") as f:150elements = partition_text(file=f)151assert len(elements) > 0152assert elements == EXPECTED_OUTPUT153for element in elements:154assert element.metadata.filename is None155
156
157def test_partition_text_from_bytes_file():158filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")159with open(filename, "rb") as f:160elements = partition_text(file=f)161assert len(elements) > 0162assert elements == EXPECTED_OUTPUT163for element in elements:164assert element.metadata.filename is None165
166
167@pytest.mark.parametrize(168"filename",169["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],170)
171def test_partition_text_from_bytes_file_default_encoding(filename: str):172filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)173with open(filename_path, "rb") as f:174elements = partition_text(file=f)175assert len(elements) > 0176assert elements == EXPECTED_OUTPUT177for element in elements:178assert element.metadata.filename is None179
180
181def test_text_partition_element_metadata_user_provided_languages():182filename = "example-docs/book-war-and-peace-1p.txt"183elements = partition_text(filename=filename, strategy="fast", languages=["en"])184assert elements[0].metadata.languages == ["eng"]185
186
187def test_partition_text_from_text():188filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")189with open(filename) as f:190text = f.read()191elements = partition_text(text=text)192assert len(elements) > 0193assert elements == EXPECTED_OUTPUT194for element in elements:195assert element.metadata.filename is None196
197
198def test_partition_text_from_text_works_with_empty_string():199assert partition_text(text="") == []200
201
202def test_partition_text_raises_with_none_specified():203with pytest.raises(ValueError):204partition_text()205
206
207def test_partition_text_raises_with_too_many_specified():208filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")209with open(filename) as f:210text = f.read()211
212with pytest.raises(ValueError):213partition_text(filename=filename, text=text)214
215
216def test_partition_text_captures_everything_even_with_linebreaks():217text = """218VERY IMPORTANT MEMO
219DOYLESTOWN, PA 18901
220"""
221elements = partition_text(text=text)222assert elements == [223Title(text="VERY IMPORTANT MEMO"),224Address(text="DOYLESTOWN, PA 18901"),225]226for element in elements:227assert element.metadata.filename is None228
229
230def test_partition_text_groups_broken_paragraphs():231text = """The big brown fox232was walking down the lane.
233
234At the end of the lane,
235the fox met a bear."""
236
237elements = partition_text(text=text, paragraph_grouper=group_broken_paragraphs)238assert elements == [239NarrativeText(text="The big brown fox was walking down the lane."),240NarrativeText(text="At the end of the lane, the fox met a bear."),241]242for element in elements:243assert element.metadata.filename is None244
245
246def test_partition_text_extract_regex_metadata():247text = "SPEAKER 1: It is my turn to speak now!"248
249elements = partition_text(text=text, regex_metadata={"speaker": r"SPEAKER \d{1,3}"})250assert elements[0].metadata.regex_metadata == {251"speaker": [{"text": "SPEAKER 1", "start": 0, "end": 9}],252}253for element in elements:254assert element.metadata.filename is None255
256
257def test_partition_text_splits_long_text():258filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")259elements = partition_text(filename=filename)260assert len(elements) > 0261assert elements[0].text.startswith("Iwan Roberts")262assert elements[-1].text.endswith("External links")263
264
265def test_partition_text_splits_long_text_max_partition():266filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")267elements = partition_text(filename=filename)268elements_max_part = partition_text(filename=filename, max_partition=500)269# NOTE(klaijan) - I edited the operation here from < to <=270# Please revert back if this does not make sense271assert len(elements) <= len(elements_max_part)272for element in elements_max_part:273assert len(element.text) <= 500274
275# Make sure combined text is all the same276assert " ".join([el.text for el in elements]) == " ".join([el.text for el in elements_max_part])277
278
279def test_partition_text_splits_max_min_partition():280filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")281elements = partition_text(filename=filename)282elements_max_part = partition_text(filename=filename, min_partition=1000, max_partition=1500)283for i, element in enumerate(elements_max_part):284# NOTE(robinson) - the last element does not have a next element to merge with,285# so it can be short286if i < len(elements_max_part) - 1:287assert len(element.text) <= 1500288assert len(element.text) >= 1000289
290import re291
292from unstructured.nlp.patterns import BULLETS_PATTERN293
294# NOTE(klaijan) - clean the asterik out of both text.295# The `elements` was partitioned by new line and thus makes line 56 (shown below)296# "*Club domestic league appearances and goals"297# be considered as a bullet point by the function is_bulleted_text298# and so the asterik was removed from the paragraph299# whereas `elements_max_part` was partitioned differently and thus none of the line300# starts with any of the BULLETS_PATTERN.301
302# TODO(klaijan) - when edit the function partition_text to support non-bullet paragraph303# that starts with bullet-like BULLETS_PATTERN, remove the re.sub part from the assert below.304
305# Make sure combined text is all the same306assert re.sub(BULLETS_PATTERN, "", " ".join([el.text for el in elements])) == re.sub(307BULLETS_PATTERN,308"",309" ".join([el.text for el in elements_max_part]),310)311
312
313def test_partition_text_min_max():314segments = partition_text(text=SHORT_PARAGRAPHS, min_partition=6)315for i, segment in enumerate(segments):316# NOTE(robinson) - the last element does not have a next element to merge with,317# so it can be short318if i < len(segments) - 1:319assert len(segment.text) >= 6320
321segments = partition_text(text=SHORT_PARAGRAPHS, max_partition=20, min_partition=7)322for i, segment in enumerate(segments):323# NOTE(robinson) - the last element does not have a next element to merge with,324# so it can be short325if i < len(segments) - 1:326assert len(segment.text) >= 7327assert len(segment.text) <= 20328
329
330def test_split_content_to_fit_max():331segments = _split_content_to_fit_max(332content=MIN_MAX_TEXT,333max_partition=75,334)335assert segments == [336"This is a story.",337"This is a story that doesn't matter because",338"it is just being used as an example. Hi. Hello. Howdy. Hola.",339"The example is simple and repetitive and long",340"and somewhat boring, but it serves a purpose. End.",341]342
343
344def test_combine_paragraphs_less_than_min():345segments = _combine_paragraphs_less_than_min(346SHORT_PARAGRAPHS.split("\n\n"),347max_partition=1500,348min_partition=7,349)350assert len(segments) < len(SHORT_PARAGRAPHS)351
352
353def test_partition_text_doesnt_get_page_breaks():354text = "--------------------"355elements = partition_text(text=text)356assert len(elements) == 1357assert elements[0].text == text358assert not isinstance(elements[0], ListItem)359
360
361@pytest.mark.parametrize(362("filename", "encoding"),363[364("fake-text.txt", "utf-8"),365("fake-text.txt", None),366("fake-text-utf-16-be.txt", "utf-16-be"),367],368)
369def test_partition_text_from_filename_exclude_metadata(filename: str, encoding: Optional[str]):370filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)371elements = partition_text(372filename=filename,373encoding=encoding,374include_metadata=False,375)376for i in range(len(elements)):377assert elements[i].metadata.to_dict() == {}378
379
380def test_partition_text_from_file_exclude_metadata():381filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")382with open(filename, "rb") as f:383elements = partition_text(file=f, include_metadata=False)384for i in range(len(elements)):385assert elements[i].metadata.to_dict() == {}386
387
388def test_partition_text_metadata_date(mocker: MockerFixture):389filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")390mocked_last_modification_date = "2029-07-05T09:24:28"391
392mocker.patch(393"unstructured.partition.text.get_last_modified_date",394return_value=mocked_last_modification_date,395)396
397elements = partition_text(398filename=filename,399)400
401assert elements[0].metadata.last_modified == mocked_last_modification_date402
403
404def test_partition_text_with_custom_metadata_date(mocker: MockerFixture):405filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")406mocked_last_modification_date = "2029-07-05T09:24:28"407expected_last_modification_date = "2020-07-05T09:24:28"408
409mocker.patch(410"unstructured.partition.text.get_last_modified_date",411return_value=mocked_last_modification_date,412)413
414elements = partition_text(415filename=filename,416metadata_last_modified=expected_last_modification_date,417)418
419assert elements[0].metadata.last_modified == expected_last_modification_date420
421
422def test_partition_text_from_file_metadata_date(mocker: MockerFixture):423filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")424mocked_last_modification_date = "2029-07-05T09:24:28"425
426mocker.patch(427"unstructured.partition.text.get_last_modified_date_from_file",428return_value=mocked_last_modification_date,429)430
431with open(filename, "rb") as f:432elements = partition_text(433file=f,434)435
436assert elements[0].metadata.last_modified == mocked_last_modification_date437
438
439def test_partition_text_from_file_with_custom_metadata_date(mocker: MockerFixture):440filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")441mocked_last_modification_date = "2029-07-05T09:24:28"442expected_last_modification_date = "2020-07-05T09:24:28"443
444mocker.patch(445"unstructured.partition.text.get_last_modified_date_from_file",446return_value=mocked_last_modification_date,447)448
449with open(filename, "rb") as f:450elements = partition_text(file=f, metadata_last_modified=expected_last_modification_date)451
452assert elements[0].metadata.last_modified == expected_last_modification_date453
454
455def test_partition_text_from_text_metadata_date():456filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")457with open(filename) as f:458text = f.read()459
460elements = partition_text(461text=text,462)463assert elements[0].metadata.last_modified is None464
465
466def test_partition_text_from_text_with_custom_metadata_date():467filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")468expected_last_modification_date = "2020-07-05T09:24:28"469
470with open(filename) as f:471text = f.read()472
473elements = partition_text(text=text, metadata_last_modified=expected_last_modification_date)474
475assert elements[0].metadata.last_modified == expected_last_modification_date476
477
478def test_partition_text_with_unique_ids():479elements = partition_text(text="hello there!")480assert elements[0].id == "c69509590d81db2f37f9d75480c8efed"481# Test that the element is JSON serializable. This should run without an error482json.dumps(elements[0].to_dict())483
484elements = partition_text(text="hello there!", unique_element_ids=True)485id = elements[0].id486assert isinstance(id, str) # included for type-narrowing487assert len(id) == 36488assert id.count("-") == 4489# Test that the element is JSON serializable. This should run without an error490json.dumps(elements[0].to_dict())491
492
493@pytest.mark.parametrize(494("file_name", "encoding"),495[496("fake-text.txt", "utf-8"),497("fake-text.txt", None),498("fake-text-utf-16-be.txt", "utf-16-be"),499],500)
501def test_partition_text_with_json(file_name: str, encoding: str | None):502elements = partition_text(example_doc_path(file_name), encoding=encoding)503assert_round_trips_through_JSON(elements)504
505
506def test_add_chunking_strategy_on_partition_text():507filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")508elements = partition_text(filename=filename)509chunk_elements = partition_text(filename, chunking_strategy="by_title")510chunks = chunk_by_title(elements)511assert chunk_elements != elements512assert chunk_elements == chunks513
514
515def test_partition_text_element_metadata_has_languages():516filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")517elements = partition_text(filename=filename)518assert elements[0].metadata.languages == ["eng"]519
520
521def test_partition_text_respects_detect_language_per_element():522filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "language-docs", "eng_spa_mult.txt")523elements = partition_text(filename=filename, detect_language_per_element=True)524langs = [element.metadata.languages for element in elements]525assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]526
527
528def test_partition_text_respects_languages_arg():529filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")530elements = partition_text(filename=filename, languages=["deu"])531assert elements[0].metadata.languages == ["deu"]532
533
534def test_partition_text_element_metadata_raises_TypeError():535with pytest.raises(TypeError):536filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")537partition_text(filename=filename, languages="eng") # type: ignore538
539
540def test_partition_text_detects_more_than_3_languages():541filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "language-docs", "UDHR_first_article_all.txt")542elements = partition_text(filename=filename, detect_language_per_element=True)543langs = list(544{element.metadata.languages[0] for element in elements if element.metadata.languages},545)546assert len(langs) > 10547