unstructured
298 строк · 9.5 Кб
1import os
2import pathlib
3from unittest.mock import patch
4
5import pytest
6import requests
7
8from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
9from unstructured.chunking.title import chunk_by_title
10from unstructured.documents.elements import ElementType, Title
11from unstructured.partition.md import partition_md
12from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
13
14DIRECTORY = pathlib.Path(__file__).parent.resolve()
15
16
17def test_partition_md_from_filename():
18filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
19elements = partition_md(filename=filename)
20assert "PageBreak" not in [elem.category for elem in elements]
21assert len(elements) > 0
22for element in elements:
23assert element.metadata.filename == "README.md"
24if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
25assert {element.metadata.detection_origin for element in elements} == {"md"}
26
27
28def test_partition_md_from_filename_returns_uns_elements():
29filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
30elements = partition_md(filename=filename)
31assert len(elements) > 0
32assert isinstance(elements[0], Title)
33
34
35def test_partition_md_from_filename_with_metadata_filename():
36filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
37elements = partition_md(filename=filename, metadata_filename="test")
38assert "PageBreak" not in [elem.category for elem in elements]
39assert len(elements) > 0
40for element in elements:
41assert element.metadata.filename == "test"
42
43
44def test_partition_md_from_file():
45filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
46with open(filename) as f:
47elements = partition_md(file=f)
48assert len(elements) > 0
49for element in elements:
50assert element.metadata.filename is None
51
52
53def test_partition_md_from_file_with_metadata_filename():
54filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
55with open(filename) as f:
56elements = partition_md(file=f, metadata_filename="test")
57assert len(elements) > 0
58assert all(element.metadata.filename == "test" for element in elements)
59
60
61def test_partition_md_from_text():
62filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
63with open(filename) as f:
64text = f.read()
65elements = partition_md(text=text)
66assert len(elements) > 0
67for element in elements:
68assert element.metadata.filename is None
69
70
71class MockResponse:
72def __init__(self, text, status_code, headers={}):
73self.text = text
74self.status_code = status_code
75self.ok = status_code < 300
76self.headers = headers
77
78
79def test_partition_md_from_url():
80filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
81with open(filename) as f:
82text = f.read()
83
84response = MockResponse(
85text=text,
86status_code=200,
87headers={"Content-Type": "text/markdown"},
88)
89with patch.object(requests, "get", return_value=response) as _:
90elements = partition_md(url="https://fake.url")
91
92assert len(elements) > 0
93for element in elements:
94assert element.metadata.filename is None
95
96
97def test_partition_md_from_url_raises_with_bad_status_code():
98filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
99with open(filename) as f:
100text = f.read()
101
102response = MockResponse(
103text=text,
104status_code=500,
105headers={"Content-Type": "text/html"},
106)
107with patch.object(requests, "get", return_value=response) as _, pytest.raises(ValueError):
108partition_md(url="https://fake.url")
109
110
111def test_partition_md_from_url_raises_with_bad_content_type():
112filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
113with open(filename) as f:
114text = f.read()
115
116response = MockResponse(
117text=text,
118status_code=200,
119headers={"Content-Type": "application/json"},
120)
121with patch.object(requests, "get", return_value=response) as _, pytest.raises(ValueError):
122partition_md(url="https://fake.url")
123
124
125def test_partition_md_raises_with_none_specified():
126with pytest.raises(ValueError):
127partition_md()
128
129
130def test_partition_md_raises_with_too_many_specified():
131filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
132with open(filename) as f:
133text = f.read()
134
135with pytest.raises(ValueError):
136partition_md(filename=filename, text=text)
137
138
139def test_partition_md_from_filename_exclude_metadata():
140filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
141elements = partition_md(filename=filename, include_metadata=False)
142for i in range(len(elements)):
143assert elements[i].metadata.to_dict() == {}
144
145
146def test_partition_md_from_file_exclude_metadata():
147filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
148with open(filename) as f:
149elements = partition_md(file=f, include_metadata=False)
150for i in range(len(elements)):
151assert elements[i].metadata.to_dict() == {}
152
153
154def test_partition_md_from_text_exclude_metadata():
155filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
156with open(filename) as f:
157text = f.read()
158elements = partition_md(text=text, include_metadata=False)
159for i in range(len(elements)):
160assert elements[i].metadata.to_dict() == {}
161
162
163def test_partition_md_metadata_date(
164mocker,
165filename="example-docs/README.md",
166):
167mocked_last_modification_date = "2029-07-05T09:24:28"
168
169mocker.patch(
170"unstructured.partition.md.get_last_modified_date",
171return_value=mocked_last_modification_date,
172)
173
174elements = partition_md(
175filename=filename,
176)
177
178assert elements[0].metadata.last_modified == mocked_last_modification_date
179
180
181def test_partition_md_with_custom_metadata_date(
182mocker,
183filename="example-docs/README.md",
184):
185mocked_last_modification_date = "2029-07-05T09:24:28"
186expected_last_modification_date = "2020-07-05T09:24:28"
187
188mocker.patch(
189"unstructured.partition.md.get_last_modified_date",
190return_value=mocked_last_modification_date,
191)
192
193elements = partition_md(
194filename=filename,
195metadata_last_modified=expected_last_modification_date,
196)
197
198assert elements[0].metadata.last_modified == expected_last_modification_date
199
200
201def test_partition_md_from_file_metadata_date(
202mocker,
203filename="example-docs/README.md",
204):
205mocked_last_modification_date = "2029-07-05T09:24:28"
206
207mocker.patch(
208"unstructured.partition.md.get_last_modified_date_from_file",
209return_value=mocked_last_modification_date,
210)
211
212with open(filename, "rb") as f:
213elements = partition_md(
214file=f,
215)
216
217assert elements[0].metadata.last_modified == mocked_last_modification_date
218
219
220def test_partition_md_from_file_with_custom_metadata_date(
221mocker,
222filename="example-docs/README.md",
223):
224mocked_last_modification_date = "2029-07-05T09:24:28"
225expected_last_modification_date = "2020-07-05T09:24:28"
226
227mocker.patch(
228"unstructured.partition.md.get_last_modified_date_from_file",
229return_value=mocked_last_modification_date,
230)
231
232with open(filename, "rb") as f:
233elements = partition_md(file=f, metadata_last_modified=expected_last_modification_date)
234
235assert elements[0].metadata.last_modified == expected_last_modification_date
236
237
238def test_partition_md_from_text_metadata_date(
239filename="example-docs/README.md",
240):
241with open(filename) as f:
242text = f.read()
243
244elements = partition_md(
245text=text,
246)
247
248assert elements[0].metadata.last_modified is None
249
250
251def test_partition_md_from_text_with_custom_metadata_date(
252filename="example-docs/README.md",
253):
254expected_last_modification_date = "2020-07-05T09:24:28"
255
256with open(filename) as f:
257text = f.read()
258
259elements = partition_md(text=text, metadata_last_modified=expected_last_modification_date)
260
261assert elements[0].metadata.last_modified == expected_last_modification_date
262
263
264def test_partition_md_with_json():
265with open(example_doc_path("README.md")) as f:
266text = f.read()
267elements = partition_md(text=text)
268assert_round_trips_through_JSON(elements)
269
270
271def test_add_chunking_strategy_by_title_on_partition_md(
272filename="example-docs/README.md",
273):
274elements = partition_md(filename=filename)
275chunk_elements = partition_md(filename, chunking_strategy="by_title")
276chunks = chunk_by_title(elements)
277assert chunk_elements != elements
278assert chunk_elements == chunks
279
280
281def test_partition_md_element_metadata_has_languages():
282filename = "example-docs/README.md"
283elements = partition_md(filename=filename)
284assert elements[0].metadata.languages == ["eng"]
285
286
287def test_partition_md_respects_detect_language_per_element():
288filename = "example-docs/language-docs/eng_spa_mult.md"
289elements = partition_md(filename=filename, detect_language_per_element=True)
290langs = [element.metadata.languages for element in elements]
291assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
292
293
294def test_partition_md_parse_table():
295filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "simple-table.md")
296elements = partition_md(filename=filename)
297assert len(elements) > 0
298assert elements[0].category == ElementType.TABLE
299