unstructured
80 строк · 2.5 Кб
1# pyright: reportPrivateUsage=false
2
3import os
4from pathlib import Path
5
6import pytest
7from lxml import etree
8
9from unstructured.documents.xml import XMLDocument
10
11FILEPATH = Path(__file__).absolute().parent
12
13
14@pytest.fixture()
15def sample_document():
16return """"<SEC-DOCUMENT>
17<TYPE>10-K
18<COMPANY>Proctor & Gamble
19</SEC-DOCUMENT>"""
20
21
22def test_read_xml(sample_document, tmpdir):
23filename = os.path.join(tmpdir.dirname, "sample-document.xml")
24with open(filename, "w") as f:
25f.write(sample_document)
26
27xml_document = XMLDocument.from_file(filename=filename)
28document_tree = xml_document.document_tree
29type_tag = document_tree.find(".//type")
30assert type_tag.text.strip() == "10-K"
31
32# Test to make sure the & character is retained
33company_tag = document_tree.find(".//company")
34assert company_tag.text.strip() == "Proctor & Gamble"
35
36
37def test_xml_read_raises():
38xml_document = XMLDocument()
39with pytest.raises(NotImplementedError):
40xml_document._parse_pages_from_element_tree()
41
42
43def test_from_string(sample_document):
44xml_document = XMLDocument.from_string(sample_document)
45type_tag = xml_document.document_tree.find(".//type")
46assert type_tag.text.strip() == "10-K"
47
48
49def test_from_string_with_pre_tag():
50sample_document = """
51<pre>
52<SEC-DOCUMENT>
53<TYPE>10-K
54<COMPANY>Proctor & Gamble
55</SEC-DOCUMENT>
56</pre>
57"""
58xml_document = XMLDocument.from_string(sample_document)
59type_tag = xml_document.document_tree.find(".//type")
60assert type_tag.text.strip() == "10-K"
61
62
63def test_read_with_stylesheet():
64filename = os.path.join(FILEPATH, "..", "..", "example-docs", "factbook.xml")
65stylesheet = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xsl")
66
67xml_document = XMLDocument.from_file(filename=filename, stylesheet=stylesheet)
68doc_tree = xml_document.document_tree
69# NOTE(robinson) - The table heading row plus one row for each of the four data items
70assert int(doc_tree.xpath("count(//tr)")) == 5
71# NOTE(robinson) - Four data elements x four attributes for each
72assert int(doc_tree.xpath("count(//td)")) == 16
73
74
75def test_read_with_stylesheet_warns_with_html_parser(caplog):
76filename = os.path.join(FILEPATH, "..", "..", "example-docs", "factbook.xml")
77stylesheet = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xsl")
78
79XMLDocument.from_file(filename=filename, stylesheet=stylesheet, parser=etree.HTMLParser())
80assert "WARNING" in caplog.text
81