unstructured
124 строки · 4.0 Кб
1import os2
3import pytest4
5from unstructured.partition import pdf, strategies6from unstructured.partition.utils.constants import PartitionStrategy7
8
9@pytest.mark.parametrize(10"strategy",11[12PartitionStrategy.AUTO,13PartitionStrategy.FAST,14PartitionStrategy.OCR_ONLY,15PartitionStrategy.HI_RES,16],17)
18def test_validate_strategy(strategy):19# Nothing should raise for a valid strategy20strategies.validate_strategy(strategy=strategy)21
22
23def test_validate_strategy_raises_for_fast_strategy():24with pytest.raises(ValueError):25strategies.validate_strategy(strategy=PartitionStrategy.FAST, is_image=True)26
27
28def test_validate_strategy_raises_for_bad_strategy():29with pytest.raises(ValueError):30strategies.validate_strategy("totally_guess_the_text")31
32
33@pytest.mark.parametrize(34("filename", "from_file", "expected"),35[36("layout-parser-paper-fast.pdf", True, True),37("copy-protected.pdf", True, True),38("loremipsum-flat.pdf", True, False),39("layout-parser-paper-fast.pdf", False, True),40("copy-protected.pdf", False, True),41("loremipsum-flat.pdf", False, False),42],43)
44def test_is_pdf_text_extractable(filename, from_file, expected):45filename = os.path.join("example-docs", filename)46
47if from_file:48with open(filename, "rb") as f:49extractable = pdf.extractable_elements(file=f)50else:51extractable = pdf.extractable_elements(filename=filename)52
53assert bool(extractable) is expected54
55
56@pytest.mark.parametrize(57("pdf_text_extractable", "infer_table_structure"),58[59(True, True),60(False, True),61(True, False),62(False, False),63],64)
65def test_determine_pdf_or_image_fast_strategy(pdf_text_extractable, infer_table_structure):66strategy = strategies.determine_pdf_or_image_strategy(67strategy=PartitionStrategy.FAST,68pdf_text_extractable=pdf_text_extractable,69infer_table_structure=infer_table_structure,70)71assert strategy == PartitionStrategy.FAST72
73
74@pytest.mark.parametrize(75(76"pdf_text_extractable",77"infer_table_structure",78"extract_images_in_pdf",79"extract_image_block_types",80"expected",81),82[83(True, True, True, ["Image"], PartitionStrategy.HI_RES),84(True, True, True, [], PartitionStrategy.HI_RES),85(True, True, False, ["Image"], PartitionStrategy.HI_RES),86(True, True, False, [], PartitionStrategy.HI_RES),87(True, False, True, ["Image"], PartitionStrategy.HI_RES),88(True, False, True, [], PartitionStrategy.HI_RES),89(True, False, False, ["Image"], PartitionStrategy.HI_RES),90(True, False, False, [], PartitionStrategy.FAST),91(False, True, True, ["Image"], PartitionStrategy.HI_RES),92(False, True, True, [], PartitionStrategy.HI_RES),93(False, True, False, ["Image"], PartitionStrategy.HI_RES),94(False, True, False, [], PartitionStrategy.HI_RES),95(False, False, True, ["Image"], PartitionStrategy.HI_RES),96(False, False, True, [], PartitionStrategy.HI_RES),97(False, False, False, ["Image"], PartitionStrategy.HI_RES),98(False, False, False, [], PartitionStrategy.OCR_ONLY),99],100)
101def test_determine_pdf_auto_strategy(102pdf_text_extractable,103infer_table_structure,104extract_images_in_pdf,105extract_image_block_types,106expected,107):108strategy = strategies.determine_pdf_or_image_strategy(109strategy=PartitionStrategy.AUTO,110is_image=False,111pdf_text_extractable=pdf_text_extractable,112infer_table_structure=infer_table_structure,113extract_images_in_pdf=extract_images_in_pdf,114extract_image_block_types=extract_image_block_types,115)116assert strategy == expected117
118
119def test_determine_image_auto_strategy():120strategy = strategies.determine_pdf_or_image_strategy(121strategy=PartitionStrategy.AUTO,122is_image=True,123)124assert strategy == PartitionStrategy.HI_RES125