unstructured

Форк
0
124 строки · 4.0 Кб
1
import os
2

3
import pytest
4

5
from unstructured.partition import pdf, strategies
6
from unstructured.partition.utils.constants import PartitionStrategy
7

8

9
@pytest.mark.parametrize(
10
    "strategy",
11
    [
12
        PartitionStrategy.AUTO,
13
        PartitionStrategy.FAST,
14
        PartitionStrategy.OCR_ONLY,
15
        PartitionStrategy.HI_RES,
16
    ],
17
)
18
def test_validate_strategy(strategy):
19
    # Nothing should raise for a valid strategy
20
    strategies.validate_strategy(strategy=strategy)
21

22

23
def test_validate_strategy_raises_for_fast_strategy():
24
    with pytest.raises(ValueError):
25
        strategies.validate_strategy(strategy=PartitionStrategy.FAST, is_image=True)
26

27

28
def test_validate_strategy_raises_for_bad_strategy():
29
    with pytest.raises(ValueError):
30
        strategies.validate_strategy("totally_guess_the_text")
31

32

33
@pytest.mark.parametrize(
34
    ("filename", "from_file", "expected"),
35
    [
36
        ("layout-parser-paper-fast.pdf", True, True),
37
        ("copy-protected.pdf", True, True),
38
        ("loremipsum-flat.pdf", True, False),
39
        ("layout-parser-paper-fast.pdf", False, True),
40
        ("copy-protected.pdf", False, True),
41
        ("loremipsum-flat.pdf", False, False),
42
    ],
43
)
44
def test_is_pdf_text_extractable(filename, from_file, expected):
45
    filename = os.path.join("example-docs", filename)
46

47
    if from_file:
48
        with open(filename, "rb") as f:
49
            extractable = pdf.extractable_elements(file=f)
50
    else:
51
        extractable = pdf.extractable_elements(filename=filename)
52

53
    assert bool(extractable) is expected
54

55

56
@pytest.mark.parametrize(
57
    ("pdf_text_extractable", "infer_table_structure"),
58
    [
59
        (True, True),
60
        (False, True),
61
        (True, False),
62
        (False, False),
63
    ],
64
)
65
def test_determine_pdf_or_image_fast_strategy(pdf_text_extractable, infer_table_structure):
66
    strategy = strategies.determine_pdf_or_image_strategy(
67
        strategy=PartitionStrategy.FAST,
68
        pdf_text_extractable=pdf_text_extractable,
69
        infer_table_structure=infer_table_structure,
70
    )
71
    assert strategy == PartitionStrategy.FAST
72

73

74
@pytest.mark.parametrize(
75
    (
76
        "pdf_text_extractable",
77
        "infer_table_structure",
78
        "extract_images_in_pdf",
79
        "extract_image_block_types",
80
        "expected",
81
    ),
82
    [
83
        (True, True, True, ["Image"], PartitionStrategy.HI_RES),
84
        (True, True, True, [], PartitionStrategy.HI_RES),
85
        (True, True, False, ["Image"], PartitionStrategy.HI_RES),
86
        (True, True, False, [], PartitionStrategy.HI_RES),
87
        (True, False, True, ["Image"], PartitionStrategy.HI_RES),
88
        (True, False, True, [], PartitionStrategy.HI_RES),
89
        (True, False, False, ["Image"], PartitionStrategy.HI_RES),
90
        (True, False, False, [], PartitionStrategy.FAST),
91
        (False, True, True, ["Image"], PartitionStrategy.HI_RES),
92
        (False, True, True, [], PartitionStrategy.HI_RES),
93
        (False, True, False, ["Image"], PartitionStrategy.HI_RES),
94
        (False, True, False, [], PartitionStrategy.HI_RES),
95
        (False, False, True, ["Image"], PartitionStrategy.HI_RES),
96
        (False, False, True, [], PartitionStrategy.HI_RES),
97
        (False, False, False, ["Image"], PartitionStrategy.HI_RES),
98
        (False, False, False, [], PartitionStrategy.OCR_ONLY),
99
    ],
100
)
101
def test_determine_pdf_auto_strategy(
102
    pdf_text_extractable,
103
    infer_table_structure,
104
    extract_images_in_pdf,
105
    extract_image_block_types,
106
    expected,
107
):
108
    strategy = strategies.determine_pdf_or_image_strategy(
109
        strategy=PartitionStrategy.AUTO,
110
        is_image=False,
111
        pdf_text_extractable=pdf_text_extractable,
112
        infer_table_structure=infer_table_structure,
113
        extract_images_in_pdf=extract_images_in_pdf,
114
        extract_image_block_types=extract_image_block_types,
115
    )
116
    assert strategy == expected
117

118

119
def test_determine_image_auto_strategy():
120
    strategy = strategies.determine_pdf_or_image_strategy(
121
        strategy=PartitionStrategy.AUTO,
122
        is_image=True,
123
    )
124
    assert strategy == PartitionStrategy.HI_RES
125

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.