unstructured

Форк
0
97 строк · 3.2 Кб
1
import os
2
import pathlib
3

4
import pandas as pd
5
import pytest
6

7
from unstructured.file_utils import exploration
8
from unstructured.file_utils.filetype import FileType
9

10
DIRECTORY = pathlib.Path(__file__).parent.resolve()
11

12

13
is_in_docker = os.path.exists("/.dockerenv")
14

15

16
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
17
def test_get_directory_file_info(tmpdir):
18
    file_info_test = os.path.join(tmpdir, "file_info_test")
19
    if not os.path.exists(file_info_test):
20
        os.mkdir(file_info_test)
21

22
    directory1 = os.path.join(file_info_test, "directory1")
23
    if not os.path.exists(directory1):
24
        os.mkdir(directory1)
25

26
    filename1 = os.path.join(directory1, "filename1.txt")
27
    with open(filename1, "w") as f:
28
        f.write("hello there!")
29

30
    directory2 = os.path.join(file_info_test, "directory2")
31
    if not os.path.exists(directory2):
32
        os.mkdir(directory2)
33

34
    filename2 = os.path.join(directory2, "filename2.txt")
35
    with open(filename2, "w") as f:
36
        f.write("hello there!")
37

38
    file_info = exploration.get_directory_file_info(file_info_test)
39
    assert isinstance(file_info, pd.DataFrame)
40
    assert set(file_info["filename"].to_list()) == {"filename1.txt", "filename2.txt"}
41

42
    means = file_info.groupby("filetype").mean(numeric_only=True)
43
    assert means.columns.to_list() == ["filesize"]
44

45

46
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
47
def test_get_file_info(tmpdir):
48
    file_info_test = os.path.join(tmpdir, "file_info_test")
49
    if not os.path.exists(file_info_test):
50
        os.mkdir(file_info_test)
51

52
    directory1 = os.path.join(file_info_test, "directory1")
53
    if not os.path.exists(directory1):
54
        os.mkdir(directory1)
55

56
    filename1 = os.path.join(directory1, "filename1.txt")
57
    with open(filename1, "w") as f:
58
        f.write("hello there!")
59

60
    directory2 = os.path.join(file_info_test, "directory2")
61
    if not os.path.exists(directory2):
62
        os.mkdir(directory2)
63

64
    filename2 = os.path.join(directory2, "filename2.txt")
65
    with open(filename2, "w") as f:
66
        f.write("hello there!")
67

68
    file_info = exploration.get_file_info([filename1, filename2])
69
    assert isinstance(file_info, pd.DataFrame)
70
    assert set(file_info["filename"].to_list()) == {"filename1.txt", "filename2.txt"}
71

72
    means = file_info.groupby("filetype").mean(numeric_only=True)
73
    assert means.columns.to_list() == ["filesize"]
74

75

76
def test_get_file_info_from_file_contents():
77
    file_contents_filename = os.path.join(DIRECTORY, "test-file-contents.txt")
78
    with open(file_contents_filename) as f:
79
        file_contents = [f.read()]
80

81
    file_info = exploration.get_file_info_from_file_contents(
82
        file_contents=file_contents,
83
        filenames=["test.eml"],
84
    )
85
    assert file_info.filetype[0] == FileType.EML
86

87

88
def test_get_file_info_from_file_contents_raises_if_lists_no_equal():
89
    file_contents_filename = os.path.join(DIRECTORY, "test-file-contents.txt")
90
    with open(file_contents_filename) as f:
91
        file_contents = [f.read()]
92

93
    with pytest.raises(ValueError):
94
        exploration.get_file_info_from_file_contents(
95
            file_contents=file_contents,
96
            filenames=["test.eml", "test2.eml"],
97
        )
98

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.