unstructured
97 строк · 3.2 Кб
1import os2import pathlib3
4import pandas as pd5import pytest6
7from unstructured.file_utils import exploration8from unstructured.file_utils.filetype import FileType9
10DIRECTORY = pathlib.Path(__file__).parent.resolve()11
12
13is_in_docker = os.path.exists("/.dockerenv")14
15
16@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")17def test_get_directory_file_info(tmpdir):18file_info_test = os.path.join(tmpdir, "file_info_test")19if not os.path.exists(file_info_test):20os.mkdir(file_info_test)21
22directory1 = os.path.join(file_info_test, "directory1")23if not os.path.exists(directory1):24os.mkdir(directory1)25
26filename1 = os.path.join(directory1, "filename1.txt")27with open(filename1, "w") as f:28f.write("hello there!")29
30directory2 = os.path.join(file_info_test, "directory2")31if not os.path.exists(directory2):32os.mkdir(directory2)33
34filename2 = os.path.join(directory2, "filename2.txt")35with open(filename2, "w") as f:36f.write("hello there!")37
38file_info = exploration.get_directory_file_info(file_info_test)39assert isinstance(file_info, pd.DataFrame)40assert set(file_info["filename"].to_list()) == {"filename1.txt", "filename2.txt"}41
42means = file_info.groupby("filetype").mean(numeric_only=True)43assert means.columns.to_list() == ["filesize"]44
45
46@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")47def test_get_file_info(tmpdir):48file_info_test = os.path.join(tmpdir, "file_info_test")49if not os.path.exists(file_info_test):50os.mkdir(file_info_test)51
52directory1 = os.path.join(file_info_test, "directory1")53if not os.path.exists(directory1):54os.mkdir(directory1)55
56filename1 = os.path.join(directory1, "filename1.txt")57with open(filename1, "w") as f:58f.write("hello there!")59
60directory2 = os.path.join(file_info_test, "directory2")61if not os.path.exists(directory2):62os.mkdir(directory2)63
64filename2 = os.path.join(directory2, "filename2.txt")65with open(filename2, "w") as f:66f.write("hello there!")67
68file_info = exploration.get_file_info([filename1, filename2])69assert isinstance(file_info, pd.DataFrame)70assert set(file_info["filename"].to_list()) == {"filename1.txt", "filename2.txt"}71
72means = file_info.groupby("filetype").mean(numeric_only=True)73assert means.columns.to_list() == ["filesize"]74
75
76def test_get_file_info_from_file_contents():77file_contents_filename = os.path.join(DIRECTORY, "test-file-contents.txt")78with open(file_contents_filename) as f:79file_contents = [f.read()]80
81file_info = exploration.get_file_info_from_file_contents(82file_contents=file_contents,83filenames=["test.eml"],84)85assert file_info.filetype[0] == FileType.EML86
87
88def test_get_file_info_from_file_contents_raises_if_lists_no_equal():89file_contents_filename = os.path.join(DIRECTORY, "test-file-contents.txt")90with open(file_contents_filename) as f:91file_contents = [f.read()]92
93with pytest.raises(ValueError):94exploration.get_file_info_from_file_contents(95file_contents=file_contents,96filenames=["test.eml", "test2.eml"],97)98