DataProcessingFramework

Форк
0
84 строки · 2.8 Кб
1
from typing import Optional, Union
2

3
from DPF.datatypes import ColumnDataType, ShardedDataType
4
from DPF.modalities import MODALITIES
5

6
from .sharded_config import ShardedDatasetConfig
7

8

9
class ShardsDatasetConfig(ShardedDatasetConfig):
10
    """Config for Shards dataset type"""
11

12
    def __init__(
13
        self,
14
        path: str,
15
        datatypes: list[Union[ShardedDataType, ColumnDataType]],
16
        archives_ext: str = "tar",
17
        datafiles_ext: str = "csv",
18
    ):
19
        """
20
        Parameters
21
        ----------
22
        path: str
23
            Path to directory with shards
24
        datatypes: list[Union[ShardedDataType, ColumnDataType]]
25
            List of datatypes in dataset
26
        archives_ext: str = "tar"
27
            Extension of archives in shards
28
        datafiles_ext: str = "csv"
29
            Extension of files with metadata in shards
30
        """
31
        super().__init__(path, datatypes, datafiles_ext)
32
        self.archives_ext = archives_ext.lstrip('.')
33

34
    @classmethod
35
    def from_path_and_columns(
36
        cls,
37
        path: str,
38
        image_name_col: Optional[str] = None,
39
        video_name_col: Optional[str] = None,
40
        text_col: Optional[str] = None,
41
        archives_ext: str = "tar",
42
        datafiles_ext: str = "csv",
43
    ) -> "ShardsDatasetConfig":
44
        """
45
        Parameters
46
        ----------
47
        path: str
48
            Path to directory with shards
49
        image_name_col: Optional[str] = None
50
            Name of column with image filenames in shard
51
        video_name_col: Optional[str] = None
52
            Name of column with video filenames in shard
53
        text_col: Optional[str] = None
54
            Name of column with text
55
        archives_ext: str = "tar"
56
            Extension of archives in shards
57
        datafiles_ext: str = "csv"
58
            Extension of files with metadata in shards
59

60
        Returns
61
        -------
62
        ShardsDatasetConfig
63
            Instance of itself
64
        """
65
        datatypes: list[Union[ShardedDataType, ColumnDataType]] = []
66
        if image_name_col:
67
            datatypes.append(ShardedDataType(MODALITIES['image'], image_name_col))
68
        if video_name_col:
69
            datatypes.append(ShardedDataType(MODALITIES['video'], video_name_col))
70
        if text_col:
71
            datatypes.append(ColumnDataType(MODALITIES['text'], text_col))
72
        assert len(datatypes) > 0, "At least one modality should be provided"
73
        return cls(path, datatypes, archives_ext=archives_ext, datafiles_ext=datafiles_ext)
74

75
    def __repr__(self) -> str:
76
        s = "ShardsDatasetConfig(\n\t"
77
        s += f'path="{self.path}",\n\t'
78
        s += f'archives_ext="{self.archives_ext}",\n\t'
79
        s += f'datafiles_ext="{self.datafiles_ext}",\n\t'
80
        s += 'datatypes=[\n\t\t'
81
        s += '\n\t\t'.join([str(i) for i in self.datatypes])
82
        s += '\n\t]'
83
        s += '\n)'
84
        return s
85

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.