DataProcessingFramework
84 строки · 2.8 Кб
1from typing import Optional, Union
2
3from DPF.datatypes import ColumnDataType, ShardedDataType
4from DPF.modalities import MODALITIES
5
6from .sharded_config import ShardedDatasetConfig
7
8
9class ShardsDatasetConfig(ShardedDatasetConfig):
10"""Config for Shards dataset type"""
11
12def __init__(
13self,
14path: str,
15datatypes: list[Union[ShardedDataType, ColumnDataType]],
16archives_ext: str = "tar",
17datafiles_ext: str = "csv",
18):
19"""
20Parameters
21----------
22path: str
23Path to directory with shards
24datatypes: list[Union[ShardedDataType, ColumnDataType]]
25List of datatypes in dataset
26archives_ext: str = "tar"
27Extension of archives in shards
28datafiles_ext: str = "csv"
29Extension of files with metadata in shards
30"""
31super().__init__(path, datatypes, datafiles_ext)
32self.archives_ext = archives_ext.lstrip('.')
33
34@classmethod
35def from_path_and_columns(
36cls,
37path: str,
38image_name_col: Optional[str] = None,
39video_name_col: Optional[str] = None,
40text_col: Optional[str] = None,
41archives_ext: str = "tar",
42datafiles_ext: str = "csv",
43) -> "ShardsDatasetConfig":
44"""
45Parameters
46----------
47path: str
48Path to directory with shards
49image_name_col: Optional[str] = None
50Name of column with image filenames in shard
51video_name_col: Optional[str] = None
52Name of column with video filenames in shard
53text_col: Optional[str] = None
54Name of column with text
55archives_ext: str = "tar"
56Extension of archives in shards
57datafiles_ext: str = "csv"
58Extension of files with metadata in shards
59
60Returns
61-------
62ShardsDatasetConfig
63Instance of itself
64"""
65datatypes: list[Union[ShardedDataType, ColumnDataType]] = []
66if image_name_col:
67datatypes.append(ShardedDataType(MODALITIES['image'], image_name_col))
68if video_name_col:
69datatypes.append(ShardedDataType(MODALITIES['video'], video_name_col))
70if text_col:
71datatypes.append(ColumnDataType(MODALITIES['text'], text_col))
72assert len(datatypes) > 0, "At least one modality should be provided"
73return cls(path, datatypes, archives_ext=archives_ext, datafiles_ext=datafiles_ext)
74
75def __repr__(self) -> str:
76s = "ShardsDatasetConfig(\n\t"
77s += f'path="{self.path}",\n\t'
78s += f'archives_ext="{self.archives_ext}",\n\t'
79s += f'datafiles_ext="{self.datafiles_ext}",\n\t'
80s += 'datatypes=[\n\t\t'
81s += '\n\t\t'.join([str(i) for i in self.datatypes])
82s += '\n\t]'
83s += '\n)'
84return s
85