6
from datasets.features.features import Features, Value
7
from datasets.info import DatasetInfo, DatasetInfosDict
10
@pytest.mark.parametrize(
13
["full:README.md", "dataset_infos.json"],
14
["empty:README.md", "dataset_infos.json"],
15
["dataset_infos.json"],
19
def test_from_dir(files, tmp_path_factory):
20
dataset_infos_dir = tmp_path_factory.mktemp("dset_infos_dir")
21
if "full:README.md" in files:
22
with open(dataset_infos_dir / "README.md", "w") as f:
23
f.write("---\ndataset_info:\n dataset_size: 42\n---")
24
if "empty:README.md" in files:
25
with open(dataset_infos_dir / "README.md", "w") as f:
28
if "dataset_infos.json" in files:
29
with open(dataset_infos_dir / "dataset_infos.json", "w") as f:
30
f.write('{"default": {"dataset_size": 42}}')
31
dataset_infos = DatasetInfosDict.from_directory(dataset_infos_dir)
33
assert dataset_infos["default"].dataset_size == 42
36
@pytest.mark.parametrize(
42
features=Features({"a": Value("int32")}),
43
builder_name="builder",
46
splits=[{"name": "train"}],
51
def test_dataset_info_dump_and_reload(tmp_path, dataset_info: DatasetInfo):
52
tmp_path = str(tmp_path)
53
dataset_info.write_to_directory(tmp_path)
54
reloaded = DatasetInfo.from_directory(tmp_path)
55
assert dataset_info == reloaded
56
assert os.path.exists(os.path.join(tmp_path, "dataset_info.json"))
59
def test_dataset_info_to_yaml_dict():
60
dataset_info = DatasetInfo(
63
homepage="https://foo.bar",
65
features=Features({"a": Value("int32")}),
69
builder_name="builder",
72
splits=[{"name": "train", "num_examples": 42}],
73
download_checksums={},
75
post_processing_size=442,
77
size_in_bytes=1337 + 442 + 1234,
79
dataset_info_yaml_dict = dataset_info._to_yaml_dict()
80
assert sorted(dataset_info_yaml_dict) == sorted(DatasetInfo._INCLUDED_INFO_IN_YAML)
81
for key in DatasetInfo._INCLUDED_INFO_IN_YAML:
82
assert key in dataset_info_yaml_dict
83
assert isinstance(dataset_info_yaml_dict[key], (list, dict, int, str))
84
dataset_info_yaml = yaml.safe_dump(dataset_info_yaml_dict)
85
reloaded = yaml.safe_load(dataset_info_yaml)
86
assert dataset_info_yaml_dict == reloaded
89
def test_dataset_info_to_yaml_dict_empty():
90
dataset_info = DatasetInfo()
91
dataset_info_yaml_dict = dataset_info._to_yaml_dict()
92
assert dataset_info_yaml_dict == {}
95
@pytest.mark.parametrize(
99
DatasetInfosDict({"default": DatasetInfo()}),
100
DatasetInfosDict({"my_config_name": DatasetInfo()}),
103
"default": DatasetInfo(
105
features=Features({"a": Value("int32")}),
106
builder_name="builder",
107
config_name="config",
109
splits=[{"name": "train"}],
116
"v1": DatasetInfo(dataset_size=42),
117
"v2": DatasetInfo(dataset_size=1337),
122
def test_dataset_infos_dict_dump_and_reload(tmp_path, dataset_infos_dict: DatasetInfosDict):
123
tmp_path = str(tmp_path)
124
dataset_infos_dict.write_to_directory(tmp_path)
125
reloaded = DatasetInfosDict.from_directory(tmp_path)
128
for config_name, dataset_info in dataset_infos_dict.items():
129
dataset_info.config_name = config_name
132
dataset_infos_dict[config_name] = DatasetInfo._from_yaml_dict(dataset_info._to_yaml_dict())
133
assert dataset_infos_dict == reloaded
135
if dataset_infos_dict:
136
assert os.path.exists(os.path.join(tmp_path, "README.md"))
139
@pytest.mark.parametrize(
146
features=Features({"a": Value("int32")}),
147
builder_name="builder",
148
config_name="config",
150
splits=[{"name": "train"}],
152
dataset_name="dataset_name",
156
def test_from_merge_same_dataset_infos(dataset_info):
158
if dataset_info is not None:
159
dataset_info_list = [dataset_info.copy() for _ in range(num_elements)]
161
dataset_info_list = [None] * num_elements
162
dataset_info_merged = DatasetInfo.from_merge(dataset_info_list)
163
if dataset_info is not None:
164
assert dataset_info == dataset_info_merged
166
assert DatasetInfo() == dataset_info_merged