5
from pathlib import Path
9
from huggingface_hub import DatasetCard, DatasetCardData
11
from datasets.config import METADATA_CONFIGS_FIELD
12
from datasets.info import DatasetInfo
13
from datasets.utils.metadata import MetadataConfigs
16
def _dedent(string: str) -> str:
17
indent_level = min(re.search("^ +", t).end() if t.startswith(" ") else 0 for t in string.splitlines())
18
return "\n".join([line[indent_level:] for line in string.splitlines() if indent_level < len(line)])
27
- sentiment-classification
34
README_EMPTY_YAML = """\
50
README_METADATA_CONFIG_INCORRECT_FORMAT = f"""\
52
{METADATA_CONFIGS_FIELD}:
59
README_METADATA_SINGLE_CONFIG = f"""\
61
{METADATA_CONFIGS_FIELD}:
69
README_METADATA_TWO_CONFIGS_WITH_DEFAULT_FLAG = f"""\
71
{METADATA_CONFIGS_FIELD}:
83
README_METADATA_TWO_CONFIGS_WITH_DEFAULT_NAME = f"""\
85
{METADATA_CONFIGS_FIELD}:
89
- config_name: default
96
EXPECTED_METADATA_SINGLE_CONFIG = {"custom": {"data_dir": "v1", "drop_labels": True}}
97
EXPECTED_METADATA_TWO_CONFIGS_DEFAULT_FLAG = {
98
"v1": {"data_dir": "v1", "drop_labels": True},
99
"v2": {"data_dir": "v2", "drop_labels": False, "default": True},
101
EXPECTED_METADATA_TWO_CONFIGS_DEFAULT_NAME = {
102
"custom": {"data_dir": "custom", "drop_labels": True},
103
"default": {"data_dir": "data", "drop_labels": False},
108
def data_dir_with_two_subdirs(tmp_path):
109
data_dir = tmp_path / "data_dir_with_two_configs_in_metadata"
110
cats_data_dir = data_dir / "cats"
111
cats_data_dir.mkdir(parents=True)
112
dogs_data_dir = data_dir / "dogs"
113
dogs_data_dir.mkdir(parents=True)
115
with open(cats_data_dir / "cat.jpg", "wb") as f:
116
f.write(b"this_is_a_cat_image_bytes")
117
with open(dogs_data_dir / "dog.jpg", "wb") as f:
118
f.write(b"this_is_a_dog_image_bytes")
123
class TestMetadataUtils(unittest.TestCase):
124
def test_metadata_dict_from_readme(self):
125
with tempfile.TemporaryDirectory() as tmp_dir:
126
path = Path(tmp_dir) / "README.md"
127
with open(path, "w+") as readme_file:
128
readme_file.write(README_YAML)
129
dataset_card_data = DatasetCard.load(path).data
130
self.assertDictEqual(
131
dataset_card_data.to_dict(), {"language": ["zh", "en"], "task_ids": ["sentiment-classification"]}
134
with open(path, "w+") as readme_file:
135
readme_file.write(README_EMPTY_YAML)
137
sys.platform != "win32"
139
dataset_card_data = DatasetCard.load(path).data
140
self.assertDictEqual(dataset_card_data.to_dict(), {})
142
with open(path, "w+") as readme_file:
143
readme_file.write(README_NO_YAML)
144
dataset_card_data = DatasetCard.load(path).data
145
self.assertEqual(dataset_card_data.to_dict(), {})
147
def test_from_yaml_string(self):
148
valid_yaml_string = _dedent(
150
annotations_creators:
160
pretty_name: Test Dataset
164
- extended|other-yahoo-webscope-l6
171
assert DatasetCardData(**yaml.safe_load(valid_yaml_string)).to_dict()
173
valid_yaml_with_optional_keys = _dedent(
175
annotations_creators:
185
pretty_name: Test Dataset
189
- extended|other-yahoo-webscope-l6
191
- text-classification
193
- multi-class-classification
200
task: text-classification
201
task_id: multi_class_classification
211
extra_gated_prompt: |
212
By clicking on “Access repository” below, you also agree to ImageNet Terms of Access:
213
[RESEARCHER_FULLNAME] (the "Researcher") has requested permission to use the ImageNet database (the "Database") at Princeton University and Stanford University. In exchange for such permission, Researcher hereby agrees to the following terms and conditions:
214
1. Researcher shall use the Database only for non-commercial research and educational purposes.
218
I agree to use this model for non-commerical use ONLY: checkbox
221
assert DatasetCardData(**yaml.safe_load(valid_yaml_with_optional_keys)).to_dict()
224
@pytest.mark.parametrize(
225
"readme_content, expected_metadata_configs_dict, expected_default_config_name",
227
(README_METADATA_SINGLE_CONFIG, EXPECTED_METADATA_SINGLE_CONFIG, "custom"),
228
(README_METADATA_TWO_CONFIGS_WITH_DEFAULT_FLAG, EXPECTED_METADATA_TWO_CONFIGS_DEFAULT_FLAG, "v2"),
229
(README_METADATA_TWO_CONFIGS_WITH_DEFAULT_NAME, EXPECTED_METADATA_TWO_CONFIGS_DEFAULT_NAME, "default"),
232
def test_metadata_configs_dataset_card_data(
233
readme_content, expected_metadata_configs_dict, expected_default_config_name
235
with tempfile.TemporaryDirectory() as tmp_dir:
236
path = Path(tmp_dir) / "README.md"
237
with open(path, "w+") as readme_file:
238
readme_file.write(readme_content)
239
dataset_card_data = DatasetCard.load(path).data
240
metadata_configs_dict = MetadataConfigs.from_dataset_card_data(dataset_card_data)
241
assert metadata_configs_dict == expected_metadata_configs_dict
242
assert metadata_configs_dict.get_default_config_name() == expected_default_config_name
245
def test_metadata_configs_incorrect_yaml():
246
with tempfile.TemporaryDirectory() as tmp_dir:
247
path = Path(tmp_dir) / "README.md"
248
with open(path, "w+") as readme_file:
249
readme_file.write(README_METADATA_CONFIG_INCORRECT_FORMAT)
250
dataset_card_data = DatasetCard.load(path).data
251
with pytest.raises(ValueError):
252
_ = MetadataConfigs.from_dataset_card_data(dataset_card_data)
255
def test_split_order_in_metadata_configs_from_exported_parquet_files_and_dataset_infos():
256
exported_parquet_files = [
261
"url": "https://huggingface.co/datasets/beans/resolve/refs%2Fconvert%2Fparquet/default/test/0000.parquet",
262
"filename": "0000.parquet",
269
"url": "https://huggingface.co/datasets/beans/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet",
270
"filename": "0000.parquet",
276
"split": "validation",
277
"url": "https://huggingface.co/datasets/beans/resolve/refs%2Fconvert%2Fparquet/default/validation/0000.parquet",
278
"filename": "0000.parquet",
283
"default": DatasetInfo(
284
dataset_name="beans",
285
config_name="default",
290
"num_bytes": 143996486,
291
"num_examples": 1034,
292
"shard_lengths": None,
293
"dataset_name": "beans",
296
"name": "validation",
297
"num_bytes": 18525985,
299
"shard_lengths": None,
300
"dataset_name": "beans",
304
"num_bytes": 17730506,
306
"shard_lengths": None,
307
"dataset_name": "beans",
311
"https://huggingface.co/datasets/beans/resolve/main/data/train.zip": {
312
"num_bytes": 143812152,
315
"https://huggingface.co/datasets/beans/resolve/main/data/validation.zip": {
316
"num_bytes": 18504213,
319
"https://huggingface.co/datasets/beans/resolve/main/data/test.zip": {
320
"num_bytes": 17708541,
324
download_size=180024906,
325
post_processing_size=None,
326
dataset_size=180252977,
327
size_in_bytes=360277883,
330
metadata_configs = MetadataConfigs._from_exported_parquet_files_and_dataset_infos(
331
"123", exported_parquet_files, dataset_infos
333
split_names = [data_file["split"] for data_file in metadata_configs["default"]["data_files"]]
334
assert split_names == ["train", "validation", "test"]