datasets

Форк
0
/
test_metadata_util.py 
334 строки · 10.6 Кб
1
import re
2
import sys
3
import tempfile
4
import unittest
5
from pathlib import Path
6

7
import pytest
8
import yaml
9
from huggingface_hub import DatasetCard, DatasetCardData
10

11
from datasets.config import METADATA_CONFIGS_FIELD
12
from datasets.info import DatasetInfo
13
from datasets.utils.metadata import MetadataConfigs
14

15

16
def _dedent(string: str) -> str:
17
    indent_level = min(re.search("^ +", t).end() if t.startswith(" ") else 0 for t in string.splitlines())
18
    return "\n".join([line[indent_level:] for line in string.splitlines() if indent_level < len(line)])
19

20

21
README_YAML = """\
22
---
23
language:
24
- zh
25
- en
26
task_ids:
27
- sentiment-classification
28
---
29
# Begin of markdown
30

31
Some cool dataset card
32
"""
33

34
README_EMPTY_YAML = """\
35
---
36
---
37
# Begin of markdown
38

39
Some cool dataset card
40
"""
41

42

43
README_NO_YAML = """\
44
# Begin of markdown
45

46
Some cool dataset card
47
"""
48

49

50
README_METADATA_CONFIG_INCORRECT_FORMAT = f"""\
51
---
52
{METADATA_CONFIGS_FIELD}:
53
  data_dir: v1
54
  drop_labels: true
55
---
56
"""
57

58

59
README_METADATA_SINGLE_CONFIG = f"""\
60
---
61
{METADATA_CONFIGS_FIELD}:
62
  - config_name: custom
63
    data_dir: v1
64
    drop_labels: true
65
---
66
"""
67

68

69
README_METADATA_TWO_CONFIGS_WITH_DEFAULT_FLAG = f"""\
70
---
71
{METADATA_CONFIGS_FIELD}:
72
  - config_name: v1
73
    data_dir: v1
74
    drop_labels: true
75
  - config_name: v2
76
    data_dir: v2
77
    drop_labels: false
78
    default: true
79
---
80
"""
81

82

83
README_METADATA_TWO_CONFIGS_WITH_DEFAULT_NAME = f"""\
84
---
85
{METADATA_CONFIGS_FIELD}:
86
  - config_name: custom
87
    data_dir: custom
88
    drop_labels: true
89
  - config_name: default
90
    data_dir: data
91
    drop_labels: false
92
---
93
"""
94

95

96
EXPECTED_METADATA_SINGLE_CONFIG = {"custom": {"data_dir": "v1", "drop_labels": True}}
97
EXPECTED_METADATA_TWO_CONFIGS_DEFAULT_FLAG = {
98
    "v1": {"data_dir": "v1", "drop_labels": True},
99
    "v2": {"data_dir": "v2", "drop_labels": False, "default": True},
100
}
101
EXPECTED_METADATA_TWO_CONFIGS_DEFAULT_NAME = {
102
    "custom": {"data_dir": "custom", "drop_labels": True},
103
    "default": {"data_dir": "data", "drop_labels": False},
104
}
105

106

107
@pytest.fixture
108
def data_dir_with_two_subdirs(tmp_path):
109
    data_dir = tmp_path / "data_dir_with_two_configs_in_metadata"
110
    cats_data_dir = data_dir / "cats"
111
    cats_data_dir.mkdir(parents=True)
112
    dogs_data_dir = data_dir / "dogs"
113
    dogs_data_dir.mkdir(parents=True)
114

115
    with open(cats_data_dir / "cat.jpg", "wb") as f:
116
        f.write(b"this_is_a_cat_image_bytes")
117
    with open(dogs_data_dir / "dog.jpg", "wb") as f:
118
        f.write(b"this_is_a_dog_image_bytes")
119

120
    return str(data_dir)
121

122

123
class TestMetadataUtils(unittest.TestCase):
124
    def test_metadata_dict_from_readme(self):
125
        with tempfile.TemporaryDirectory() as tmp_dir:
126
            path = Path(tmp_dir) / "README.md"
127
            with open(path, "w+") as readme_file:
128
                readme_file.write(README_YAML)
129
            dataset_card_data = DatasetCard.load(path).data
130
            self.assertDictEqual(
131
                dataset_card_data.to_dict(), {"language": ["zh", "en"], "task_ids": ["sentiment-classification"]}
132
            )
133

134
            with open(path, "w+") as readme_file:
135
                readme_file.write(README_EMPTY_YAML)
136
            if (
137
                sys.platform != "win32"
138
            ):  # there is a bug on windows, see https://github.com/huggingface/huggingface_hub/issues/1546
139
                dataset_card_data = DatasetCard.load(path).data
140
                self.assertDictEqual(dataset_card_data.to_dict(), {})
141

142
            with open(path, "w+") as readme_file:
143
                readme_file.write(README_NO_YAML)
144
            dataset_card_data = DatasetCard.load(path).data
145
            self.assertEqual(dataset_card_data.to_dict(), {})
146

147
    def test_from_yaml_string(self):
148
        valid_yaml_string = _dedent(
149
            """\
150
            annotations_creators:
151
            - found
152
            language_creators:
153
            - found
154
            language:
155
            - en
156
            license:
157
            - unknown
158
            multilinguality:
159
            - monolingual
160
            pretty_name: Test Dataset
161
            size_categories:
162
            - 10K<n<100K
163
            source_datasets:
164
            - extended|other-yahoo-webscope-l6
165
            task_categories:
166
            - question-answering
167
            task_ids:
168
            - open-domain-qa
169
            """
170
        )
171
        assert DatasetCardData(**yaml.safe_load(valid_yaml_string)).to_dict()
172

173
        valid_yaml_with_optional_keys = _dedent(
174
            """\
175
            annotations_creators:
176
            - found
177
            language_creators:
178
            - found
179
            language:
180
            - en
181
            license:
182
            - unknown
183
            multilinguality:
184
            - monolingual
185
            pretty_name: Test Dataset
186
            size_categories:
187
            - 10K<n<100K
188
            source_datasets:
189
            - extended|other-yahoo-webscope-l6
190
            task_categories:
191
            - text-classification
192
            task_ids:
193
            - multi-class-classification
194
            paperswithcode_id:
195
            - squad
196
            configs:
197
            - en
198
            train-eval-index:
199
            - config: en
200
              task: text-classification
201
              task_id: multi_class_classification
202
              splits:
203
                train_split: train
204
                eval_split: test
205
              col_mapping:
206
                text: text
207
                label: target
208
              metrics:
209
                - type: accuracy
210
                  name: Accuracy
211
            extra_gated_prompt: |
212
              By clicking on “Access repository” below, you also agree to ImageNet Terms of Access:
213
              [RESEARCHER_FULLNAME] (the "Researcher") has requested permission to use the ImageNet database (the "Database") at Princeton University and Stanford University. In exchange for such permission, Researcher hereby agrees to the following terms and conditions:
214
              1. Researcher shall use the Database only for non-commercial research and educational purposes.
215
            extra_gated_fields:
216
              Company: text
217
              Country: text
218
              I agree to use this model for non-commerical use ONLY: checkbox
219
            """
220
        )
221
        assert DatasetCardData(**yaml.safe_load(valid_yaml_with_optional_keys)).to_dict()
222

223

224
@pytest.mark.parametrize(
225
    "readme_content, expected_metadata_configs_dict, expected_default_config_name",
226
    [
227
        (README_METADATA_SINGLE_CONFIG, EXPECTED_METADATA_SINGLE_CONFIG, "custom"),
228
        (README_METADATA_TWO_CONFIGS_WITH_DEFAULT_FLAG, EXPECTED_METADATA_TWO_CONFIGS_DEFAULT_FLAG, "v2"),
229
        (README_METADATA_TWO_CONFIGS_WITH_DEFAULT_NAME, EXPECTED_METADATA_TWO_CONFIGS_DEFAULT_NAME, "default"),
230
    ],
231
)
232
def test_metadata_configs_dataset_card_data(
233
    readme_content, expected_metadata_configs_dict, expected_default_config_name
234
):
235
    with tempfile.TemporaryDirectory() as tmp_dir:
236
        path = Path(tmp_dir) / "README.md"
237
        with open(path, "w+") as readme_file:
238
            readme_file.write(readme_content)
239
        dataset_card_data = DatasetCard.load(path).data
240
        metadata_configs_dict = MetadataConfigs.from_dataset_card_data(dataset_card_data)
241
        assert metadata_configs_dict == expected_metadata_configs_dict
242
        assert metadata_configs_dict.get_default_config_name() == expected_default_config_name
243

244

245
def test_metadata_configs_incorrect_yaml():
246
    with tempfile.TemporaryDirectory() as tmp_dir:
247
        path = Path(tmp_dir) / "README.md"
248
        with open(path, "w+") as readme_file:
249
            readme_file.write(README_METADATA_CONFIG_INCORRECT_FORMAT)
250
        dataset_card_data = DatasetCard.load(path).data
251
        with pytest.raises(ValueError):
252
            _ = MetadataConfigs.from_dataset_card_data(dataset_card_data)
253

254

255
def test_split_order_in_metadata_configs_from_exported_parquet_files_and_dataset_infos():
256
    exported_parquet_files = [
257
        {
258
            "dataset": "beans",
259
            "config": "default",
260
            "split": "test",
261
            "url": "https://huggingface.co/datasets/beans/resolve/refs%2Fconvert%2Fparquet/default/test/0000.parquet",
262
            "filename": "0000.parquet",
263
            "size": 17707203,
264
        },
265
        {
266
            "dataset": "beans",
267
            "config": "default",
268
            "split": "train",
269
            "url": "https://huggingface.co/datasets/beans/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet",
270
            "filename": "0000.parquet",
271
            "size": 143780164,
272
        },
273
        {
274
            "dataset": "beans",
275
            "config": "default",
276
            "split": "validation",
277
            "url": "https://huggingface.co/datasets/beans/resolve/refs%2Fconvert%2Fparquet/default/validation/0000.parquet",
278
            "filename": "0000.parquet",
279
            "size": 18500862,
280
        },
281
    ]
282
    dataset_infos = {
283
        "default": DatasetInfo(
284
            dataset_name="beans",
285
            config_name="default",
286
            version="0.0.0",
287
            splits={
288
                "train": {
289
                    "name": "train",
290
                    "num_bytes": 143996486,
291
                    "num_examples": 1034,
292
                    "shard_lengths": None,
293
                    "dataset_name": "beans",
294
                },
295
                "validation": {
296
                    "name": "validation",
297
                    "num_bytes": 18525985,
298
                    "num_examples": 133,
299
                    "shard_lengths": None,
300
                    "dataset_name": "beans",
301
                },
302
                "test": {
303
                    "name": "test",
304
                    "num_bytes": 17730506,
305
                    "num_examples": 128,
306
                    "shard_lengths": None,
307
                    "dataset_name": "beans",
308
                },
309
            },
310
            download_checksums={
311
                "https://huggingface.co/datasets/beans/resolve/main/data/train.zip": {
312
                    "num_bytes": 143812152,
313
                    "checksum": None,
314
                },
315
                "https://huggingface.co/datasets/beans/resolve/main/data/validation.zip": {
316
                    "num_bytes": 18504213,
317
                    "checksum": None,
318
                },
319
                "https://huggingface.co/datasets/beans/resolve/main/data/test.zip": {
320
                    "num_bytes": 17708541,
321
                    "checksum": None,
322
                },
323
            },
324
            download_size=180024906,
325
            post_processing_size=None,
326
            dataset_size=180252977,
327
            size_in_bytes=360277883,
328
        )
329
    }
330
    metadata_configs = MetadataConfigs._from_exported_parquet_files_and_dataset_infos(
331
        "123", exported_parquet_files, dataset_infos
332
    )
333
    split_names = [data_file["split"] for data_file in metadata_configs["default"]["data_files"]]
334
    assert split_names == ["train", "validation", "test"]
335

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.