datasets

Форк
0
76 строк · 3.1 Кб
1
from pathlib import Path
2

3
import pytest
4

5
from datasets import load_dataset
6
from datasets.packaged_modules.cache.cache import Cache
7

8

9
SAMPLE_DATASET_TWO_CONFIG_IN_METADATA = "hf-internal-testing/audiofolder_two_configs_in_metadata"
10

11

12
def test_cache(text_dir: Path):
13
    ds = load_dataset(str(text_dir))
14
    hash = Path(ds["train"].cache_files[0]["filename"]).parts[-2]
15
    cache = Cache(dataset_name=text_dir.name, hash=hash)
16
    reloaded = cache.as_dataset()
17
    assert list(ds) == list(reloaded)
18
    assert list(ds["train"]) == list(reloaded["train"])
19

20

21
def test_cache_streaming(text_dir: Path):
22
    ds = load_dataset(str(text_dir))
23
    hash = Path(ds["train"].cache_files[0]["filename"]).parts[-2]
24
    cache = Cache(dataset_name=text_dir.name, hash=hash)
25
    reloaded = cache.as_streaming_dataset()
26
    assert list(ds) == list(reloaded)
27
    assert list(ds["train"]) == list(reloaded["train"])
28

29

30
def test_cache_auto_hash(text_dir: Path):
31
    ds = load_dataset(str(text_dir))
32
    cache = Cache(dataset_name=text_dir.name, version="auto", hash="auto")
33
    reloaded = cache.as_dataset()
34
    assert list(ds) == list(reloaded)
35
    assert list(ds["train"]) == list(reloaded["train"])
36

37

38
def test_cache_auto_hash_with_custom_config(text_dir: Path):
39
    ds = load_dataset(str(text_dir), sample_by="paragraph")
40
    another_ds = load_dataset(str(text_dir))
41
    cache = Cache(dataset_name=text_dir.name, version="auto", hash="auto", sample_by="paragraph")
42
    another_cache = Cache(dataset_name=text_dir.name, version="auto", hash="auto")
43
    assert cache.config_id.endswith("paragraph")
44
    assert not another_cache.config_id.endswith("paragraph")
45
    reloaded = cache.as_dataset()
46
    another_reloaded = another_cache.as_dataset()
47
    assert list(ds) == list(reloaded)
48
    assert list(ds["train"]) == list(reloaded["train"])
49
    assert list(another_ds) == list(another_reloaded)
50
    assert list(another_ds["train"]) == list(another_reloaded["train"])
51

52

53
def test_cache_missing(text_dir: Path):
54
    load_dataset(str(text_dir))
55
    Cache(dataset_name=text_dir.name, version="auto", hash="auto").download_and_prepare()
56
    with pytest.raises(ValueError):
57
        Cache(dataset_name="missing", version="auto", hash="auto").download_and_prepare()
58
    with pytest.raises(ValueError):
59
        Cache(dataset_name=text_dir.name, hash="missing").download_and_prepare()
60
    with pytest.raises(ValueError):
61
        Cache(dataset_name=text_dir.name, config_name="missing", version="auto", hash="auto").download_and_prepare()
62

63

64
@pytest.mark.integration
65
def test_cache_multi_configs():
66
    repo_id = SAMPLE_DATASET_TWO_CONFIG_IN_METADATA
67
    dataset_name = repo_id.split("/")[-1]
68
    config_name = "v1"
69
    ds = load_dataset(repo_id, config_name)
70
    cache = Cache(dataset_name=dataset_name, repo_id=repo_id, config_name=config_name, version="auto", hash="auto")
71
    reloaded = cache.as_dataset()
72
    assert list(ds) == list(reloaded)
73
    assert len(ds["train"]) == len(reloaded["train"])
74
    with pytest.raises(ValueError) as excinfo:
75
        Cache(dataset_name=dataset_name, repo_id=repo_id, config_name="missing", version="auto", hash="auto")
76
    assert config_name in str(excinfo.value)
77

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.