datasets
76 строк · 3.1 Кб
1from pathlib import Path
2
3import pytest
4
5from datasets import load_dataset
6from datasets.packaged_modules.cache.cache import Cache
7
8
9SAMPLE_DATASET_TWO_CONFIG_IN_METADATA = "hf-internal-testing/audiofolder_two_configs_in_metadata"
10
11
12def test_cache(text_dir: Path):
13ds = load_dataset(str(text_dir))
14hash = Path(ds["train"].cache_files[0]["filename"]).parts[-2]
15cache = Cache(dataset_name=text_dir.name, hash=hash)
16reloaded = cache.as_dataset()
17assert list(ds) == list(reloaded)
18assert list(ds["train"]) == list(reloaded["train"])
19
20
21def test_cache_streaming(text_dir: Path):
22ds = load_dataset(str(text_dir))
23hash = Path(ds["train"].cache_files[0]["filename"]).parts[-2]
24cache = Cache(dataset_name=text_dir.name, hash=hash)
25reloaded = cache.as_streaming_dataset()
26assert list(ds) == list(reloaded)
27assert list(ds["train"]) == list(reloaded["train"])
28
29
30def test_cache_auto_hash(text_dir: Path):
31ds = load_dataset(str(text_dir))
32cache = Cache(dataset_name=text_dir.name, version="auto", hash="auto")
33reloaded = cache.as_dataset()
34assert list(ds) == list(reloaded)
35assert list(ds["train"]) == list(reloaded["train"])
36
37
38def test_cache_auto_hash_with_custom_config(text_dir: Path):
39ds = load_dataset(str(text_dir), sample_by="paragraph")
40another_ds = load_dataset(str(text_dir))
41cache = Cache(dataset_name=text_dir.name, version="auto", hash="auto", sample_by="paragraph")
42another_cache = Cache(dataset_name=text_dir.name, version="auto", hash="auto")
43assert cache.config_id.endswith("paragraph")
44assert not another_cache.config_id.endswith("paragraph")
45reloaded = cache.as_dataset()
46another_reloaded = another_cache.as_dataset()
47assert list(ds) == list(reloaded)
48assert list(ds["train"]) == list(reloaded["train"])
49assert list(another_ds) == list(another_reloaded)
50assert list(another_ds["train"]) == list(another_reloaded["train"])
51
52
53def test_cache_missing(text_dir: Path):
54load_dataset(str(text_dir))
55Cache(dataset_name=text_dir.name, version="auto", hash="auto").download_and_prepare()
56with pytest.raises(ValueError):
57Cache(dataset_name="missing", version="auto", hash="auto").download_and_prepare()
58with pytest.raises(ValueError):
59Cache(dataset_name=text_dir.name, hash="missing").download_and_prepare()
60with pytest.raises(ValueError):
61Cache(dataset_name=text_dir.name, config_name="missing", version="auto", hash="auto").download_and_prepare()
62
63
64@pytest.mark.integration
65def test_cache_multi_configs():
66repo_id = SAMPLE_DATASET_TWO_CONFIG_IN_METADATA
67dataset_name = repo_id.split("/")[-1]
68config_name = "v1"
69ds = load_dataset(repo_id, config_name)
70cache = Cache(dataset_name=dataset_name, repo_id=repo_id, config_name=config_name, version="auto", hash="auto")
71reloaded = cache.as_dataset()
72assert list(ds) == list(reloaded)
73assert len(ds["train"]) == len(reloaded["train"])
74with pytest.raises(ValueError) as excinfo:
75Cache(dataset_name=dataset_name, repo_id=repo_id, config_name="missing", version="auto", hash="auto")
76assert config_name in str(excinfo.value)
77