1
from __future__ import annotations
5
from unittest.mock import Mock, patch
11
from mteb.abstasks import AbsTask
12
from mteb.abstasks.AbsTaskInstructionRetrieval import AbsTaskInstructionRetrieval
13
from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval
14
from mteb.abstasks.MultiSubsetLoader import MultiSubsetLoader
16
logging.basicConfig(level=logging.INFO)
19
@pytest.mark.parametrize("task", MTEB().tasks_cls)
20
@patch("datasets.load_dataset")
21
@patch("datasets.concatenate_datasets")
23
mock_concatenate_datasets: Mock, mock_load_dataset: Mock, task: AbsTask
27
isinstance(task, AbsTaskRetrieval)
28
or isinstance(task, AbsTaskInstructionRetrieval)
29
or isinstance(task, MultiSubsetLoader)
32
with patch.object(task, "dataset_transform") as mock_dataset_transform:
34
mock_load_dataset.assert_called()
37
if not task.is_crosslingual and not task.is_multilingual:
38
mock_dataset_transform.assert_called_once()
41
async def check_dataset_on_hf(
42
session: aiohttp.ClientSession, dataset: str, revision: str
44
url = f"https://huggingface.co/datasets/{dataset}/tree/{revision}"
45
async with session.head(url) as response:
46
return response.status == 200
49
async def check_datasets_are_available_on_hf(tasks):
51
async with aiohttp.ClientSession() as session:
55
task.metadata.dataset["path"],
56
task.metadata.dataset["revision"],
60
datasets_exists = await asyncio.gather(*tasks_checks)
62
for task, ds_exists in zip(tasks, datasets_exists):
64
does_not_exist.append(
65
(task.metadata.dataset["path"], task.metadata.dataset["revision"])
69
pretty_print = "\n".join(
70
[f"{ds[0]} - revision {ds[1]}" for ds in does_not_exist]
72
assert False, f"Datasets not available on Hugging Face:\n{pretty_print}"
75
def test_dataset_availability():
76
"""Checks if the datasets are available on Hugging Face using both their name and revision."""
77
tasks = MTEB().tasks_cls
78
asyncio.run(check_datasets_are_available_on_hf(tasks))