datasets

Форк
0
150 строк · 4.6 Кб
1
import os
2
import time
3
import uuid
4
from contextlib import contextmanager
5
from typing import Optional
6

7
import pytest
8
import requests
9
from huggingface_hub.hf_api import HfApi, RepositoryNotFoundError
10

11

12
CI_HUB_USER = "__DUMMY_TRANSFORMERS_USER__"
13
CI_HUB_USER_FULL_NAME = "Dummy User"
14
CI_HUB_USER_TOKEN = "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"
15

16
CI_HUB_ENDPOINT = "https://hub-ci.huggingface.co"
17
CI_HUB_DATASETS_URL = CI_HUB_ENDPOINT + "/datasets/{repo_id}/resolve/{revision}/{path}"
18
CI_HFH_HUGGINGFACE_CO_URL_TEMPLATE = CI_HUB_ENDPOINT + "/{repo_id}/resolve/{revision}/{filename}"
19

20

21
@pytest.fixture
22
def ci_hfh_hf_hub_url(monkeypatch):
23
    monkeypatch.setattr(
24
        "huggingface_hub.file_download.HUGGINGFACE_CO_URL_TEMPLATE", CI_HFH_HUGGINGFACE_CO_URL_TEMPLATE
25
    )
26

27

28
@pytest.fixture
29
def ci_hub_config(monkeypatch):
30
    monkeypatch.setattr("datasets.config.HF_ENDPOINT", CI_HUB_ENDPOINT)
31
    monkeypatch.setattr("datasets.config.HUB_DATASETS_URL", CI_HUB_DATASETS_URL)
32

33

34
@pytest.fixture
35
def set_ci_hub_access_token(ci_hub_config):
36
    old_environ = dict(os.environ)
37
    os.environ["HF_TOKEN"] = CI_HUB_USER_TOKEN
38
    yield
39
    os.environ.clear()
40
    os.environ.update(old_environ)
41

42

43
@pytest.fixture(scope="session")
44
def hf_api():
45
    return HfApi(endpoint=CI_HUB_ENDPOINT)
46

47

48
@pytest.fixture(scope="session")
49
def hf_token():
50
    yield CI_HUB_USER_TOKEN
51

52

53
@pytest.fixture
54
def cleanup_repo(hf_api):
55
    def _cleanup_repo(repo_id):
56
        hf_api.delete_repo(repo_id, token=CI_HUB_USER_TOKEN, repo_type="dataset")
57

58
    return _cleanup_repo
59

60

61
@pytest.fixture
62
def temporary_repo(cleanup_repo):
63
    @contextmanager
64
    def _temporary_repo(repo_id: Optional[str] = None):
65
        repo_id = repo_id or f"{CI_HUB_USER}/test-dataset-{uuid.uuid4().hex[:6]}-{int(time.time() * 10e3)}"
66
        try:
67
            yield repo_id
68
        finally:
69
            try:
70
                cleanup_repo(repo_id)
71
            except RepositoryNotFoundError:
72
                pass
73

74
    return _temporary_repo
75

76

77
@pytest.fixture(scope="session")
78
def hf_private_dataset_repo_txt_data_(hf_api: HfApi, hf_token, text_file):
79
    repo_name = f"repo_txt_data-{int(time.time() * 10e6)}"
80
    repo_id = f"{CI_HUB_USER}/{repo_name}"
81
    hf_api.create_repo(repo_id, token=hf_token, repo_type="dataset", private=True)
82
    hf_api.upload_file(
83
        token=hf_token,
84
        path_or_fileobj=str(text_file),
85
        path_in_repo="data/text_data.txt",
86
        repo_id=repo_id,
87
        repo_type="dataset",
88
    )
89
    yield repo_id
90
    try:
91
        hf_api.delete_repo(repo_id, token=hf_token, repo_type="dataset")
92
    except (requests.exceptions.HTTPError, ValueError):  # catch http error and token invalid error
93
        pass
94

95

96
@pytest.fixture()
97
def hf_private_dataset_repo_txt_data(hf_private_dataset_repo_txt_data_, ci_hub_config, ci_hfh_hf_hub_url):
98
    return hf_private_dataset_repo_txt_data_
99

100

101
@pytest.fixture(scope="session")
102
def hf_private_dataset_repo_zipped_txt_data_(hf_api: HfApi, hf_token, zip_csv_with_dir_path):
103
    repo_name = f"repo_zipped_txt_data-{int(time.time() * 10e6)}"
104
    repo_id = f"{CI_HUB_USER}/{repo_name}"
105
    hf_api.create_repo(repo_id, token=hf_token, repo_type="dataset", private=True)
106
    hf_api.upload_file(
107
        token=hf_token,
108
        path_or_fileobj=str(zip_csv_with_dir_path),
109
        path_in_repo="data.zip",
110
        repo_id=repo_id,
111
        repo_type="dataset",
112
    )
113
    yield repo_id
114
    try:
115
        hf_api.delete_repo(repo_id, token=hf_token, repo_type="dataset")
116
    except (requests.exceptions.HTTPError, ValueError):  # catch http error and token invalid error
117
        pass
118

119

120
@pytest.fixture()
121
def hf_private_dataset_repo_zipped_txt_data(
122
    hf_private_dataset_repo_zipped_txt_data_, ci_hub_config, ci_hfh_hf_hub_url
123
):
124
    return hf_private_dataset_repo_zipped_txt_data_
125

126

127
@pytest.fixture(scope="session")
128
def hf_private_dataset_repo_zipped_img_data_(hf_api: HfApi, hf_token, zip_image_path):
129
    repo_name = f"repo_zipped_img_data-{int(time.time() * 10e6)}"
130
    repo_id = f"{CI_HUB_USER}/{repo_name}"
131
    hf_api.create_repo(repo_id, token=hf_token, repo_type="dataset", private=True)
132
    hf_api.upload_file(
133
        token=hf_token,
134
        path_or_fileobj=str(zip_image_path),
135
        path_in_repo="data.zip",
136
        repo_id=repo_id,
137
        repo_type="dataset",
138
    )
139
    yield repo_id
140
    try:
141
        hf_api.delete_repo(repo_id, token=hf_token, repo_type="dataset")
142
    except (requests.exceptions.HTTPError, ValueError):  # catch http error and token invalid error
143
        pass
144

145

146
@pytest.fixture()
147
def hf_private_dataset_repo_zipped_img_data(
148
    hf_private_dataset_repo_zipped_img_data_, ci_hub_config, ci_hfh_hf_hub_url
149
):
150
    return hf_private_dataset_repo_zipped_img_data_
151

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.