12
import pyarrow.parquet as pq
22
@pytest.fixture(scope="session")
25
features = datasets.Features(
27
"tokens": datasets.Sequence(datasets.Value("string")),
28
"labels": datasets.Sequence(datasets.ClassLabel(names=["negative", "positive"])),
29
"answers": datasets.Sequence(
31
"text": datasets.Value("string"),
32
"answer_start": datasets.Value("int32"),
35
"id": datasets.Value("int64"),
38
dataset = datasets.Dataset.from_dict(
40
"tokens": [["foo"] * 5] * n,
41
"labels": [[1] * 5] * n,
42
"answers": [{"answer_start": [97], "text": ["1976"]}] * 10,
50
@pytest.fixture(scope="session")
51
def arrow_file(tmp_path_factory, dataset):
52
filename = str(tmp_path_factory.mktemp("data") / "file.arrow")
53
dataset.map(cache_file_name=filename)
62
Second line of data."""
65
@pytest.fixture(scope="session")
66
def text_file(tmp_path_factory):
67
filename = tmp_path_factory.mktemp("data") / "file.txt"
69
with open(filename, "w") as f:
74
@pytest.fixture(scope="session")
75
def bz2_file(tmp_path_factory):
78
path = tmp_path_factory.mktemp("data") / "file.txt.bz2"
79
data = bytes(FILE_CONTENT, "utf-8")
80
with bz2.open(path, "wb") as f:
85
@pytest.fixture(scope="session")
86
def gz_file(tmp_path_factory):
89
path = str(tmp_path_factory.mktemp("data") / "file.txt.gz")
90
data = bytes(FILE_CONTENT, "utf-8")
91
with gzip.open(path, "wb") as f:
96
@pytest.fixture(scope="session")
97
def lz4_file(tmp_path_factory):
98
if datasets.config.LZ4_AVAILABLE:
101
path = tmp_path_factory.mktemp("data") / "file.txt.lz4"
102
data = bytes(FILE_CONTENT, "utf-8")
103
with lz4.frame.open(path, "wb") as f:
108
@pytest.fixture(scope="session")
109
def seven_zip_file(tmp_path_factory, text_file):
110
if datasets.config.PY7ZR_AVAILABLE:
113
path = tmp_path_factory.mktemp("data") / "file.txt.7z"
114
with py7zr.SevenZipFile(path, "w") as archive:
115
archive.write(text_file, arcname=os.path.basename(text_file))
119
@pytest.fixture(scope="session")
120
def tar_file(tmp_path_factory, text_file):
123
path = tmp_path_factory.mktemp("data") / "file.txt.tar"
124
with tarfile.TarFile(path, "w") as f:
125
f.add(text_file, arcname=os.path.basename(text_file))
129
@pytest.fixture(scope="session")
130
def xz_file(tmp_path_factory):
133
path = tmp_path_factory.mktemp("data") / "file.txt.xz"
134
data = bytes(FILE_CONTENT, "utf-8")
135
with lzma.open(path, "wb") as f:
140
@pytest.fixture(scope="session")
141
def zip_file(tmp_path_factory, text_file):
144
path = tmp_path_factory.mktemp("data") / "file.txt.zip"
145
with zipfile.ZipFile(path, "w") as f:
146
f.write(text_file, arcname=os.path.basename(text_file))
150
@pytest.fixture(scope="session")
151
def zstd_file(tmp_path_factory):
152
if datasets.config.ZSTANDARD_AVAILABLE:
153
import zstandard as zstd
155
path = tmp_path_factory.mktemp("data") / "file.txt.zst"
156
data = bytes(FILE_CONTENT, "utf-8")
157
with zstd.open(path, "wb") as f:
165
@pytest.fixture(scope="session")
166
def xml_file(tmp_path_factory):
167
filename = tmp_path_factory.mktemp("data") / "file.xml"
168
data = textwrap.dedent(
170
<?xml version="1.0" encoding="UTF-8" ?>
172
<header segtype="sentence" srclang="ca" />
175
<tuv xml:lang="ca"><seg>Contingut 1</seg></tuv>
176
<tuv xml:lang="en"><seg>Content 1</seg></tuv>
179
<tuv xml:lang="ca"><seg>Contingut 2</seg></tuv>
180
<tuv xml:lang="en"><seg>Content 2</seg></tuv>
183
<tuv xml:lang="ca"><seg>Contingut 3</seg></tuv>
184
<tuv xml:lang="en"><seg>Content 3</seg></tuv>
187
<tuv xml:lang="ca"><seg>Contingut 4</seg></tuv>
188
<tuv xml:lang="en"><seg>Content 4</seg></tuv>
191
<tuv xml:lang="ca"><seg>Contingut 5</seg></tuv>
192
<tuv xml:lang="en"><seg>Content 5</seg></tuv>
197
with open(filename, "w") as f:
203
{"col_1": "0", "col_2": 0, "col_3": 0.0},
204
{"col_1": "1", "col_2": 1, "col_3": 1.0},
205
{"col_1": "2", "col_2": 2, "col_3": 2.0},
206
{"col_1": "3", "col_2": 3, "col_3": 3.0},
209
{"col_1": "4", "col_2": 4, "col_3": 4.0},
210
{"col_1": "5", "col_2": 5, "col_3": 5.0},
212
DATA_DICT_OF_LISTS = {
213
"col_1": ["0", "1", "2", "3"],
214
"col_2": [0, 1, 2, 3],
215
"col_3": [0.0, 1.0, 2.0, 3.0],
219
{"col_3": 0.0, "col_1": "0", "col_2": 0},
220
{"col_3": 1.0, "col_1": "1", "col_2": 1},
224
{"col_1": "s0", "col_2": 0, "col_3": 0.0},
225
{"col_1": "s1", "col_2": 1, "col_3": 1.0},
226
{"col_1": "s2", "col_2": 2, "col_3": 2.0},
227
{"col_1": "s3", "col_2": 3, "col_3": 3.0},
231
@pytest.fixture(scope="session")
233
return DATA_DICT_OF_LISTS
236
@pytest.fixture(scope="session")
237
def arrow_path(tmp_path_factory):
238
dataset = datasets.Dataset.from_dict(DATA_DICT_OF_LISTS)
239
path = str(tmp_path_factory.mktemp("data") / "dataset.arrow")
240
dataset.map(cache_file_name=path)
244
@pytest.fixture(scope="session")
245
def sqlite_path(tmp_path_factory):
246
path = str(tmp_path_factory.mktemp("data") / "dataset.sqlite")
247
with contextlib.closing(sqlite3.connect(path)) as con:
249
cur.execute("CREATE TABLE dataset(col_1 text, col_2 int, col_3 real)")
251
cur.execute("INSERT INTO dataset(col_1, col_2, col_3) VALUES (?, ?, ?)", tuple(item.values()))
256
@pytest.fixture(scope="session")
257
def csv_path(tmp_path_factory):
258
path = str(tmp_path_factory.mktemp("data") / "dataset.csv")
259
with open(path, "w", newline="") as f:
260
writer = csv.DictWriter(f, fieldnames=["col_1", "col_2", "col_3"])
263
writer.writerow(item)
267
@pytest.fixture(scope="session")
268
def csv2_path(tmp_path_factory):
269
path = str(tmp_path_factory.mktemp("data") / "dataset2.csv")
270
with open(path, "w", newline="") as f:
271
writer = csv.DictWriter(f, fieldnames=["col_1", "col_2", "col_3"])
274
writer.writerow(item)
278
@pytest.fixture(scope="session")
279
def bz2_csv_path(csv_path, tmp_path_factory):
282
path = tmp_path_factory.mktemp("data") / "dataset.csv.bz2"
283
with open(csv_path, "rb") as f:
285
# data = bytes(FILE_CONTENT, "utf-8")
286
with bz2.open(path, "wb") as f:
291
@pytest.fixture(scope="session")
292
def zip_csv_path(csv_path, csv2_path, tmp_path_factory):
293
path = tmp_path_factory.mktemp("zip_csv_path") / "csv-dataset.zip"
294
with zipfile.ZipFile(path, "w") as f:
295
f.write(csv_path, arcname=os.path.basename(csv_path))
296
f.write(csv2_path, arcname=os.path.basename(csv2_path))
300
@pytest.fixture(scope="session")
301
def zip_uppercase_csv_path(csv_path, csv2_path, tmp_path_factory):
302
path = tmp_path_factory.mktemp("data") / "dataset.csv.zip"
303
with zipfile.ZipFile(path, "w") as f:
304
f.write(csv_path, arcname=os.path.basename(csv_path.replace(".csv", ".CSV")))
305
f.write(csv2_path, arcname=os.path.basename(csv2_path.replace(".csv", ".CSV")))
309
@pytest.fixture(scope="session")
310
def zip_csv_with_dir_path(csv_path, csv2_path, tmp_path_factory):
311
path = tmp_path_factory.mktemp("data") / "dataset_with_dir.csv.zip"
312
with zipfile.ZipFile(path, "w") as f:
313
f.write(csv_path, arcname=os.path.join("main_dir", os.path.basename(csv_path)))
314
f.write(csv2_path, arcname=os.path.join("main_dir", os.path.basename(csv2_path)))
318
@pytest.fixture(scope="session")
319
def parquet_path(tmp_path_factory):
320
path = str(tmp_path_factory.mktemp("data") / "dataset.parquet")
323
"col_1": pa.string(),
325
"col_3": pa.float64(),
328
with open(path, "wb") as f:
329
writer = pq.ParquetWriter(f, schema=schema)
330
pa_table = pa.Table.from_pydict({k: [DATA[i][k] for i in range(len(DATA))] for k in DATA[0]}, schema=schema)
331
writer.write_table(pa_table)
336
@pytest.fixture(scope="session")
337
def geoparquet_path(tmp_path_factory):
338
df = pd.read_parquet(path="https://github.com/opengeospatial/geoparquet/raw/v1.0.0/examples/example.parquet")
339
path = str(tmp_path_factory.mktemp("data") / "dataset.geoparquet")
340
df.to_parquet(path=path)
344
@pytest.fixture(scope="session")
345
def json_list_of_dicts_path(tmp_path_factory):
346
path = str(tmp_path_factory.mktemp("data") / "dataset.json")
347
data = {"data": DATA}
348
with open(path, "w") as f:
353
@pytest.fixture(scope="session")
354
def json_dict_of_lists_path(tmp_path_factory):
355
path = str(tmp_path_factory.mktemp("data") / "dataset.json")
356
data = {"data": DATA_DICT_OF_LISTS}
357
with open(path, "w") as f:
362
@pytest.fixture(scope="session")
363
def jsonl_path(tmp_path_factory):
364
path = str(tmp_path_factory.mktemp("data") / "dataset.jsonl")
365
with open(path, "w") as f:
367
f.write(json.dumps(item) + "\n")
371
@pytest.fixture(scope="session")
372
def jsonl2_path(tmp_path_factory):
373
path = str(tmp_path_factory.mktemp("data") / "dataset2.jsonl")
374
with open(path, "w") as f:
376
f.write(json.dumps(item) + "\n")
380
@pytest.fixture(scope="session")
381
def jsonl_312_path(tmp_path_factory):
382
path = str(tmp_path_factory.mktemp("data") / "dataset_312.jsonl")
383
with open(path, "w") as f:
384
for item in DATA_312:
385
f.write(json.dumps(item) + "\n")
389
@pytest.fixture(scope="session")
390
def jsonl_str_path(tmp_path_factory):
391
path = str(tmp_path_factory.mktemp("data") / "dataset-str.jsonl")
392
with open(path, "w") as f:
393
for item in DATA_STR:
394
f.write(json.dumps(item) + "\n")
398
@pytest.fixture(scope="session")
399
def text_gz_path(tmp_path_factory, text_path):
402
path = str(tmp_path_factory.mktemp("data") / "dataset.txt.gz")
403
with open(text_path, "rb") as orig_file:
404
with gzip.open(path, "wb") as zipped_file:
405
zipped_file.writelines(orig_file)
409
@pytest.fixture(scope="session")
410
def jsonl_gz_path(tmp_path_factory, jsonl_path):
413
path = str(tmp_path_factory.mktemp("data") / "dataset.jsonl.gz")
414
with open(jsonl_path, "rb") as orig_file:
415
with gzip.open(path, "wb") as zipped_file:
416
zipped_file.writelines(orig_file)
420
@pytest.fixture(scope="session")
421
def zip_jsonl_path(jsonl_path, jsonl2_path, tmp_path_factory):
422
path = tmp_path_factory.mktemp("data") / "dataset.jsonl.zip"
423
with zipfile.ZipFile(path, "w") as f:
424
f.write(jsonl_path, arcname=os.path.basename(jsonl_path))
425
f.write(jsonl2_path, arcname=os.path.basename(jsonl2_path))
429
@pytest.fixture(scope="session")
430
def zip_nested_jsonl_path(zip_jsonl_path, jsonl_path, jsonl2_path, tmp_path_factory):
431
path = tmp_path_factory.mktemp("data") / "dataset_nested.jsonl.zip"
432
with zipfile.ZipFile(path, "w") as f:
433
f.write(zip_jsonl_path, arcname=os.path.join("nested", os.path.basename(zip_jsonl_path)))
437
@pytest.fixture(scope="session")
438
def zip_jsonl_with_dir_path(jsonl_path, jsonl2_path, tmp_path_factory):
439
path = tmp_path_factory.mktemp("data") / "dataset_with_dir.jsonl.zip"
440
with zipfile.ZipFile(path, "w") as f:
441
f.write(jsonl_path, arcname=os.path.join("main_dir", os.path.basename(jsonl_path)))
442
f.write(jsonl2_path, arcname=os.path.join("main_dir", os.path.basename(jsonl2_path)))
446
@pytest.fixture(scope="session")
447
def tar_jsonl_path(jsonl_path, jsonl2_path, tmp_path_factory):
448
path = tmp_path_factory.mktemp("data") / "dataset.jsonl.tar"
449
with tarfile.TarFile(path, "w") as f:
450
f.add(jsonl_path, arcname=os.path.basename(jsonl_path))
451
f.add(jsonl2_path, arcname=os.path.basename(jsonl2_path))
455
@pytest.fixture(scope="session")
456
def tar_nested_jsonl_path(tar_jsonl_path, jsonl_path, jsonl2_path, tmp_path_factory):
457
path = tmp_path_factory.mktemp("data") / "dataset_nested.jsonl.tar"
458
with tarfile.TarFile(path, "w") as f:
459
f.add(tar_jsonl_path, arcname=os.path.join("nested", os.path.basename(tar_jsonl_path)))
463
@pytest.fixture(scope="session")
464
def text_path(tmp_path_factory):
465
data = ["0", "1", "2", "3"]
466
path = str(tmp_path_factory.mktemp("data") / "dataset.txt")
467
with open(path, "w") as f:
473
@pytest.fixture(scope="session")
474
def text2_path(tmp_path_factory):
475
data = ["0", "1", "2", "3"]
476
path = str(tmp_path_factory.mktemp("data") / "dataset2.txt")
477
with open(path, "w") as f:
483
@pytest.fixture(scope="session")
484
def text_dir(tmp_path_factory):
485
data = ["0", "1", "2", "3"]
486
path = tmp_path_factory.mktemp("data_text_dir") / "dataset.txt"
487
with open(path, "w") as f:
493
@pytest.fixture(scope="session")
494
def text_dir_with_unsupported_extension(tmp_path_factory):
495
data = ["0", "1", "2", "3"]
496
path = tmp_path_factory.mktemp("data") / "dataset.abc"
497
with open(path, "w") as f:
503
@pytest.fixture(scope="session")
504
def zip_text_path(text_path, text2_path, tmp_path_factory):
505
path = tmp_path_factory.mktemp("data") / "dataset.text.zip"
506
with zipfile.ZipFile(path, "w") as f:
507
f.write(text_path, arcname=os.path.basename(text_path))
508
f.write(text2_path, arcname=os.path.basename(text2_path))
512
@pytest.fixture(scope="session")
513
def zip_text_with_dir_path(text_path, text2_path, tmp_path_factory):
514
path = tmp_path_factory.mktemp("data") / "dataset_with_dir.text.zip"
515
with zipfile.ZipFile(path, "w") as f:
516
f.write(text_path, arcname=os.path.join("main_dir", os.path.basename(text_path)))
517
f.write(text2_path, arcname=os.path.join("main_dir", os.path.basename(text2_path)))
521
@pytest.fixture(scope="session")
522
def zip_unsupported_ext_path(text_path, text2_path, tmp_path_factory):
523
path = tmp_path_factory.mktemp("data") / "dataset.ext.zip"
524
with zipfile.ZipFile(path, "w") as f:
525
f.write(text_path, arcname=os.path.basename("unsupported.ext"))
526
f.write(text2_path, arcname=os.path.basename("unsupported_2.ext"))
530
@pytest.fixture(scope="session")
531
def text_path_with_unicode_new_lines(tmp_path_factory):
532
text = "\n".join(["First", "Second\u2029with Unicode new line", "Third"])
533
path = str(tmp_path_factory.mktemp("data") / "dataset_with_unicode_new_lines.txt")
534
with open(path, "w", encoding="utf-8") as f:
539
@pytest.fixture(scope="session")
541
return os.path.join("tests", "features", "data", "test_image_rgb.jpg")
544
@pytest.fixture(scope="session")
546
return os.path.join("tests", "features", "data", "test_audio_44100.wav")
549
@pytest.fixture(scope="session")
550
def zip_image_path(image_file, tmp_path_factory):
551
path = tmp_path_factory.mktemp("data") / "dataset.img.zip"
552
with zipfile.ZipFile(path, "w") as f:
553
f.write(image_file, arcname=os.path.basename(image_file))
554
f.write(image_file, arcname=os.path.basename(image_file).replace(".jpg", "2.jpg"))
558
@pytest.fixture(scope="session")
559
def data_dir_with_hidden_files(tmp_path_factory):
560
data_dir = tmp_path_factory.mktemp("data_dir")
562
(data_dir / "subdir").mkdir()
563
with open(data_dir / "subdir" / "train.txt", "w") as f:
564
f.write("foo\n" * 10)
565
with open(data_dir / "subdir" / "test.txt", "w") as f:
566
f.write("bar\n" * 10)
568
with open(data_dir / "subdir" / ".test.txt", "w") as f:
569
f.write("bar\n" * 10)
572
(data_dir / ".subdir").mkdir()
573
with open(data_dir / ".subdir" / "train.txt", "w") as f:
574
f.write("foo\n" * 10)
575
with open(data_dir / ".subdir" / "test.txt", "w") as f:
576
f.write("bar\n" * 10)