datasets
98 строк · 3.7 Кб
1import json
2import os
3import tempfile
4
5import datasets
6from utils import generate_example_dataset, get_duration
7
8
9SPEED_TEST_N_EXAMPLES = 50_000
10SMALL_TEST = 5_000
11
12RESULTS_BASEPATH, RESULTS_FILENAME = os.path.split(__file__)
13RESULTS_FILE_PATH = os.path.join(RESULTS_BASEPATH, "results", RESULTS_FILENAME.replace(".py", ".json"))
14
15
16@get_duration
17def read(dataset: datasets.Dataset, length):
18for i in range(length):
19_ = dataset[i]
20
21
22@get_duration
23def read_batch(dataset: datasets.Dataset, length, batch_size):
24for i in range(0, len(dataset), batch_size):
25_ = dataset[i : i + batch_size]
26
27
28@get_duration
29def read_formatted(dataset: datasets.Dataset, length, type):
30with dataset.formatted_as(type=type):
31for i in range(length):
32_ = dataset[i]
33
34
35@get_duration
36def read_formatted_batch(dataset: datasets.Dataset, length, batch_size, type):
37with dataset.formatted_as(type=type):
38for i in range(0, length, batch_size):
39_ = dataset[i : i + batch_size]
40
41
42def benchmark_iterating():
43times = {"num examples": SPEED_TEST_N_EXAMPLES}
44functions = [
45(read, {"length": SMALL_TEST}),
46(read, {"length": SPEED_TEST_N_EXAMPLES}),
47(read_batch, {"length": SPEED_TEST_N_EXAMPLES, "batch_size": 10}),
48(read_batch, {"length": SPEED_TEST_N_EXAMPLES, "batch_size": 100}),
49(read_batch, {"length": SPEED_TEST_N_EXAMPLES, "batch_size": 1_000}),
50(read_formatted, {"type": "numpy", "length": SMALL_TEST}),
51(read_formatted, {"type": "pandas", "length": SMALL_TEST}),
52(read_formatted, {"type": "torch", "length": SMALL_TEST}),
53(read_formatted, {"type": "tensorflow", "length": SMALL_TEST}),
54(read_formatted_batch, {"type": "numpy", "length": SMALL_TEST, "batch_size": 10}),
55(read_formatted_batch, {"type": "numpy", "length": SMALL_TEST, "batch_size": 1_000}),
56]
57
58functions_shuffled = [
59(read, {"length": SMALL_TEST}),
60(read, {"length": SPEED_TEST_N_EXAMPLES}),
61(read_batch, {"length": SPEED_TEST_N_EXAMPLES, "batch_size": 10}),
62(read_batch, {"length": SPEED_TEST_N_EXAMPLES, "batch_size": 100}),
63(read_batch, {"length": SPEED_TEST_N_EXAMPLES, "batch_size": 1_000}),
64(read_formatted, {"type": "numpy", "length": SMALL_TEST}),
65(read_formatted_batch, {"type": "numpy", "length": SMALL_TEST, "batch_size": 10}),
66(read_formatted_batch, {"type": "numpy", "length": SMALL_TEST, "batch_size": 1_000}),
67]
68with tempfile.TemporaryDirectory() as tmp_dir:
69print("generating dataset")
70features = datasets.Features(
71{"list": datasets.Sequence(datasets.Value("float32")), "numbers": datasets.Value("float32")}
72)
73dataset = generate_example_dataset(
74os.path.join(tmp_dir, "dataset.arrow"),
75features,
76num_examples=SPEED_TEST_N_EXAMPLES,
77seq_shapes={"list": (100,)},
78)
79print("first set of iterations")
80for func, kwargs in functions:
81print(func.__name__, str(kwargs))
82times[func.__name__ + " " + " ".join(str(v) for v in kwargs.values())] = func(dataset, **kwargs)
83
84print("shuffling dataset")
85dataset = dataset.shuffle()
86print("Second set of iterations (after shuffling")
87for func, kwargs in functions_shuffled:
88print("shuffled ", func.__name__, str(kwargs))
89times["shuffled " + func.__name__ + " " + " ".join(str(v) for v in kwargs.values())] = func(
90dataset, **kwargs
91)
92
93with open(RESULTS_FILE_PATH, "wb") as f:
94f.write(json.dumps(times).encode("utf-8"))
95
96
97if __name__ == "__main__": # useful to run the profiler
98benchmark_iterating()
99