datasets
71 строка · 2.4 Кб
1import json2import os3import tempfile4
5import transformers6
7import datasets8from utils import generate_example_dataset, get_duration9
10
11SPEED_TEST_N_EXAMPLES = 500_00012
13RESULTS_BASEPATH, RESULTS_FILENAME = os.path.split(__file__)14RESULTS_FILE_PATH = os.path.join(RESULTS_BASEPATH, "results", RESULTS_FILENAME.replace(".py", ".json"))15
16
17@get_duration
18def map(dataset: datasets.Dataset, **kwargs):19_ = dataset.map(**kwargs)20
21
22@get_duration
23def filter(dataset: datasets.Dataset, **kwargs):24_ = dataset.filter(**kwargs)25
26
27def benchmark_map_filter():28times = {"num examples": SPEED_TEST_N_EXAMPLES}29with tempfile.TemporaryDirectory() as tmp_dir:30features = datasets.Features({"text": datasets.Value("string"), "numbers": datasets.Value("float32")})31dataset = generate_example_dataset(32os.path.join(tmp_dir, "dataset.arrow"), features, num_examples=SPEED_TEST_N_EXAMPLES33)34
35tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-cased", use_fast=True)36
37def tokenize(examples):38return tokenizer(examples["text"])39
40times["map identity"] = map(dataset)41
42times["map identity batched"] = map(dataset, batched=True)43
44times["map no-op batched"] = map(dataset, function=lambda x: None, batched=True)45
46with dataset.formatted_as(type="numpy"):47times["map no-op batched numpy"] = map(dataset, function=lambda x: None, batched=True)48
49with dataset.formatted_as(type="pandas"):50times["map no-op batched pandas"] = map(dataset, function=lambda x: None, batched=True)51
52with dataset.formatted_as(type="torch", columns="numbers"):53times["map no-op batched pytorch"] = map(dataset, function=lambda x: None, batched=True)54
55with dataset.formatted_as(type="tensorflow", columns="numbers"):56times["map no-op batched tensorflow"] = map(dataset, function=lambda x: None, batched=True)57
58times["map fast-tokenizer batched"] = map(dataset, function=tokenize, batched=True)59
60times["filter"] = filter(dataset)61
62# Activate later when tokenizer support batched inputs63# with dataset.formatted_as(type='numpy'):64# times[func.__name__ + " fast-tokenizer batched numpy"] = func(dataset, function=tokenize, batched=True)65
66with open(RESULTS_FILE_PATH, "wb") as f:67f.write(json.dumps(times).encode("utf-8"))68
69
70if __name__ == "__main__": # useful to run the profiler71benchmark_map_filter()72