datasets

benchmark_map_filter.py
71 строка · 2.4 Кб
Перенос по словам
1
import json
2
import os
3
import tempfile
4

5
import transformers
6

7
import datasets
8
from utils import generate_example_dataset, get_duration
9

10

11
SPEED_TEST_N_EXAMPLES = 500_000
12

13
RESULTS_BASEPATH, RESULTS_FILENAME = os.path.split(__file__)
14
RESULTS_FILE_PATH = os.path.join(RESULTS_BASEPATH, "results", RESULTS_FILENAME.replace(".py", ".json"))
15

16

17
@get_duration
18
def map(dataset: datasets.Dataset, **kwargs):
19
    _ = dataset.map(**kwargs)
20

21

22
@get_duration
23
def filter(dataset: datasets.Dataset, **kwargs):
24
    _ = dataset.filter(**kwargs)
25

26

27
def benchmark_map_filter():
28
    times = {"num examples": SPEED_TEST_N_EXAMPLES}
29
    with tempfile.TemporaryDirectory() as tmp_dir:
30
        features = datasets.Features({"text": datasets.Value("string"), "numbers": datasets.Value("float32")})
31
        dataset = generate_example_dataset(
32
            os.path.join(tmp_dir, "dataset.arrow"), features, num_examples=SPEED_TEST_N_EXAMPLES
33
        )
34

35
        tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-cased", use_fast=True)
36

37
        def tokenize(examples):
38
            return tokenizer(examples["text"])
39

40
        times["map identity"] = map(dataset)
41

42
        times["map identity batched"] = map(dataset, batched=True)
43

44
        times["map no-op batched"] = map(dataset, function=lambda x: None, batched=True)
45

46
        with dataset.formatted_as(type="numpy"):
47
            times["map no-op batched numpy"] = map(dataset, function=lambda x: None, batched=True)
48

49
        with dataset.formatted_as(type="pandas"):
50
            times["map no-op batched pandas"] = map(dataset, function=lambda x: None, batched=True)
51

52
        with dataset.formatted_as(type="torch", columns="numbers"):
53
            times["map no-op batched pytorch"] = map(dataset, function=lambda x: None, batched=True)
54

55
        with dataset.formatted_as(type="tensorflow", columns="numbers"):
56
            times["map no-op batched tensorflow"] = map(dataset, function=lambda x: None, batched=True)
57

58
        times["map fast-tokenizer batched"] = map(dataset, function=tokenize, batched=True)
59

60
        times["filter"] = filter(dataset)
61

62
        # Activate later when tokenizer support batched inputs
63
        # with dataset.formatted_as(type='numpy'):
64
        #     times[func.__name__ + " fast-tokenizer batched numpy"] = func(dataset, function=tokenize, batched=True)
65

66
    with open(RESULTS_FILE_PATH, "wb") as f:
67
        f.write(json.dumps(times).encode("utf-8"))
68

69

70
if __name__ == "__main__":  # useful to run the profiler
71
    benchmark_map_filter()
72
datasets

Использование cookies