pytorch

torchbench.yaml
252 строки · 5.0 Кб
Перенос по словам
1
# Some models have large dataset that doesn't fit in memory. Lower the batch
2
# size to test the accuracy.
3
batch_size:
4
  training:
5
    demucs: 4
6
    dlrm: 1024
7
    densenet121: 4
8
    hf_Reformer: 4
9
    hf_T5_base: 4
10
    timm_efficientdet: 1
11
    llama_v2_7b_16h: 1
12
    # reduced from 16 due to cudagraphs OOM in TorchInductor dashboard
13
    yolov3: 8
14

15
  inference:
16
    timm_efficientdet: 32
17

18

19
dont_change_batch_size:
20
  - demucs
21
  - pytorch_struct
22
  - pyhpc_turbulent_kinetic_energy
23
  # https://github.com/pytorch/benchmark/pull/1656
24
  - vision_maskrcnn
25

26

27
tolerance:
28
  # Need lower tolerance on GPU. GPU kernels have non deterministic kernels for these models.
29
  higher:
30
    - alexnet
31
    - attention_is_all_you_need_pytorch
32
    - densenet121
33
    - hf_Albert
34
    - vgg16
35
    - mobilenet_v3_large
36
    - nvidia_deeprecommender
37
    - timm_efficientdet
38

39
  # These models need >1e-3 tolerance
40
  even_higher:
41
    - soft_actor_critic
42
    - tacotron2
43

44
  higher_fp16:
45
    - doctr_reco_predictor
46
    - drq
47
    - hf_Whisper
48

49
  higher_bf16:
50
    - doctr_reco_predictor
51
    - drq
52
    - hf_Whisper
53

54
  cosine: []
55

56

57
# These benchmarks took >600s on an i9-11900K CPU
58
very_slow: &VERY_SLOW_MODELS
59
  # 3339s
60
  - hf_BigBird
61
  # 3062s
62
  - hf_Longformer
63
  # 930s
64
  - hf_T5
65

66

67
# These benchmarks took >60s on an i9-11900K CPU
68
slow:
69
  - *VERY_SLOW_MODELS
70
  # 137s
71
  - BERT_pytorch
72
  # 116s
73
  - demucs
74
  # 242s
75
  - fastNLP_Bert
76
  # 221s
77
  - hf_Albert
78
  # 400s
79
  - hf_Bart
80
  # 334s
81
  - hf_Bert
82
  # 187s
83
  - hf_DistilBert
84
  # 470s
85
  - hf_GPT2
86
  # 141s
87
  - hf_Reformer
88
  # 317s
89
  - speech_transformer
90
  # 99s
91
  - vision_maskrcnn
92

93

94
non_deterministic:
95
  # https://github.com/pytorch/pytorch/issues/98355
96
  - mobilenet_v3_large
97

98

99
dtype:
100
  force_amp_for_fp16_bf16_models:
101
    - DALLE2_pytorch
102
    - doctr_det_predictor
103
    - doctr_reco_predictor
104
    - Super_SloMo
105
    - tts_angular
106
    - pyhpc_turbulent_kinetic_energy
107
    - detectron2_fcos_r_50_fpn
108

109
  force_fp16_for_bf16_models:
110
    - vision_maskrcnn
111

112

113
# models in canary_models that we should run anyway
114
canary_models:
115
  - torchrec_dlrm
116

117

118
detectron2_models: &DETECTRON2_MODELS
119
  - detectron2_fasterrcnn_r_101_c4
120
  - detectron2_fasterrcnn_r_101_dc5
121
  - detectron2_fasterrcnn_r_101_fpn
122
  - detectron2_fasterrcnn_r_50_c4
123
  - detectron2_fasterrcnn_r_50_dc5
124
  - detectron2_fasterrcnn_r_50_fpn
125
  - detectron2_maskrcnn_r_101_c4
126
  - detectron2_maskrcnn_r_101_fpn
127
  - detectron2_maskrcnn_r_50_fpn
128

129

130
# These models support only train mode. So accuracy checking can't be done in
131
# eval mode.
132
only_training:
133
  - *DETECTRON2_MODELS
134
  - tts_angular
135
  - tacotron2
136
  - demucs
137
  - hf_Reformer
138
  - pytorch_struct
139
  - yolov3
140

141

142
trt_not_yet_working:
143
  - alexnet
144
  - resnet18
145
  - resnet50
146
  - mobilenet_v2
147
  - mnasnet1_0
148
  - squeezenet1_1
149
  - shufflenetv2_x1_0
150
  - vgg16
151
  - resnext50_32x4d
152

153

154
skip:
155
  all:
156
    # OOMs (A100 40G)
157
    - detectron2_maskrcnn
158
    # TIMEOUT, https://github.com/pytorch/pytorch/issues/98467
159
    - tacotron2
160
    # Failing in eager mode
161
    - hf_clip
162
    # multi gpu not always available in benchmark runners
163
    - simple_gpt_tp_manual
164

165
  device:
166
    cpu:
167
      # OOMs
168
      - hf_T5_generate
169
      # model is CUDA only
170
      - cm3leon_generate
171
      # timeout
172
      - nanogpt
173
      # timeout
174
      - sam
175
      # model is CUDA only
176
      - llama_v2_7b_16h
177
      # flaky
178
      - stable_diffusion
179
      # requires FBGEMM, CUDA only
180
      - torchrec_dlrm
181
      - simple_gpt
182
      # works on cuda, accuracy failure on cpu
183
      - hf_Whisper
184
      - stable_diffusion_text_encoder
185

186
    cuda: []
187

188
  test:
189
    training:
190
      - *DETECTRON2_MODELS
191
      # not designed for training
192
      - pyhpc_equation_of_state
193
      - pyhpc_isoneutral_mixing
194
      - pyhpc_turbulent_kinetic_energy
195
      - maml
196
      - llama
197
      - llama_v2_7b_16h
198
      - simple_gpt
199
      # Model's DEFAULT_TRAIN_BSIZE is not implemented
200
      - cm3leon_generate
201
      - hf_T5_generate
202
      - doctr_det_predictor
203
      - doctr_reco_predictor
204
      # doesnt fit in memory
205
      - phi_1_5
206
      - detectron2_fcos_r_50_fpn
207

208
  control_flow:
209
    - cm3leon_generate
210
    - detectron2_fcos_r_50_fpn
211
    - fastNLP_Bert
212
    - hf_Longformer
213
    - hf_Reformer
214
    - hf_T5_generate
215
    - opacus_cifar10
216
    - speech_transformer
217

218
  # Models that should only run in --multiprocess mode
219
  multiprocess:
220
    - simple_gpt
221

222
  # for these models, conv-batchnorm fusing causes big numerical churn.
223
  # Skip them
224
  freezing:
225
    - mnasnet1_0
226
    - moco
227
    - shufflenet_v2_x1_0
228

229

230

231

232
accuracy:
233
  skip:
234
    large_models:
235
      # Models too large to have eager, dynamo and fp64_numbers simultaneosuly
236
      # even for 40 GB machine. We have tested accuracy for smaller version of
237
      # these models
238
      - hf_GPT2_large
239
      - hf_T5_large
240
      - timm_vision_transformer_large
241
      # accuracy https://github.com/pytorch/pytorch/issues/93847
242
      - maml
243
      - llama_v2_7b_16h
244
      - Background_Matting
245
      - stable_diffusion_unet
246
    eager_not_deterministic:
247
      # Models that deterministic algorithms can not be turned on for eager mode.
248
      - Background_Matting
249

250
  max_batch_size:
251
    hf_GPT2: 2
252
    pytorch_unet: 2
253
pytorch

Использование cookies