20
#include <emscripten.h>
25
#include "datareader.h"
33
class DataReaderFromEmpty : public ncnn::DataReader
36
virtual int scan(const char* format, void* p) const
40
virtual size_t read(void* buf, size_t size) const
47
static int g_warmup_loop_count = 8;
48
static int g_loop_count = 4;
49
static bool g_enable_cooling_down = true;
51
static ncnn::UnlockedPoolAllocator g_blob_pool_allocator;
52
static ncnn::PoolAllocator g_workspace_pool_allocator;
55
static ncnn::VulkanDevice* g_vkdev = 0;
56
static ncnn::VkAllocator* g_blob_vkallocator = 0;
57
static ncnn::VkAllocator* g_staging_vkallocator = 0;
60
void benchmark(const char* comment, const std::vector<ncnn::Mat>& _in, const ncnn::Option& opt, bool fixed_path = true)
62
g_blob_pool_allocator.clear();
63
g_workspace_pool_allocator.clear();
66
if (opt.use_vulkan_compute)
68
g_blob_vkallocator->clear();
69
g_staging_vkallocator->clear();
78
if (net.opt.use_vulkan_compute)
80
net.set_vulkan_device(g_vkdev);
85
#define MODEL_DIR "/working/"
93
sprintf(parampath, MODEL_DIR "%s.param", comment);
94
net.load_param(parampath);
98
net.load_param(comment);
101
DataReaderFromEmpty dr;
104
const std::vector<const char*>& input_names = net.input_names();
105
const std::vector<const char*>& output_names = net.output_names();
107
if (g_enable_cooling_down)
110
ncnn::sleep(10 * 1000);
113
if (input_names.size() > _in.size())
115
fprintf(stderr, "input %ld tensors while model has %ld inputs\n", _in.size(), input_names.size());
120
for (size_t j = 0; j < input_names.size(); ++j)
122
ncnn::Mat in = _in[j];
127
for (int i = 0; i < g_warmup_loop_count; i++)
129
ncnn::Extractor ex = net.create_extractor();
130
for (size_t j = 0; j < input_names.size(); ++j)
132
ncnn::Mat in = _in[j];
133
ex.input(input_names[j], in);
136
for (size_t j = 0; j < output_names.size(); ++j)
139
ex.extract(output_names[j], out);
143
double time_min = DBL_MAX;
144
double time_max = -DBL_MAX;
147
for (int i = 0; i < g_loop_count; i++)
149
double start = ncnn::get_current_time();
151
ncnn::Extractor ex = net.create_extractor();
152
for (size_t j = 0; j < input_names.size(); ++j)
154
ncnn::Mat in = _in[j];
155
ex.input(input_names[j], in);
158
for (size_t j = 0; j < output_names.size(); ++j)
161
ex.extract(output_names[j], out);
165
double end = ncnn::get_current_time();
167
double time = end - start;
169
time_min = std::min(time_min, time);
170
time_max = std::max(time_max, time);
174
time_avg /= g_loop_count;
176
fprintf(stderr, "%20s min = %7.2f max = %7.2f avg = %7.2f\n", comment, time_min, time_max, time_avg);
179
void benchmark(const char* comment, const ncnn::Mat& _in, const ncnn::Option& opt, bool fixed_path = true)
181
std::vector<ncnn::Mat> inputs;
182
inputs.push_back(_in);
183
return benchmark(comment, inputs, opt, fixed_path);
188
fprintf(stderr, "Usage: benchncnn [loop count] [num threads] [powersave] [gpu device] [cooling down] [(key=value)...]\n");
189
fprintf(stderr, " param=model.param\n");
190
fprintf(stderr, " shape=[227,227,3],...\n");
193
static std::vector<ncnn::Mat> parse_shape_list(char* s)
195
std::vector<std::vector<int> > shapes;
196
std::vector<ncnn::Mat> mats;
198
char* pch = strtok(s, "[]");
204
int nscan = sscanf(pch, "%d%n", &v, &nconsumed);
213
nscan = sscanf(pch, ",%d%n", &v, &nconsumed);
220
nscan = sscanf(pch, ",%d%n", &v, &nconsumed);
227
pch = strtok(NULL, "[]");
230
for (size_t i = 0; i < shapes.size(); ++i)
232
const std::vector<int>& shape = shapes[i];
233
switch (shape.size())
236
mats.push_back(ncnn::Mat(shape[0], shape[1], shape[2], shape[3]));
239
mats.push_back(ncnn::Mat(shape[0], shape[1], shape[2]));
242
mats.push_back(ncnn::Mat(shape[0], shape[1]));
245
mats.push_back(ncnn::Mat(shape[0]));
248
fprintf(stderr, "unsupported input shape size %ld\n", shape.size());
255
int main(int argc, char** argv)
258
int num_threads = ncnn::get_physical_big_cpu_count();
261
int cooling_down = 1;
263
std::vector<ncnn::Mat> inputs;
265
for (int i = 1; i < argc; i++)
267
if (argv[i][0] == '-' && argv[i][1] == 'h')
273
if (strcmp(argv[i], "--help") == 0)
282
loop_count = atoi(argv[1]);
286
num_threads = atoi(argv[2]);
290
powersave = atoi(argv[3]);
294
gpu_device = atoi(argv[4]);
298
cooling_down = atoi(argv[5]);
301
for (int i = 6; i < argc; i++)
306
char* eqs = strchr(kv, '=');
309
fprintf(stderr, "unrecognized arg %s\n", kv);
315
const char* key = kv;
316
char* value = eqs + 1;
318
if (strcmp(key, "param") == 0)
320
if (strcmp(key, "shape") == 0)
321
inputs = parse_shape_list(value);
324
if (model && inputs.empty())
326
fprintf(stderr, "input tensor shape empty!\n");
332
FS.mkdir('/working');
333
FS.mount(NODEFS, {root: '.'}, '/working'););
336
bool use_vulkan_compute = gpu_device != -1;
338
g_enable_cooling_down = cooling_down != 0;
340
g_loop_count = loop_count;
342
g_blob_pool_allocator.set_size_compare_ratio(0.f);
343
g_workspace_pool_allocator.set_size_compare_ratio(0.f);
346
if (use_vulkan_compute)
348
g_warmup_loop_count = 10;
350
g_vkdev = ncnn::get_gpu_device(gpu_device);
352
g_blob_vkallocator = new ncnn::VkBlobAllocator(g_vkdev);
353
g_staging_vkallocator = new ncnn::VkStagingAllocator(g_vkdev);
357
ncnn::set_cpu_powersave(powersave);
359
ncnn::set_omp_dynamic(0);
360
ncnn::set_omp_num_threads(num_threads);
364
opt.lightmode = true;
365
opt.num_threads = num_threads;
366
opt.blob_allocator = &g_blob_pool_allocator;
367
opt.workspace_allocator = &g_workspace_pool_allocator;
369
opt.blob_vkallocator = g_blob_vkallocator;
370
opt.workspace_vkallocator = g_blob_vkallocator;
371
opt.staging_vkallocator = g_staging_vkallocator;
373
opt.use_winograd_convolution = true;
374
opt.use_sgemm_convolution = true;
375
opt.use_int8_inference = true;
376
opt.use_vulkan_compute = use_vulkan_compute;
377
opt.use_fp16_packed = true;
378
opt.use_fp16_storage = true;
379
opt.use_fp16_arithmetic = true;
380
opt.use_int8_storage = true;
381
opt.use_int8_arithmetic = true;
382
opt.use_packing_layout = true;
383
opt.use_shader_pack8 = false;
384
opt.use_image_storage = false;
386
fprintf(stderr, "loop_count = %d\n", g_loop_count);
387
fprintf(stderr, "num_threads = %d\n", num_threads);
388
fprintf(stderr, "powersave = %d\n", ncnn::get_cpu_powersave());
389
fprintf(stderr, "gpu_device = %d\n", gpu_device);
390
fprintf(stderr, "cooling_down = %d\n", (int)g_enable_cooling_down);
395
benchmark(model, inputs, opt, false);
400
benchmark("squeezenet", ncnn::Mat(227, 227, 3), opt);
402
benchmark("squeezenet_int8", ncnn::Mat(227, 227, 3), opt);
404
benchmark("mobilenet", ncnn::Mat(224, 224, 3), opt);
406
benchmark("mobilenet_int8", ncnn::Mat(224, 224, 3), opt);
408
benchmark("mobilenet_v2", ncnn::Mat(224, 224, 3), opt);
412
benchmark("mobilenet_v3", ncnn::Mat(224, 224, 3), opt);
414
benchmark("shufflenet", ncnn::Mat(224, 224, 3), opt);
416
benchmark("shufflenet_v2", ncnn::Mat(224, 224, 3), opt);
418
benchmark("mnasnet", ncnn::Mat(224, 224, 3), opt);
420
benchmark("proxylessnasnet", ncnn::Mat(224, 224, 3), opt);
422
benchmark("efficientnet_b0", ncnn::Mat(224, 224, 3), opt);
424
benchmark("efficientnetv2_b0", ncnn::Mat(224, 224, 3), opt);
426
benchmark("regnety_400m", ncnn::Mat(224, 224, 3), opt);
428
benchmark("blazeface", ncnn::Mat(128, 128, 3), opt);
430
benchmark("googlenet", ncnn::Mat(224, 224, 3), opt);
432
benchmark("googlenet_int8", ncnn::Mat(224, 224, 3), opt);
434
benchmark("resnet18", ncnn::Mat(224, 224, 3), opt);
436
benchmark("resnet18_int8", ncnn::Mat(224, 224, 3), opt);
438
benchmark("alexnet", ncnn::Mat(227, 227, 3), opt);
440
benchmark("vgg16", ncnn::Mat(224, 224, 3), opt);
442
benchmark("vgg16_int8", ncnn::Mat(224, 224, 3), opt);
444
benchmark("resnet50", ncnn::Mat(224, 224, 3), opt);
446
benchmark("resnet50_int8", ncnn::Mat(224, 224, 3), opt);
448
benchmark("squeezenet_ssd", ncnn::Mat(300, 300, 3), opt);
450
benchmark("squeezenet_ssd_int8", ncnn::Mat(300, 300, 3), opt);
452
benchmark("mobilenet_ssd", ncnn::Mat(300, 300, 3), opt);
454
benchmark("mobilenet_ssd_int8", ncnn::Mat(300, 300, 3), opt);
456
benchmark("mobilenet_yolo", ncnn::Mat(416, 416, 3), opt);
458
benchmark("mobilenetv2_yolov3", ncnn::Mat(352, 352, 3), opt);
460
benchmark("yolov4-tiny", ncnn::Mat(416, 416, 3), opt);
462
benchmark("nanodet_m", ncnn::Mat(320, 320, 3), opt);
464
benchmark("yolo-fastest-1.1", ncnn::Mat(320, 320, 3), opt);
466
benchmark("yolo-fastestv2", ncnn::Mat(352, 352, 3), opt);
468
benchmark("vision_transformer", ncnn::Mat(384, 384, 3), opt);
470
benchmark("FastestDet", ncnn::Mat(352, 352, 3), opt);
473
delete g_blob_vkallocator;
474
delete g_staging_vkallocator;