1
// Tencent is pleased to support the open source community by making ncnn available.
3
// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
8
// https://opensource.org/licenses/BSD-3-Clause
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
31
static struct prng_rand_t g_prng_rand_state;
35
prng_srand(seed, &g_prng_rand_state);
40
return prng_rand(&g_prng_rand_state);
43
float RandomFloat(float a, float b)
45
float random = ((float)RAND()) / (float)uint64_t(-1); //RAND_MAX;
47
float r = random * diff;
49
// generate denormal as zero
50
if (v < 0.0001 && v > -0.0001)
55
int RandomInt(int a, int b)
57
float random = ((float)RAND()) / (float)uint64_t(-1); //RAND_MAX;
59
float r = random * diff;
65
return (signed char)RandomInt(-127, 127);
68
void Randomize(ncnn::Mat& m, float a, float b)
70
for (size_t i = 0; i < m.total(); i++)
72
m[i] = RandomFloat(a, b);
76
void RandomizeInt(ncnn::Mat& m, int a, int b)
78
for (size_t i = 0; i < m.total(); i++)
80
((int*)m)[i] = RandomInt(a, b);
84
void RandomizeS8(ncnn::Mat& m)
86
for (size_t i = 0; i < m.total(); i++)
88
((signed char*)m)[i] = RandomS8();
92
ncnn::Mat RandomMat(int w, float a, float b)
99
ncnn::Mat RandomMat(int w, int h, float a, float b)
106
ncnn::Mat RandomMat(int w, int h, int c, float a, float b)
108
ncnn::Mat m(w, h, c);
113
ncnn::Mat RandomMat(int w, int h, int d, int c, float a, float b)
115
ncnn::Mat m(w, h, d, c);
120
ncnn::Mat RandomIntMat(int w)
127
ncnn::Mat RandomIntMat(int w, int h)
134
ncnn::Mat RandomIntMat(int w, int h, int c)
136
ncnn::Mat m(w, h, c);
141
ncnn::Mat RandomIntMat(int w, int h, int d, int c)
143
ncnn::Mat m(w, h, d, c);
148
ncnn::Mat RandomS8Mat(int w)
150
ncnn::Mat m(w, (size_t)1u);
155
ncnn::Mat RandomS8Mat(int w, int h)
157
ncnn::Mat m(w, h, (size_t)1u);
162
ncnn::Mat RandomS8Mat(int w, int h, int c)
164
ncnn::Mat m(w, h, c, (size_t)1u);
169
ncnn::Mat RandomS8Mat(int w, int h, int d, int c)
171
ncnn::Mat m(w, h, d, c, (size_t)1u);
176
ncnn::Mat scales_mat(const ncnn::Mat& mat, int m, int k, int ldx)
178
ncnn::Mat weight_scales(m);
179
for (int i = 0; i < m; ++i)
181
float min = mat[0], _max = mat[0];
182
const float* ptr = (const float*)(mat.data) + i * ldx;
183
for (int j = 0; j < k; ++j)
194
const float abs_min = abs(min), abs_max = abs(_max);
195
weight_scales[i] = 127.f / (abs_min > abs_max ? abs_min : abs_max);
197
return weight_scales;
200
bool NearlyEqual(float a, float b, float epsilon)
205
float diff = (float)fabs(a - b);
210
return diff < epsilon * std::max(fabs(a), fabs(b));
213
int Compare(const ncnn::Mat& a, const ncnn::Mat& b, float epsilon)
215
#define CHECK_MEMBER(m) \
218
fprintf(stderr, #m " not match expect %d but got %d\n", (int)a.m, (int)b.m); \
227
CHECK_MEMBER(elemsize)
228
CHECK_MEMBER(elempack)
232
for (int q = 0; q < a.c; q++)
234
const ncnn::Mat ma = a.channel(q);
235
const ncnn::Mat mb = b.channel(q);
236
for (int z = 0; z < a.d; z++)
238
const ncnn::Mat da = ma.depth(z);
239
const ncnn::Mat db = mb.depth(z);
240
for (int i = 0; i < a.h; i++)
242
const float* pa = da.row(i);
243
const float* pb = db.row(i);
244
for (int j = 0; j < a.w; j++)
246
if (!NearlyEqual(pa[j], pb[j], epsilon))
248
fprintf(stderr, "value not match at c:%d d:%d h:%d w:%d expect %f but got %f\n", q, z, i, j, pa[j], pb[j]);
259
int CompareMat(const ncnn::Mat& a, const ncnn::Mat& b, float epsilon)
267
ncnn::convert_packing(a, a1, 1, opt);
268
return CompareMat(a1, b, epsilon);
274
ncnn::convert_packing(b, b1, 1, opt);
275
return CompareMat(a, b1, epsilon);
278
if (a.elemsize == 2u)
281
cast_float16_to_float32(a, a32, opt);
282
return CompareMat(a32, b, epsilon);
284
if (a.elemsize == 1u)
287
cast_int8_to_float32(a, a32, opt);
288
return CompareMat(a32, b, epsilon);
291
if (b.elemsize == 2u)
294
cast_float16_to_float32(b, b32, opt);
295
return CompareMat(a, b32, epsilon);
297
if (b.elemsize == 1u)
300
cast_int8_to_float32(b, b32, opt);
301
return CompareMat(a, b32, epsilon);
304
return Compare(a, b, epsilon);
307
int CompareMat(const std::vector<ncnn::Mat>& a, const std::vector<ncnn::Mat>& b, float epsilon)
309
if (a.size() != b.size())
311
fprintf(stderr, "output blob count not match %zu %zu\n", a.size(), b.size());
315
for (size_t i = 0; i < a.size(); i++)
317
if (CompareMat(a[i], b[i], epsilon))
319
fprintf(stderr, "output blob %zu not match\n", i);
327
static int convert_to_optimal_layout(const ncnn::Mat& a, ncnn::Mat& a4, const ncnn::Option& opt, const ncnn::Layer* op, int flag)
332
if (opt.use_fp16_storage && ncnn::cpu_support_arm_asimdhp() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
334
ncnn::cast_float32_to_float16(a, a4, opt);
339
if (opt.use_fp16_storage && !opt.use_bf16_storage && ncnn::cpu_support_arm_vfpv4() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
341
ncnn::cast_float32_to_float16(a, a4, opt);
346
if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
348
ncnn::cast_float32_to_float16(a, a4, opt);
353
if (opt.use_bf16_storage && op->support_bf16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
355
ncnn::cast_float32_to_bfloat16(a, a4, opt);
359
if (opt.use_fp16_storage && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
361
ncnn::cast_float32_to_float16(a, a4, opt);
370
if (opt.use_packing_layout && op->support_packing && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_PACKING))
372
// resolve dst_elempack
375
if (dims == 1) elemcount = a4.elempack * a4.w;
376
if (dims == 2) elemcount = a4.elempack * a4.h;
377
if (dims == 3 || dims == 4) elemcount = a4.elempack * a4.c;
379
int elembits = a4.elembits();
381
int dst_elempack = 1;
386
if (elemcount % 16 == 0 && ncnn::cpu_support_x86_avx512())
388
else if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx())
390
else if (elemcount % 4 == 0)
393
if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx())
395
else if (elemcount % 4 == 0)
398
const int packn = ncnn::cpu_riscv_vlenb() / (elembits / 8);
399
if (elemcount % packn == 0)
400
dst_elempack = packn;
402
if (elemcount % 4 == 0)
409
if (elemcount % 8 == 0 && ncnn::cpu_support_arm_asimdhp() && opt.use_fp16_arithmetic)
411
else if (elemcount % 4 == 0)
414
const int packn = ncnn::cpu_riscv_vlenb() / 2;
415
if (elemcount % packn == 0)
416
dst_elempack = packn;
418
if (elemcount % 4 == 0)
425
const int packn = ncnn::cpu_riscv_vlenb() / 1;
426
if (elemcount % packn == 0)
427
dst_elempack = packn;
429
if (elemcount % 8 == 0)
434
if (flag & TEST_LAYER_ENABLE_FORCE_INPUT_PACK8)
438
ncnn::convert_packing(a4, a4_packed, dst_elempack, opt);
445
static int convert_to_vanilla_layout(const ncnn::Mat& c4, ncnn::Mat& c, const ncnn::Option& opt, const ncnn::Layer* op, int flag)
447
ncnn::Mat c4_unpacked;
448
if (c4.elempack != 1)
450
ncnn::convert_packing(c4, c4_unpacked, 1, opt);
460
if (opt.use_fp16_storage && ncnn::cpu_support_arm_asimdhp() && op->support_fp16_storage && c4_unpacked.elembits() == 16)
462
ncnn::cast_float16_to_float32(c4_unpacked, c, opt);
467
if (opt.use_fp16_storage && !opt.use_bf16_storage && ncnn::cpu_support_arm_vfpv4() && op->support_fp16_storage && c4_unpacked.elembits() == 16)
469
ncnn::cast_float16_to_float32(c4_unpacked, c, opt);
474
if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && c4_unpacked.elembits() == 16)
476
ncnn::cast_float16_to_float32(c4_unpacked, c, opt);
481
if (opt.use_bf16_storage && op->support_bf16_storage && c4_unpacked.elembits() == 16)
483
ncnn::cast_bfloat16_to_float32(c4_unpacked, c, opt);
487
if (opt.use_fp16_storage && op->support_fp16_storage && c4_unpacked.elembits() == 16)
489
ncnn::cast_float16_to_float32(c4_unpacked, c, opt);
501
int test_layer_naive(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& b, void (*func)(ncnn::Layer*), int flag)
503
ncnn::Layer* op = ncnn::create_layer_naive(typeindex);
507
(*func)((ncnn::Layer*)op);
512
if (op->one_blob_only && a.size() != 1)
514
fprintf(stderr, "layer with one_blob_only but consume multiple inputs\n");
519
ncnn::ModelBinFromMatArray mb(weights.data());
525
opt.lightmode = false;
526
opt.use_packing_layout = false;
527
opt.use_fp16_packed = false;
528
opt.use_fp16_storage = false;
529
opt.use_fp16_arithmetic = false;
530
opt.use_shader_pack8 = false;
531
opt.use_image_storage = false;
532
opt.use_bf16_storage = false;
533
opt.use_vulkan_compute = false;
535
op->create_pipeline(opt);
537
b.resize(top_blob_count);
539
if (op->support_inplace)
541
for (size_t i = 0; i < a.size(); i++)
546
op->forward_inplace(b, opt);
550
op->forward(a, b, opt);
553
op->destroy_pipeline(opt);
560
int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& c, const std::vector<ncnn::Mat>& top_shapes, void (*func)(ncnn::Layer*), int flag)
562
ncnn::Layer* op = ncnn::create_layer_cpu(typeindex);
564
if (!op->support_packing && _opt.use_packing_layout)
569
if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
577
(*func)((ncnn::Layer*)op);
580
if (!top_shapes.empty())
582
op->bottom_shapes = a;
583
op->top_shapes = top_shapes;
588
if (op->one_blob_only && a.size() != 1)
590
fprintf(stderr, "layer with one_blob_only but consume multiple inputs\n");
595
ncnn::ModelBinFromMatArray mb(weights.data());
599
ncnn::Option opt = _opt;
601
opt.use_vulkan_compute = false;
603
op->create_pipeline(opt);
605
if (!op->support_packing && _opt.use_packing_layout)
607
op->destroy_pipeline(opt);
611
if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
613
op->destroy_pipeline(opt);
618
std::vector<ncnn::Mat> a4(a.size());
620
for (size_t i = 0; i < a4.size(); i++)
622
convert_to_optimal_layout(a[i], a4[i], opt, op, flag);
625
c.resize(top_blob_count);
627
if (op->support_inplace)
629
for (size_t i = 0; i < a4.size(); i++)
631
c[i] = a4[i].clone();
634
op->forward_inplace(c, opt);
638
op->forward(a4, c, opt);
641
for (size_t i = 0; i < c.size(); i++)
643
convert_to_vanilla_layout(c[i], c[i], opt, op, flag);
646
op->destroy_pipeline(opt);
654
int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& d, const std::vector<ncnn::Mat>& top_shapes, void (*func)(ncnn::Layer*), int flag)
656
if (!_opt.use_packing_layout)
658
// pack1 test is useless for gpu
662
ncnn::Layer* op = ncnn::create_layer_vulkan(typeindex);
670
if (!op->support_vulkan)
676
ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
682
(*func)((ncnn::Layer*)op);
685
if (!top_shapes.empty())
687
op->bottom_shapes = a;
688
op->top_shapes = top_shapes;
691
if (op->one_blob_only && a.size() != 1)
693
fprintf(stderr, "layer with one_blob_only but consume multiple inputs\n");
698
ncnn::ModelBinFromMatArray mb(weights.data());
702
ncnn::VkWeightAllocator g_weight_vkallocator(vkdev);
703
ncnn::VkWeightStagingAllocator g_weight_staging_vkallocator(vkdev);
705
ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
706
ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();
708
ncnn::Option opt = _opt;
710
opt.use_vulkan_compute = true;
713
opt.use_image_storage = false;
716
opt.blob_vkallocator = blob_vkallocator;
717
opt.workspace_vkallocator = blob_vkallocator;
718
opt.staging_vkallocator = staging_vkallocator;
720
if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
721
if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
722
if (!vkdev->info.support_fp16_uniform()) opt.use_fp16_uniform = false;
723
if (!vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false;
724
if (!vkdev->info.support_int8_packed()) opt.use_int8_packed = false;
725
if (!vkdev->info.support_int8_storage()) opt.use_int8_storage = false;
726
if (!vkdev->info.support_int8_uniform()) opt.use_int8_uniform = false;
727
if (!vkdev->info.support_int8_arithmetic()) opt.use_int8_arithmetic = false;
728
if (!vkdev->info.support_cooperative_matrix()) opt.use_cooperative_matrix = false;
730
// FIXME fp16a may produce large error
731
opt.use_fp16_arithmetic = false;
733
op->create_pipeline(opt);
735
if (!op->support_vulkan)
737
op->destroy_pipeline(opt);
743
ncnn::VkTransfer cmd(vkdev);
745
ncnn::Option opt_upload = opt;
746
opt_upload.blob_vkallocator = &g_weight_vkallocator;
747
opt_upload.workspace_vkallocator = &g_weight_vkallocator;
748
opt_upload.staging_vkallocator = &g_weight_staging_vkallocator;
750
op->upload_model(cmd, opt_upload);
752
cmd.submit_and_wait();
755
d.resize(top_blob_count);
759
ncnn::VkCompute cmd(vkdev);
761
if (op->support_image_storage && opt.use_image_storage)
764
std::vector<ncnn::VkImageMat> a_gpu(a.size());
765
for (size_t i = 0; i < a_gpu.size(); i++)
767
cmd.record_upload(a[i], a_gpu[i], opt);
770
std::vector<ncnn::VkImageMat> d_gpu(top_blob_count);
771
if (op->support_inplace)
773
op->forward_inplace(a_gpu, cmd, opt);
779
op->forward(a_gpu, d_gpu, cmd, opt);
783
for (size_t i = 0; i < d_gpu.size(); i++)
785
cmd.record_download(d_gpu[i], d[i], opt);
791
std::vector<ncnn::VkMat> a_gpu(a.size());
792
for (size_t i = 0; i < a_gpu.size(); i++)
794
cmd.record_upload(a[i], a_gpu[i], opt);
797
std::vector<ncnn::VkMat> d_gpu(top_blob_count);
798
if (op->support_inplace)
800
op->forward_inplace(a_gpu, cmd, opt);
806
op->forward(a_gpu, d_gpu, cmd, opt);
810
for (size_t i = 0; i < d_gpu.size(); i++)
812
cmd.record_download(d_gpu[i], d[i], opt);
816
cmd.submit_and_wait();
819
op->destroy_pipeline(opt);
823
vkdev->reclaim_blob_allocator(blob_vkallocator);
824
vkdev->reclaim_staging_allocator(staging_vkallocator);
825
g_weight_vkallocator.clear();
826
g_weight_staging_vkallocator.clear();
832
int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, const std::vector<ncnn::Mat>& top_shapes, float epsilon, void (*func)(ncnn::Layer*), int flag)
835
std::vector<ncnn::Mat> b;
837
int ret = test_layer_naive(typeindex, pd, weights, a, top_blob_count, b, func, flag);
838
if (ret != 233 && ret != 0)
840
fprintf(stderr, "test_layer_naive failed\n");
847
std::vector<ncnn::Mat> c;
848
int ret = test_layer_cpu(typeindex, pd, weights, _opt, a, top_blob_count, c, std::vector<ncnn::Mat>(), func, flag);
849
if (ret != 233 && (ret != 0 || CompareMat(b, c, epsilon) != 0))
851
fprintf(stderr, "test_layer_cpu failed\n");
858
std::vector<ncnn::Mat> c;
859
int ret = test_layer_cpu(typeindex, pd, weights, _opt, a, top_blob_count, c, b, func, flag);
860
if (ret != 233 && (ret != 0 || CompareMat(b, c, epsilon) != 0))
862
fprintf(stderr, "test_layer_cpu failed with shape hint\n");
869
if (!(flag & TEST_LAYER_DISABLE_GPU_TESTING))
871
std::vector<ncnn::Mat> d;
872
int ret = test_layer_gpu(typeindex, pd, weights, _opt, a, top_blob_count, d, std::vector<ncnn::Mat>(), func, flag);
873
if (ret != 233 && (ret != 0 || CompareMat(b, d, epsilon) != 0))
875
fprintf(stderr, "test_layer_gpu failed\n");
881
if (!(flag & TEST_LAYER_DISABLE_GPU_TESTING))
883
std::vector<ncnn::Mat> d;
884
int ret = test_layer_gpu(typeindex, pd, weights, _opt, a, top_blob_count, d, b, func, flag);
885
if (ret != 233 && (ret != 0 || CompareMat(b, d, epsilon) != 0))
887
fprintf(stderr, "test_layer_gpu failed with shape hint\n");
896
int test_layer_naive(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Mat& a, ncnn::Mat& b, void (*func)(ncnn::Layer*), int flag)
898
ncnn::Layer* op = ncnn::create_layer_naive(typeindex);
902
(*func)((ncnn::Layer*)op);
907
ncnn::ModelBinFromMatArray mb(weights.data());
913
opt.lightmode = false;
914
opt.use_packing_layout = false;
915
opt.use_fp16_packed = false;
916
opt.use_fp16_storage = false;
917
opt.use_fp16_arithmetic = false;
918
opt.use_shader_pack8 = false;
919
opt.use_image_storage = false;
920
opt.use_bf16_storage = false;
921
opt.use_vulkan_compute = false;
923
op->create_pipeline(opt);
925
if (op->support_inplace)
928
op->forward_inplace(b, opt);
932
op->forward(a, b, opt);
935
op->destroy_pipeline(opt);
942
int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, ncnn::Mat& c, const ncnn::Mat& top_shape, void (*func)(ncnn::Layer*), int flag)
944
ncnn::Layer* op = ncnn::create_layer_cpu(typeindex);
946
if (!op->support_packing && _opt.use_packing_layout)
951
if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
959
(*func)((ncnn::Layer*)op);
964
op->bottom_shapes.resize(1);
965
op->top_shapes.resize(1);
966
op->bottom_shapes[0] = a;
967
op->top_shapes[0] = top_shape;
972
ncnn::ModelBinFromMatArray mb(weights.data());
976
ncnn::Option opt = _opt;
978
opt.use_vulkan_compute = false;
980
op->create_pipeline(opt);
982
if (!op->support_packing && _opt.use_packing_layout)
984
op->destroy_pipeline(opt);
988
if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
990
op->destroy_pipeline(opt);
996
convert_to_optimal_layout(a, a4, opt, op, flag);
998
if (op->support_inplace)
1001
op->forward_inplace(c, opt);
1005
op->forward(a4, c, opt);
1008
convert_to_vanilla_layout(c, c, opt, op, flag);
1010
op->destroy_pipeline(opt);
1018
int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, ncnn::Mat& d, const ncnn::Mat& top_shape, void (*func)(ncnn::Layer*), int flag)
1020
if (!_opt.use_packing_layout)
1022
// pack1 test is useless for gpu
1026
ncnn::Layer* op = ncnn::create_layer_vulkan(typeindex);
1034
if (!op->support_vulkan)
1040
ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
1046
(*func)((ncnn::Layer*)op);
1051
op->bottom_shapes.resize(1);
1052
op->top_shapes.resize(1);
1053
op->bottom_shapes[0] = a;
1054
op->top_shapes[0] = top_shape;
1057
ncnn::ModelBinFromMatArray mb(weights.data());
1061
ncnn::VkWeightAllocator g_weight_vkallocator(vkdev);
1062
ncnn::VkWeightStagingAllocator g_weight_staging_vkallocator(vkdev);
1064
ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
1065
ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();
1067
ncnn::Option opt = _opt;
1068
opt.num_threads = 1;
1069
opt.use_vulkan_compute = true;
1072
opt.use_image_storage = false;
1075
opt.blob_vkallocator = blob_vkallocator;
1076
opt.workspace_vkallocator = blob_vkallocator;
1077
opt.staging_vkallocator = staging_vkallocator;
1079
if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
1080
if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
1081
if (!vkdev->info.support_fp16_uniform()) opt.use_fp16_uniform = false;
1082
if (!vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false;
1083
if (!vkdev->info.support_int8_packed()) opt.use_int8_packed = false;
1084
if (!vkdev->info.support_int8_storage()) opt.use_int8_storage = false;
1085
if (!vkdev->info.support_int8_uniform()) opt.use_int8_uniform = false;
1086
if (!vkdev->info.support_int8_arithmetic()) opt.use_int8_arithmetic = false;
1087
if (!vkdev->info.support_cooperative_matrix()) opt.use_cooperative_matrix = false;
1089
// FIXME fp16a may produce large error
1090
opt.use_fp16_arithmetic = false;
1092
op->create_pipeline(opt);
1094
if (!op->support_vulkan)
1096
op->destroy_pipeline(opt);
1102
ncnn::VkTransfer cmd(vkdev);
1104
ncnn::Option opt_upload = opt;
1105
opt_upload.blob_vkallocator = &g_weight_vkallocator;
1106
opt_upload.workspace_vkallocator = &g_weight_vkallocator;
1107
opt_upload.staging_vkallocator = &g_weight_staging_vkallocator;
1109
op->upload_model(cmd, opt_upload);
1111
cmd.submit_and_wait();
1116
ncnn::VkCompute cmd(vkdev);
1118
if (op->support_image_storage && opt.use_image_storage)
1121
ncnn::VkImageMat a_gpu;
1122
cmd.record_upload(a, a_gpu, opt);
1124
ncnn::VkImageMat d_gpu;
1125
if (op->support_inplace)
1127
op->forward_inplace(a_gpu, cmd, opt);
1133
op->forward(a_gpu, d_gpu, cmd, opt);
1137
cmd.record_download(d_gpu, d, opt);
1143
cmd.record_upload(a, a_gpu, opt);
1146
if (op->support_inplace)
1148
op->forward_inplace(a_gpu, cmd, opt);
1154
op->forward(a_gpu, d_gpu, cmd, opt);
1158
cmd.record_download(d_gpu, d, opt);
1161
cmd.submit_and_wait();
1164
op->destroy_pipeline(opt);
1168
vkdev->reclaim_blob_allocator(blob_vkallocator);
1169
vkdev->reclaim_staging_allocator(staging_vkallocator);
1170
g_weight_vkallocator.clear();
1171
g_weight_staging_vkallocator.clear();
1175
#endif // NCNN_VULKAN
1177
int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, const ncnn::Mat& top_shape, float epsilon, void (*func)(ncnn::Layer*), int flag)
1182
int ret = test_layer_naive(typeindex, pd, weights, a, b, func, flag);
1183
if (ret != 233 && ret != 0)
1185
fprintf(stderr, "test_layer_naive failed\n");
1193
int ret = test_layer_cpu(typeindex, pd, weights, _opt, a, c, ncnn::Mat(), func, flag);
1194
if (ret != 233 && (ret != 0 || CompareMat(b, c, epsilon) != 0))
1196
fprintf(stderr, "test_layer_cpu failed\n");
1204
int ret = test_layer_cpu(typeindex, pd, weights, _opt, a, c, b, func, flag);
1205
if (ret != 233 && (ret != 0 || CompareMat(b, c, epsilon) != 0))
1207
fprintf(stderr, "test_layer_cpu failed with shape hint\n");
1214
if (!(flag & TEST_LAYER_DISABLE_GPU_TESTING))
1217
int ret = test_layer_gpu(typeindex, pd, weights, _opt, a, d, ncnn::Mat(), func, flag);
1218
if (ret != 233 && (ret != 0 || CompareMat(b, d, epsilon) != 0))
1220
fprintf(stderr, "test_layer_gpu failed\n");
1226
if (!(flag & TEST_LAYER_DISABLE_GPU_TESTING))
1229
int ret = test_layer_gpu(typeindex, pd, weights, _opt, a, d, b, func, flag);
1230
if (ret != 233 && (ret != 0 || CompareMat(b, d, epsilon) != 0))
1232
fprintf(stderr, "test_layer_gpu failed with shape hint\n");
1236
#endif // NCNN_VULKAN
1241
int test_layer_opt(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& opt, const std::vector<ncnn::Mat>& a, int top_blob_count, float epsilon, void (*func)(ncnn::Layer*), int flag)
1243
// fp16 representation
1244
std::vector<ncnn::Mat> a_fp16;
1245
if (opt.use_bf16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
1247
a_fp16.resize(a.size());
1248
for (size_t j = 0; j < a.size(); j++)
1251
ncnn::cast_float32_to_bfloat16(a[j], tmp, opt);
1252
ncnn::cast_bfloat16_to_float32(tmp, a_fp16[j], opt);
1255
else if ((opt.use_fp16_packed || opt.use_fp16_storage) && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
1257
a_fp16.resize(a.size());
1258
for (size_t j = 0; j < a.size(); j++)
1261
ncnn::cast_float32_to_float16(a[j], tmp, opt);
1262
ncnn::cast_float16_to_float32(tmp, a_fp16[j], opt);
1270
std::vector<ncnn::Mat> weights_fp16;
1272
if (opt.use_bf16_storage)
1274
weights_fp16.resize(weights.size());
1275
for (size_t j = 0; j < weights.size(); j++)
1277
if (weights[j].elembits() != 32)
1279
weights_fp16[j] = weights[j];
1284
ncnn::cast_float32_to_bfloat16(weights[j], tmp, opt);
1285
ncnn::cast_bfloat16_to_float32(tmp, weights_fp16[j], opt);
1287
epsilon_fp16 = epsilon * 100; // 0.1
1289
else if (opt.use_fp16_packed || opt.use_fp16_storage)
1291
weights_fp16.resize(weights.size());
1292
for (size_t j = 0; j < weights.size(); j++)
1294
if (weights[j].elembits() != 32)
1296
weights_fp16[j] = weights[j];
1301
ncnn::cast_float32_to_float16(weights[j], tmp, opt);
1302
ncnn::cast_float16_to_float32(tmp, weights_fp16[j], opt);
1304
epsilon_fp16 = epsilon * 100; // 0.1
1308
weights_fp16 = weights;
1309
epsilon_fp16 = epsilon;
1312
if (opt.use_fp16_arithmetic)
1314
epsilon_fp16 = epsilon * 1000; // 1.0
1317
std::vector<ncnn::Mat> top_shapes;
1318
int ret = test_layer(ncnn::layer_to_index(layer_type), pd, weights_fp16, opt, a_fp16, top_blob_count, top_shapes, epsilon_fp16, func, flag);
1321
fprintf(stderr, "test_layer %s failed use_packing_layout=%d use_fp16_packed=%d use_fp16_storage=%d use_fp16_arithmetic=%d use_shader_pack8=%d use_bf16_storage=%d use_image_storage=%d use_sgemm_convolution=%d use_winograd_convolution=%d\n", layer_type, opt.use_packing_layout, opt.use_fp16_packed, opt.use_fp16_storage, opt.use_fp16_arithmetic, opt.use_shader_pack8, opt.use_bf16_storage, opt.use_image_storage, opt.use_sgemm_convolution, opt.use_winograd_convolution);
1328
int test_layer_opt(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& opt, const ncnn::Mat& a, float epsilon, void (*func)(ncnn::Layer*), int flag)
1330
// fp16 representation
1332
if (opt.use_bf16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
1335
ncnn::cast_float32_to_bfloat16(a, tmp, opt);
1336
ncnn::cast_bfloat16_to_float32(tmp, a_fp16, opt);
1338
else if ((opt.use_fp16_packed || opt.use_fp16_storage) && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
1341
ncnn::cast_float32_to_float16(a, tmp, opt);
1342
ncnn::cast_float16_to_float32(tmp, a_fp16, opt);
1349
std::vector<ncnn::Mat> weights_fp16;
1351
if (opt.use_bf16_storage)
1353
weights_fp16.resize(weights.size());
1354
for (size_t j = 0; j < weights.size(); j++)
1356
if (weights[j].elembits() != 32)
1358
weights_fp16[j] = weights[j];
1363
ncnn::cast_float32_to_bfloat16(weights[j], tmp, opt);
1364
ncnn::cast_bfloat16_to_float32(tmp, weights_fp16[j], opt);
1366
epsilon_fp16 = epsilon * 100; // 0.1
1368
else if (opt.use_fp16_packed || opt.use_fp16_storage)
1370
weights_fp16.resize(weights.size());
1371
for (size_t j = 0; j < weights.size(); j++)
1373
if (weights[j].elembits() != 32)
1375
weights_fp16[j] = weights[j];
1380
ncnn::cast_float32_to_float16(weights[j], tmp, opt);
1381
ncnn::cast_float16_to_float32(tmp, weights_fp16[j], opt);
1383
epsilon_fp16 = epsilon * 100; // 0.1
1387
weights_fp16 = weights;
1388
epsilon_fp16 = epsilon;
1391
if (opt.use_fp16_arithmetic)
1393
epsilon_fp16 = epsilon * 1000; // 1.0
1396
ncnn::Mat top_shape;
1397
int ret = test_layer(ncnn::layer_to_index(layer_type), pd, weights_fp16, opt, a_fp16, top_shape, epsilon_fp16, func, flag);
1400
fprintf(stderr, "test_layer %s failed use_packing_layout=%d use_fp16_packed=%d use_fp16_storage=%d use_fp16_arithmetic=%d use_shader_pack8=%d use_bf16_storage=%d use_image_storage=%d use_sgemm_convolution=%d use_winograd_convolution=%d\n", layer_type, opt.use_packing_layout, opt.use_fp16_packed, opt.use_fp16_storage, opt.use_fp16_arithmetic, opt.use_shader_pack8, opt.use_bf16_storage, opt.use_image_storage, opt.use_sgemm_convolution, opt.use_winograd_convolution);
1407
int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const std::vector<ncnn::Mat>& a, int top_blob_count, float epsilon, void (*func)(ncnn::Layer*), int flag)
1409
// pack fp16p fp16s fp16a bf16s shader8 image
1410
const int options[][7] = {
1411
{0, 0, 0, 0, 0, 0, 0},
1412
{0, 0, 1, 0, 0, 0, 0},
1413
{0, 0, 1, 1, 1, 0, 0},
1414
{1, 0, 0, 0, 0, 0, 0},
1415
{1, 1, 0, 0, 1, 0, 0},
1416
{1, 0, 1, 0, 0, 1, 0},
1417
{1, 1, 1, 1, 0, 0, 0},
1418
{1, 1, 1, 1, 1, 1, 1},
1421
const int opt_count = sizeof(options) / sizeof(options[0]);
1423
for (int i = 0; i < opt_count; i++)
1426
opt.num_threads = 1;
1427
opt.use_packing_layout = options[i][0];
1428
opt.use_fp16_packed = options[i][1];
1429
opt.use_fp16_storage = options[i][2];
1430
opt.use_fp16_arithmetic = options[i][3];
1431
opt.use_bf16_storage = options[i][4];
1432
opt.use_shader_pack8 = options[i][5];
1433
opt.use_image_storage = options[i][6];
1435
int ret = test_layer_opt(layer_type, pd, weights, opt, a, top_blob_count, epsilon, func, flag);
1443
int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Mat& a, float epsilon, void (*func)(ncnn::Layer*), int flag)
1445
// pack fp16p fp16s fp16a bf16s shader8 image
1446
const int options[][7] = {
1447
{0, 0, 0, 0, 0, 0, 0},
1448
{0, 0, 1, 0, 0, 0, 0},
1449
{0, 0, 1, 1, 1, 0, 0},
1450
{1, 0, 0, 0, 0, 0, 0},
1451
{1, 1, 0, 0, 1, 0, 0},
1452
{1, 0, 1, 0, 0, 1, 0},
1453
{1, 1, 1, 1, 0, 0, 0},
1454
{1, 1, 1, 1, 1, 1, 1},
1457
const int opt_count = sizeof(options) / sizeof(options[0]);
1459
for (int i = 0; i < opt_count; i++)
1462
opt.num_threads = 1;
1463
opt.use_packing_layout = options[i][0];
1464
opt.use_fp16_packed = options[i][1];
1465
opt.use_fp16_storage = options[i][2];
1466
opt.use_fp16_arithmetic = options[i][3];
1467
opt.use_bf16_storage = options[i][4];
1468
opt.use_shader_pack8 = options[i][5];
1469
opt.use_image_storage = options[i][6];
1471
int ret = test_layer_opt(layer_type, pd, weights, opt, a, epsilon, func, flag);
1479
class TestOOMAllocator : public ncnn::UnlockedPoolAllocator
1483
virtual void* fastMalloc(size_t size);
1484
virtual void fastFree(void* ptr);
1491
TestOOMAllocator::TestOOMAllocator()
1497
void* TestOOMAllocator::fastMalloc(size_t size)
1502
if (counter == failid)
1508
ptr = ncnn::UnlockedPoolAllocator::fastMalloc(size);
1517
void TestOOMAllocator::fastFree(void* ptr)
1521
ncnn::UnlockedPoolAllocator::fastFree(ptr);
1526
int test_layer_oom_opt(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, int flag)
1528
int typeindex = ncnn::layer_to_index(layer_type);
1529
if (typeindex == -1)
1532
ncnn::Layer* op = ncnn::create_layer_cpu(typeindex);
1534
if (!op->support_packing && _opt.use_packing_layout)
1539
if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
1547
if (op->one_blob_only && a.size() != 1)
1549
fprintf(stderr, "layer with one_blob_only but consume multiple inputs\n");
1554
ncnn::ModelBinFromMatArray mb(weights.data());
1558
ncnn::Option opt = _opt;
1559
opt.num_threads = 1;
1560
opt.use_vulkan_compute = false;
1562
op->create_pipeline(opt);
1564
if (!op->support_packing && _opt.use_packing_layout)
1566
op->destroy_pipeline(opt);
1570
if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
1572
op->destroy_pipeline(opt);
1577
std::vector<ncnn::Mat> a4(a.size());
1579
for (size_t i = 0; i < a4.size(); i++)
1581
convert_to_optimal_layout(a[i], a4[i], opt, op, flag);
1584
TestOOMAllocator test_oom_allocator;
1585
opt.blob_allocator = &test_oom_allocator;
1586
opt.workspace_allocator = &test_oom_allocator;
1588
std::vector<ncnn::Mat> c;
1589
c.resize(top_blob_count);
1591
if (op->support_inplace)
1593
for (size_t i = 0; i < a4.size(); i++)
1595
c[i] = a4[i].clone();
1598
op->forward_inplace(c, opt);
1602
op->forward(a4, c, opt);
1605
for (int i = 0; i < top_blob_count; i++)
1610
const int alloc_count = test_oom_allocator.counter;
1611
for (int i = 0; i < alloc_count; i++)
1613
test_oom_allocator.counter = 0;
1614
test_oom_allocator.failid = i;
1617
if (op->support_inplace)
1619
for (size_t i = 0; i < a4.size(); i++)
1621
c[i] = a4[i].clone();
1624
ret = op->forward_inplace(c, opt);
1628
ret = op->forward(a4, c, opt);
1631
for (int i = 0; i < top_blob_count; i++)
1638
fprintf(stderr, "oom not catched %d/%d\n", i, alloc_count);
1640
op->destroy_pipeline(opt);
1648
op->destroy_pipeline(opt);
1655
int test_layer_oom_opt(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, int flag)
1657
int typeindex = ncnn::layer_to_index(layer_type);
1658
if (typeindex == -1)
1661
ncnn::Layer* op = ncnn::create_layer_cpu(typeindex);
1663
if (!op->support_packing && _opt.use_packing_layout)
1668
if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
1676
ncnn::ModelBinFromMatArray mb(weights.data());
1680
ncnn::Option opt = _opt;
1681
opt.num_threads = 1;
1682
opt.use_vulkan_compute = false;
1684
op->create_pipeline(opt);
1686
if (!op->support_packing && _opt.use_packing_layout)
1688
op->destroy_pipeline(opt);
1692
if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
1694
op->destroy_pipeline(opt);
1700
convert_to_optimal_layout(a, a4, opt, op, flag);
1702
TestOOMAllocator test_oom_allocator;
1703
opt.blob_allocator = &test_oom_allocator;
1704
opt.workspace_allocator = &test_oom_allocator;
1708
if (op->support_inplace)
1711
op->forward_inplace(c, opt);
1715
op->forward(a4, c, opt);
1720
const int alloc_count = test_oom_allocator.counter;
1721
for (int i = 0; i < alloc_count; i++)
1723
test_oom_allocator.counter = 0;
1724
test_oom_allocator.failid = i;
1727
if (op->support_inplace)
1730
ret = op->forward_inplace(c, opt);
1734
ret = op->forward(a4, c, opt);
1741
fprintf(stderr, "oom not catched %d/%d\n", i, alloc_count);
1743
op->destroy_pipeline(opt);
1751
op->destroy_pipeline(opt);
1758
int test_layer_oom(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const std::vector<ncnn::Mat>& a, int top_blob_count, int flag)
1760
// pack fp16p fp16s fp16a bf16s shader8 image
1761
const int options[][7] = {
1762
{0, 0, 0, 0, 0, 0, 0},
1763
{0, 0, 1, 0, 0, 0, 0},
1764
{0, 0, 1, 1, 1, 0, 0},
1765
{1, 0, 0, 0, 0, 0, 0},
1766
{1, 1, 0, 0, 1, 0, 0},
1767
{1, 0, 1, 0, 0, 1, 0},
1768
{1, 1, 1, 1, 0, 0, 0},
1769
{1, 1, 1, 1, 1, 1, 1},
1772
const int opt_count = sizeof(options) / sizeof(options[0]);
1774
for (int i = 0; i < opt_count; i++)
1777
opt.num_threads = 1;
1778
opt.use_packing_layout = options[i][0];
1779
opt.use_fp16_packed = options[i][1];
1780
opt.use_fp16_storage = options[i][2];
1781
opt.use_fp16_arithmetic = options[i][3];
1782
opt.use_bf16_storage = options[i][4];
1783
opt.use_shader_pack8 = options[i][5];
1784
opt.use_image_storage = options[i][6];
1786
int ret = test_layer_oom_opt(layer_type, pd, weights, opt, a, top_blob_count, flag);
1787
if (ret != 233 && ret != 0)
1794
int test_layer_oom(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Mat& a, int flag)
1796
// pack fp16p fp16s fp16a bf16s shader8 image
1797
const int options[][7] = {
1798
{0, 0, 0, 0, 0, 0, 0},
1799
{0, 0, 1, 0, 0, 0, 0},
1800
{0, 0, 1, 1, 1, 0, 0},
1801
{1, 0, 0, 0, 0, 0, 0},
1802
{1, 1, 0, 0, 1, 0, 0},
1803
{1, 0, 1, 0, 0, 1, 0},
1804
{1, 1, 1, 1, 0, 0, 0},
1805
{1, 1, 1, 1, 1, 1, 1},
1808
const int opt_count = sizeof(options) / sizeof(options[0]);
1810
for (int i = 0; i < opt_count; i++)
1813
opt.num_threads = 1;
1814
opt.use_packing_layout = options[i][0];
1815
opt.use_fp16_packed = options[i][1];
1816
opt.use_fp16_storage = options[i][2];
1817
opt.use_fp16_arithmetic = options[i][3];
1818
opt.use_bf16_storage = options[i][4];
1819
opt.use_shader_pack8 = options[i][5];
1820
opt.use_image_storage = options[i][6];
1822
int ret = test_layer_oom_opt(layer_type, pd, weights, opt, a, flag);
1823
if (ret != 233 && ret != 0)