1
// Tencent is pleased to support the open source community by making ncnn available.
3
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
8
// https://opensource.org/licenses/BSD-3-Clause
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
17
static int cast_cpu_naive(const ncnn::Mat& a, ncnn::Mat& b, int type_from, int type_to)
23
std::vector<ncnn::Mat> weights(0);
28
ncnn::Layer* op = ncnn::create_layer_naive("Cast");
32
ncnn::ModelBinFromMatArray mb(weights.data());
36
op->create_pipeline(opt);
38
op->forward(a, b, opt);
40
op->destroy_pipeline(opt);
47
static int test_cast_cpu(const ncnn::Mat& a, int type_from, int type_to)
53
std::vector<ncnn::Mat> weights(0);
57
opt.use_vulkan_compute = false;
58
opt.use_int8_inference = false;
59
opt.use_packing_layout = false;
61
ncnn::Layer* op = ncnn::create_layer_cpu("Cast");
65
ncnn::ModelBinFromMatArray mb(weights.data());
69
op->create_pipeline(opt);
72
cast_cpu_naive(a, a_fp16, 1, type_from);
75
cast_cpu_naive(a_fp16, b, type_from, type_to);
78
op->forward(a_fp16, c, opt);
80
op->destroy_pipeline(opt);
84
if (CompareMat(b, c, 0.001) != 0)
86
fprintf(stderr, "test_cast_cpu failed a.dims=%d a=(%d %d %d %d) type_from=%d type_to=%d\n", a.dims, a.w, a.h, a.d, a.c, type_from, type_to);
93
static int test_cast_cpu_packed(const ncnn::Mat& a, int type_from, int type_to)
99
std::vector<ncnn::Mat> weights(0);
103
opt.use_vulkan_compute = false;
104
opt.use_packing_layout = false;
106
ncnn::Layer* op = ncnn::create_layer_cpu("Cast");
110
ncnn::ModelBinFromMatArray mb(weights.data());
114
op->create_pipeline(opt);
117
cast_cpu_naive(a, a_fp16, 1, type_from);
120
cast_cpu_naive(a_fp16, b, type_from, type_to);
123
ncnn::convert_packing(a, a4, 4, opt);
126
cast_cpu_naive(a4, a4_fp16, 1, type_from);
129
op->forward(a4_fp16, c, opt);
131
op->destroy_pipeline(opt);
135
if (CompareMat(b, c, 0.001) != 0)
137
fprintf(stderr, "test_cast_cpu_packed failed a.dims=%d a=(%d %d %d %d) type_from=%d type_to=%d\n", a.dims, a.w, a.h, a.d, a.c, type_from, type_to);
145
static int test_cast_gpu_fp16p(const ncnn::Mat& a, int type_from, int type_to)
147
if (type_to == 4 || type_from == 4)
150
pd.set(0, type_from);
153
std::vector<ncnn::Mat> weights(0);
157
opt.use_vulkan_compute = true;
158
opt.use_int8_inference = false;
159
opt.use_fp16_packed = true;
160
opt.use_fp16_storage = false;
161
opt.use_fp16_arithmetic = false;
162
opt.use_int8_storage = false;
163
opt.use_int8_arithmetic = false;
164
opt.use_packing_layout = true;
165
opt.use_image_storage = false;
167
ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
169
ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
170
ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();
172
opt.blob_vkallocator = blob_vkallocator;
173
opt.workspace_vkallocator = blob_vkallocator;
174
opt.staging_vkallocator = staging_vkallocator;
176
if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
177
if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
179
ncnn::Layer* op = ncnn::create_layer_vulkan("Cast");
185
ncnn::ModelBinFromMatArray mb(weights.data());
189
op->create_pipeline(opt);
194
ncnn::cast_float32_to_float16(a, a_fp16, opt);
202
cast_cpu_naive(a_fp16, b, type_from, type_to);
208
ncnn::convert_packing(a, a4, 4, opt);
211
if (type_from == 2 && a4.elempack == 4)
213
ncnn::cast_float32_to_float16(a4, a4_fp16, opt);
221
ncnn::VkCompute cmd(vkdev);
225
cmd.record_clone(a4_fp16, a4_gpu, opt);
228
if (op->support_inplace)
230
op->forward_inplace(a4_gpu, cmd, opt);
236
op->forward(a4_gpu, d4_gpu, cmd, opt);
240
cmd.record_clone(d4_gpu, d, opt);
242
cmd.submit_and_wait();
244
op->destroy_pipeline(opt);
248
vkdev->reclaim_blob_allocator(blob_vkallocator);
249
vkdev->reclaim_staging_allocator(staging_vkallocator);
251
if (CompareMat(b, d, 0.001) != 0)
253
fprintf(stderr, "test_cast_gpu_fp16p failed a.dims=%d a=(%d %d %d %d) type_from=%d type_to=%d\n", a.dims, a.w, a.h, a.d, a.c, type_from, type_to);
260
static int test_cast_gpu_fp16p_pack8(const ncnn::Mat& a, int type_from, int type_to)
262
if (type_to == 4 || type_from == 4)
265
pd.set(0, type_from);
268
std::vector<ncnn::Mat> weights(0);
272
opt.use_vulkan_compute = true;
273
opt.use_int8_inference = false;
274
opt.use_fp16_packed = true;
275
opt.use_fp16_storage = false;
276
opt.use_fp16_arithmetic = false;
277
opt.use_int8_storage = false;
278
opt.use_int8_arithmetic = false;
279
opt.use_packing_layout = true;
280
opt.use_shader_pack8 = true;
281
opt.use_image_storage = false;
283
ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
285
ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
286
ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();
288
opt.blob_vkallocator = blob_vkallocator;
289
opt.workspace_vkallocator = blob_vkallocator;
290
opt.staging_vkallocator = staging_vkallocator;
292
if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
293
if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
295
ncnn::Layer* op = ncnn::create_layer_vulkan("Cast");
301
ncnn::ModelBinFromMatArray mb(weights.data());
305
op->create_pipeline(opt);
310
ncnn::cast_float32_to_float16(a, a_fp16, opt);
318
cast_cpu_naive(a_fp16, b, type_from, type_to);
324
ncnn::convert_packing(a, a4, 8, opt);
325
if (a4.elempack != 8)
326
ncnn::convert_packing(a, a4, 4, opt);
329
if (type_from == 2 && (a4.elempack == 4 || a4.elempack == 8))
331
ncnn::cast_float32_to_float16(a4, a4_fp16, opt);
339
ncnn::VkCompute cmd(vkdev);
343
cmd.record_clone(a4_fp16, a4_gpu, opt);
346
if (op->support_inplace)
348
op->forward_inplace(a4_gpu, cmd, opt);
354
op->forward(a4_gpu, d4_gpu, cmd, opt);
358
cmd.record_clone(d4_gpu, d, opt);
360
cmd.submit_and_wait();
362
op->destroy_pipeline(opt);
366
vkdev->reclaim_blob_allocator(blob_vkallocator);
367
vkdev->reclaim_staging_allocator(staging_vkallocator);
369
if (CompareMat(b, d, 0.001) != 0)
371
fprintf(stderr, "test_cast_gpu_fp16p_pack8 failed a.dims=%d a=(%d %d %d %d) type_from=%d type_to=%d\n", a.dims, a.w, a.h, a.d, a.c, type_from, type_to);
378
static int test_cast_gpu_image_fp16p(const ncnn::Mat& a, int type_from, int type_to)
380
if (type_to == 4 || type_from == 4)
383
pd.set(0, type_from);
386
std::vector<ncnn::Mat> weights(0);
390
opt.use_vulkan_compute = true;
391
opt.use_int8_inference = false;
392
opt.use_fp16_packed = true;
393
opt.use_fp16_storage = false;
394
opt.use_fp16_arithmetic = false;
395
opt.use_int8_storage = false;
396
opt.use_int8_arithmetic = false;
397
opt.use_packing_layout = true;
398
opt.use_image_storage = true;
400
ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
402
ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
403
ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();
405
opt.blob_vkallocator = blob_vkallocator;
406
opt.workspace_vkallocator = blob_vkallocator;
407
opt.staging_vkallocator = staging_vkallocator;
409
if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
410
if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
412
ncnn::Layer* op = ncnn::create_layer_vulkan("Cast");
418
ncnn::ModelBinFromMatArray mb(weights.data());
422
op->create_pipeline(opt);
427
ncnn::cast_float32_to_float16(a, a_fp16, opt);
435
cast_cpu_naive(a_fp16, b, type_from, type_to);
441
ncnn::convert_packing(a, a4, 4, opt);
444
if (type_from == 2 && a4.elempack == 4)
446
ncnn::cast_float32_to_float16(a4, a4_fp16, opt);
454
ncnn::VkCompute cmd(vkdev);
457
ncnn::VkImageMat a4_gpu;
458
cmd.record_clone(a4_fp16, a4_gpu, opt);
460
ncnn::VkImageMat d4_gpu;
461
if (op->support_inplace)
463
op->forward_inplace(a4_gpu, cmd, opt);
469
op->forward(a4_gpu, d4_gpu, cmd, opt);
473
cmd.record_clone(d4_gpu, d, opt);
475
cmd.submit_and_wait();
477
op->destroy_pipeline(opt);
481
vkdev->reclaim_blob_allocator(blob_vkallocator);
482
vkdev->reclaim_staging_allocator(staging_vkallocator);
484
if (CompareMat(b, d, 0.001) != 0)
486
fprintf(stderr, "test_cast_gpu_image_fp16p failed a.dims=%d a=(%d %d %d %d) type_from=%d type_to=%d\n", a.dims, a.w, a.h, a.d, a.c, type_from, type_to);
493
static int test_cast_gpu_image_fp16p_pack8(const ncnn::Mat& a, int type_from, int type_to)
495
if (type_to == 4 || type_from == 4)
498
pd.set(0, type_from);
501
std::vector<ncnn::Mat> weights(0);
505
opt.use_vulkan_compute = true;
506
opt.use_int8_inference = false;
507
opt.use_fp16_packed = true;
508
opt.use_fp16_storage = false;
509
opt.use_fp16_arithmetic = false;
510
opt.use_int8_storage = false;
511
opt.use_int8_arithmetic = false;
512
opt.use_packing_layout = true;
513
opt.use_shader_pack8 = true;
514
opt.use_image_storage = true;
516
ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
518
ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
519
ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();
521
opt.blob_vkallocator = blob_vkallocator;
522
opt.workspace_vkallocator = blob_vkallocator;
523
opt.staging_vkallocator = staging_vkallocator;
525
if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
526
if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
528
ncnn::Layer* op = ncnn::create_layer_vulkan("Cast");
534
ncnn::ModelBinFromMatArray mb(weights.data());
538
op->create_pipeline(opt);
543
ncnn::cast_float32_to_float16(a, a_fp16, opt);
551
cast_cpu_naive(a_fp16, b, type_from, type_to);
557
ncnn::convert_packing(a, a4, 8, opt);
558
if (a4.elempack != 8)
559
ncnn::convert_packing(a, a4, 4, opt);
562
if (type_from == 2 && (a4.elempack == 4 || a4.elempack == 8))
564
ncnn::cast_float32_to_float16(a4, a4_fp16, opt);
572
ncnn::VkCompute cmd(vkdev);
575
ncnn::VkImageMat a4_gpu;
576
cmd.record_clone(a4_fp16, a4_gpu, opt);
578
ncnn::VkImageMat d4_gpu;
579
if (op->support_inplace)
581
op->forward_inplace(a4_gpu, cmd, opt);
587
op->forward(a4_gpu, d4_gpu, cmd, opt);
591
cmd.record_clone(d4_gpu, d, opt);
593
cmd.submit_and_wait();
595
op->destroy_pipeline(opt);
599
vkdev->reclaim_blob_allocator(blob_vkallocator);
600
vkdev->reclaim_staging_allocator(staging_vkallocator);
602
if (CompareMat(b, d, 0.001) != 0)
604
fprintf(stderr, "test_cast_gpu_image_fp16p_pack8 failed a.dims=%d a=(%d %d %d %d) type_from=%d type_to=%d\n", a.dims, a.w, a.h, a.d, a.c, type_from, type_to);
612
static int test_cast(const ncnn::Mat& a, int type_from, int type_to)
615
|| test_cast_cpu(a, type_from, type_to)
616
|| test_cast_cpu_packed(a, type_from, type_to)
618
|| test_cast_gpu_fp16p(a, type_from, type_to)
619
|| test_cast_gpu_fp16p_pack8(a, type_from, type_to)
620
|| test_cast_gpu_image_fp16p(a, type_from, type_to)
621
|| test_cast_gpu_image_fp16p_pack8(a, type_from, type_to)
626
static int test_cast_0()
629
|| test_cast(RandomMat(5, 6, 7, 16), 1, 2)
630
|| test_cast(RandomMat(3, 4, 5, 13), 1, 2)
631
|| test_cast(RandomMat(5, 6, 7, 16), 2, 1)
632
|| test_cast(RandomMat(3, 4, 5, 13), 2, 1)
633
|| test_cast(RandomMat(5, 6, 7, 16), 1, 4)
634
|| test_cast(RandomMat(3, 4, 5, 13), 1, 4)
635
|| test_cast(RandomMat(5, 6, 7, 16), 4, 1)
636
|| test_cast(RandomMat(3, 4, 5, 13), 4, 1);
639
static int test_cast_1()
642
|| test_cast(RandomMat(5, 7, 16), 1, 2)
643
|| test_cast(RandomMat(3, 5, 13), 1, 2)
644
|| test_cast(RandomMat(5, 7, 16), 2, 1)
645
|| test_cast(RandomMat(3, 5, 13), 2, 1)
646
|| test_cast(RandomMat(5, 7, 16), 1, 4)
647
|| test_cast(RandomMat(3, 5, 13), 1, 4)
648
|| test_cast(RandomMat(5, 7, 16), 4, 1)
649
|| test_cast(RandomMat(3, 5, 13), 4, 1);
652
static int test_cast_2()
655
|| test_cast(RandomMat(6, 16), 1, 2)
656
|| test_cast(RandomMat(7, 15), 1, 2)
657
|| test_cast(RandomMat(6, 16), 2, 1)
658
|| test_cast(RandomMat(7, 15), 2, 1)
659
|| test_cast(RandomMat(6, 16), 1, 4)
660
|| test_cast(RandomMat(7, 15), 1, 4)
661
|| test_cast(RandomMat(6, 16), 4, 1)
662
|| test_cast(RandomMat(7, 15), 4, 1);
665
static int test_cast_3()
668
|| test_cast(RandomMat(128), 1, 2)
669
|| test_cast(RandomMat(127), 1, 2)
670
|| test_cast(RandomMat(128), 2, 1)
671
|| test_cast(RandomMat(127), 2, 1)
672
|| test_cast(RandomMat(128), 1, 4)
673
|| test_cast(RandomMat(127), 1, 4)
674
|| test_cast(RandomMat(128), 4, 1)
675
|| test_cast(RandomMat(127), 4, 1);