1
// Tencent is pleased to support the open source community by making ncnn available.
3
// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
8
// https://opensource.org/licenses/BSD-3-Clause
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
15
#include "deconvolutiondepthwise_vulkan.h"
17
#include "layer_shader_type.h"
18
#include "layer_type.h"
22
DeconvolutionDepthWise_vulkan::DeconvolutionDepthWise_vulkan()
24
support_vulkan = true;
25
support_image_storage = true;
30
pipeline_deconvolutiondepthwise = 0;
31
pipeline_deconvolutiondepthwise_pack4 = 0;
32
pipeline_deconvolutiondepthwise_pack8 = 0;
34
pipeline_deconvolutiondepthwise_group = 0;
35
pipeline_deconvolutiondepthwise_group_pack4 = 0;
36
pipeline_deconvolutiondepthwise_group_pack1to4 = 0;
37
pipeline_deconvolutiondepthwise_group_pack4to1 = 0;
38
pipeline_deconvolutiondepthwise_group_pack8 = 0;
39
pipeline_deconvolutiondepthwise_group_pack1to8 = 0;
40
pipeline_deconvolutiondepthwise_group_pack4to8 = 0;
41
pipeline_deconvolutiondepthwise_group_pack8to4 = 0;
42
pipeline_deconvolutiondepthwise_group_pack8to1 = 0;
45
int DeconvolutionDepthWise_vulkan::load_param(const ParamDict& pd)
47
int ret = DeconvolutionDepthWise::load_param(pd);
51
support_vulkan = false;
52
support_image_storage = false;
58
int DeconvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
61
const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
62
const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
64
// the shape before unpadding
65
Mat out_shape_bordered;
68
const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
69
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
71
int outw = (shape.w - 1) * stride_w + kernel_extent_w + output_pad_right;
72
int outh = (shape.h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
74
out_shape_bordered = Mat(outw, outh, out_shape.c, (void*)0);
77
const int maxk = kernel_w * kernel_h;
78
int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
80
int elempack = opt.use_shader_pack8 && channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1;
81
int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
85
if (opt.use_fp16_storage)
87
elemsize = elempack * 2u;
88
out_elemsize = out_elempack * 2u;
90
else if (opt.use_fp16_packed)
92
elemsize = elempack == 1 ? 4u : elempack * 2u;
93
out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u;
97
elemsize = elempack * 4u;
98
out_elemsize = out_elempack * 4u;
102
if (shape.dims == 1) shape_packed = Mat(shape.w / elempack, (void*)0, elemsize, elempack);
103
if (shape.dims == 2) shape_packed = Mat(shape.w, shape.h / elempack, (void*)0, elemsize, elempack);
104
if (shape.dims == 3) shape_packed = Mat(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
106
Mat out_shape_bordered_packed;
107
if (out_shape_bordered.dims == 1) out_shape_bordered_packed = Mat(out_shape_bordered.w / out_elempack, (void*)0, out_elemsize, out_elempack);
108
if (out_shape_bordered.dims == 2) out_shape_bordered_packed = Mat(out_shape_bordered.w, out_shape_bordered.h / out_elempack, (void*)0, out_elemsize, out_elempack);
109
if (out_shape_bordered.dims == 3) out_shape_bordered_packed = Mat(out_shape_bordered.w, out_shape_bordered.h, out_shape_bordered.c / out_elempack, (void*)0, out_elemsize, out_elempack);
111
// group deconvolution
112
const int channels_g = channels / group;
113
const int num_output_g = num_output / group;
115
int elempack_g = opt.use_shader_pack8 && channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1;
116
int out_elempack_g = opt.use_shader_pack8 && num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1;
119
size_t out_elemsize_g;
120
if (opt.use_fp16_storage)
122
elemsize_g = elempack_g * 2u;
123
out_elemsize_g = out_elempack_g * 2u;
125
else if (opt.use_fp16_packed)
127
elemsize_g = elempack_g == 1 ? 4u : elempack_g * 2u;
128
out_elemsize_g = out_elempack_g == 1 ? 4u : out_elempack_g * 2u;
132
elemsize_g = elempack_g * 4u;
133
out_elemsize_g = out_elempack_g * 4u;
137
if (shape.dims == 3) shape_g_packed = Mat(shape.w, shape.h, shape.c / elempack_g, (void*)0, elemsize_g, elempack_g);
139
Mat out_shape_bordered_g_packed;
140
if (out_shape_bordered.dims == 3) out_shape_bordered_g_packed = Mat(out_shape_bordered.w, out_shape_bordered.h, out_shape_bordered.c / out_elempack_g, (void*)0, out_elemsize_g, out_elempack_g);
143
if (!vkdev->shape_support_image_storage(shape_packed) || !vkdev->shape_support_image_storage(out_shape_bordered_packed))
145
support_image_storage = false;
146
opt.use_image_storage = false;
149
// check weight shape
150
if (channels == group && group == num_output)
152
Mat weight_data_packed(maxk, group / elempack, (void*)0, (size_t)4 * elempack, elempack);
153
if (!vkdev->shape_support_image_storage(weight_data_packed))
155
support_image_storage = false;
156
opt.use_image_storage = false;
162
if (!vkdev->shape_support_image_storage(shape_g_packed) || !vkdev->shape_support_image_storage(out_shape_bordered_g_packed))
164
support_image_storage = false;
165
opt.use_image_storage = false;
168
Mat weight_data_packed_groups(maxk, channels_g / elempack_g, num_output_g / out_elempack_g * group, (size_t)4 * elempack_g * out_elempack_g, elempack_g * out_elempack_g);
169
if (!vkdev->shape_support_image_storage(weight_data_packed_groups))
171
support_image_storage = false;
172
opt.use_image_storage = false;
177
crop = ncnn::create_layer_vulkan(ncnn::LayerType::Crop);
180
crop->bottom_shapes.resize(1);
181
crop->bottom_shapes[0] = out_shape_bordered;
182
crop->top_shapes.resize(1);
183
crop->top_shapes[0] = out_shape;
190
crop->load_param(pd);
192
crop->create_pipeline(opt);
196
output_crop = ncnn::create_layer_vulkan(ncnn::LayerType::Crop);
197
output_crop->vkdev = vkdev;
199
output_crop->bottom_shapes.resize(1);
200
output_crop->bottom_shapes[0] = out_shape_bordered;
201
output_crop->top_shapes.resize(1);
202
output_crop->top_shapes[0] = out_shape;
209
output_crop->load_param(pd);
211
output_crop->create_pipeline(opt);
214
Mat weight_data_transposed(weight_data.w);
216
float* pt = weight_data_transposed;
217
const float* p = weight_data;
219
for (int i = 0; i < (channels / group) * (num_output / group) * group; i++)
221
for (int k = 0; k < maxk; k++)
223
pt[maxk - 1 - k] = p[k];
231
std::vector<vk_specialization_type> specializations(11 + 10);
232
specializations[0].i = kernel_w;
233
specializations[1].i = kernel_h;
234
specializations[2].i = dilation_w;
235
specializations[3].i = dilation_h;
236
specializations[4].i = stride_w;
237
specializations[5].i = stride_h;
238
specializations[6].i = bias_term;
239
specializations[7].i = group;
240
specializations[8].i = activation_type;
241
specializations[9].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
242
specializations[10].f = activation_params.w == 2 ? activation_params[1] : 0.f;
245
if (channels == group && group == num_output)
247
Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group);
248
convert_packing(weight_data_r2, weight_data_packed, elempack, opt);
252
convert_packing(bias_data, bias_data_packed, out_elempack, opt);
255
specializations[11 + 0].i = shape_packed.dims;
256
specializations[11 + 1].i = shape_packed.w;
257
specializations[11 + 2].i = shape_packed.h;
258
specializations[11 + 3].i = shape_packed.c;
259
specializations[11 + 4].i = shape_packed.cstep;
260
specializations[11 + 5].i = out_shape_bordered_packed.dims;
261
specializations[11 + 6].i = out_shape_bordered_packed.w;
262
specializations[11 + 7].i = out_shape_bordered_packed.h;
263
specializations[11 + 8].i = out_shape_bordered_packed.c;
264
specializations[11 + 9].i = out_shape_bordered_packed.cstep;
266
Mat local_size_xyz(8, 8, std::min(4, num_output / out_elempack), (void*)0);
267
if (out_shape_bordered_packed.dims != 0)
269
local_size_xyz.w = std::min(8, out_shape_bordered_packed.w);
270
local_size_xyz.h = std::min(8, out_shape_bordered_packed.h);
271
local_size_xyz.c = std::min(4, out_shape_bordered_packed.c);
277
pipeline_deconvolutiondepthwise = new Pipeline(vkdev);
278
pipeline_deconvolutiondepthwise->set_optimal_local_size_xyz(local_size_xyz);
279
pipeline_deconvolutiondepthwise->create(LayerShaderType::deconvolutiondepthwise, opt, specializations);
285
pipeline_deconvolutiondepthwise_pack4 = new Pipeline(vkdev);
286
pipeline_deconvolutiondepthwise_pack4->set_optimal_local_size_xyz(local_size_xyz);
287
pipeline_deconvolutiondepthwise_pack4->create(LayerShaderType::deconvolutiondepthwise_pack4, opt, specializations);
293
pipeline_deconvolutiondepthwise_pack8 = new Pipeline(vkdev);
294
pipeline_deconvolutiondepthwise_pack8->set_optimal_local_size_xyz(local_size_xyz);
295
pipeline_deconvolutiondepthwise_pack8->create(LayerShaderType::deconvolutiondepthwise_pack8, opt, specializations);
300
weight_data.release();
307
// src = kw-kh-inch-outch
308
// dst = pa-pb-kw-kh-inch/pa-outch/pb
310
Mat weight_data_r2_groups = weight_data_transposed.reshape(maxk, channels_g, num_output_g * group);
312
weight_data_packed.create(maxk, channels_g / elempack_g, num_output_g / out_elempack_g * group, (size_t)4 * elempack_g * out_elempack_g, elempack_g * out_elempack_g);
314
for (int g = 0; g < group; g++)
316
const Mat weight_data_r2 = weight_data_r2_groups.channel_range(num_output_g * g, num_output_g);
318
Mat weight_data_pack4 = weight_data_packed.channel_range(num_output_g / out_elempack_g * g, num_output_g / out_elempack_g);
320
for (int q = 0; q + (out_elempack_g - 1) < num_output_g; q += out_elempack_g)
322
float* g00 = weight_data_pack4.channel(q / out_elempack_g);
324
for (int p = 0; p + (elempack_g - 1) < channels_g; p += elempack_g)
326
for (int k = 0; k < maxk; k++)
328
for (int i = 0; i < out_elempack_g; i++)
330
const Mat k0 = weight_data_r2.channel(q + i);
332
for (int j = 0; j < elempack_g; j++)
334
const float* k00 = k0.row(p + j);
349
convert_packing(bias_data, bias_data_packed, out_elempack_g, opt);
352
specializations[11 + 0].i = shape_g_packed.dims;
353
specializations[11 + 1].i = shape_g_packed.w;
354
specializations[11 + 2].i = shape_g_packed.h;
355
specializations[11 + 3].i = shape_g_packed.c;
356
specializations[11 + 4].i = shape_g_packed.cstep;
357
specializations[11 + 5].i = out_shape_bordered_g_packed.dims;
358
specializations[11 + 6].i = out_shape_bordered_g_packed.w;
359
specializations[11 + 7].i = out_shape_bordered_g_packed.h;
360
specializations[11 + 8].i = out_shape_bordered_g_packed.c;
361
specializations[11 + 9].i = out_shape_bordered_g_packed.cstep;
363
Mat local_size_xyz(8, 8, std::min(4, num_output / out_elempack_g), (void*)0);
364
if (out_shape_bordered_g_packed.dims != 0)
366
local_size_xyz.w = std::min(8, out_shape_bordered_g_packed.w);
367
local_size_xyz.h = std::min(8, out_shape_bordered_g_packed.h);
368
local_size_xyz.c = std::min(4, out_shape_bordered_g_packed.c);
372
if (elempack_g == 1 && out_elempack_g == 1)
374
pipeline_deconvolutiondepthwise_group = new Pipeline(vkdev);
375
pipeline_deconvolutiondepthwise_group->set_optimal_local_size_xyz(local_size_xyz);
376
pipeline_deconvolutiondepthwise_group->create(LayerShaderType::deconvolutiondepthwise_group, opt, specializations);
380
if (elempack_g == 4 && out_elempack_g == 4)
382
pipeline_deconvolutiondepthwise_group_pack4 = new Pipeline(vkdev);
383
pipeline_deconvolutiondepthwise_group_pack4->set_optimal_local_size_xyz(local_size_xyz);
384
pipeline_deconvolutiondepthwise_group_pack4->create(LayerShaderType::deconvolutiondepthwise_group_pack4, opt, specializations);
388
if (elempack_g == 1 && out_elempack_g == 4)
390
pipeline_deconvolutiondepthwise_group_pack1to4 = new Pipeline(vkdev);
391
pipeline_deconvolutiondepthwise_group_pack1to4->set_optimal_local_size_xyz(local_size_xyz);
392
pipeline_deconvolutiondepthwise_group_pack1to4->create(LayerShaderType::deconvolutiondepthwise_group_pack1to4, opt, specializations);
396
if (elempack_g == 4 && out_elempack_g == 1)
398
pipeline_deconvolutiondepthwise_group_pack4to1 = new Pipeline(vkdev);
399
pipeline_deconvolutiondepthwise_group_pack4to1->set_optimal_local_size_xyz(local_size_xyz);
400
pipeline_deconvolutiondepthwise_group_pack4to1->create(LayerShaderType::deconvolutiondepthwise_group_pack4to1, opt, specializations);
404
if (elempack_g == 8 && out_elempack_g == 8)
406
pipeline_deconvolutiondepthwise_group_pack8 = new Pipeline(vkdev);
407
pipeline_deconvolutiondepthwise_group_pack8->set_optimal_local_size_xyz(local_size_xyz);
408
pipeline_deconvolutiondepthwise_group_pack8->create(LayerShaderType::deconvolutiondepthwise_group_pack8, opt, specializations);
412
if (elempack_g == 1 && out_elempack_g == 8)
414
pipeline_deconvolutiondepthwise_group_pack1to8 = new Pipeline(vkdev);
415
pipeline_deconvolutiondepthwise_group_pack1to8->set_optimal_local_size_xyz(local_size_xyz);
416
pipeline_deconvolutiondepthwise_group_pack1to8->create(LayerShaderType::deconvolutiondepthwise_group_pack1to8, opt, specializations);
420
if (elempack_g == 4 && out_elempack_g == 8)
422
pipeline_deconvolutiondepthwise_group_pack4to8 = new Pipeline(vkdev);
423
pipeline_deconvolutiondepthwise_group_pack4to8->set_optimal_local_size_xyz(local_size_xyz);
424
pipeline_deconvolutiondepthwise_group_pack4to8->create(LayerShaderType::deconvolutiondepthwise_group_pack4to8, opt, specializations);
428
if (elempack_g == 8 && out_elempack_g == 4)
430
pipeline_deconvolutiondepthwise_group_pack8to4 = new Pipeline(vkdev);
431
pipeline_deconvolutiondepthwise_group_pack8to4->set_optimal_local_size_xyz(local_size_xyz);
432
pipeline_deconvolutiondepthwise_group_pack8to4->create(LayerShaderType::deconvolutiondepthwise_group_pack8to4, opt, specializations);
436
if (elempack_g == 8 && out_elempack_g == 1)
438
pipeline_deconvolutiondepthwise_group_pack8to1 = new Pipeline(vkdev);
439
pipeline_deconvolutiondepthwise_group_pack8to1->set_optimal_local_size_xyz(local_size_xyz);
440
pipeline_deconvolutiondepthwise_group_pack8to1->create(LayerShaderType::deconvolutiondepthwise_group_pack8to1, opt, specializations);
445
weight_data.release();
452
int DeconvolutionDepthWise_vulkan::destroy_pipeline(const Option& opt)
456
crop->destroy_pipeline(opt);
463
output_crop->destroy_pipeline(opt);
468
delete pipeline_deconvolutiondepthwise;
469
pipeline_deconvolutiondepthwise = 0;
471
delete pipeline_deconvolutiondepthwise_pack4;
472
pipeline_deconvolutiondepthwise_pack4 = 0;
474
delete pipeline_deconvolutiondepthwise_pack8;
475
pipeline_deconvolutiondepthwise_pack8 = 0;
477
delete pipeline_deconvolutiondepthwise_group;
478
pipeline_deconvolutiondepthwise_group = 0;
480
delete pipeline_deconvolutiondepthwise_group_pack4;
481
pipeline_deconvolutiondepthwise_group_pack4 = 0;
483
delete pipeline_deconvolutiondepthwise_group_pack1to4;
484
pipeline_deconvolutiondepthwise_group_pack1to4 = 0;
486
delete pipeline_deconvolutiondepthwise_group_pack4to1;
487
pipeline_deconvolutiondepthwise_group_pack4to1 = 0;
489
delete pipeline_deconvolutiondepthwise_group_pack8;
490
pipeline_deconvolutiondepthwise_group_pack8 = 0;
492
delete pipeline_deconvolutiondepthwise_group_pack1to8;
493
pipeline_deconvolutiondepthwise_group_pack1to8 = 0;
495
delete pipeline_deconvolutiondepthwise_group_pack4to8;
496
pipeline_deconvolutiondepthwise_group_pack4to8 = 0;
498
delete pipeline_deconvolutiondepthwise_group_pack8to4;
499
pipeline_deconvolutiondepthwise_group_pack8to4 = 0;
501
delete pipeline_deconvolutiondepthwise_group_pack8to1;
502
pipeline_deconvolutiondepthwise_group_pack8to1 = 0;
507
int DeconvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
511
crop->upload_model(cmd, opt);
516
output_crop->upload_model(cmd, opt);
519
if (support_image_storage && opt.use_image_storage)
521
cmd.record_upload(weight_data_packed, weight_data_gpu_image, opt);
525
cmd.record_upload(weight_data_packed, weight_data_gpu, opt);
528
weight_data_packed.release();
532
if (support_image_storage && opt.use_image_storage)
534
cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt);
538
cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
541
bias_data_packed.release();
547
int DeconvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
549
int w = bottom_blob.w;
550
int h = bottom_blob.h;
551
int channels = bottom_blob.c;
552
size_t elemsize = bottom_blob.elemsize;
553
int elempack = bottom_blob.elempack;
555
const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
556
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
558
int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
559
int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
560
int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
561
size_t out_elemsize = elemsize / elempack * out_elempack;
563
if (opt.use_fp16_packed && !opt.use_fp16_storage)
565
if (out_elempack == 8) out_elemsize = 8 * 2u;
566
if (out_elempack == 4) out_elemsize = 4 * 2u;
567
if (out_elempack == 1) out_elemsize = 4u;
570
VkMat top_blob_bordered;
571
if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0))
573
top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator);
577
top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
579
if (top_blob_bordered.empty())
583
if (channels == group / elempack && group / elempack == num_output / elempack)
585
std::vector<VkMat> bindings(4);
586
bindings[0] = bottom_blob;
587
bindings[1] = top_blob_bordered;
588
bindings[2] = weight_data_gpu;
589
bindings[3] = bias_data_gpu;
591
std::vector<vk_constant_type> constants(10);
592
constants[0].i = bottom_blob.dims;
593
constants[1].i = bottom_blob.w;
594
constants[2].i = bottom_blob.h;
595
constants[3].i = bottom_blob.c;
596
constants[4].i = bottom_blob.cstep;
597
constants[5].i = top_blob_bordered.dims;
598
constants[6].i = top_blob_bordered.w;
599
constants[7].i = top_blob_bordered.h;
600
constants[8].i = top_blob_bordered.c;
601
constants[9].i = top_blob_bordered.cstep;
603
const Pipeline* pipeline = elempack == 8 ? pipeline_deconvolutiondepthwise_pack8
604
: elempack == 4 ? pipeline_deconvolutiondepthwise_pack4
605
: pipeline_deconvolutiondepthwise;
608
cmd.record_pipeline(pipeline, bindings, constants, top_blob_bordered);
610
if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
613
VkMat reference_blob;
614
reference_blob.dims = 2;
615
reference_blob.w = top_blob_bordered.w - pad_left - pad_right;
616
reference_blob.h = top_blob_bordered.h - pad_top - pad_bottom;
617
reference_blob.elempack = 1;
619
std::vector<VkMat> crop_bottom_blobs(2);
620
crop_bottom_blobs[0] = top_blob_bordered;
621
crop_bottom_blobs[1] = reference_blob;
622
std::vector<VkMat> crop_top_blobs(1);
623
crop->forward(crop_bottom_blobs, crop_top_blobs, cmd, opt);
624
top_blob = crop_top_blobs[0];
626
if (top_blob.empty())
632
else if (output_w > 0 && output_h > 0)
634
int wcut = top_blob_bordered.w - output_w;
635
int hcut = top_blob_bordered.h - output_h;
637
VkMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
638
int* crop_params = crop_param_blob.mapped();
640
if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233)
642
// onnx padding=SAME_UPPER
643
crop_params[0] = wcut / 2;
644
crop_params[1] = hcut / 2;
646
crop_params[3] = top_blob_bordered.w - wcut;
647
crop_params[4] = top_blob_bordered.h - hcut;
648
crop_params[5] = top_blob_bordered.c * out_elempack;
650
else if (pad_left == -234 || pad_right == -234 || pad_top == -234 || pad_bottom == -234)
652
// onnx padding=SAME_LOWER
653
crop_params[0] = wcut - wcut / 2;
654
crop_params[1] = hcut - hcut / 2;
656
crop_params[3] = top_blob_bordered.w - wcut;
657
crop_params[4] = top_blob_bordered.h - hcut;
658
crop_params[5] = top_blob_bordered.c * out_elempack;
661
std::vector<VkMat> crop_inputs(2);
662
crop_inputs[0] = top_blob_bordered;
663
crop_inputs[1] = crop_param_blob;
665
std::vector<VkMat> crop_outputs(1);
666
output_crop->forward(crop_inputs, crop_outputs, cmd, opt);
667
top_blob = crop_outputs[0];
668
if (top_blob.empty())
676
top_blob = top_blob_bordered;
682
const int channels_g = channels * elempack / group;
683
const int num_output_g = num_output / group;
685
int elempack_g = opt.use_shader_pack8 && channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1;
686
int out_elempack_g = opt.use_shader_pack8 && num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1;
687
size_t out_elemsize_g = elemsize / elempack * out_elempack_g;
689
if (opt.use_fp16_packed && !opt.use_fp16_storage)
691
if (out_elempack_g == 8) out_elemsize_g = 8 * 2u;
692
if (out_elempack_g == 4) out_elemsize_g = 4 * 2u;
693
if (out_elempack_g == 1) out_elemsize_g = 4u;
697
VkMat bottom_blob_unpacked = bottom_blob;
698
if (elempack > elempack_g)
700
Option opt_pack1 = opt;
701
opt_pack1.blob_vkallocator = opt.workspace_vkallocator;
703
vkdev->convert_packing(bottom_blob, bottom_blob_unpacked, elempack_g, cmd, opt_pack1);
706
VkMat top_blob_unpacked = top_blob_bordered;
707
if (out_elempack_g < out_elempack)
709
top_blob_unpacked.create(outw, outh, num_output / out_elempack_g, out_elemsize_g, out_elempack_g, opt.workspace_vkallocator);
710
if (top_blob_unpacked.empty())
714
std::vector<VkMat> bindings(4);
715
bindings[0] = bottom_blob_unpacked;
716
bindings[1] = top_blob_unpacked;
717
bindings[2] = weight_data_gpu;
718
bindings[3] = bias_data_gpu;
720
std::vector<vk_constant_type> constants(10);
721
constants[0].i = bottom_blob_unpacked.dims;
722
constants[1].i = bottom_blob_unpacked.w;
723
constants[2].i = bottom_blob_unpacked.h;
724
constants[3].i = bottom_blob_unpacked.c;
725
constants[4].i = bottom_blob_unpacked.cstep;
726
constants[5].i = top_blob_unpacked.dims;
727
constants[6].i = top_blob_unpacked.w;
728
constants[7].i = top_blob_unpacked.h;
729
constants[8].i = top_blob_unpacked.c;
730
constants[9].i = top_blob_unpacked.cstep;
732
const Pipeline* pipeline = 0;
733
if (elempack_g == 1 && out_elempack_g == 1)
735
pipeline = pipeline_deconvolutiondepthwise_group;
737
else if (elempack_g == 4 && out_elempack_g == 4)
739
pipeline = pipeline_deconvolutiondepthwise_group_pack4;
741
else if (elempack_g == 1 && out_elempack_g == 4)
743
pipeline = pipeline_deconvolutiondepthwise_group_pack1to4;
745
else if (elempack_g == 4 && out_elempack_g == 1)
747
pipeline = pipeline_deconvolutiondepthwise_group_pack4to1;
749
else if (elempack_g == 8 && out_elempack_g == 8)
751
pipeline = pipeline_deconvolutiondepthwise_group_pack8;
753
else if (elempack_g == 1 && out_elempack_g == 8)
755
pipeline = pipeline_deconvolutiondepthwise_group_pack1to8;
757
else if (elempack_g == 4 && out_elempack_g == 8)
759
pipeline = pipeline_deconvolutiondepthwise_group_pack4to8;
761
else if (elempack_g == 8 && out_elempack_g == 4)
763
pipeline = pipeline_deconvolutiondepthwise_group_pack8to4;
765
else if (elempack_g == 8 && out_elempack_g == 1)
767
pipeline = pipeline_deconvolutiondepthwise_group_pack8to1;
770
cmd.record_pipeline(pipeline, bindings, constants, top_blob_unpacked);
773
if (out_elempack_g < out_elempack)
775
vkdev->convert_packing(top_blob_unpacked, top_blob_bordered, out_elempack, cmd, opt);
779
top_blob_bordered = top_blob_unpacked;
782
if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
785
VkMat reference_blob;
786
reference_blob.dims = 2;
787
reference_blob.w = top_blob_bordered.w - pad_left - pad_right;
788
reference_blob.h = top_blob_bordered.h - pad_top - pad_bottom;
789
reference_blob.elempack = 1;
791
std::vector<VkMat> crop_bottom_blobs(2);
792
crop_bottom_blobs[0] = top_blob_bordered;
793
crop_bottom_blobs[1] = reference_blob;
794
std::vector<VkMat> crop_top_blobs(1);
795
crop->forward(crop_bottom_blobs, crop_top_blobs, cmd, opt);
796
top_blob = crop_top_blobs[0];
798
if (top_blob.empty())
804
else if (output_w > 0 && output_h > 0)
806
int wcut = top_blob_bordered.w - output_w;
807
int hcut = top_blob_bordered.h - output_h;
809
VkMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
810
int* crop_params = crop_param_blob.mapped();
812
if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233)
814
// onnx padding=SAME_UPPER
815
crop_params[0] = wcut / 2;
816
crop_params[1] = hcut / 2;
818
crop_params[3] = top_blob_bordered.w - wcut;
819
crop_params[4] = top_blob_bordered.h - hcut;
820
crop_params[5] = top_blob_bordered.c * out_elempack;
822
else if (pad_left == -234 || pad_right == -234 || pad_top == -234 || pad_bottom == -234)
824
// onnx padding=SAME_LOWER
825
crop_params[0] = wcut - wcut / 2;
826
crop_params[1] = hcut - hcut / 2;
828
crop_params[3] = top_blob_bordered.w - wcut;
829
crop_params[4] = top_blob_bordered.h - hcut;
830
crop_params[5] = top_blob_bordered.c * out_elempack;
833
std::vector<VkMat> crop_inputs(2);
834
crop_inputs[0] = top_blob_bordered;
835
crop_inputs[1] = crop_param_blob;
837
std::vector<VkMat> crop_outputs(1);
838
output_crop->forward(crop_inputs, crop_outputs, cmd, opt);
839
top_blob = crop_outputs[0];
840
if (top_blob.empty())
848
top_blob = top_blob_bordered;
854
int DeconvolutionDepthWise_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const
856
int w = bottom_blob.w;
857
int h = bottom_blob.h;
858
int channels = bottom_blob.c;
859
size_t elemsize = bottom_blob.elemsize;
860
int elempack = bottom_blob.elempack;
862
const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
863
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
865
int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
866
int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
867
int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
868
size_t out_elemsize = elemsize / elempack * out_elempack;
870
if (opt.use_fp16_packed && !opt.use_fp16_storage)
872
if (out_elempack == 8) out_elemsize = 8 * 2u;
873
if (out_elempack == 4) out_elemsize = 4 * 2u;
874
if (out_elempack == 1) out_elemsize = 4u;
877
VkImageMat top_blob_bordered;
878
if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0))
880
top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator);
884
top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
886
if (top_blob_bordered.empty())
890
if (channels == group / elempack && group / elempack == num_output / elempack)
892
std::vector<VkImageMat> bindings(4);
893
bindings[0] = bottom_blob;
894
bindings[1] = top_blob_bordered;
895
bindings[2] = weight_data_gpu_image;
896
bindings[3] = bias_data_gpu_image;
898
std::vector<vk_constant_type> constants(10);
899
constants[0].i = bottom_blob.dims;
900
constants[1].i = bottom_blob.w;
901
constants[2].i = bottom_blob.h;
902
constants[3].i = bottom_blob.c;
903
constants[4].i = 0; //bottom_blob.cstep;
904
constants[5].i = top_blob_bordered.dims;
905
constants[6].i = top_blob_bordered.w;
906
constants[7].i = top_blob_bordered.h;
907
constants[8].i = top_blob_bordered.c;
908
constants[9].i = 0; //top_blob_bordered.cstep;
910
const Pipeline* pipeline = elempack == 8 ? pipeline_deconvolutiondepthwise_pack8
911
: elempack == 4 ? pipeline_deconvolutiondepthwise_pack4
912
: pipeline_deconvolutiondepthwise;
915
cmd.record_pipeline(pipeline, bindings, constants, top_blob_bordered);
917
if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
920
VkImageMat reference_blob;
921
reference_blob.dims = 2;
922
reference_blob.w = top_blob_bordered.w - pad_left - pad_right;
923
reference_blob.h = top_blob_bordered.h - pad_top - pad_bottom;
924
reference_blob.elempack = 1;
926
std::vector<VkImageMat> crop_bottom_blobs(2);
927
crop_bottom_blobs[0] = top_blob_bordered;
928
crop_bottom_blobs[1] = reference_blob;
929
std::vector<VkImageMat> crop_top_blobs(1);
930
crop->forward(crop_bottom_blobs, crop_top_blobs, cmd, opt);
931
top_blob = crop_top_blobs[0];
933
if (top_blob.empty())
939
else if (output_w > 0 && output_h > 0)
941
int wcut = top_blob_bordered.w - output_w;
942
int hcut = top_blob_bordered.h - output_h;
944
VkImageMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
945
int* crop_params = crop_param_blob.mapped();
947
if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233)
949
// onnx padding=SAME_UPPER
950
crop_params[0] = wcut / 2;
951
crop_params[1] = hcut / 2;
953
crop_params[3] = top_blob_bordered.w - wcut;
954
crop_params[4] = top_blob_bordered.h - hcut;
955
crop_params[5] = top_blob_bordered.c * out_elempack;
957
else if (pad_left == -234 || pad_right == -234 || pad_top == -234 || pad_bottom == -234)
959
// onnx padding=SAME_LOWER
960
crop_params[0] = wcut - wcut / 2;
961
crop_params[1] = hcut - hcut / 2;
963
crop_params[3] = top_blob_bordered.w - wcut;
964
crop_params[4] = top_blob_bordered.h - hcut;
965
crop_params[5] = top_blob_bordered.c * out_elempack;
968
std::vector<VkImageMat> crop_inputs(2);
969
crop_inputs[0] = top_blob_bordered;
970
crop_inputs[1] = crop_param_blob;
972
std::vector<VkImageMat> crop_outputs(1);
973
output_crop->forward(crop_inputs, crop_outputs, cmd, opt);
974
top_blob = crop_outputs[0];
975
if (top_blob.empty())
983
top_blob = top_blob_bordered;
989
const int channels_g = channels * elempack / group;
990
const int num_output_g = num_output / group;
992
int elempack_g = opt.use_shader_pack8 && channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1;
993
int out_elempack_g = opt.use_shader_pack8 && num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1;
994
size_t out_elemsize_g = elemsize / elempack * out_elempack_g;
996
if (opt.use_fp16_packed && !opt.use_fp16_storage)
998
if (out_elempack_g == 8) out_elemsize_g = 8 * 2u;
999
if (out_elempack_g == 4) out_elemsize_g = 4 * 2u;
1000
if (out_elempack_g == 1) out_elemsize_g = 4u;
1004
VkImageMat bottom_blob_unpacked = bottom_blob;
1005
if (elempack > elempack_g)
1007
Option opt_pack1 = opt;
1008
opt_pack1.blob_vkallocator = opt.workspace_vkallocator;
1010
vkdev->convert_packing(bottom_blob, bottom_blob_unpacked, elempack_g, cmd, opt_pack1);
1013
VkImageMat top_blob_unpacked = top_blob_bordered;
1014
if (out_elempack_g < out_elempack)
1016
top_blob_unpacked.create(outw, outh, num_output / out_elempack_g, out_elemsize_g, out_elempack_g, opt.workspace_vkallocator);
1017
if (top_blob_unpacked.empty())
1021
std::vector<VkImageMat> bindings(4);
1022
bindings[0] = bottom_blob_unpacked;
1023
bindings[1] = top_blob_unpacked;
1024
bindings[2] = weight_data_gpu_image;
1025
bindings[3] = bias_data_gpu_image;
1027
std::vector<vk_constant_type> constants(10);
1028
constants[0].i = bottom_blob_unpacked.dims;
1029
constants[1].i = bottom_blob_unpacked.w;
1030
constants[2].i = bottom_blob_unpacked.h;
1031
constants[3].i = bottom_blob_unpacked.c;
1032
constants[4].i = 0; //bottom_blob_unpacked.cstep;
1033
constants[5].i = top_blob_unpacked.dims;
1034
constants[6].i = top_blob_unpacked.w;
1035
constants[7].i = top_blob_unpacked.h;
1036
constants[8].i = top_blob_unpacked.c;
1037
constants[9].i = 0; //top_blob_unpacked.cstep;
1039
const Pipeline* pipeline = 0;
1040
if (elempack_g == 1 && out_elempack_g == 1)
1042
pipeline = pipeline_deconvolutiondepthwise_group;
1044
else if (elempack_g == 4 && out_elempack_g == 4)
1046
pipeline = pipeline_deconvolutiondepthwise_group_pack4;
1048
else if (elempack_g == 1 && out_elempack_g == 4)
1050
pipeline = pipeline_deconvolutiondepthwise_group_pack1to4;
1052
else if (elempack_g == 4 && out_elempack_g == 1)
1054
pipeline = pipeline_deconvolutiondepthwise_group_pack4to1;
1056
else if (elempack_g == 8 && out_elempack_g == 8)
1058
pipeline = pipeline_deconvolutiondepthwise_group_pack8;
1060
else if (elempack_g == 1 && out_elempack_g == 8)
1062
pipeline = pipeline_deconvolutiondepthwise_group_pack1to8;
1064
else if (elempack_g == 4 && out_elempack_g == 8)
1066
pipeline = pipeline_deconvolutiondepthwise_group_pack4to8;
1068
else if (elempack_g == 8 && out_elempack_g == 4)
1070
pipeline = pipeline_deconvolutiondepthwise_group_pack8to4;
1072
else if (elempack_g == 8 && out_elempack_g == 1)
1074
pipeline = pipeline_deconvolutiondepthwise_group_pack8to1;
1077
cmd.record_pipeline(pipeline, bindings, constants, top_blob_unpacked);
1080
if (out_elempack_g < out_elempack)
1082
vkdev->convert_packing(top_blob_unpacked, top_blob_bordered, out_elempack, cmd, opt);
1086
top_blob_bordered = top_blob_unpacked;
1089
if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
1092
VkImageMat reference_blob;
1093
reference_blob.dims = 2;
1094
reference_blob.w = top_blob_bordered.w - pad_left - pad_right;
1095
reference_blob.h = top_blob_bordered.h - pad_top - pad_bottom;
1096
reference_blob.elempack = 1;
1098
std::vector<VkImageMat> crop_bottom_blobs(2);
1099
crop_bottom_blobs[0] = top_blob_bordered;
1100
crop_bottom_blobs[1] = reference_blob;
1101
std::vector<VkImageMat> crop_top_blobs(1);
1102
crop->forward(crop_bottom_blobs, crop_top_blobs, cmd, opt);
1103
top_blob = crop_top_blobs[0];
1105
if (top_blob.empty())
1111
else if (output_w > 0 && output_h > 0)
1113
int wcut = top_blob_bordered.w - output_w;
1114
int hcut = top_blob_bordered.h - output_h;
1116
VkImageMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
1117
int* crop_params = crop_param_blob.mapped();
1119
if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233)
1121
// onnx padding=SAME_UPPER
1122
crop_params[0] = wcut / 2;
1123
crop_params[1] = hcut / 2;
1125
crop_params[3] = top_blob_bordered.w - wcut;
1126
crop_params[4] = top_blob_bordered.h - hcut;
1127
crop_params[5] = top_blob_bordered.c * out_elempack;
1129
else if (pad_left == -234 || pad_right == -234 || pad_top == -234 || pad_bottom == -234)
1131
// onnx padding=SAME_LOWER
1132
crop_params[0] = wcut - wcut / 2;
1133
crop_params[1] = hcut - hcut / 2;
1135
crop_params[3] = top_blob_bordered.w - wcut;
1136
crop_params[4] = top_blob_bordered.h - hcut;
1137
crop_params[5] = top_blob_bordered.c * out_elempack;
1140
std::vector<VkImageMat> crop_inputs(2);
1141
crop_inputs[0] = top_blob_bordered;
1142
crop_inputs[1] = crop_param_blob;
1144
std::vector<VkImageMat> crop_outputs(1);
1145
output_crop->forward(crop_inputs, crop_outputs, cmd, opt);
1146
top_blob = crop_outputs[0];
1147
if (top_blob.empty())
1155
top_blob = top_blob_bordered;