ncnn

deconvolutiondepthwise_vulkan.cpp
1161 строка · 41.2 Кб
Перенос по словам
1
// Tencent is pleased to support the open source community by making ncnn available.
2
//
3
// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4
//
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
7
//
8
// https://opensource.org/licenses/BSD-3-Clause
9
//
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
14

15
#include "deconvolutiondepthwise_vulkan.h"
16

17
#include "layer_shader_type.h"
18
#include "layer_type.h"
19

20
namespace ncnn {
21

22
DeconvolutionDepthWise_vulkan::DeconvolutionDepthWise_vulkan()
23
{
24
    support_vulkan = true;
25
    support_image_storage = true;
26

27
    crop = 0;
28
    output_crop = 0;
29

30
    pipeline_deconvolutiondepthwise = 0;
31
    pipeline_deconvolutiondepthwise_pack4 = 0;
32
    pipeline_deconvolutiondepthwise_pack8 = 0;
33

34
    pipeline_deconvolutiondepthwise_group = 0;
35
    pipeline_deconvolutiondepthwise_group_pack4 = 0;
36
    pipeline_deconvolutiondepthwise_group_pack1to4 = 0;
37
    pipeline_deconvolutiondepthwise_group_pack4to1 = 0;
38
    pipeline_deconvolutiondepthwise_group_pack8 = 0;
39
    pipeline_deconvolutiondepthwise_group_pack1to8 = 0;
40
    pipeline_deconvolutiondepthwise_group_pack4to8 = 0;
41
    pipeline_deconvolutiondepthwise_group_pack8to4 = 0;
42
    pipeline_deconvolutiondepthwise_group_pack8to1 = 0;
43
}
44

45
int DeconvolutionDepthWise_vulkan::load_param(const ParamDict& pd)
46
{
47
    int ret = DeconvolutionDepthWise::load_param(pd);
48

49
    if (dynamic_weight)
50
    {
51
        support_vulkan = false;
52
        support_image_storage = false;
53
    }
54

55
    return ret;
56
}
57

58
int DeconvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
59
{
60
    Option opt = _opt;
61
    const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
62
    const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
63

64
    // the shape before unpadding
65
    Mat out_shape_bordered;
66
    if (shape.dims != 0)
67
    {
68
        const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
69
        const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
70

71
        int outw = (shape.w - 1) * stride_w + kernel_extent_w + output_pad_right;
72
        int outh = (shape.h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
73

74
        out_shape_bordered = Mat(outw, outh, out_shape.c, (void*)0);
75
    }
76

77
    const int maxk = kernel_w * kernel_h;
78
    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
79

80
    int elempack = opt.use_shader_pack8 && channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1;
81
    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
82

83
    size_t elemsize;
84
    size_t out_elemsize;
85
    if (opt.use_fp16_storage)
86
    {
87
        elemsize = elempack * 2u;
88
        out_elemsize = out_elempack * 2u;
89
    }
90
    else if (opt.use_fp16_packed)
91
    {
92
        elemsize = elempack == 1 ? 4u : elempack * 2u;
93
        out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u;
94
    }
95
    else
96
    {
97
        elemsize = elempack * 4u;
98
        out_elemsize = out_elempack * 4u;
99
    }
100

101
    Mat shape_packed;
102
    if (shape.dims == 1) shape_packed = Mat(shape.w / elempack, (void*)0, elemsize, elempack);
103
    if (shape.dims == 2) shape_packed = Mat(shape.w, shape.h / elempack, (void*)0, elemsize, elempack);
104
    if (shape.dims == 3) shape_packed = Mat(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
105

106
    Mat out_shape_bordered_packed;
107
    if (out_shape_bordered.dims == 1) out_shape_bordered_packed = Mat(out_shape_bordered.w / out_elempack, (void*)0, out_elemsize, out_elempack);
108
    if (out_shape_bordered.dims == 2) out_shape_bordered_packed = Mat(out_shape_bordered.w, out_shape_bordered.h / out_elempack, (void*)0, out_elemsize, out_elempack);
109
    if (out_shape_bordered.dims == 3) out_shape_bordered_packed = Mat(out_shape_bordered.w, out_shape_bordered.h, out_shape_bordered.c / out_elempack, (void*)0, out_elemsize, out_elempack);
110

111
    // group deconvolution
112
    const int channels_g = channels / group;
113
    const int num_output_g = num_output / group;
114

115
    int elempack_g = opt.use_shader_pack8 && channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1;
116
    int out_elempack_g = opt.use_shader_pack8 && num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1;
117

118
    size_t elemsize_g;
119
    size_t out_elemsize_g;
120
    if (opt.use_fp16_storage)
121
    {
122
        elemsize_g = elempack_g * 2u;
123
        out_elemsize_g = out_elempack_g * 2u;
124
    }
125
    else if (opt.use_fp16_packed)
126
    {
127
        elemsize_g = elempack_g == 1 ? 4u : elempack_g * 2u;
128
        out_elemsize_g = out_elempack_g == 1 ? 4u : out_elempack_g * 2u;
129
    }
130
    else
131
    {
132
        elemsize_g = elempack_g * 4u;
133
        out_elemsize_g = out_elempack_g * 4u;
134
    }
135

136
    Mat shape_g_packed;
137
    if (shape.dims == 3) shape_g_packed = Mat(shape.w, shape.h, shape.c / elempack_g, (void*)0, elemsize_g, elempack_g);
138

139
    Mat out_shape_bordered_g_packed;
140
    if (out_shape_bordered.dims == 3) out_shape_bordered_g_packed = Mat(out_shape_bordered.w, out_shape_bordered.h, out_shape_bordered.c / out_elempack_g, (void*)0, out_elemsize_g, out_elempack_g);
141

142
    // check blob shape
143
    if (!vkdev->shape_support_image_storage(shape_packed) || !vkdev->shape_support_image_storage(out_shape_bordered_packed))
144
    {
145
        support_image_storage = false;
146
        opt.use_image_storage = false;
147
    }
148

149
    // check weight shape
150
    if (channels == group && group == num_output)
151
    {
152
        Mat weight_data_packed(maxk, group / elempack, (void*)0, (size_t)4 * elempack, elempack);
153
        if (!vkdev->shape_support_image_storage(weight_data_packed))
154
        {
155
            support_image_storage = false;
156
            opt.use_image_storage = false;
157
        }
158
    }
159
    else
160
    {
161
        // check blob shape
162
        if (!vkdev->shape_support_image_storage(shape_g_packed) || !vkdev->shape_support_image_storage(out_shape_bordered_g_packed))
163
        {
164
            support_image_storage = false;
165
            opt.use_image_storage = false;
166
        }
167

168
        Mat weight_data_packed_groups(maxk, channels_g / elempack_g, num_output_g / out_elempack_g * group, (size_t)4 * elempack_g * out_elempack_g, elempack_g * out_elempack_g);
169
        if (!vkdev->shape_support_image_storage(weight_data_packed_groups))
170
        {
171
            support_image_storage = false;
172
            opt.use_image_storage = false;
173
        }
174
    }
175

176
    {
177
        crop = ncnn::create_layer_vulkan(ncnn::LayerType::Crop);
178
        crop->vkdev = vkdev;
179

180
        crop->bottom_shapes.resize(1);
181
        crop->bottom_shapes[0] = out_shape_bordered;
182
        crop->top_shapes.resize(1);
183
        crop->top_shapes[0] = out_shape;
184

185
        ncnn::ParamDict pd;
186
        pd.set(0, pad_left);
187
        pd.set(1, pad_top);
188
        pd.set(2, 0);
189

190
        crop->load_param(pd);
191

192
        crop->create_pipeline(opt);
193
    }
194

195
    {
196
        output_crop = ncnn::create_layer_vulkan(ncnn::LayerType::Crop);
197
        output_crop->vkdev = vkdev;
198

199
        output_crop->bottom_shapes.resize(1);
200
        output_crop->bottom_shapes[0] = out_shape_bordered;
201
        output_crop->top_shapes.resize(1);
202
        output_crop->top_shapes[0] = out_shape;
203

204
        ncnn::ParamDict pd;
205
        pd.set(0, -233);
206
        pd.set(1, -233);
207
        pd.set(2, -233);
208

209
        output_crop->load_param(pd);
210

211
        output_crop->create_pipeline(opt);
212
    }
213

214
    Mat weight_data_transposed(weight_data.w);
215
    {
216
        float* pt = weight_data_transposed;
217
        const float* p = weight_data;
218

219
        for (int i = 0; i < (channels / group) * (num_output / group) * group; i++)
220
        {
221
            for (int k = 0; k < maxk; k++)
222
            {
223
                pt[maxk - 1 - k] = p[k];
224
            }
225

226
            p += maxk;
227
            pt += maxk;
228
        }
229
    }
230

231
    std::vector<vk_specialization_type> specializations(11 + 10);
232
    specializations[0].i = kernel_w;
233
    specializations[1].i = kernel_h;
234
    specializations[2].i = dilation_w;
235
    specializations[3].i = dilation_h;
236
    specializations[4].i = stride_w;
237
    specializations[5].i = stride_h;
238
    specializations[6].i = bias_term;
239
    specializations[7].i = group;
240
    specializations[8].i = activation_type;
241
    specializations[9].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
242
    specializations[10].f = activation_params.w == 2 ? activation_params[1] : 0.f;
243

244
    // depth-wise
245
    if (channels == group && group == num_output)
246
    {
247
        Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group);
248
        convert_packing(weight_data_r2, weight_data_packed, elempack, opt);
249

250
        if (bias_term)
251
        {
252
            convert_packing(bias_data, bias_data_packed, out_elempack, opt);
253
        }
254

255
        specializations[11 + 0].i = shape_packed.dims;
256
        specializations[11 + 1].i = shape_packed.w;
257
        specializations[11 + 2].i = shape_packed.h;
258
        specializations[11 + 3].i = shape_packed.c;
259
        specializations[11 + 4].i = shape_packed.cstep;
260
        specializations[11 + 5].i = out_shape_bordered_packed.dims;
261
        specializations[11 + 6].i = out_shape_bordered_packed.w;
262
        specializations[11 + 7].i = out_shape_bordered_packed.h;
263
        specializations[11 + 8].i = out_shape_bordered_packed.c;
264
        specializations[11 + 9].i = out_shape_bordered_packed.cstep;
265

266
        Mat local_size_xyz(8, 8, std::min(4, num_output / out_elempack), (void*)0);
267
        if (out_shape_bordered_packed.dims != 0)
268
        {
269
            local_size_xyz.w = std::min(8, out_shape_bordered_packed.w);
270
            local_size_xyz.h = std::min(8, out_shape_bordered_packed.h);
271
            local_size_xyz.c = std::min(4, out_shape_bordered_packed.c);
272
        }
273

274
        // pack1
275
        if (elempack == 1)
276
        {
277
            pipeline_deconvolutiondepthwise = new Pipeline(vkdev);
278
            pipeline_deconvolutiondepthwise->set_optimal_local_size_xyz(local_size_xyz);
279
            pipeline_deconvolutiondepthwise->create(LayerShaderType::deconvolutiondepthwise, opt, specializations);
280
        }
281

282
        // pack4
283
        if (elempack == 4)
284
        {
285
            pipeline_deconvolutiondepthwise_pack4 = new Pipeline(vkdev);
286
            pipeline_deconvolutiondepthwise_pack4->set_optimal_local_size_xyz(local_size_xyz);
287
            pipeline_deconvolutiondepthwise_pack4->create(LayerShaderType::deconvolutiondepthwise_pack4, opt, specializations);
288
        }
289

290
        // pack8
291
        if (elempack == 8)
292
        {
293
            pipeline_deconvolutiondepthwise_pack8 = new Pipeline(vkdev);
294
            pipeline_deconvolutiondepthwise_pack8->set_optimal_local_size_xyz(local_size_xyz);
295
            pipeline_deconvolutiondepthwise_pack8->create(LayerShaderType::deconvolutiondepthwise_pack8, opt, specializations);
296
        }
297

298
        if (opt.lightmode)
299
        {
300
            weight_data.release();
301
            bias_data.release();
302
        }
303

304
        return 0;
305
    }
306

307
    // src = kw-kh-inch-outch
308
    // dst = pa-pb-kw-kh-inch/pa-outch/pb
309
    {
310
        Mat weight_data_r2_groups = weight_data_transposed.reshape(maxk, channels_g, num_output_g * group);
311

312
        weight_data_packed.create(maxk, channels_g / elempack_g, num_output_g / out_elempack_g * group, (size_t)4 * elempack_g * out_elempack_g, elempack_g * out_elempack_g);
313

314
        for (int g = 0; g < group; g++)
315
        {
316
            const Mat weight_data_r2 = weight_data_r2_groups.channel_range(num_output_g * g, num_output_g);
317

318
            Mat weight_data_pack4 = weight_data_packed.channel_range(num_output_g / out_elempack_g * g, num_output_g / out_elempack_g);
319

320
            for (int q = 0; q + (out_elempack_g - 1) < num_output_g; q += out_elempack_g)
321
            {
322
                float* g00 = weight_data_pack4.channel(q / out_elempack_g);
323

324
                for (int p = 0; p + (elempack_g - 1) < channels_g; p += elempack_g)
325
                {
326
                    for (int k = 0; k < maxk; k++)
327
                    {
328
                        for (int i = 0; i < out_elempack_g; i++)
329
                        {
330
                            const Mat k0 = weight_data_r2.channel(q + i);
331

332
                            for (int j = 0; j < elempack_g; j++)
333
                            {
334
                                const float* k00 = k0.row(p + j);
335

336
                                g00[0] = k00[k];
337

338
                                g00++;
339
                            }
340
                        }
341
                    }
342
                }
343
            }
344
        }
345
    }
346

347
    if (bias_term)
348
    {
349
        convert_packing(bias_data, bias_data_packed, out_elempack_g, opt);
350
    }
351

352
    specializations[11 + 0].i = shape_g_packed.dims;
353
    specializations[11 + 1].i = shape_g_packed.w;
354
    specializations[11 + 2].i = shape_g_packed.h;
355
    specializations[11 + 3].i = shape_g_packed.c;
356
    specializations[11 + 4].i = shape_g_packed.cstep;
357
    specializations[11 + 5].i = out_shape_bordered_g_packed.dims;
358
    specializations[11 + 6].i = out_shape_bordered_g_packed.w;
359
    specializations[11 + 7].i = out_shape_bordered_g_packed.h;
360
    specializations[11 + 8].i = out_shape_bordered_g_packed.c;
361
    specializations[11 + 9].i = out_shape_bordered_g_packed.cstep;
362

363
    Mat local_size_xyz(8, 8, std::min(4, num_output / out_elempack_g), (void*)0);
364
    if (out_shape_bordered_g_packed.dims != 0)
365
    {
366
        local_size_xyz.w = std::min(8, out_shape_bordered_g_packed.w);
367
        local_size_xyz.h = std::min(8, out_shape_bordered_g_packed.h);
368
        local_size_xyz.c = std::min(4, out_shape_bordered_g_packed.c);
369
    }
370

371
    // pack1
372
    if (elempack_g == 1 && out_elempack_g == 1)
373
    {
374
        pipeline_deconvolutiondepthwise_group = new Pipeline(vkdev);
375
        pipeline_deconvolutiondepthwise_group->set_optimal_local_size_xyz(local_size_xyz);
376
        pipeline_deconvolutiondepthwise_group->create(LayerShaderType::deconvolutiondepthwise_group, opt, specializations);
377
    }
378

379
    // pack4
380
    if (elempack_g == 4 && out_elempack_g == 4)
381
    {
382
        pipeline_deconvolutiondepthwise_group_pack4 = new Pipeline(vkdev);
383
        pipeline_deconvolutiondepthwise_group_pack4->set_optimal_local_size_xyz(local_size_xyz);
384
        pipeline_deconvolutiondepthwise_group_pack4->create(LayerShaderType::deconvolutiondepthwise_group_pack4, opt, specializations);
385
    }
386

387
    // pack1to4
388
    if (elempack_g == 1 && out_elempack_g == 4)
389
    {
390
        pipeline_deconvolutiondepthwise_group_pack1to4 = new Pipeline(vkdev);
391
        pipeline_deconvolutiondepthwise_group_pack1to4->set_optimal_local_size_xyz(local_size_xyz);
392
        pipeline_deconvolutiondepthwise_group_pack1to4->create(LayerShaderType::deconvolutiondepthwise_group_pack1to4, opt, specializations);
393
    }
394

395
    // pack4to1
396
    if (elempack_g == 4 && out_elempack_g == 1)
397
    {
398
        pipeline_deconvolutiondepthwise_group_pack4to1 = new Pipeline(vkdev);
399
        pipeline_deconvolutiondepthwise_group_pack4to1->set_optimal_local_size_xyz(local_size_xyz);
400
        pipeline_deconvolutiondepthwise_group_pack4to1->create(LayerShaderType::deconvolutiondepthwise_group_pack4to1, opt, specializations);
401
    }
402

403
    // pack8
404
    if (elempack_g == 8 && out_elempack_g == 8)
405
    {
406
        pipeline_deconvolutiondepthwise_group_pack8 = new Pipeline(vkdev);
407
        pipeline_deconvolutiondepthwise_group_pack8->set_optimal_local_size_xyz(local_size_xyz);
408
        pipeline_deconvolutiondepthwise_group_pack8->create(LayerShaderType::deconvolutiondepthwise_group_pack8, opt, specializations);
409
    }
410

411
    // pack1to8
412
    if (elempack_g == 1 && out_elempack_g == 8)
413
    {
414
        pipeline_deconvolutiondepthwise_group_pack1to8 = new Pipeline(vkdev);
415
        pipeline_deconvolutiondepthwise_group_pack1to8->set_optimal_local_size_xyz(local_size_xyz);
416
        pipeline_deconvolutiondepthwise_group_pack1to8->create(LayerShaderType::deconvolutiondepthwise_group_pack1to8, opt, specializations);
417
    }
418

419
    // pack4to8
420
    if (elempack_g == 4 && out_elempack_g == 8)
421
    {
422
        pipeline_deconvolutiondepthwise_group_pack4to8 = new Pipeline(vkdev);
423
        pipeline_deconvolutiondepthwise_group_pack4to8->set_optimal_local_size_xyz(local_size_xyz);
424
        pipeline_deconvolutiondepthwise_group_pack4to8->create(LayerShaderType::deconvolutiondepthwise_group_pack4to8, opt, specializations);
425
    }
426

427
    // pack8to4
428
    if (elempack_g == 8 && out_elempack_g == 4)
429
    {
430
        pipeline_deconvolutiondepthwise_group_pack8to4 = new Pipeline(vkdev);
431
        pipeline_deconvolutiondepthwise_group_pack8to4->set_optimal_local_size_xyz(local_size_xyz);
432
        pipeline_deconvolutiondepthwise_group_pack8to4->create(LayerShaderType::deconvolutiondepthwise_group_pack8to4, opt, specializations);
433
    }
434

435
    // pack8to1
436
    if (elempack_g == 8 && out_elempack_g == 1)
437
    {
438
        pipeline_deconvolutiondepthwise_group_pack8to1 = new Pipeline(vkdev);
439
        pipeline_deconvolutiondepthwise_group_pack8to1->set_optimal_local_size_xyz(local_size_xyz);
440
        pipeline_deconvolutiondepthwise_group_pack8to1->create(LayerShaderType::deconvolutiondepthwise_group_pack8to1, opt, specializations);
441
    }
442

443
    if (opt.lightmode)
444
    {
445
        weight_data.release();
446
        bias_data.release();
447
    }
448

449
    return 0;
450
}
451

452
int DeconvolutionDepthWise_vulkan::destroy_pipeline(const Option& opt)
453
{
454
    if (crop)
455
    {
456
        crop->destroy_pipeline(opt);
457
        delete crop;
458
        crop = 0;
459
    }
460

461
    if (output_crop)
462
    {
463
        output_crop->destroy_pipeline(opt);
464
        delete output_crop;
465
        output_crop = 0;
466
    }
467

468
    delete pipeline_deconvolutiondepthwise;
469
    pipeline_deconvolutiondepthwise = 0;
470

471
    delete pipeline_deconvolutiondepthwise_pack4;
472
    pipeline_deconvolutiondepthwise_pack4 = 0;
473

474
    delete pipeline_deconvolutiondepthwise_pack8;
475
    pipeline_deconvolutiondepthwise_pack8 = 0;
476

477
    delete pipeline_deconvolutiondepthwise_group;
478
    pipeline_deconvolutiondepthwise_group = 0;
479

480
    delete pipeline_deconvolutiondepthwise_group_pack4;
481
    pipeline_deconvolutiondepthwise_group_pack4 = 0;
482

483
    delete pipeline_deconvolutiondepthwise_group_pack1to4;
484
    pipeline_deconvolutiondepthwise_group_pack1to4 = 0;
485

486
    delete pipeline_deconvolutiondepthwise_group_pack4to1;
487
    pipeline_deconvolutiondepthwise_group_pack4to1 = 0;
488

489
    delete pipeline_deconvolutiondepthwise_group_pack8;
490
    pipeline_deconvolutiondepthwise_group_pack8 = 0;
491

492
    delete pipeline_deconvolutiondepthwise_group_pack1to8;
493
    pipeline_deconvolutiondepthwise_group_pack1to8 = 0;
494

495
    delete pipeline_deconvolutiondepthwise_group_pack4to8;
496
    pipeline_deconvolutiondepthwise_group_pack4to8 = 0;
497

498
    delete pipeline_deconvolutiondepthwise_group_pack8to4;
499
    pipeline_deconvolutiondepthwise_group_pack8to4 = 0;
500

501
    delete pipeline_deconvolutiondepthwise_group_pack8to1;
502
    pipeline_deconvolutiondepthwise_group_pack8to1 = 0;
503

504
    return 0;
505
}
506

507
int DeconvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
508
{
509
    if (crop)
510
    {
511
        crop->upload_model(cmd, opt);
512
    }
513

514
    if (output_crop)
515
    {
516
        output_crop->upload_model(cmd, opt);
517
    }
518

519
    if (support_image_storage && opt.use_image_storage)
520
    {
521
        cmd.record_upload(weight_data_packed, weight_data_gpu_image, opt);
522
    }
523
    else
524
    {
525
        cmd.record_upload(weight_data_packed, weight_data_gpu, opt);
526
    }
527

528
    weight_data_packed.release();
529

530
    if (bias_term)
531
    {
532
        if (support_image_storage && opt.use_image_storage)
533
        {
534
            cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt);
535
        }
536
        else
537
        {
538
            cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
539
        }
540

541
        bias_data_packed.release();
542
    }
543

544
    return 0;
545
}
546

547
int DeconvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
548
{
549
    int w = bottom_blob.w;
550
    int h = bottom_blob.h;
551
    int channels = bottom_blob.c;
552
    size_t elemsize = bottom_blob.elemsize;
553
    int elempack = bottom_blob.elempack;
554

555
    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
556
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
557

558
    int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
559
    int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
560
    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
561
    size_t out_elemsize = elemsize / elempack * out_elempack;
562

563
    if (opt.use_fp16_packed && !opt.use_fp16_storage)
564
    {
565
        if (out_elempack == 8) out_elemsize = 8 * 2u;
566
        if (out_elempack == 4) out_elemsize = 4 * 2u;
567
        if (out_elempack == 1) out_elemsize = 4u;
568
    }
569

570
    VkMat top_blob_bordered;
571
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0))
572
    {
573
        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator);
574
    }
575
    else
576
    {
577
        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
578
    }
579
    if (top_blob_bordered.empty())
580
        return -100;
581

582
    // depth-wise
583
    if (channels == group / elempack && group / elempack == num_output / elempack)
584
    {
585
        std::vector<VkMat> bindings(4);
586
        bindings[0] = bottom_blob;
587
        bindings[1] = top_blob_bordered;
588
        bindings[2] = weight_data_gpu;
589
        bindings[3] = bias_data_gpu;
590

591
        std::vector<vk_constant_type> constants(10);
592
        constants[0].i = bottom_blob.dims;
593
        constants[1].i = bottom_blob.w;
594
        constants[2].i = bottom_blob.h;
595
        constants[3].i = bottom_blob.c;
596
        constants[4].i = bottom_blob.cstep;
597
        constants[5].i = top_blob_bordered.dims;
598
        constants[6].i = top_blob_bordered.w;
599
        constants[7].i = top_blob_bordered.h;
600
        constants[8].i = top_blob_bordered.c;
601
        constants[9].i = top_blob_bordered.cstep;
602

603
        const Pipeline* pipeline = elempack == 8 ? pipeline_deconvolutiondepthwise_pack8
604
                                   : elempack == 4 ? pipeline_deconvolutiondepthwise_pack4
605
                                   : pipeline_deconvolutiondepthwise;
606

607
        // record
608
        cmd.record_pipeline(pipeline, bindings, constants, top_blob_bordered);
609

610
        if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
611
        {
612
            {
613
                VkMat reference_blob;
614
                reference_blob.dims = 2;
615
                reference_blob.w = top_blob_bordered.w - pad_left - pad_right;
616
                reference_blob.h = top_blob_bordered.h - pad_top - pad_bottom;
617
                reference_blob.elempack = 1;
618

619
                std::vector<VkMat> crop_bottom_blobs(2);
620
                crop_bottom_blobs[0] = top_blob_bordered;
621
                crop_bottom_blobs[1] = reference_blob;
622
                std::vector<VkMat> crop_top_blobs(1);
623
                crop->forward(crop_bottom_blobs, crop_top_blobs, cmd, opt);
624
                top_blob = crop_top_blobs[0];
625
            }
626
            if (top_blob.empty())
627
                return -100;
628

629
            outw = top_blob.w;
630
            outh = top_blob.h;
631
        }
632
        else if (output_w > 0 && output_h > 0)
633
        {
634
            int wcut = top_blob_bordered.w - output_w;
635
            int hcut = top_blob_bordered.h - output_h;
636

637
            VkMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
638
            int* crop_params = crop_param_blob.mapped();
639

640
            if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233)
641
            {
642
                // onnx padding=SAME_UPPER
643
                crop_params[0] = wcut / 2;
644
                crop_params[1] = hcut / 2;
645
                crop_params[2] = 0;
646
                crop_params[3] = top_blob_bordered.w - wcut;
647
                crop_params[4] = top_blob_bordered.h - hcut;
648
                crop_params[5] = top_blob_bordered.c * out_elempack;
649
            }
650
            else if (pad_left == -234 || pad_right == -234 || pad_top == -234 || pad_bottom == -234)
651
            {
652
                // onnx padding=SAME_LOWER
653
                crop_params[0] = wcut - wcut / 2;
654
                crop_params[1] = hcut - hcut / 2;
655
                crop_params[2] = 0;
656
                crop_params[3] = top_blob_bordered.w - wcut;
657
                crop_params[4] = top_blob_bordered.h - hcut;
658
                crop_params[5] = top_blob_bordered.c * out_elempack;
659
            }
660

661
            std::vector<VkMat> crop_inputs(2);
662
            crop_inputs[0] = top_blob_bordered;
663
            crop_inputs[1] = crop_param_blob;
664

665
            std::vector<VkMat> crop_outputs(1);
666
            output_crop->forward(crop_inputs, crop_outputs, cmd, opt);
667
            top_blob = crop_outputs[0];
668
            if (top_blob.empty())
669
                return -100;
670

671
            outw = top_blob.w;
672
            outh = top_blob.h;
673
        }
674
        else
675
        {
676
            top_blob = top_blob_bordered;
677
        }
678

679
        return 0;
680
    }
681

682
    const int channels_g = channels * elempack / group;
683
    const int num_output_g = num_output / group;
684

685
    int elempack_g = opt.use_shader_pack8 && channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1;
686
    int out_elempack_g = opt.use_shader_pack8 && num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1;
687
    size_t out_elemsize_g = elemsize / elempack * out_elempack_g;
688

689
    if (opt.use_fp16_packed && !opt.use_fp16_storage)
690
    {
691
        if (out_elempack_g == 8) out_elemsize_g = 8 * 2u;
692
        if (out_elempack_g == 4) out_elemsize_g = 4 * 2u;
693
        if (out_elempack_g == 1) out_elemsize_g = 4u;
694
    }
695

696
    // unpacking
697
    VkMat bottom_blob_unpacked = bottom_blob;
698
    if (elempack > elempack_g)
699
    {
700
        Option opt_pack1 = opt;
701
        opt_pack1.blob_vkallocator = opt.workspace_vkallocator;
702

703
        vkdev->convert_packing(bottom_blob, bottom_blob_unpacked, elempack_g, cmd, opt_pack1);
704
    }
705

706
    VkMat top_blob_unpacked = top_blob_bordered;
707
    if (out_elempack_g < out_elempack)
708
    {
709
        top_blob_unpacked.create(outw, outh, num_output / out_elempack_g, out_elemsize_g, out_elempack_g, opt.workspace_vkallocator);
710
        if (top_blob_unpacked.empty())
711
            return -100;
712
    }
713

714
    std::vector<VkMat> bindings(4);
715
    bindings[0] = bottom_blob_unpacked;
716
    bindings[1] = top_blob_unpacked;
717
    bindings[2] = weight_data_gpu;
718
    bindings[3] = bias_data_gpu;
719

720
    std::vector<vk_constant_type> constants(10);
721
    constants[0].i = bottom_blob_unpacked.dims;
722
    constants[1].i = bottom_blob_unpacked.w;
723
    constants[2].i = bottom_blob_unpacked.h;
724
    constants[3].i = bottom_blob_unpacked.c;
725
    constants[4].i = bottom_blob_unpacked.cstep;
726
    constants[5].i = top_blob_unpacked.dims;
727
    constants[6].i = top_blob_unpacked.w;
728
    constants[7].i = top_blob_unpacked.h;
729
    constants[8].i = top_blob_unpacked.c;
730
    constants[9].i = top_blob_unpacked.cstep;
731

732
    const Pipeline* pipeline = 0;
733
    if (elempack_g == 1 && out_elempack_g == 1)
734
    {
735
        pipeline = pipeline_deconvolutiondepthwise_group;
736
    }
737
    else if (elempack_g == 4 && out_elempack_g == 4)
738
    {
739
        pipeline = pipeline_deconvolutiondepthwise_group_pack4;
740
    }
741
    else if (elempack_g == 1 && out_elempack_g == 4)
742
    {
743
        pipeline = pipeline_deconvolutiondepthwise_group_pack1to4;
744
    }
745
    else if (elempack_g == 4 && out_elempack_g == 1)
746
    {
747
        pipeline = pipeline_deconvolutiondepthwise_group_pack4to1;
748
    }
749
    else if (elempack_g == 8 && out_elempack_g == 8)
750
    {
751
        pipeline = pipeline_deconvolutiondepthwise_group_pack8;
752
    }
753
    else if (elempack_g == 1 && out_elempack_g == 8)
754
    {
755
        pipeline = pipeline_deconvolutiondepthwise_group_pack1to8;
756
    }
757
    else if (elempack_g == 4 && out_elempack_g == 8)
758
    {
759
        pipeline = pipeline_deconvolutiondepthwise_group_pack4to8;
760
    }
761
    else if (elempack_g == 8 && out_elempack_g == 4)
762
    {
763
        pipeline = pipeline_deconvolutiondepthwise_group_pack8to4;
764
    }
765
    else if (elempack_g == 8 && out_elempack_g == 1)
766
    {
767
        pipeline = pipeline_deconvolutiondepthwise_group_pack8to1;
768
    }
769

770
    cmd.record_pipeline(pipeline, bindings, constants, top_blob_unpacked);
771

772
    // packing
773
    if (out_elempack_g < out_elempack)
774
    {
775
        vkdev->convert_packing(top_blob_unpacked, top_blob_bordered, out_elempack, cmd, opt);
776
    }
777
    else
778
    {
779
        top_blob_bordered = top_blob_unpacked;
780
    }
781

782
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
783
    {
784
        {
785
            VkMat reference_blob;
786
            reference_blob.dims = 2;
787
            reference_blob.w = top_blob_bordered.w - pad_left - pad_right;
788
            reference_blob.h = top_blob_bordered.h - pad_top - pad_bottom;
789
            reference_blob.elempack = 1;
790

791
            std::vector<VkMat> crop_bottom_blobs(2);
792
            crop_bottom_blobs[0] = top_blob_bordered;
793
            crop_bottom_blobs[1] = reference_blob;
794
            std::vector<VkMat> crop_top_blobs(1);
795
            crop->forward(crop_bottom_blobs, crop_top_blobs, cmd, opt);
796
            top_blob = crop_top_blobs[0];
797
        }
798
        if (top_blob.empty())
799
            return -100;
800

801
        outw = top_blob.w;
802
        outh = top_blob.h;
803
    }
804
    else if (output_w > 0 && output_h > 0)
805
    {
806
        int wcut = top_blob_bordered.w - output_w;
807
        int hcut = top_blob_bordered.h - output_h;
808

809
        VkMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
810
        int* crop_params = crop_param_blob.mapped();
811

812
        if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233)
813
        {
814
            // onnx padding=SAME_UPPER
815
            crop_params[0] = wcut / 2;
816
            crop_params[1] = hcut / 2;
817
            crop_params[2] = 0;
818
            crop_params[3] = top_blob_bordered.w - wcut;
819
            crop_params[4] = top_blob_bordered.h - hcut;
820
            crop_params[5] = top_blob_bordered.c * out_elempack;
821
        }
822
        else if (pad_left == -234 || pad_right == -234 || pad_top == -234 || pad_bottom == -234)
823
        {
824
            // onnx padding=SAME_LOWER
825
            crop_params[0] = wcut - wcut / 2;
826
            crop_params[1] = hcut - hcut / 2;
827
            crop_params[2] = 0;
828
            crop_params[3] = top_blob_bordered.w - wcut;
829
            crop_params[4] = top_blob_bordered.h - hcut;
830
            crop_params[5] = top_blob_bordered.c * out_elempack;
831
        }
832

833
        std::vector<VkMat> crop_inputs(2);
834
        crop_inputs[0] = top_blob_bordered;
835
        crop_inputs[1] = crop_param_blob;
836

837
        std::vector<VkMat> crop_outputs(1);
838
        output_crop->forward(crop_inputs, crop_outputs, cmd, opt);
839
        top_blob = crop_outputs[0];
840
        if (top_blob.empty())
841
            return -100;
842

843
        outw = top_blob.w;
844
        outh = top_blob.h;
845
    }
846
    else
847
    {
848
        top_blob = top_blob_bordered;
849
    }
850

851
    return 0;
852
}
853

854
int DeconvolutionDepthWise_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const
855
{
856
    int w = bottom_blob.w;
857
    int h = bottom_blob.h;
858
    int channels = bottom_blob.c;
859
    size_t elemsize = bottom_blob.elemsize;
860
    int elempack = bottom_blob.elempack;
861

862
    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
863
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
864

865
    int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
866
    int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
867
    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
868
    size_t out_elemsize = elemsize / elempack * out_elempack;
869

870
    if (opt.use_fp16_packed && !opt.use_fp16_storage)
871
    {
872
        if (out_elempack == 8) out_elemsize = 8 * 2u;
873
        if (out_elempack == 4) out_elemsize = 4 * 2u;
874
        if (out_elempack == 1) out_elemsize = 4u;
875
    }
876

877
    VkImageMat top_blob_bordered;
878
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0))
879
    {
880
        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator);
881
    }
882
    else
883
    {
884
        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
885
    }
886
    if (top_blob_bordered.empty())
887
        return -100;
888

889
    // depth-wise
890
    if (channels == group / elempack && group / elempack == num_output / elempack)
891
    {
892
        std::vector<VkImageMat> bindings(4);
893
        bindings[0] = bottom_blob;
894
        bindings[1] = top_blob_bordered;
895
        bindings[2] = weight_data_gpu_image;
896
        bindings[3] = bias_data_gpu_image;
897

898
        std::vector<vk_constant_type> constants(10);
899
        constants[0].i = bottom_blob.dims;
900
        constants[1].i = bottom_blob.w;
901
        constants[2].i = bottom_blob.h;
902
        constants[3].i = bottom_blob.c;
903
        constants[4].i = 0; //bottom_blob.cstep;
904
        constants[5].i = top_blob_bordered.dims;
905
        constants[6].i = top_blob_bordered.w;
906
        constants[7].i = top_blob_bordered.h;
907
        constants[8].i = top_blob_bordered.c;
908
        constants[9].i = 0; //top_blob_bordered.cstep;
909

910
        const Pipeline* pipeline = elempack == 8 ? pipeline_deconvolutiondepthwise_pack8
911
                                   : elempack == 4 ? pipeline_deconvolutiondepthwise_pack4
912
                                   : pipeline_deconvolutiondepthwise;
913

914
        // record
915
        cmd.record_pipeline(pipeline, bindings, constants, top_blob_bordered);
916

917
        if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
918
        {
919
            {
920
                VkImageMat reference_blob;
921
                reference_blob.dims = 2;
922
                reference_blob.w = top_blob_bordered.w - pad_left - pad_right;
923
                reference_blob.h = top_blob_bordered.h - pad_top - pad_bottom;
924
                reference_blob.elempack = 1;
925

926
                std::vector<VkImageMat> crop_bottom_blobs(2);
927
                crop_bottom_blobs[0] = top_blob_bordered;
928
                crop_bottom_blobs[1] = reference_blob;
929
                std::vector<VkImageMat> crop_top_blobs(1);
930
                crop->forward(crop_bottom_blobs, crop_top_blobs, cmd, opt);
931
                top_blob = crop_top_blobs[0];
932
            }
933
            if (top_blob.empty())
934
                return -100;
935

936
            outw = top_blob.w;
937
            outh = top_blob.h;
938
        }
939
        else if (output_w > 0 && output_h > 0)
940
        {
941
            int wcut = top_blob_bordered.w - output_w;
942
            int hcut = top_blob_bordered.h - output_h;
943

944
            VkImageMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
945
            int* crop_params = crop_param_blob.mapped();
946

947
            if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233)
948
            {
949
                // onnx padding=SAME_UPPER
950
                crop_params[0] = wcut / 2;
951
                crop_params[1] = hcut / 2;
952
                crop_params[2] = 0;
953
                crop_params[3] = top_blob_bordered.w - wcut;
954
                crop_params[4] = top_blob_bordered.h - hcut;
955
                crop_params[5] = top_blob_bordered.c * out_elempack;
956
            }
957
            else if (pad_left == -234 || pad_right == -234 || pad_top == -234 || pad_bottom == -234)
958
            {
959
                // onnx padding=SAME_LOWER
960
                crop_params[0] = wcut - wcut / 2;
961
                crop_params[1] = hcut - hcut / 2;
962
                crop_params[2] = 0;
963
                crop_params[3] = top_blob_bordered.w - wcut;
964
                crop_params[4] = top_blob_bordered.h - hcut;
965
                crop_params[5] = top_blob_bordered.c * out_elempack;
966
            }
967

968
            std::vector<VkImageMat> crop_inputs(2);
969
            crop_inputs[0] = top_blob_bordered;
970
            crop_inputs[1] = crop_param_blob;
971

972
            std::vector<VkImageMat> crop_outputs(1);
973
            output_crop->forward(crop_inputs, crop_outputs, cmd, opt);
974
            top_blob = crop_outputs[0];
975
            if (top_blob.empty())
976
                return -100;
977

978
            outw = top_blob.w;
979
            outh = top_blob.h;
980
        }
981
        else
982
        {
983
            top_blob = top_blob_bordered;
984
        }
985

986
        return 0;
987
    }
988

989
    const int channels_g = channels * elempack / group;
990
    const int num_output_g = num_output / group;
991

992
    int elempack_g = opt.use_shader_pack8 && channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1;
993
    int out_elempack_g = opt.use_shader_pack8 && num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1;
994
    size_t out_elemsize_g = elemsize / elempack * out_elempack_g;
995

996
    if (opt.use_fp16_packed && !opt.use_fp16_storage)
997
    {
998
        if (out_elempack_g == 8) out_elemsize_g = 8 * 2u;
999
        if (out_elempack_g == 4) out_elemsize_g = 4 * 2u;
1000
        if (out_elempack_g == 1) out_elemsize_g = 4u;
1001
    }
1002

1003
    // unpacking
1004
    VkImageMat bottom_blob_unpacked = bottom_blob;
1005
    if (elempack > elempack_g)
1006
    {
1007
        Option opt_pack1 = opt;
1008
        opt_pack1.blob_vkallocator = opt.workspace_vkallocator;
1009

1010
        vkdev->convert_packing(bottom_blob, bottom_blob_unpacked, elempack_g, cmd, opt_pack1);
1011
    }
1012

1013
    VkImageMat top_blob_unpacked = top_blob_bordered;
1014
    if (out_elempack_g < out_elempack)
1015
    {
1016
        top_blob_unpacked.create(outw, outh, num_output / out_elempack_g, out_elemsize_g, out_elempack_g, opt.workspace_vkallocator);
1017
        if (top_blob_unpacked.empty())
1018
            return -100;
1019
    }
1020

1021
    std::vector<VkImageMat> bindings(4);
1022
    bindings[0] = bottom_blob_unpacked;
1023
    bindings[1] = top_blob_unpacked;
1024
    bindings[2] = weight_data_gpu_image;
1025
    bindings[3] = bias_data_gpu_image;
1026

1027
    std::vector<vk_constant_type> constants(10);
1028
    constants[0].i = bottom_blob_unpacked.dims;
1029
    constants[1].i = bottom_blob_unpacked.w;
1030
    constants[2].i = bottom_blob_unpacked.h;
1031
    constants[3].i = bottom_blob_unpacked.c;
1032
    constants[4].i = 0; //bottom_blob_unpacked.cstep;
1033
    constants[5].i = top_blob_unpacked.dims;
1034
    constants[6].i = top_blob_unpacked.w;
1035
    constants[7].i = top_blob_unpacked.h;
1036
    constants[8].i = top_blob_unpacked.c;
1037
    constants[9].i = 0; //top_blob_unpacked.cstep;
1038

1039
    const Pipeline* pipeline = 0;
1040
    if (elempack_g == 1 && out_elempack_g == 1)
1041
    {
1042
        pipeline = pipeline_deconvolutiondepthwise_group;
1043
    }
1044
    else if (elempack_g == 4 && out_elempack_g == 4)
1045
    {
1046
        pipeline = pipeline_deconvolutiondepthwise_group_pack4;
1047
    }
1048
    else if (elempack_g == 1 && out_elempack_g == 4)
1049
    {
1050
        pipeline = pipeline_deconvolutiondepthwise_group_pack1to4;
1051
    }
1052
    else if (elempack_g == 4 && out_elempack_g == 1)
1053
    {
1054
        pipeline = pipeline_deconvolutiondepthwise_group_pack4to1;
1055
    }
1056
    else if (elempack_g == 8 && out_elempack_g == 8)
1057
    {
1058
        pipeline = pipeline_deconvolutiondepthwise_group_pack8;
1059
    }
1060
    else if (elempack_g == 1 && out_elempack_g == 8)
1061
    {
1062
        pipeline = pipeline_deconvolutiondepthwise_group_pack1to8;
1063
    }
1064
    else if (elempack_g == 4 && out_elempack_g == 8)
1065
    {
1066
        pipeline = pipeline_deconvolutiondepthwise_group_pack4to8;
1067
    }
1068
    else if (elempack_g == 8 && out_elempack_g == 4)
1069
    {
1070
        pipeline = pipeline_deconvolutiondepthwise_group_pack8to4;
1071
    }
1072
    else if (elempack_g == 8 && out_elempack_g == 1)
1073
    {
1074
        pipeline = pipeline_deconvolutiondepthwise_group_pack8to1;
1075
    }
1076

1077
    cmd.record_pipeline(pipeline, bindings, constants, top_blob_unpacked);
1078

1079
    // packing
1080
    if (out_elempack_g < out_elempack)
1081
    {
1082
        vkdev->convert_packing(top_blob_unpacked, top_blob_bordered, out_elempack, cmd, opt);
1083
    }
1084
    else
1085
    {
1086
        top_blob_bordered = top_blob_unpacked;
1087
    }
1088

1089
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
1090
    {
1091
        {
1092
            VkImageMat reference_blob;
1093
            reference_blob.dims = 2;
1094
            reference_blob.w = top_blob_bordered.w - pad_left - pad_right;
1095
            reference_blob.h = top_blob_bordered.h - pad_top - pad_bottom;
1096
            reference_blob.elempack = 1;
1097

1098
            std::vector<VkImageMat> crop_bottom_blobs(2);
1099
            crop_bottom_blobs[0] = top_blob_bordered;
1100
            crop_bottom_blobs[1] = reference_blob;
1101
            std::vector<VkImageMat> crop_top_blobs(1);
1102
            crop->forward(crop_bottom_blobs, crop_top_blobs, cmd, opt);
1103
            top_blob = crop_top_blobs[0];
1104
        }
1105
        if (top_blob.empty())
1106
            return -100;
1107

1108
        outw = top_blob.w;
1109
        outh = top_blob.h;
1110
    }
1111
    else if (output_w > 0 && output_h > 0)
1112
    {
1113
        int wcut = top_blob_bordered.w - output_w;
1114
        int hcut = top_blob_bordered.h - output_h;
1115

1116
        VkImageMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
1117
        int* crop_params = crop_param_blob.mapped();
1118

1119
        if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233)
1120
        {
1121
            // onnx padding=SAME_UPPER
1122
            crop_params[0] = wcut / 2;
1123
            crop_params[1] = hcut / 2;
1124
            crop_params[2] = 0;
1125
            crop_params[3] = top_blob_bordered.w - wcut;
1126
            crop_params[4] = top_blob_bordered.h - hcut;
1127
            crop_params[5] = top_blob_bordered.c * out_elempack;
1128
        }
1129
        else if (pad_left == -234 || pad_right == -234 || pad_top == -234 || pad_bottom == -234)
1130
        {
1131
            // onnx padding=SAME_LOWER
1132
            crop_params[0] = wcut - wcut / 2;
1133
            crop_params[1] = hcut - hcut / 2;
1134
            crop_params[2] = 0;
1135
            crop_params[3] = top_blob_bordered.w - wcut;
1136
            crop_params[4] = top_blob_bordered.h - hcut;
1137
            crop_params[5] = top_blob_bordered.c * out_elempack;
1138
        }
1139

1140
        std::vector<VkImageMat> crop_inputs(2);
1141
        crop_inputs[0] = top_blob_bordered;
1142
        crop_inputs[1] = crop_param_blob;
1143

1144
        std::vector<VkImageMat> crop_outputs(1);
1145
        output_crop->forward(crop_inputs, crop_outputs, cmd, opt);
1146
        top_blob = crop_outputs[0];
1147
        if (top_blob.empty())
1148
            return -100;
1149

1150
        outw = top_blob.w;
1151
        outh = top_blob.h;
1152
    }
1153
    else
1154
    {
1155
        top_blob = top_blob_bordered;
1156
    }
1157

1158
    return 0;
1159
}
1160

1161
} // namespace ncnn
1162
ncnn

Использование cookies