ncnn

convolutiondepthwise_vulkan.cpp
1001 строка · 35.4 Кб
Перенос по словам
1
// Tencent is pleased to support the open source community by making ncnn available.
2
//
3
// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4
//
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
7
//
8
// https://opensource.org/licenses/BSD-3-Clause
9
//
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
14

15
#include "convolutiondepthwise_vulkan.h"
16

17
#include "layer_shader_type.h"
18
#include "layer_type.h"
19

20
namespace ncnn {
21

22
ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan()
23
{
24
    support_vulkan = true;
25
    support_image_storage = true;
26

27
    padding = 0;
28

29
    pipeline_convolutiondepthwise = 0;
30
    pipeline_convolutiondepthwise_pack4 = 0;
31
    pipeline_convolutiondepthwise_pack8 = 0;
32

33
    pipeline_convolutiondepthwise_group = 0;
34
    pipeline_convolutiondepthwise_group_pack4 = 0;
35
    pipeline_convolutiondepthwise_group_pack1to4 = 0;
36
    pipeline_convolutiondepthwise_group_pack4to1 = 0;
37
    pipeline_convolutiondepthwise_group_pack8 = 0;
38
    pipeline_convolutiondepthwise_group_pack1to8 = 0;
39
    pipeline_convolutiondepthwise_group_pack4to8 = 0;
40
    pipeline_convolutiondepthwise_group_pack8to4 = 0;
41
    pipeline_convolutiondepthwise_group_pack8to1 = 0;
42
}
43

44
int ConvolutionDepthWise_vulkan::load_param(const ParamDict& pd)
45
{
46
    int ret = ConvolutionDepthWise::load_param(pd);
47

48
    if (dynamic_weight)
49
    {
50
        support_vulkan = false;
51
        support_image_storage = false;
52
    }
53

54
    return ret;
55
}
56

57
int ConvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
58
{
59
    Option opt = _opt;
60
    const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
61
    const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
62

63
    // the shape after padding
64
    Mat shape_bordered;
65
    if (shape.dims != 0)
66
    {
67
        if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
68
        {
69
            shape_bordered = Mat(shape.w + pad_left + pad_right, shape.h + pad_top + pad_bottom, shape.c, (void*)0);
70
        }
71
        else if ((pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233)
72
                 || (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234))
73
        {
74
            const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
75
            const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
76

77
            int wpad = kernel_extent_w + (shape.w - 1) / stride_w * stride_w - shape.w;
78
            int hpad = kernel_extent_h + (shape.h - 1) / stride_h * stride_h - shape.h;
79
            if (wpad > 0 || hpad > 0)
80
            {
81
                shape_bordered = Mat(shape.w + wpad, shape.h + hpad, shape.c, (void*)0);
82
            }
83
        }
84
        else
85
        {
86
            shape_bordered = shape;
87
        }
88
    }
89

90
    const int maxk = kernel_w * kernel_h;
91
    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
92

93
    int elempack = opt.use_shader_pack8 && channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1;
94
    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
95

96
    size_t elemsize;
97
    size_t out_elemsize;
98
    if (opt.use_fp16_storage)
99
    {
100
        elemsize = elempack * 2u;
101
        out_elemsize = out_elempack * 2u;
102
    }
103
    else if (opt.use_fp16_packed)
104
    {
105
        elemsize = elempack == 1 ? 4u : elempack * 2u;
106
        out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u;
107
    }
108
    else
109
    {
110
        elemsize = elempack * 4u;
111
        out_elemsize = out_elempack * 4u;
112
    }
113

114
    Mat shape_bordered_packed;
115
    if (shape_bordered.dims == 3) shape_bordered_packed = Mat(shape_bordered.w, shape_bordered.h, shape_bordered.c / elempack, (void*)0, elemsize, elempack);
116

117
    Mat out_shape_packed;
118
    if (out_shape.dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack);
119

120
    // group convolution
121
    const int channels_g = channels / group;
122
    const int num_output_g = num_output / group;
123

124
    int elempack_g = opt.use_shader_pack8 && channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1;
125
    int out_elempack_g = opt.use_shader_pack8 && num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1;
126

127
    size_t elemsize_g;
128
    size_t out_elemsize_g;
129
    if (opt.use_fp16_storage)
130
    {
131
        elemsize_g = elempack_g * 2u;
132
        out_elemsize_g = out_elempack_g * 2u;
133
    }
134
    else if (opt.use_fp16_packed)
135
    {
136
        elemsize_g = elempack_g == 1 ? 4u : elempack_g * 2u;
137
        out_elemsize_g = out_elempack_g == 1 ? 4u : out_elempack_g * 2u;
138
    }
139
    else
140
    {
141
        elemsize_g = elempack_g * 4u;
142
        out_elemsize_g = out_elempack_g * 4u;
143
    }
144

145
    Mat shape_bordered_g_packed;
146
    if (shape_bordered.dims == 3) shape_bordered_g_packed = Mat(shape_bordered.w, shape_bordered.h, shape_bordered.c / elempack_g, (void*)0, elemsize_g, elempack_g);
147

148
    Mat out_shape_g_packed;
149
    if (out_shape.dims == 3) out_shape_g_packed = Mat(out_shape.w, out_shape.h, out_shape.c / out_elempack_g, (void*)0, out_elemsize_g, out_elempack_g);
150

151
    // check blob shape
152
    if (!vkdev->shape_support_image_storage(shape_bordered_packed) || !vkdev->shape_support_image_storage(out_shape_packed))
153
    {
154
        support_image_storage = false;
155
        opt.use_image_storage = false;
156
    }
157

158
    // check weight shape
159
    if (channels == group && group == num_output)
160
    {
161
        Mat weight_data_packed(maxk, group / elempack, (void*)0, (size_t)4 * elempack, elempack);
162
        if (!vkdev->shape_support_image_storage(weight_data_packed))
163
        {
164
            support_image_storage = false;
165
            opt.use_image_storage = false;
166
        }
167
    }
168
    else
169
    {
170
        // check blob shape
171
        if (!vkdev->shape_support_image_storage(shape_bordered_g_packed) || !vkdev->shape_support_image_storage(out_shape_g_packed))
172
        {
173
            support_image_storage = false;
174
            opt.use_image_storage = false;
175
        }
176

177
        Mat weight_data_packed_groups(maxk, channels_g / elempack_g, num_output_g / out_elempack_g * group, (size_t)4 * elempack_g * out_elempack_g, elempack_g * out_elempack_g);
178
        if (!vkdev->shape_support_image_storage(weight_data_packed_groups))
179
        {
180
            support_image_storage = false;
181
            opt.use_image_storage = false;
182
        }
183
    }
184

185
    {
186
        padding = ncnn::create_layer_vulkan(ncnn::LayerType::Padding);
187
        padding->vkdev = vkdev;
188

189
        padding->bottom_shapes.resize(1);
190
        padding->bottom_shapes[0] = shape;
191
        padding->top_shapes.resize(1);
192
        padding->top_shapes[0] = shape_bordered;
193

194
        ncnn::ParamDict pd;
195
        pd.set(0, pad_top);
196
        pd.set(1, pad_bottom);
197
        pd.set(2, pad_left);
198
        pd.set(3, pad_right);
199
        pd.set(4, 0);
200
        pd.set(5, pad_value);
201

202
        padding->load_param(pd);
203

204
        padding->create_pipeline(opt);
205
    }
206

207
    std::vector<vk_specialization_type> specializations(11 + 10);
208
    specializations[0].i = kernel_w;
209
    specializations[1].i = kernel_h;
210
    specializations[2].i = dilation_w;
211
    specializations[3].i = dilation_h;
212
    specializations[4].i = stride_w;
213
    specializations[5].i = stride_h;
214
    specializations[6].i = bias_term;
215
    specializations[7].i = group;
216
    specializations[8].i = activation_type;
217
    specializations[9].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
218
    specializations[10].f = activation_params.w == 2 ? activation_params[1] : 0.f;
219

220
    // depth-wise
221
    if (channels == group && group == num_output)
222
    {
223
        Mat weight_data_r2 = weight_data.reshape(maxk, group);
224
        convert_packing(weight_data_r2, weight_data_packed, elempack, opt);
225

226
        if (bias_term)
227
        {
228
            convert_packing(bias_data, bias_data_packed, out_elempack, opt);
229
        }
230

231
        specializations[11 + 0].i = shape_bordered_packed.dims;
232
        specializations[11 + 1].i = shape_bordered_packed.w;
233
        specializations[11 + 2].i = shape_bordered_packed.h;
234
        specializations[11 + 3].i = shape_bordered_packed.c;
235
        specializations[11 + 4].i = shape_bordered_packed.cstep;
236
        specializations[11 + 5].i = out_shape_packed.dims;
237
        specializations[11 + 6].i = out_shape_packed.w;
238
        specializations[11 + 7].i = out_shape_packed.h;
239
        specializations[11 + 8].i = out_shape_packed.c;
240
        specializations[11 + 9].i = out_shape_packed.cstep;
241

242
        Mat local_size_xyz(8, 8, std::min(4, num_output / out_elempack), (void*)0);
243
        if (out_shape_packed.dims != 0)
244
        {
245
            local_size_xyz.w = std::min(8, out_shape_packed.w);
246
            local_size_xyz.h = std::min(8, out_shape_packed.h);
247
            local_size_xyz.c = std::min(4, out_shape_packed.c);
248
        }
249

250
        // pack1
251
        if (elempack == 1)
252
        {
253
            pipeline_convolutiondepthwise = new Pipeline(vkdev);
254
            pipeline_convolutiondepthwise->set_optimal_local_size_xyz(local_size_xyz);
255
            pipeline_convolutiondepthwise->create(LayerShaderType::convolutiondepthwise, opt, specializations);
256
        }
257

258
        // pack4
259
        if (elempack == 4)
260
        {
261
            pipeline_convolutiondepthwise_pack4 = new Pipeline(vkdev);
262
            pipeline_convolutiondepthwise_pack4->set_optimal_local_size_xyz(local_size_xyz);
263
            pipeline_convolutiondepthwise_pack4->create(LayerShaderType::convolutiondepthwise_pack4, opt, specializations);
264
        }
265

266
        // pack8
267
        if (elempack == 8)
268
        {
269
            pipeline_convolutiondepthwise_pack8 = new Pipeline(vkdev);
270
            pipeline_convolutiondepthwise_pack8->set_optimal_local_size_xyz(local_size_xyz);
271
            pipeline_convolutiondepthwise_pack8->create(LayerShaderType::convolutiondepthwise_pack8, opt, specializations);
272
        }
273

274
        if (opt.lightmode)
275
        {
276
            weight_data.release();
277
            bias_data.release();
278
        }
279

280
        return 0;
281
    }
282

283
    // src = kw-kh-inch-outch
284
    // dst = pa-pb-kw-kh-inch/pa-outch/pb
285
    {
286
        Mat weight_data_r2_groups = weight_data.reshape(maxk, channels_g, num_output_g * group);
287

288
        weight_data_packed_groups.create(maxk, channels_g / elempack_g, num_output_g / out_elempack_g * group, (size_t)4 * elempack_g * out_elempack_g, elempack_g * out_elempack_g);
289

290
        for (int g = 0; g < group; g++)
291
        {
292
            const Mat weight_data_r2 = weight_data_r2_groups.channel_range(num_output_g * g, num_output_g);
293

294
            Mat weight_data_packed = weight_data_packed_groups.channel_range(num_output_g / out_elempack_g * g, num_output_g / out_elempack_g);
295

296
            for (int q = 0; q + (out_elempack_g - 1) < num_output_g; q += out_elempack_g)
297
            {
298
                float* g00 = weight_data_packed.channel(q / out_elempack_g);
299

300
                for (int p = 0; p + (elempack_g - 1) < channels_g; p += elempack_g)
301
                {
302
                    for (int k = 0; k < maxk; k++)
303
                    {
304
                        for (int i = 0; i < out_elempack_g; i++)
305
                        {
306
                            const Mat k0 = weight_data_r2.channel(q + i);
307

308
                            for (int j = 0; j < elempack_g; j++)
309
                            {
310
                                const float* k00 = k0.row(p + j);
311

312
                                g00[0] = k00[k];
313

314
                                g00++;
315
                            }
316
                        }
317
                    }
318
                }
319
            }
320
        }
321
    }
322

323
    if (bias_term)
324
    {
325
        convert_packing(bias_data, bias_data_packed, out_elempack_g, opt);
326
    }
327

328
    specializations[11 + 0].i = shape_bordered_g_packed.dims;
329
    specializations[11 + 1].i = shape_bordered_g_packed.w;
330
    specializations[11 + 2].i = shape_bordered_g_packed.h;
331
    specializations[11 + 3].i = shape_bordered_g_packed.c;
332
    specializations[11 + 4].i = shape_bordered_g_packed.cstep;
333
    specializations[11 + 5].i = out_shape_g_packed.dims;
334
    specializations[11 + 6].i = out_shape_g_packed.w;
335
    specializations[11 + 7].i = out_shape_g_packed.h;
336
    specializations[11 + 8].i = out_shape_g_packed.c;
337
    specializations[11 + 9].i = out_shape_g_packed.cstep;
338

339
    Mat local_size_xyz(8, 8, std::min(4, num_output / out_elempack_g), (void*)0);
340
    if (out_shape_g_packed.dims != 0)
341
    {
342
        local_size_xyz.w = std::min(8, out_shape_g_packed.w);
343
        local_size_xyz.h = std::min(8, out_shape_g_packed.h);
344
        local_size_xyz.c = std::min(4, out_shape_g_packed.c);
345
    }
346

347
    // pack1
348
    if (elempack_g == 1 && out_elempack_g == 1)
349
    {
350
        pipeline_convolutiondepthwise_group = new Pipeline(vkdev);
351
        pipeline_convolutiondepthwise_group->set_optimal_local_size_xyz(local_size_xyz);
352
        pipeline_convolutiondepthwise_group->create(LayerShaderType::convolutiondepthwise_group, opt, specializations);
353
    }
354

355
    // pack4
356
    if (elempack_g == 4 && out_elempack_g == 4)
357
    {
358
        pipeline_convolutiondepthwise_group_pack4 = new Pipeline(vkdev);
359
        pipeline_convolutiondepthwise_group_pack4->set_optimal_local_size_xyz(local_size_xyz);
360
        pipeline_convolutiondepthwise_group_pack4->create(LayerShaderType::convolutiondepthwise_group_pack4, opt, specializations);
361
    }
362

363
    // pack1to4
364
    if (elempack_g == 1 && out_elempack_g == 4)
365
    {
366
        pipeline_convolutiondepthwise_group_pack1to4 = new Pipeline(vkdev);
367
        pipeline_convolutiondepthwise_group_pack1to4->set_optimal_local_size_xyz(local_size_xyz);
368
        pipeline_convolutiondepthwise_group_pack1to4->create(LayerShaderType::convolutiondepthwise_group_pack1to4, opt, specializations);
369
    }
370

371
    // pack4to1
372
    if (elempack_g == 4 && out_elempack_g == 1)
373
    {
374
        pipeline_convolutiondepthwise_group_pack4to1 = new Pipeline(vkdev);
375
        pipeline_convolutiondepthwise_group_pack4to1->set_optimal_local_size_xyz(local_size_xyz);
376
        pipeline_convolutiondepthwise_group_pack4to1->create(LayerShaderType::convolutiondepthwise_group_pack4to1, opt, specializations);
377
    }
378

379
    // pack8
380
    if (elempack_g == 8 && out_elempack_g == 8)
381
    {
382
        pipeline_convolutiondepthwise_group_pack8 = new Pipeline(vkdev);
383
        pipeline_convolutiondepthwise_group_pack8->set_optimal_local_size_xyz(local_size_xyz);
384
        pipeline_convolutiondepthwise_group_pack8->create(LayerShaderType::convolutiondepthwise_group_pack8, opt, specializations);
385
    }
386

387
    // pack1to8
388
    if (elempack_g == 1 && out_elempack_g == 8)
389
    {
390
        pipeline_convolutiondepthwise_group_pack1to8 = new Pipeline(vkdev);
391
        pipeline_convolutiondepthwise_group_pack1to8->set_optimal_local_size_xyz(local_size_xyz);
392
        pipeline_convolutiondepthwise_group_pack1to8->create(LayerShaderType::convolutiondepthwise_group_pack1to8, opt, specializations);
393
    }
394

395
    // pack4to8
396
    if (elempack_g == 4 && out_elempack_g == 8)
397
    {
398
        pipeline_convolutiondepthwise_group_pack4to8 = new Pipeline(vkdev);
399
        pipeline_convolutiondepthwise_group_pack4to8->set_optimal_local_size_xyz(local_size_xyz);
400
        pipeline_convolutiondepthwise_group_pack4to8->create(LayerShaderType::convolutiondepthwise_group_pack4to8, opt, specializations);
401
    }
402

403
    // pack8to4
404
    if (elempack_g == 8 && out_elempack_g == 4)
405
    {
406
        pipeline_convolutiondepthwise_group_pack8to4 = new Pipeline(vkdev);
407
        pipeline_convolutiondepthwise_group_pack8to4->set_optimal_local_size_xyz(local_size_xyz);
408
        pipeline_convolutiondepthwise_group_pack8to4->create(LayerShaderType::convolutiondepthwise_group_pack8to4, opt, specializations);
409
    }
410

411
    // pack8to1
412
    if (elempack_g == 8 && out_elempack_g == 1)
413
    {
414
        pipeline_convolutiondepthwise_group_pack8to1 = new Pipeline(vkdev);
415
        pipeline_convolutiondepthwise_group_pack8to1->set_optimal_local_size_xyz(local_size_xyz);
416
        pipeline_convolutiondepthwise_group_pack8to1->create(LayerShaderType::convolutiondepthwise_group_pack8to1, opt, specializations);
417
    }
418

419
    if (opt.lightmode)
420
    {
421
        weight_data.release();
422
        bias_data.release();
423
    }
424

425
    return 0;
426
}
427

428
int ConvolutionDepthWise_vulkan::destroy_pipeline(const Option& opt)
429
{
430
    if (padding)
431
    {
432
        padding->destroy_pipeline(opt);
433
        delete padding;
434
        padding = 0;
435
    }
436

437
    delete pipeline_convolutiondepthwise;
438
    pipeline_convolutiondepthwise = 0;
439

440
    delete pipeline_convolutiondepthwise_pack4;
441
    pipeline_convolutiondepthwise_pack4 = 0;
442

443
    delete pipeline_convolutiondepthwise_pack8;
444
    pipeline_convolutiondepthwise_pack8 = 0;
445

446
    delete pipeline_convolutiondepthwise_group;
447
    pipeline_convolutiondepthwise_group = 0;
448

449
    delete pipeline_convolutiondepthwise_group_pack4;
450
    pipeline_convolutiondepthwise_group_pack4 = 0;
451

452
    delete pipeline_convolutiondepthwise_group_pack1to4;
453
    pipeline_convolutiondepthwise_group_pack1to4 = 0;
454

455
    delete pipeline_convolutiondepthwise_group_pack4to1;
456
    pipeline_convolutiondepthwise_group_pack4to1 = 0;
457

458
    delete pipeline_convolutiondepthwise_group_pack8;
459
    pipeline_convolutiondepthwise_group_pack8 = 0;
460

461
    delete pipeline_convolutiondepthwise_group_pack1to8;
462
    pipeline_convolutiondepthwise_group_pack1to8 = 0;
463

464
    delete pipeline_convolutiondepthwise_group_pack4to8;
465
    pipeline_convolutiondepthwise_group_pack4to8 = 0;
466

467
    delete pipeline_convolutiondepthwise_group_pack8to4;
468
    pipeline_convolutiondepthwise_group_pack8to4 = 0;
469

470
    delete pipeline_convolutiondepthwise_group_pack8to1;
471
    pipeline_convolutiondepthwise_group_pack8to1 = 0;
472

473
    return 0;
474
}
475

476
int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
477
{
478
    if (padding)
479
    {
480
        padding->upload_model(cmd, opt);
481
    }
482

483
    const int maxk = kernel_w * kernel_h;
484
    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
485

486
    // depth-wise
487
    if (channels == group && group == num_output)
488
    {
489
        if (support_image_storage && opt.use_image_storage)
490
        {
491
            cmd.record_upload(weight_data_packed, weight_data_gpu_image, opt);
492
        }
493
        else
494
        {
495
            cmd.record_upload(weight_data_packed, weight_data_gpu, opt);
496
        }
497

498
        weight_data_packed.release();
499

500
        if (bias_term)
501
        {
502
            if (support_image_storage && opt.use_image_storage)
503
            {
504
                cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt);
505
            }
506
            else
507
            {
508
                cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
509
            }
510

511
            bias_data_packed.release();
512
        }
513

514
        return 0;
515
    }
516

517
    if (support_image_storage && opt.use_image_storage)
518
    {
519
        cmd.record_upload(weight_data_packed_groups, weight_data_gpu_image, opt);
520
    }
521
    else
522
    {
523
        cmd.record_upload(weight_data_packed_groups, weight_data_gpu, opt);
524
    }
525

526
    weight_data_packed_groups.release();
527

528
    if (bias_term)
529
    {
530
        if (support_image_storage && opt.use_image_storage)
531
        {
532
            cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt);
533
        }
534
        else
535
        {
536
            cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
537
        }
538

539
        bias_data_packed.release();
540
    }
541

542
    return 0;
543
}
544

545
int ConvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
546
{
547
    int w = bottom_blob.w;
548
    int h = bottom_blob.h;
549
    int channels = bottom_blob.c;
550
    size_t elemsize = bottom_blob.elemsize;
551
    int elempack = bottom_blob.elempack;
552

553
    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
554
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
555

556
    VkMat bottom_blob_bordered = bottom_blob;
557
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
558
    {
559
        Option opt_pad = opt;
560
        opt_pad.blob_vkallocator = opt.workspace_vkallocator;
561

562
        padding->forward(bottom_blob, bottom_blob_bordered, cmd, opt_pad);
563
    }
564
    else if (pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233)
565
    {
566
        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
567
        int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
568
        if (wpad > 0 || hpad > 0)
569
        {
570
            Option opt_pad = opt;
571
            opt_pad.blob_vkallocator = opt.workspace_vkallocator;
572

573
            VkMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator);
574
            int* padding_params = padding_param_blob.mapped();
575

576
            padding_params[0] = hpad / 2;
577
            padding_params[1] = hpad - hpad / 2;
578
            padding_params[2] = wpad / 2;
579
            padding_params[3] = wpad - wpad / 2;
580
            padding_params[4] = 0;
581
            padding_params[5] = 0;
582

583
            std::vector<VkMat> padding_inputs(2);
584
            padding_inputs[0] = bottom_blob;
585
            padding_inputs[1] = padding_param_blob;
586

587
            std::vector<VkMat> padding_outputs(1);
588
            padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
589
            bottom_blob_bordered = padding_outputs[0];
590
        }
591
    }
592
    else if (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234)
593
    {
594
        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
595
        int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
596
        if (wpad > 0 || hpad > 0)
597
        {
598
            Option opt_pad = opt;
599
            opt_pad.blob_vkallocator = opt.workspace_vkallocator;
600

601
            VkMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator);
602
            int* padding_params = padding_param_blob.mapped();
603

604
            padding_params[0] = hpad - hpad / 2;
605
            padding_params[1] = hpad / 2;
606
            padding_params[2] = wpad - wpad / 2;
607
            padding_params[3] = wpad / 2;
608
            padding_params[4] = 0;
609
            padding_params[5] = 0;
610

611
            std::vector<VkMat> padding_inputs(2);
612
            padding_inputs[0] = bottom_blob;
613
            padding_inputs[1] = padding_param_blob;
614

615
            std::vector<VkMat> padding_outputs(1);
616
            padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
617
            bottom_blob_bordered = padding_outputs[0];
618
        }
619
    }
620

621
    w = bottom_blob_bordered.w;
622
    h = bottom_blob_bordered.h;
623

624
    int outw = (w - kernel_extent_w) / stride_w + 1;
625
    int outh = (h - kernel_extent_h) / stride_h + 1;
626
    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
627
    size_t out_elemsize = elemsize / elempack * out_elempack;
628

629
    if (opt.use_fp16_packed && !opt.use_fp16_storage)
630
    {
631
        if (out_elempack == 8) out_elemsize = 8 * 2u;
632
        if (out_elempack == 4) out_elemsize = 4 * 2u;
633
        if (out_elempack == 1) out_elemsize = 4u;
634
    }
635

636
    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
637
    if (top_blob.empty())
638
        return -100;
639

640
    // depth-wise
641
    if (channels == group / elempack && group / elempack == num_output / elempack)
642
    {
643
        std::vector<VkMat> bindings(4);
644
        bindings[0] = bottom_blob_bordered;
645
        bindings[1] = top_blob;
646
        bindings[2] = weight_data_gpu;
647
        bindings[3] = bias_data_gpu;
648

649
        std::vector<vk_constant_type> constants(10);
650
        constants[0].i = bottom_blob_bordered.dims;
651
        constants[1].i = bottom_blob_bordered.w;
652
        constants[2].i = bottom_blob_bordered.h;
653
        constants[3].i = bottom_blob_bordered.c;
654
        constants[4].i = bottom_blob_bordered.cstep;
655
        constants[5].i = top_blob.dims;
656
        constants[6].i = top_blob.w;
657
        constants[7].i = top_blob.h;
658
        constants[8].i = top_blob.c;
659
        constants[9].i = top_blob.cstep;
660

661
        const Pipeline* pipeline = elempack == 8 ? pipeline_convolutiondepthwise_pack8
662
                                   : elempack == 4 ? pipeline_convolutiondepthwise_pack4
663
                                   : pipeline_convolutiondepthwise;
664

665
        cmd.record_pipeline(pipeline, bindings, constants, top_blob);
666

667
        return 0;
668
    }
669

670
    const int channels_g = channels * elempack / group;
671
    const int num_output_g = num_output / group;
672

673
    int elempack_g = opt.use_shader_pack8 && channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1;
674
    int out_elempack_g = opt.use_shader_pack8 && num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1;
675
    size_t out_elemsize_g = elemsize / elempack * out_elempack_g;
676

677
    if (opt.use_fp16_packed && !opt.use_fp16_storage)
678
    {
679
        if (out_elempack_g == 8) out_elemsize_g = 8 * 2u;
680
        if (out_elempack_g == 4) out_elemsize_g = 4 * 2u;
681
        if (out_elempack_g == 1) out_elemsize_g = 4u;
682
    }
683

684
    // unpacking
685
    VkMat bottom_blob_bordered_unpacked = bottom_blob_bordered;
686
    if (elempack > elempack_g)
687
    {
688
        Option opt_pack1 = opt;
689
        opt_pack1.blob_vkallocator = opt.workspace_vkallocator;
690

691
        vkdev->convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, elempack_g, cmd, opt_pack1);
692
    }
693

694
    VkMat top_blob_unpacked = top_blob;
695
    if (out_elempack_g < out_elempack)
696
    {
697
        top_blob_unpacked.create(outw, outh, num_output / out_elempack_g, out_elemsize_g, out_elempack_g, opt.workspace_vkallocator);
698
        if (top_blob_unpacked.empty())
699
            return -100;
700
    }
701

702
    std::vector<VkMat> bindings(4);
703
    bindings[0] = bottom_blob_bordered_unpacked;
704
    bindings[1] = top_blob_unpacked;
705
    bindings[2] = weight_data_gpu;
706
    bindings[3] = bias_data_gpu;
707

708
    std::vector<vk_constant_type> constants(10);
709
    constants[0].i = bottom_blob_bordered_unpacked.dims;
710
    constants[1].i = bottom_blob_bordered_unpacked.w;
711
    constants[2].i = bottom_blob_bordered_unpacked.h;
712
    constants[3].i = bottom_blob_bordered_unpacked.c;
713
    constants[4].i = bottom_blob_bordered_unpacked.cstep;
714
    constants[5].i = top_blob_unpacked.dims;
715
    constants[6].i = top_blob_unpacked.w;
716
    constants[7].i = top_blob_unpacked.h;
717
    constants[8].i = top_blob_unpacked.c;
718
    constants[9].i = top_blob_unpacked.cstep;
719

720
    const Pipeline* pipeline = 0;
721
    if (elempack_g == 1 && out_elempack_g == 1)
722
    {
723
        pipeline = pipeline_convolutiondepthwise_group;
724
    }
725
    else if (elempack_g == 4 && out_elempack_g == 4)
726
    {
727
        pipeline = pipeline_convolutiondepthwise_group_pack4;
728
    }
729
    else if (elempack_g == 1 && out_elempack_g == 4)
730
    {
731
        pipeline = pipeline_convolutiondepthwise_group_pack1to4;
732
    }
733
    else if (elempack_g == 4 && out_elempack_g == 1)
734
    {
735
        pipeline = pipeline_convolutiondepthwise_group_pack4to1;
736
    }
737
    else if (elempack_g == 8 && out_elempack_g == 8)
738
    {
739
        pipeline = pipeline_convolutiondepthwise_group_pack8;
740
    }
741
    else if (elempack_g == 1 && out_elempack_g == 8)
742
    {
743
        pipeline = pipeline_convolutiondepthwise_group_pack1to8;
744
    }
745
    else if (elempack_g == 4 && out_elempack_g == 8)
746
    {
747
        pipeline = pipeline_convolutiondepthwise_group_pack4to8;
748
    }
749
    else if (elempack_g == 8 && out_elempack_g == 4)
750
    {
751
        pipeline = pipeline_convolutiondepthwise_group_pack8to4;
752
    }
753
    else if (elempack_g == 8 && out_elempack_g == 1)
754
    {
755
        pipeline = pipeline_convolutiondepthwise_group_pack8to1;
756
    }
757

758
    cmd.record_pipeline(pipeline, bindings, constants, top_blob_unpacked);
759

760
    // packing
761
    if (out_elempack_g < out_elempack)
762
    {
763
        vkdev->convert_packing(top_blob_unpacked, top_blob, out_elempack, cmd, opt);
764
    }
765
    else
766
    {
767
        top_blob = top_blob_unpacked;
768
    }
769

770
    return 0;
771
}
772

773
int ConvolutionDepthWise_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const
774
{
775
    int w = bottom_blob.w;
776
    int h = bottom_blob.h;
777
    int channels = bottom_blob.c;
778
    size_t elemsize = bottom_blob.elemsize;
779
    int elempack = bottom_blob.elempack;
780

781
    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
782
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
783

784
    VkImageMat bottom_blob_bordered = bottom_blob;
785
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
786
    {
787
        Option opt_pad = opt;
788
        opt_pad.blob_vkallocator = opt.workspace_vkallocator;
789

790
        padding->forward(bottom_blob, bottom_blob_bordered, cmd, opt_pad);
791
    }
792
    else if (pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233)
793
    {
794
        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
795
        int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
796
        if (wpad > 0 || hpad > 0)
797
        {
798
            Option opt_pad = opt;
799
            opt_pad.blob_vkallocator = opt.workspace_vkallocator;
800

801
            VkImageMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator);
802
            int* padding_params = padding_param_blob.mapped();
803

804
            padding_params[0] = hpad / 2;
805
            padding_params[1] = hpad - hpad / 2;
806
            padding_params[2] = wpad / 2;
807
            padding_params[3] = wpad - wpad / 2;
808
            padding_params[4] = 0;
809
            padding_params[5] = 0;
810

811
            std::vector<VkImageMat> padding_inputs(2);
812
            padding_inputs[0] = bottom_blob;
813
            padding_inputs[1] = padding_param_blob;
814

815
            std::vector<VkImageMat> padding_outputs(1);
816
            padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
817
            bottom_blob_bordered = padding_outputs[0];
818
        }
819
    }
820
    else if (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234)
821
    {
822
        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
823
        int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
824
        if (wpad > 0 || hpad > 0)
825
        {
826
            Option opt_pad = opt;
827
            opt_pad.blob_vkallocator = opt.workspace_vkallocator;
828

829
            VkImageMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator);
830
            int* padding_params = padding_param_blob.mapped();
831

832
            padding_params[0] = hpad - hpad / 2;
833
            padding_params[1] = hpad / 2;
834
            padding_params[2] = wpad - wpad / 2;
835
            padding_params[3] = wpad / 2;
836
            padding_params[4] = 0;
837
            padding_params[5] = 0;
838

839
            std::vector<VkImageMat> padding_inputs(2);
840
            padding_inputs[0] = bottom_blob;
841
            padding_inputs[1] = padding_param_blob;
842

843
            std::vector<VkImageMat> padding_outputs(1);
844
            padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
845
            bottom_blob_bordered = padding_outputs[0];
846
        }
847
    }
848

849
    w = bottom_blob_bordered.w;
850
    h = bottom_blob_bordered.h;
851

852
    int outw = (w - kernel_extent_w) / stride_w + 1;
853
    int outh = (h - kernel_extent_h) / stride_h + 1;
854
    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
855
    size_t out_elemsize = elemsize / elempack * out_elempack;
856

857
    if (opt.use_fp16_packed && !opt.use_fp16_storage)
858
    {
859
        if (out_elempack == 8) out_elemsize = 8 * 2u;
860
        if (out_elempack == 4) out_elemsize = 4 * 2u;
861
        if (out_elempack == 1) out_elemsize = 4u;
862
    }
863

864
    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
865
    if (top_blob.empty())
866
        return -100;
867

868
    // depth-wise
869
    if (channels == group / elempack && group / elempack == num_output / elempack)
870
    {
871
        std::vector<VkImageMat> bindings(4);
872
        bindings[0] = bottom_blob_bordered;
873
        bindings[1] = top_blob;
874
        bindings[2] = weight_data_gpu_image;
875
        bindings[3] = bias_data_gpu_image;
876

877
        std::vector<vk_constant_type> constants(10);
878
        constants[0].i = bottom_blob_bordered.dims;
879
        constants[1].i = bottom_blob_bordered.w;
880
        constants[2].i = bottom_blob_bordered.h;
881
        constants[3].i = bottom_blob_bordered.c;
882
        constants[4].i = 0; //bottom_blob_bordered.cstep;
883
        constants[5].i = top_blob.dims;
884
        constants[6].i = top_blob.w;
885
        constants[7].i = top_blob.h;
886
        constants[8].i = top_blob.c;
887
        constants[9].i = 0; //top_blob.cstep;
888

889
        const Pipeline* pipeline = elempack == 8 ? pipeline_convolutiondepthwise_pack8
890
                                   : elempack == 4 ? pipeline_convolutiondepthwise_pack4
891
                                   : pipeline_convolutiondepthwise;
892

893
        cmd.record_pipeline(pipeline, bindings, constants, top_blob);
894

895
        return 0;
896
    }
897

898
    const int channels_g = channels * elempack / group;
899
    const int num_output_g = num_output / group;
900

901
    int elempack_g = opt.use_shader_pack8 && channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1;
902
    int out_elempack_g = opt.use_shader_pack8 && num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1;
903
    size_t out_elemsize_g = elemsize / elempack * out_elempack_g;
904

905
    if (opt.use_fp16_packed && !opt.use_fp16_storage)
906
    {
907
        if (out_elempack_g == 8) out_elemsize_g = 8 * 2u;
908
        if (out_elempack_g == 4) out_elemsize_g = 4 * 2u;
909
        if (out_elempack_g == 1) out_elemsize_g = 4u;
910
    }
911

912
    // unpacking
913
    VkImageMat bottom_blob_bordered_unpacked = bottom_blob_bordered;
914
    if (elempack > elempack_g)
915
    {
916
        Option opt_pack1 = opt;
917
        opt_pack1.blob_vkallocator = opt.workspace_vkallocator;
918

919
        vkdev->convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, elempack_g, cmd, opt_pack1);
920
    }
921

922
    VkImageMat top_blob_unpacked = top_blob;
923
    if (out_elempack_g < out_elempack)
924
    {
925
        top_blob_unpacked.create(outw, outh, num_output / out_elempack_g, out_elemsize_g, out_elempack_g, opt.workspace_vkallocator);
926
        if (top_blob_unpacked.empty())
927
            return -100;
928
    }
929

930
    std::vector<VkImageMat> bindings(4);
931
    bindings[0] = bottom_blob_bordered_unpacked;
932
    bindings[1] = top_blob_unpacked;
933
    bindings[2] = weight_data_gpu_image;
934
    bindings[3] = bias_data_gpu_image;
935

936
    std::vector<vk_constant_type> constants(10);
937
    constants[0].i = bottom_blob_bordered_unpacked.dims;
938
    constants[1].i = bottom_blob_bordered_unpacked.w;
939
    constants[2].i = bottom_blob_bordered_unpacked.h;
940
    constants[3].i = bottom_blob_bordered_unpacked.c;
941
    constants[4].i = 0; //bottom_blob_bordered_unpacked.cstep;
942
    constants[5].i = top_blob_unpacked.dims;
943
    constants[6].i = top_blob_unpacked.w;
944
    constants[7].i = top_blob_unpacked.h;
945
    constants[8].i = top_blob_unpacked.c;
946
    constants[9].i = 0; //top_blob_unpacked.cstep;
947

948
    const Pipeline* pipeline = 0;
949
    if (elempack_g == 1 && out_elempack_g == 1)
950
    {
951
        pipeline = pipeline_convolutiondepthwise_group;
952
    }
953
    else if (elempack_g == 4 && out_elempack_g == 4)
954
    {
955
        pipeline = pipeline_convolutiondepthwise_group_pack4;
956
    }
957
    else if (elempack_g == 1 && out_elempack_g == 4)
958
    {
959
        pipeline = pipeline_convolutiondepthwise_group_pack1to4;
960
    }
961
    else if (elempack_g == 4 && out_elempack_g == 1)
962
    {
963
        pipeline = pipeline_convolutiondepthwise_group_pack4to1;
964
    }
965
    else if (elempack_g == 8 && out_elempack_g == 8)
966
    {
967
        pipeline = pipeline_convolutiondepthwise_group_pack8;
968
    }
969
    else if (elempack_g == 1 && out_elempack_g == 8)
970
    {
971
        pipeline = pipeline_convolutiondepthwise_group_pack1to8;
972
    }
973
    else if (elempack_g == 4 && out_elempack_g == 8)
974
    {
975
        pipeline = pipeline_convolutiondepthwise_group_pack4to8;
976
    }
977
    else if (elempack_g == 8 && out_elempack_g == 4)
978
    {
979
        pipeline = pipeline_convolutiondepthwise_group_pack8to4;
980
    }
981
    else if (elempack_g == 8 && out_elempack_g == 1)
982
    {
983
        pipeline = pipeline_convolutiondepthwise_group_pack8to1;
984
    }
985

986
    cmd.record_pipeline(pipeline, bindings, constants, top_blob_unpacked);
987

988
    // packing
989
    if (out_elempack_g < out_elempack)
990
    {
991
        vkdev->convert_packing(top_blob_unpacked, top_blob, out_elempack, cmd, opt);
992
    }
993
    else
994
    {
995
        top_blob = top_blob_unpacked;
996
    }
997

998
    return 0;
999
}
1000

1001
} // namespace ncnn
1002
ncnn

Использование cookies