1
// Tencent is pleased to support the open source community by making ncnn available.
3
// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
8
// https://opensource.org/licenses/BSD-3-Clause
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
15
#include "instancenorm_vulkan.h"
17
#include "layer_shader_type.h"
21
InstanceNorm_vulkan::InstanceNorm_vulkan()
23
support_vulkan = true;
24
support_image_storage = true;
26
pipeline_instancenorm_reduce_sum4_fp16_to_fp32 = 0;
27
pipeline_instancenorm_reduce_sum4_fp32[0] = 0;
28
pipeline_instancenorm_reduce_sum4_fp32[1] = 0;
29
pipeline_instancenorm_reduce_mean = 0;
30
pipeline_instancenorm_sub_mean_square = 0;
31
pipeline_instancenorm_coeffs = 0;
32
pipeline_instancenorm_norm = 0;
34
pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack4 = 0;
35
pipeline_instancenorm_reduce_sum4_fp32_pack4[0] = 0;
36
pipeline_instancenorm_reduce_sum4_fp32_pack4[1] = 0;
37
pipeline_instancenorm_reduce_mean_pack4 = 0;
38
pipeline_instancenorm_sub_mean_square_pack4 = 0;
39
pipeline_instancenorm_coeffs_pack4 = 0;
40
pipeline_instancenorm_norm_pack4 = 0;
42
pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack8 = 0;
43
pipeline_instancenorm_reduce_sum4_fp32_pack8[0] = 0;
44
pipeline_instancenorm_reduce_sum4_fp32_pack8[1] = 0;
45
pipeline_instancenorm_reduce_mean_pack8 = 0;
46
pipeline_instancenorm_sub_mean_square_pack8 = 0;
47
pipeline_instancenorm_coeffs_pack8 = 0;
48
pipeline_instancenorm_norm_pack8 = 0;
51
int InstanceNorm_vulkan::create_pipeline(const Option& opt)
53
const Mat& shape = top_shapes.empty() ? Mat() : top_shapes[0];
55
int _channels = channels;
56
if (shape.dims == 3) _channels = shape.c;
59
if (_channels != 0) elempack = opt.use_shader_pack8 && _channels % 8 == 0 ? 8 : _channels % 4 == 0 ? 4 : 1;
62
if (opt.use_fp16_storage)
64
elemsize = elempack * 2u;
66
else if (opt.use_fp16_packed)
68
elemsize = elempack == 1 ? 4u : elempack * 2u;
72
elemsize = elempack * 4u;
76
if (shape.dims == 3) shape_packed = Mat(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
78
// TODO resolve workspace_shape.w
79
Mat workspace_shape_packed;
80
if (_channels != 0) workspace_shape_packed = Mat(1, 1, _channels / elempack, (void*)0, elemsize, elempack);
84
if (opt.use_image_storage)
86
local_size_xyz = Mat(4, 4, _channels ? std::min(4, _channels / elempack) : 4, (void*)0);
87
if (workspace_shape_packed.dims != 0)
91
local_size_xyz.c = std::min(4, workspace_shape_packed.c);
96
local_size_xyz = Mat(16, 1, _channels ? std::min(4, _channels / elempack) : 4, (void*)0);
97
if (workspace_shape_packed.dims != 0)
99
local_size_xyz.w = 16;
100
local_size_xyz.h = 1;
101
local_size_xyz.c = std::min(4, workspace_shape_packed.c);
106
if (elempack == 1 || _channels == 0)
108
pipeline_instancenorm_reduce_sum4_fp16_to_fp32 = new Pipeline(vkdev);
109
pipeline_instancenorm_reduce_sum4_fp16_to_fp32->set_optimal_local_size_xyz(local_size_xyz);
110
pipeline_instancenorm_reduce_sum4_fp16_to_fp32->create(LayerShaderType::instancenorm_reduce_sum4_fp16_to_fp32, opt, std::vector<vk_specialization_type>());
112
pipeline_instancenorm_reduce_sum4_fp32[0] = new Pipeline(vkdev);
113
pipeline_instancenorm_reduce_sum4_fp32[0]->set_optimal_local_size_xyz(local_size_xyz);
114
pipeline_instancenorm_reduce_sum4_fp32[0]->create(LayerShaderType::instancenorm_reduce_sum4_fp32, opt, std::vector<vk_specialization_type>());
115
pipeline_instancenorm_reduce_sum4_fp32[1] = new Pipeline(vkdev);
116
pipeline_instancenorm_reduce_sum4_fp32[1]->set_optimal_local_size_xyz(local_size_xyz);
117
pipeline_instancenorm_reduce_sum4_fp32[1]->create(LayerShaderType::instancenorm_reduce_sum4_fp32, opt, std::vector<vk_specialization_type>());
121
if (elempack == 4 || _channels == 0)
123
pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack4 = new Pipeline(vkdev);
124
pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack4->set_optimal_local_size_xyz(local_size_xyz);
125
pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack4->create(LayerShaderType::instancenorm_reduce_sum4_fp16_to_fp32_pack4, opt, std::vector<vk_specialization_type>());
127
pipeline_instancenorm_reduce_sum4_fp32_pack4[0] = new Pipeline(vkdev);
128
pipeline_instancenorm_reduce_sum4_fp32_pack4[0]->set_optimal_local_size_xyz(local_size_xyz);
129
pipeline_instancenorm_reduce_sum4_fp32_pack4[0]->create(LayerShaderType::instancenorm_reduce_sum4_fp32_pack4, opt, std::vector<vk_specialization_type>());
130
pipeline_instancenorm_reduce_sum4_fp32_pack4[1] = new Pipeline(vkdev);
131
pipeline_instancenorm_reduce_sum4_fp32_pack4[1]->set_optimal_local_size_xyz(local_size_xyz);
132
pipeline_instancenorm_reduce_sum4_fp32_pack4[1]->create(LayerShaderType::instancenorm_reduce_sum4_fp32_pack4, opt, std::vector<vk_specialization_type>());
136
if (elempack == 8 || _channels == 0)
138
pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack8 = new Pipeline(vkdev);
139
pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack8->set_optimal_local_size_xyz(local_size_xyz);
140
pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack8->create(LayerShaderType::instancenorm_reduce_sum4_fp16_to_fp32_pack8, opt, std::vector<vk_specialization_type>());
142
pipeline_instancenorm_reduce_sum4_fp32_pack8[0] = new Pipeline(vkdev);
143
pipeline_instancenorm_reduce_sum4_fp32_pack8[0]->set_optimal_local_size_xyz(local_size_xyz);
144
pipeline_instancenorm_reduce_sum4_fp32_pack8[0]->create(LayerShaderType::instancenorm_reduce_sum4_fp32_pack8, opt, std::vector<vk_specialization_type>());
145
pipeline_instancenorm_reduce_sum4_fp32_pack8[1] = new Pipeline(vkdev);
146
pipeline_instancenorm_reduce_sum4_fp32_pack8[1]->set_optimal_local_size_xyz(local_size_xyz);
147
pipeline_instancenorm_reduce_sum4_fp32_pack8[1]->create(LayerShaderType::instancenorm_reduce_sum4_fp32_pack8, opt, std::vector<vk_specialization_type>());
152
std::vector<vk_specialization_type> specializations(0 + 4);
153
specializations[0].i = 0; // TODO resolve workspace_shape_packed.w;
154
specializations[1].i = 0; // TODO resolve workspace_shape_packed.h;
155
specializations[2].i = workspace_shape_packed.c;
156
specializations[3].i = 0; // TODO resolve workspace_shape_packed.cstep;
158
Mat local_size_xyz(_channels ? std::min(64, _channels / elempack) : 64, 1, 1, (void*)0);
159
if (workspace_shape_packed.dims != 0)
161
local_size_xyz.w = std::min(64, workspace_shape_packed.c);
162
local_size_xyz.h = 1;
163
local_size_xyz.c = 1;
166
if (elempack == 1 || _channels == 0)
168
pipeline_instancenorm_reduce_mean = new Pipeline(vkdev);
169
pipeline_instancenorm_reduce_mean->set_optimal_local_size_xyz(local_size_xyz);
170
pipeline_instancenorm_reduce_mean->create(LayerShaderType::instancenorm_reduce_mean, opt, specializations);
173
if (elempack == 4 || _channels == 0)
175
pipeline_instancenorm_reduce_mean_pack4 = new Pipeline(vkdev);
176
pipeline_instancenorm_reduce_mean_pack4->set_optimal_local_size_xyz(local_size_xyz);
177
pipeline_instancenorm_reduce_mean_pack4->create(LayerShaderType::instancenorm_reduce_mean_pack4, opt, specializations);
180
if (elempack == 8 || _channels == 0)
182
pipeline_instancenorm_reduce_mean_pack8 = new Pipeline(vkdev);
183
pipeline_instancenorm_reduce_mean_pack8->set_optimal_local_size_xyz(local_size_xyz);
184
pipeline_instancenorm_reduce_mean_pack8->create(LayerShaderType::instancenorm_reduce_mean_pack8, opt, specializations);
188
Mat square_workspace_packed;
189
if (shape.dims == 3) square_workspace_packed = Mat(shape.w, shape.h, shape.c / elempack, (void*)0, elempack * 4u, elempack);
192
std::vector<vk_specialization_type> specializations(0 + 10);
193
specializations[0 + 0].i = shape_packed.dims;
194
specializations[0 + 1].i = shape_packed.w;
195
specializations[0 + 2].i = shape_packed.h;
196
specializations[0 + 3].i = shape_packed.c;
197
specializations[0 + 4].i = shape_packed.cstep;
198
specializations[0 + 5].i = square_workspace_packed.dims;
199
specializations[0 + 6].i = square_workspace_packed.w;
200
specializations[0 + 7].i = square_workspace_packed.h;
201
specializations[0 + 8].i = square_workspace_packed.c;
202
specializations[0 + 9].i = square_workspace_packed.cstep;
204
Mat local_size_xyz(4, 4, _channels ? std::min(4, _channels / elempack) : 4, (void*)0);
205
if (square_workspace_packed.dims != 0)
207
local_size_xyz.w = std::min(4, square_workspace_packed.w);
208
local_size_xyz.h = std::min(4, square_workspace_packed.h);
209
local_size_xyz.c = std::min(4, square_workspace_packed.c);
212
if (elempack == 1 || _channels == 0)
214
pipeline_instancenorm_sub_mean_square = new Pipeline(vkdev);
215
pipeline_instancenorm_sub_mean_square->set_optimal_local_size_xyz(local_size_xyz);
216
pipeline_instancenorm_sub_mean_square->create(LayerShaderType::instancenorm_sub_mean_square, opt, specializations);
219
if (elempack == 4 || _channels == 0)
221
pipeline_instancenorm_sub_mean_square_pack4 = new Pipeline(vkdev);
222
pipeline_instancenorm_sub_mean_square_pack4->set_optimal_local_size_xyz(local_size_xyz);
223
pipeline_instancenorm_sub_mean_square_pack4->create(LayerShaderType::instancenorm_sub_mean_square_pack4, opt, specializations);
226
if (elempack == 8 || _channels == 0)
228
pipeline_instancenorm_sub_mean_square_pack8 = new Pipeline(vkdev);
229
pipeline_instancenorm_sub_mean_square_pack8->set_optimal_local_size_xyz(local_size_xyz);
230
pipeline_instancenorm_sub_mean_square_pack8->create(LayerShaderType::instancenorm_sub_mean_square_pack8, opt, specializations);
235
std::vector<vk_specialization_type> specializations(3);
236
specializations[0].f = eps;
237
specializations[1].i = affine;
238
specializations[2].i = _channels / elempack;
240
Mat local_size_xyz(_channels ? std::min(64, _channels / elempack) : 64, 1, 1, (void*)0);
241
if (workspace_shape_packed.dims != 0)
243
local_size_xyz.w = std::min(64, workspace_shape_packed.c);
244
local_size_xyz.h = 1;
245
local_size_xyz.c = 1;
248
if (elempack == 1 || _channels == 0)
250
pipeline_instancenorm_coeffs = new Pipeline(vkdev);
251
pipeline_instancenorm_coeffs->set_optimal_local_size_xyz(local_size_xyz);
252
pipeline_instancenorm_coeffs->create(LayerShaderType::instancenorm_coeffs, opt, specializations);
255
if (elempack == 4 || _channels == 0)
257
pipeline_instancenorm_coeffs_pack4 = new Pipeline(vkdev);
258
pipeline_instancenorm_coeffs_pack4->set_optimal_local_size_xyz(local_size_xyz);
259
pipeline_instancenorm_coeffs_pack4->create(LayerShaderType::instancenorm_coeffs_pack4, opt, specializations);
262
if (elempack == 8 || _channels == 0)
264
pipeline_instancenorm_coeffs_pack8 = new Pipeline(vkdev);
265
pipeline_instancenorm_coeffs_pack8->set_optimal_local_size_xyz(local_size_xyz);
266
pipeline_instancenorm_coeffs_pack8->create(LayerShaderType::instancenorm_coeffs_pack8, opt, specializations);
271
std::vector<vk_specialization_type> specializations(0 + 5);
272
specializations[0 + 0].i = shape_packed.dims;
273
specializations[0 + 1].i = shape_packed.w;
274
specializations[0 + 2].i = shape_packed.h;
275
specializations[0 + 3].i = shape_packed.c;
276
specializations[0 + 4].i = shape_packed.cstep;
278
Mat local_size_xyz(4, 4, _channels ? std::min(4, _channels / elempack) : 4, (void*)0);
279
if (shape_packed.dims != 0)
281
local_size_xyz.w = std::min(4, shape_packed.w);
282
local_size_xyz.h = std::min(4, shape_packed.h);
283
local_size_xyz.c = std::min(4, shape_packed.c);
286
if (elempack == 1 || _channels == 0)
288
pipeline_instancenorm_norm = new Pipeline(vkdev);
289
pipeline_instancenorm_norm->set_optimal_local_size_xyz(local_size_xyz);
290
pipeline_instancenorm_norm->create(LayerShaderType::instancenorm_norm, opt, specializations);
293
if (elempack == 4 || _channels == 0)
295
pipeline_instancenorm_norm_pack4 = new Pipeline(vkdev);
296
pipeline_instancenorm_norm_pack4->set_optimal_local_size_xyz(local_size_xyz);
297
pipeline_instancenorm_norm_pack4->create(LayerShaderType::instancenorm_norm_pack4, opt, specializations);
300
if (elempack == 8 || _channels == 0)
302
pipeline_instancenorm_norm_pack8 = new Pipeline(vkdev);
303
pipeline_instancenorm_norm_pack8->set_optimal_local_size_xyz(local_size_xyz);
304
pipeline_instancenorm_norm_pack8->create(LayerShaderType::instancenorm_norm_pack8, opt, specializations);
311
int InstanceNorm_vulkan::destroy_pipeline(const Option& /*opt*/)
313
delete pipeline_instancenorm_reduce_sum4_fp16_to_fp32;
314
pipeline_instancenorm_reduce_sum4_fp16_to_fp32 = 0;
316
delete pipeline_instancenorm_reduce_sum4_fp32[0];
317
delete pipeline_instancenorm_reduce_sum4_fp32[1];
318
pipeline_instancenorm_reduce_sum4_fp32[0] = 0;
319
pipeline_instancenorm_reduce_sum4_fp32[1] = 0;
321
delete pipeline_instancenorm_reduce_mean;
322
pipeline_instancenorm_reduce_mean = 0;
324
delete pipeline_instancenorm_sub_mean_square;
325
pipeline_instancenorm_sub_mean_square = 0;
327
delete pipeline_instancenorm_coeffs;
328
pipeline_instancenorm_coeffs = 0;
330
delete pipeline_instancenorm_norm;
331
pipeline_instancenorm_norm = 0;
333
delete pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack4;
334
pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack4 = 0;
336
delete pipeline_instancenorm_reduce_sum4_fp32_pack4[0];
337
delete pipeline_instancenorm_reduce_sum4_fp32_pack4[1];
338
pipeline_instancenorm_reduce_sum4_fp32_pack4[0] = 0;
339
pipeline_instancenorm_reduce_sum4_fp32_pack4[1] = 0;
341
delete pipeline_instancenorm_reduce_mean_pack4;
342
pipeline_instancenorm_reduce_mean_pack4 = 0;
344
delete pipeline_instancenorm_sub_mean_square_pack4;
345
pipeline_instancenorm_sub_mean_square_pack4 = 0;
347
delete pipeline_instancenorm_coeffs_pack4;
348
pipeline_instancenorm_coeffs_pack4 = 0;
350
delete pipeline_instancenorm_norm_pack4;
351
pipeline_instancenorm_norm_pack4 = 0;
353
delete pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack8;
354
pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack8 = 0;
356
delete pipeline_instancenorm_reduce_sum4_fp32_pack8[0];
357
delete pipeline_instancenorm_reduce_sum4_fp32_pack8[1];
358
pipeline_instancenorm_reduce_sum4_fp32_pack8[0] = 0;
359
pipeline_instancenorm_reduce_sum4_fp32_pack8[1] = 0;
361
delete pipeline_instancenorm_reduce_mean_pack8;
362
pipeline_instancenorm_reduce_mean_pack8 = 0;
364
delete pipeline_instancenorm_sub_mean_square_pack8;
365
pipeline_instancenorm_sub_mean_square_pack8 = 0;
367
delete pipeline_instancenorm_coeffs_pack8;
368
pipeline_instancenorm_coeffs_pack8 = 0;
370
delete pipeline_instancenorm_norm_pack8;
371
pipeline_instancenorm_norm_pack8 = 0;
376
int InstanceNorm_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
381
int elempack = opt.use_shader_pack8 && channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1;
383
Mat gamma_data_packed;
384
convert_packing(gamma_data, gamma_data_packed, elempack, opt);
386
if (opt.use_image_storage)
388
cmd.record_upload(gamma_data_packed, gamma_data_gpu_image, opt);
392
cmd.record_upload(gamma_data_packed, gamma_data_gpu, opt);
395
Mat beta_data_packed;
396
convert_packing(beta_data, beta_data_packed, elempack, opt);
398
if (opt.use_image_storage)
400
cmd.record_upload(beta_data_packed, beta_data_gpu_image, opt);
404
cmd.record_upload(beta_data_packed, beta_data_gpu, opt);
410
int InstanceNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const
412
int w = bottom_top_blob.w;
413
int h = bottom_top_blob.h;
414
int c = bottom_top_blob.c;
416
size_t elemsize = bottom_top_blob.elemsize;
417
int elempack = bottom_top_blob.elempack;
420
VkMat mean_workspace(c, 4u * elempack, elempack, opt.workspace_vkallocator);
425
int reduced_w = (bottom_top_blob.w * bottom_top_blob.h + 3) / 4;
427
int reduced_c = bottom_top_blob.c;
429
sum_workspace.create(reduced_w, reduced_h, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator);
431
std::vector<VkMat> bindings(2);
432
bindings[0] = bottom_top_blob;
433
bindings[1] = sum_workspace;
435
std::vector<vk_constant_type> constants(8);
436
constants[0].i = bottom_top_blob.w * bottom_top_blob.h;
438
constants[2].i = bottom_top_blob.c;
439
constants[3].i = bottom_top_blob.cstep;
440
constants[4].i = sum_workspace.w;
442
constants[6].i = sum_workspace.c;
443
constants[7].i = sum_workspace.cstep;
445
const Pipeline* pipeline = elempack == 8 ? pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack8
446
: elempack == 4 ? pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack4
447
: pipeline_instancenorm_reduce_sum4_fp16_to_fp32;
449
cmd.record_pipeline(pipeline, bindings, constants, sum_workspace);
454
while (sum_workspace.w > 4)
456
int reduced_w = (sum_workspace.w + 3) / 4;
458
int reduced_c = sum_workspace.c;
460
VkMat sum_workspace_reduced;
461
sum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator);
464
std::vector<VkMat> bindings(2);
465
bindings[0] = sum_workspace;
466
bindings[1] = sum_workspace_reduced;
468
std::vector<vk_constant_type> constants(8);
469
constants[0].i = sum_workspace.w;
471
constants[2].i = sum_workspace.c;
472
constants[3].i = sum_workspace.cstep;
473
constants[4].i = sum_workspace_reduced.w;
475
constants[6].i = sum_workspace_reduced.c;
476
constants[7].i = sum_workspace_reduced.cstep;
478
const Pipeline* pipeline = elempack == 8 ? pipeline_instancenorm_reduce_sum4_fp32_pack8[pb % 2]
479
: elempack == 4 ? pipeline_instancenorm_reduce_sum4_fp32_pack4[pb % 2]
480
: pipeline_instancenorm_reduce_sum4_fp32[pb % 2];
482
cmd.record_pipeline(pipeline, bindings, constants, sum_workspace_reduced);
487
sum_workspace = sum_workspace_reduced;
491
std::vector<VkMat> bindings(2);
492
bindings[0] = sum_workspace;
493
bindings[1] = mean_workspace;
495
std::vector<vk_constant_type> constants(5);
496
constants[0].i = sum_workspace.w;
498
constants[2].i = sum_workspace.c;
499
constants[3].i = sum_workspace.cstep;
500
constants[4].f = size;
502
const Pipeline* pipeline = elempack == 8 ? pipeline_instancenorm_reduce_mean_pack8
503
: elempack == 4 ? pipeline_instancenorm_reduce_mean_pack4
504
: pipeline_instancenorm_reduce_mean;
506
cmd.record_pipeline(pipeline, bindings, constants, mean_workspace);
511
VkMat var_workspace(c, 4u * elempack, elempack, opt.workspace_vkallocator);
513
// sub mean and square
514
VkMat square_workspace;
515
square_workspace.create(w, h, c, 4u * elempack, elempack, opt.workspace_vkallocator);
517
std::vector<VkMat> bindings(3);
518
bindings[0] = bottom_top_blob;
519
bindings[1] = mean_workspace;
520
bindings[2] = square_workspace;
522
std::vector<vk_constant_type> constants(10);
523
constants[0].i = bottom_top_blob.dims;
524
constants[1].i = bottom_top_blob.w;
525
constants[2].i = bottom_top_blob.h;
526
constants[3].i = bottom_top_blob.c;
527
constants[4].i = bottom_top_blob.cstep;
528
constants[5].i = square_workspace.dims;
529
constants[6].i = square_workspace.w;
530
constants[7].i = square_workspace.h;
531
constants[8].i = square_workspace.c;
532
constants[9].i = square_workspace.cstep;
534
const Pipeline* pipeline = elempack == 8 ? pipeline_instancenorm_sub_mean_square_pack8
535
: elempack == 4 ? pipeline_instancenorm_sub_mean_square_pack4
536
: pipeline_instancenorm_sub_mean_square;
538
cmd.record_pipeline(pipeline, bindings, constants, square_workspace);
542
VkMat sqsum_workspace = square_workspace;
543
sqsum_workspace.w = sqsum_workspace.w * sqsum_workspace.h;
544
sqsum_workspace.h = 1;
547
while (sqsum_workspace.w > 4)
549
int reduced_w = (sqsum_workspace.w + 3) / 4;
551
int reduced_c = sqsum_workspace.c;
553
VkMat sqsum_workspace_reduced;
554
sqsum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator);
557
std::vector<VkMat> bindings(2);
558
bindings[0] = sqsum_workspace;
559
bindings[1] = sqsum_workspace_reduced;
561
std::vector<vk_constant_type> constants(8);
562
constants[0].i = sqsum_workspace.w;
564
constants[2].i = sqsum_workspace.c;
565
constants[3].i = sqsum_workspace.cstep;
566
constants[4].i = sqsum_workspace_reduced.w;
568
constants[6].i = sqsum_workspace_reduced.c;
569
constants[7].i = sqsum_workspace_reduced.cstep;
571
const Pipeline* pipeline = elempack == 8 ? pipeline_instancenorm_reduce_sum4_fp32_pack8[pb % 2]
572
: elempack == 4 ? pipeline_instancenorm_reduce_sum4_fp32_pack4[pb % 2]
573
: pipeline_instancenorm_reduce_sum4_fp32[pb % 2];
575
cmd.record_pipeline(pipeline, bindings, constants, sqsum_workspace_reduced);
580
sqsum_workspace = sqsum_workspace_reduced;
584
std::vector<VkMat> bindings(2);
585
bindings[0] = sqsum_workspace;
586
bindings[1] = var_workspace;
588
std::vector<vk_constant_type> constants(5);
589
constants[0].i = sqsum_workspace.w;
591
constants[2].i = sqsum_workspace.c;
592
constants[3].i = sqsum_workspace.cstep;
593
constants[4].f = size;
595
const Pipeline* pipeline = elempack == 8 ? pipeline_instancenorm_reduce_mean_pack8
596
: elempack == 4 ? pipeline_instancenorm_reduce_mean_pack4
597
: pipeline_instancenorm_reduce_mean;
599
cmd.record_pipeline(pipeline, bindings, constants, var_workspace);
604
VkMat coeffs_workspace;
605
coeffs_workspace.create(c, elemsize * 2, elempack * 2, opt.workspace_vkallocator);
607
std::vector<VkMat> bindings(5);
608
bindings[0] = coeffs_workspace;
609
bindings[1] = mean_workspace;
610
bindings[2] = var_workspace;
611
bindings[3] = gamma_data_gpu;
612
bindings[4] = beta_data_gpu;
614
std::vector<vk_constant_type> constants(1);
617
const Pipeline* pipeline = elempack == 8 ? pipeline_instancenorm_coeffs_pack8
618
: elempack == 4 ? pipeline_instancenorm_coeffs_pack4
619
: pipeline_instancenorm_coeffs;
621
cmd.record_pipeline(pipeline, bindings, constants, coeffs_workspace);
626
std::vector<VkMat> bindings(2);
627
bindings[0] = bottom_top_blob;
628
bindings[1] = coeffs_workspace;
630
std::vector<vk_constant_type> constants(5);
631
constants[0].i = bottom_top_blob.dims;
632
constants[1].i = bottom_top_blob.w;
633
constants[2].i = bottom_top_blob.h;
634
constants[3].i = bottom_top_blob.c;
635
constants[4].i = bottom_top_blob.cstep;
637
const Pipeline* pipeline = elempack == 8 ? pipeline_instancenorm_norm_pack8
638
: elempack == 4 ? pipeline_instancenorm_norm_pack4
639
: pipeline_instancenorm_norm;
641
cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob);
647
int InstanceNorm_vulkan::forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const
649
int w = bottom_top_blob.w;
650
int h = bottom_top_blob.h;
651
int c = bottom_top_blob.c;
653
size_t elemsize = bottom_top_blob.elemsize;
654
int elempack = bottom_top_blob.elempack;
657
VkImageMat mean_workspace(c, 4u * elempack, elempack, opt.workspace_vkallocator);
660
VkImageMat sum_workspace;
662
int reduced_w = (bottom_top_blob.w + 1) / 2;
663
int reduced_h = (bottom_top_blob.h + 1) / 2;
664
int reduced_c = bottom_top_blob.c;
666
sum_workspace.create(reduced_w, reduced_h, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator);
668
std::vector<VkImageMat> bindings(2);
669
bindings[0] = bottom_top_blob;
670
bindings[1] = sum_workspace;
672
std::vector<vk_constant_type> constants(8);
673
constants[0].i = bottom_top_blob.w;
674
constants[1].i = bottom_top_blob.h;
675
constants[2].i = bottom_top_blob.c;
676
constants[3].i = 0; //bottom_top_blob.cstep;
677
constants[4].i = sum_workspace.w;
678
constants[5].i = sum_workspace.h;
679
constants[6].i = sum_workspace.c;
680
constants[7].i = 0; //sum_workspace.cstep;
682
const Pipeline* pipeline = elempack == 8 ? pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack8
683
: elempack == 4 ? pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack4
684
: pipeline_instancenorm_reduce_sum4_fp16_to_fp32;
686
cmd.record_pipeline(pipeline, bindings, constants, sum_workspace);
691
while (sum_workspace.w > 2 || sum_workspace.h > 2)
693
int reduced_w = (sum_workspace.w + 1) / 2;
694
int reduced_h = (sum_workspace.h + 1) / 2;
695
int reduced_c = sum_workspace.c;
697
VkImageMat sum_workspace_reduced;
698
sum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator);
701
std::vector<VkImageMat> bindings(2);
702
bindings[0] = sum_workspace;
703
bindings[1] = sum_workspace_reduced;
705
std::vector<vk_constant_type> constants(8);
706
constants[0].i = sum_workspace.w;
707
constants[1].i = sum_workspace.h;
708
constants[2].i = sum_workspace.c;
709
constants[3].i = 0; //sum_workspace.cstep;
710
constants[4].i = sum_workspace_reduced.w;
711
constants[5].i = sum_workspace_reduced.h;
712
constants[6].i = sum_workspace_reduced.c;
713
constants[7].i = 0; //sum_workspace_reduced.cstep;
715
const Pipeline* pipeline = elempack == 8 ? pipeline_instancenorm_reduce_sum4_fp32_pack8[pb % 2]
716
: elempack == 4 ? pipeline_instancenorm_reduce_sum4_fp32_pack4[pb % 2]
717
: pipeline_instancenorm_reduce_sum4_fp32[pb % 2];
719
cmd.record_pipeline(pipeline, bindings, constants, sum_workspace_reduced);
724
sum_workspace = sum_workspace_reduced;
728
std::vector<VkImageMat> bindings(2);
729
bindings[0] = sum_workspace;
730
bindings[1] = mean_workspace;
732
std::vector<vk_constant_type> constants(5);
733
constants[0].i = sum_workspace.w;
734
constants[1].i = sum_workspace.h;
735
constants[2].i = sum_workspace.c;
736
constants[3].i = 0; //sum_workspace.cstep;
737
constants[4].f = size;
739
const Pipeline* pipeline = elempack == 8 ? pipeline_instancenorm_reduce_mean_pack8
740
: elempack == 4 ? pipeline_instancenorm_reduce_mean_pack4
741
: pipeline_instancenorm_reduce_mean;
743
cmd.record_pipeline(pipeline, bindings, constants, mean_workspace);
748
VkImageMat var_workspace(c, 4u * elempack, elempack, opt.workspace_vkallocator);
750
// sub mean and square
751
VkImageMat square_workspace;
752
square_workspace.create(w, h, c, 4u * elempack, elempack, opt.workspace_vkallocator);
754
std::vector<VkImageMat> bindings(3);
755
bindings[0] = bottom_top_blob;
756
bindings[1] = mean_workspace;
757
bindings[2] = square_workspace;
759
std::vector<vk_constant_type> constants(10);
760
constants[0].i = bottom_top_blob.dims;
761
constants[1].i = bottom_top_blob.w;
762
constants[2].i = bottom_top_blob.h;
763
constants[3].i = bottom_top_blob.c;
764
constants[4].i = 0; //bottom_top_blob.cstep;
765
constants[5].i = square_workspace.dims;
766
constants[6].i = square_workspace.w;
767
constants[7].i = square_workspace.h;
768
constants[8].i = square_workspace.c;
769
constants[9].i = 0; //square_workspace.cstep;
771
const Pipeline* pipeline = elempack == 8 ? pipeline_instancenorm_sub_mean_square_pack8
772
: elempack == 4 ? pipeline_instancenorm_sub_mean_square_pack4
773
: pipeline_instancenorm_sub_mean_square;
775
cmd.record_pipeline(pipeline, bindings, constants, square_workspace);
779
VkImageMat sqsum_workspace = square_workspace;
782
while (sqsum_workspace.w > 2 || sqsum_workspace.h > 2)
784
int reduced_w = (sqsum_workspace.w + 1) / 2;
785
int reduced_h = (sqsum_workspace.h + 1) / 2;
786
int reduced_c = sqsum_workspace.c;
788
VkImageMat sqsum_workspace_reduced;
789
sqsum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator);
792
std::vector<VkImageMat> bindings(2);
793
bindings[0] = sqsum_workspace;
794
bindings[1] = sqsum_workspace_reduced;
796
std::vector<vk_constant_type> constants(8);
797
constants[0].i = sqsum_workspace.w;
798
constants[1].i = sqsum_workspace.h;
799
constants[2].i = sqsum_workspace.c;
800
constants[3].i = 0; //sqsum_workspace.cstep;
801
constants[4].i = sqsum_workspace_reduced.w;
802
constants[5].i = sqsum_workspace_reduced.h;
803
constants[6].i = sqsum_workspace_reduced.c;
804
constants[7].i = 0; //sqsum_workspace_reduced.cstep;
806
const Pipeline* pipeline = elempack == 8 ? pipeline_instancenorm_reduce_sum4_fp32_pack8[pb % 2]
807
: elempack == 4 ? pipeline_instancenorm_reduce_sum4_fp32_pack4[pb % 2]
808
: pipeline_instancenorm_reduce_sum4_fp32[pb % 2];
810
cmd.record_pipeline(pipeline, bindings, constants, sqsum_workspace_reduced);
815
sqsum_workspace = sqsum_workspace_reduced;
819
std::vector<VkImageMat> bindings(2);
820
bindings[0] = sqsum_workspace;
821
bindings[1] = var_workspace;
823
std::vector<vk_constant_type> constants(5);
824
constants[0].i = sqsum_workspace.w;
825
constants[1].i = sqsum_workspace.h;
826
constants[2].i = sqsum_workspace.c;
827
constants[3].i = 0; //sqsum_workspace.cstep;
828
constants[4].f = size;
830
const Pipeline* pipeline = elempack == 8 ? pipeline_instancenorm_reduce_mean_pack8
831
: elempack == 4 ? pipeline_instancenorm_reduce_mean_pack4
832
: pipeline_instancenorm_reduce_mean;
834
cmd.record_pipeline(pipeline, bindings, constants, var_workspace);
839
VkImageMat coeffs_workspace;
840
coeffs_workspace.create(c * 2, elemsize, elempack, opt.workspace_vkallocator);
842
std::vector<VkImageMat> bindings(5);
843
bindings[0] = coeffs_workspace;
844
bindings[1] = mean_workspace;
845
bindings[2] = var_workspace;
846
bindings[3] = gamma_data_gpu_image;
847
bindings[4] = beta_data_gpu_image;
849
std::vector<vk_constant_type> constants(1);
852
const Pipeline* pipeline = elempack == 8 ? pipeline_instancenorm_coeffs_pack8
853
: elempack == 4 ? pipeline_instancenorm_coeffs_pack4
854
: pipeline_instancenorm_coeffs;
856
VkImageMat dispatcher;
860
cmd.record_pipeline(pipeline, bindings, constants, dispatcher);
865
std::vector<VkImageMat> bindings(3);
866
bindings[0] = bottom_top_blob;
867
bindings[1] = bottom_top_blob;
868
bindings[2] = coeffs_workspace;
870
std::vector<vk_constant_type> constants(5);
871
constants[0].i = bottom_top_blob.dims;
872
constants[1].i = bottom_top_blob.w;
873
constants[2].i = bottom_top_blob.h;
874
constants[3].i = bottom_top_blob.c;
875
constants[4].i = 0; //bottom_top_blob.cstep;
877
const Pipeline* pipeline = elempack == 8 ? pipeline_instancenorm_norm_pack8
878
: elempack == 4 ? pipeline_instancenorm_norm_pack4
879
: pipeline_instancenorm_norm;
881
cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob);