1
// yala is pleased to support the open source community by making ncnn available.
4
// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
8
// https://opensource.org/licenses/BSD-3-Clause
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
15
#include "convolutiondepthwise_loongarch.h"
17
#include "layer_type.h"
21
#endif // __loongarch_sx
23
#include "loongarch_activation.h"
24
#include "loongarch_usability.h"
28
#include "convolutiondepthwise_3x3.h"
31
#include "convolutiondepthwise_3x3_pack4.h"
32
#include "convolutiondepthwise_5x5_pack4.h"
33
#endif // __loongarch_sx
35
ConvolutionDepthWise_loongarch::ConvolutionDepthWise_loongarch()
38
support_packing = true;
39
#endif // __loongarch_sx
44
int ConvolutionDepthWise_loongarch::create_pipeline(const Option& opt)
49
activation = create_activation_layer(activation_type, activation_params, opt);
52
if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
54
return create_pipeline_int8_loongarch(opt);
58
const int maxk = kernel_w * kernel_h;
59
int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
62
if (channels == group && group == num_output)
66
if (opt.use_packing_layout)
68
elempack = channels % 4 == 0 ? 4 : 1;
76
Mat weight_data_r2 = weight_data.reshape(maxk, group);
77
convert_packing(weight_data_r2, weight_data_tm, 4, opt);
79
#endif // __loongarch_sx
83
weight_data_tm = weight_data;
87
weight_data.release();
93
create_group_ops(opt);
96
weight_data.release();
101
int ConvolutionDepthWise_loongarch::create_group_ops(const Option& opt)
103
// create Convolution op for each group
104
const int maxk = kernel_w * kernel_h;
105
int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
107
for (int i = 0; i < (int)group_ops.size(); i++)
112
const int channels_g = channels / group;
113
const int num_output_g = num_output / group;
115
group_ops.resize(group);
117
for (int g = 0; g < group; g++)
119
Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g).clone();
122
bias_data_g = bias_data.range(num_output_g * g, num_output_g);
124
ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
128
pd.set(0, num_output_g); // num_output
130
pd.set(11, kernel_h);
131
pd.set(2, dilation_w);
132
pd.set(12, dilation_h);
134
pd.set(13, stride_h);
135
pd.set(4, 0); // pad_w
136
pd.set(14, 0); // pad_h
137
pd.set(5, bias_term);
138
pd.set(6, maxk * channels_g * num_output_g); // weight_data_size
139
pd.set(8, int8_scale_term);
140
pd.set(9, activation_type);
141
pd.set(10, activation_params);
148
ncnn::Mat weights[5];
149
weights[0] = weight_data_g;
150
weights[1] = bias_data_g;
155
Mat weight_data_int8_scales_g(num_output_g);
156
weight_data_int8_scales_g.fill(weight_data_int8_scales[g]);
157
weights[2] = weight_data_int8_scales_g;
158
weights[3] = bottom_blob_int8_scales.range(g, 1);
160
if (int8_scale_term > 100)
162
weights[4] = top_blob_int8_scales.range(g, 1);
166
op->load_model(ModelBinFromMatArray(weights));
170
ncnn::Mat weights[4];
171
weights[0] = weight_data_g;
176
Mat weight_data_int8_scales_g(num_output_g);
177
weight_data_int8_scales_g.fill(weight_data_int8_scales[g]);
178
weights[1] = weight_data_int8_scales_g;
179
weights[2] = bottom_blob_int8_scales.range(g, 1);
181
if (int8_scale_term > 100)
183
weights[3] = top_blob_int8_scales.range(g, 1);
187
op->load_model(ModelBinFromMatArray(weights));
190
op->create_pipeline(opt);
198
int ConvolutionDepthWise_loongarch::destroy_pipeline(const Option& opt)
202
activation->destroy_pipeline(opt);
207
for (int i = 0; i < (int)group_ops.size(); i++)
209
group_ops[i]->destroy_pipeline(opt);
217
int ConvolutionDepthWise_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
220
if (opt.use_int8_inference && int8_scale_term)
222
return forward_int8_loongarch(bottom_blob, top_blob, opt);
226
int w = bottom_blob.w;
227
int h = bottom_blob.h;
228
int channels = bottom_blob.c;
229
size_t elemsize = bottom_blob.elemsize;
230
int elempack = bottom_blob.elempack;
232
const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
233
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
235
Mat bottom_blob_bordered;
236
make_padding(bottom_blob, bottom_blob_bordered, opt);
237
if (bottom_blob_bordered.empty())
240
w = bottom_blob_bordered.w;
241
h = bottom_blob_bordered.h;
243
int outw = (w - kernel_extent_w) / stride_w + 1;
244
int outh = (h - kernel_extent_h) / stride_h + 1;
245
int out_elempack = 1;
247
if (opt.use_packing_layout)
249
out_elempack = num_output % 4 == 0 ? 4 : 1;
252
size_t out_elemsize = elemsize / elempack * out_elempack;
254
top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
255
if (top_blob.empty())
259
if (channels * elempack == group && group == num_output)
264
if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
266
convdw3x3s1_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
270
activation->forward_inplace(top_blob, opt);
273
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
275
convdw3x3s2_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
279
activation->forward_inplace(top_blob, opt);
282
else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
284
convdw5x5s1_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
288
activation->forward_inplace(top_blob, opt);
291
else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
293
convdw5x5s2_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
297
activation->forward_inplace(top_blob, opt);
302
const int maxk = kernel_w * kernel_h;
305
std::vector<int> _space_ofs(maxk);
306
int* space_ofs = &_space_ofs[0];
310
int gap = w * dilation_h - kernel_w * dilation_w;
311
for (int i = 0; i < kernel_h; i++)
313
for (int j = 0; j < kernel_w; j++)
323
#pragma omp parallel for num_threads(opt.num_threads)
324
for (int g = 0; g < channels; g++)
326
float* outptr = top_blob.channel(g);
327
const float* kptr = (const float*)weight_data_tm + maxk * g * 4;
328
const Mat m = bottom_blob_bordered.channel(g);
330
for (int i = 0; i < outh; i++)
332
for (int j = 0; j < outw; j++)
334
__m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
338
_sum = (__m128)__lsx_vld((const float*)bias_data + g * 4, 0);
341
const float* sptr = m.row(i * stride_h) + j * stride_w * 4;
343
for (int k = 0; k < maxk; k++)
345
__m128 _val = (__m128)__lsx_vld(sptr + space_ofs[k] * 4, 0);
346
__m128 _w = (__m128)__lsx_vld(kptr + k * 4, 0);
347
_sum = __lsx_vfmadd_s(_w, _val, _sum);
350
_sum = activation_ps(_sum, activation_type, activation_params);
352
__lsx_vst(_sum, outptr + j * 4, 0);
360
#endif // __loongarch_sx
364
if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
366
convdw3x3s1_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
370
activation->forward_inplace(top_blob, opt);
373
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
375
convdw3x3s2_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
379
activation->forward_inplace(top_blob, opt);
384
const int maxk = kernel_w * kernel_h;
387
std::vector<int> _space_ofs(maxk);
388
int* space_ofs = &_space_ofs[0];
392
int gap = w * dilation_h - kernel_w * dilation_w;
393
for (int i = 0; i < kernel_h; i++)
395
for (int j = 0; j < kernel_w; j++)
405
#pragma omp parallel for num_threads(opt.num_threads)
406
for (int g = 0; g < group; g++)
408
float* outptr = top_blob.channel(g);
409
const float* kptr = (const float*)weight_data_tm + maxk * g;
410
const Mat m = bottom_blob_bordered.channel(g);
412
for (int i = 0; i < outh; i++)
414
for (int j = 0; j < outw; j++)
421
const float* sptr = m.row(i * stride_h) + j * stride_w;
423
for (int k = 0; k < maxk; k++)
425
float val = (float)sptr[space_ofs[k]];
426
float w = (float)kptr[k];
430
sum = activation_ss(sum, activation_type, activation_params);
445
const int channels_g = channels * elempack / group;
446
const int num_output_g = num_output / group;
449
int out_g_elempack = 1;
451
if (opt.use_packing_layout)
453
g_elempack = channels_g % 4 == 0 ? 4 : 1;
454
out_g_elempack = num_output_g % 4 == 0 ? 4 : 1;
459
Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
460
if (elempack > g_elempack)
463
opt_p.blob_allocator = opt.workspace_allocator;
464
convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, 1, opt_p);
467
Mat top_blob_unpacked = top_blob;
468
if (out_g_elempack < out_elempack)
470
top_blob_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
471
if (top_blob_unpacked.empty())
475
for (int g = 0; g < group; g++)
477
const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
478
Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
480
const ncnn::Layer* op = group_ops[g];
483
opt_g.blob_allocator = top_blob_unpacked.allocator;
486
op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
490
if (out_g_elempack < out_elempack)
492
convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
496
top_blob = top_blob_unpacked;
502
int ConvolutionDepthWise_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
504
const Mat& bottom_blob = bottom_blobs[0];
505
const Mat& _weight_data = bottom_blobs[1];
506
Mat& top_blob = top_blobs[0];
508
const int _kernel_w = _weight_data.w;
509
const int _kernel_h = _weight_data.h;
510
const int _num_output = _weight_data.c * _weight_data.elempack;
512
Mat weight_data_flattened;
513
flatten(_weight_data, weight_data_flattened, opt);
514
if (weight_data_flattened.empty())
517
// weight_data_flattened as pack1
518
weight_data_flattened.w *= weight_data_flattened.elempack;
519
weight_data_flattened.elemsize /= weight_data_flattened.elempack;
520
weight_data_flattened.elempack = 1;
522
Mat bias_data_flattened;
525
const Mat& _bias_data = bottom_blobs[2];
526
flatten(_bias_data, bias_data_flattened, opt);
527
if (bias_data_flattened.empty())
530
// bias_data_flattened as pack1
531
bias_data_flattened.w *= bias_data_flattened.elempack;
532
bias_data_flattened.elemsize /= bias_data_flattened.elempack;
533
bias_data_flattened.elempack = 1;
536
ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::ConvolutionDepthWise);
539
pd.set(0, _num_output);
540
pd.set(1, _kernel_w);
541
pd.set(11, _kernel_h);
542
pd.set(2, dilation_w);
543
pd.set(12, dilation_h);
545
pd.set(13, stride_h);
547
pd.set(15, pad_right);
549
pd.set(16, pad_bottom);
550
pd.set(18, pad_value);
551
pd.set(5, bias_term);
552
pd.set(6, weight_data_flattened.w);
554
pd.set(8, int8_scale_term);
555
pd.set(9, activation_type);
556
pd.set(10, activation_params);
560
ncnn::Mat weights[2];
561
weights[0] = weight_data_flattened;
562
weights[1] = bias_data_flattened;
564
op->load_model(ncnn::ModelBinFromMatArray(weights));
566
op->create_pipeline(opt);
568
op->forward(bottom_blob, top_blob, opt);
570
op->destroy_pipeline(opt);
578
int ConvolutionDepthWise_loongarch::create_pipeline_int8_loongarch(const Option& opt)
580
const int maxk = kernel_w * kernel_h;
581
int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
584
if (channels == group && group == num_output)
588
if (opt.use_packing_layout)
590
elempack = channels % 8 == 0 ? 8 : 1;
592
#endif // __loongarch_sx
596
Mat weight_data_r2 = weight_data.reshape(maxk, group);
597
convert_packing(weight_data_r2, weight_data_tm, 8, opt);
602
weight_data_tm = weight_data;
606
weight_data.release();
612
create_group_ops(opt);
615
weight_data.release();
620
int ConvolutionDepthWise_loongarch::forward_int8_loongarch(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
622
int w = bottom_blob.w;
623
int h = bottom_blob.h;
624
int channels = bottom_blob.c;
625
int elempack = bottom_blob.elempack;
627
int elembits = bottom_blob.elembits();
629
const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
630
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
632
Mat bottom_blob_int8 = bottom_blob;
635
const int channels_g = channels * elempack / group;
637
Mat scales(channels * elempack);
640
for (int g = 0; g < group; g++)
642
float scale = bottom_blob_int8_scales[g];
643
for (int q = 0; q < channels_g; q++)
651
opt_q.blob_allocator = opt.workspace_allocator;
652
quantize_to_int8(bottom_blob, bottom_blob_int8, scales, opt_q);
655
Mat bottom_blob_bordered;
656
make_padding(bottom_blob_int8, bottom_blob_bordered, opt);
657
if (bottom_blob_bordered.empty())
660
w = bottom_blob_bordered.w;
661
h = bottom_blob_bordered.h;
662
channels = bottom_blob_bordered.c;
663
elempack = bottom_blob_bordered.elempack;
665
int outw = (w - kernel_extent_w) / stride_w + 1;
666
int outh = (h - kernel_extent_h) / stride_h + 1;
669
if (channels * elempack == group && group == num_output)
671
int out_elempack = 1;
673
if (opt.use_packing_layout)
675
out_elempack = num_output % 8 == 0 ? 8 : 1;
677
#endif // __loongarch_sx
678
bool use_int8_requantize = int8_scale_term > 100;
679
size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
681
top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
682
if (top_blob.empty())
689
const int maxk = kernel_w * kernel_h;
692
std::vector<int> _space_ofs(maxk);
693
int* space_ofs = &_space_ofs[0];
697
int gap = w * dilation_h - kernel_w * dilation_w;
698
for (int i = 0; i < kernel_h; i++)
700
for (int j = 0; j < kernel_w; j++)
710
#pragma omp parallel for num_threads(opt.num_threads)
711
for (int g = 0; g < channels; g++)
713
signed char* outptr_s8 = top_blob.channel(g);
714
float* outptr_f32 = top_blob.channel(g);
715
const signed char* kptr = (const signed char*)weight_data_tm + maxk * g * 8;
716
const Mat m = bottom_blob_bordered.channel(g);
718
for (int i = 0; i < outh; i++)
720
for (int j = 0; j < outw; j++)
722
__m128i _sum0 = __lsx_vreplgr2vr_w(0);
723
__m128i _sum1 = __lsx_vreplgr2vr_w(0);
725
const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w * 8;
727
for (int k = 0; k < maxk; k++)
729
__m128i _val = __lsx_vld(sptr + space_ofs[k] * 8, 0);
730
__m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);
732
__m128i _w = __lsx_vld(kptr + k * 8, 0);
733
__m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);
735
__m128i _s0 = __lsx_vmul_h(_val16, _w16);
736
__m128i _exts0 = __lsx_vslti_h(_s0, 0);
737
__m128i _s0l = __lsx_vilvl_h(_exts0, _s0);
738
__m128i _s0h = __lsx_vilvh_h(_exts0, _s0);
740
_sum0 = __lsx_vadd_w(_sum0, _s0l);
741
_sum1 = __lsx_vadd_w(_sum1, _s0h);
747
__m128 _bottom_blob_int8_scales0 = (__m128)__lsx_vld((const float*)bottom_blob_int8_scales + g * 8, 0);
748
__m128 _bottom_blob_int8_scales1 = (__m128)__lsx_vld((const float*)bottom_blob_int8_scales + g * 8 + 4, 0);
749
__m128 _weight_data_int8_scales0 = (__m128)__lsx_vld((const float*)weight_data_int8_scales + g * 8, 0);
750
__m128 _weight_data_int8_scales1 = (__m128)__lsx_vld((const float*)weight_data_int8_scales + g * 8 + 4, 0);
751
_scale_in0 = __lsx_vfrecip_s(__lsx_vfmul_s(_bottom_blob_int8_scales0, _weight_data_int8_scales0));
752
_scale_in1 = __lsx_vfrecip_s(__lsx_vfmul_s(_bottom_blob_int8_scales1, _weight_data_int8_scales1));
754
__m128i _m0 = __lsx_vfcmp_cne_s(_weight_data_int8_scales0, __lsx_vreplfr2vr_s(0.f));
755
__m128i _m1 = __lsx_vfcmp_cne_s(_weight_data_int8_scales1, __lsx_vreplfr2vr_s(0.f));
756
_scale_in0 = (__m128)__lsx_vand_v((__m128i)_scale_in0, (__m128i)_m0);
757
_scale_in1 = (__m128)__lsx_vand_v((__m128i)_scale_in1, (__m128i)_m1);
760
__m128 _sumfp32_0 = __lsx_vfmul_s(__lsx_vffint_s_w(_sum0), _scale_in0);
761
__m128 _sumfp32_1 = __lsx_vfmul_s(__lsx_vffint_s_w(_sum1), _scale_in1);
765
__m128 _bias0 = (__m128)__lsx_vld((const float*)bias_data + g * 8, 0);
766
__m128 _bias1 = (__m128)__lsx_vld((const float*)bias_data + g * 8 + 4, 0);
767
_sumfp32_0 = __lsx_vfadd_s(_sumfp32_0, _bias0);
768
_sumfp32_1 = __lsx_vfadd_s(_sumfp32_1, _bias1);
771
_sumfp32_0 = activation_ps(_sumfp32_0, activation_type, activation_params);
772
_sumfp32_1 = activation_ps(_sumfp32_1, activation_type, activation_params);
774
if (use_int8_requantize)
776
// requantize and relu
777
__m128 _scale_out0 = (__m128)__lsx_vld((const float*)top_blob_int8_scales + g * 8, 0);
778
__m128 _scale_out1 = (__m128)__lsx_vld((const float*)top_blob_int8_scales + g * 8 + 4, 0);
779
_sumfp32_0 = __lsx_vfmul_s(_sumfp32_0, _scale_out0);
780
_sumfp32_1 = __lsx_vfmul_s(_sumfp32_1, _scale_out1);
781
int64_t _sum8 = float2int8(_sumfp32_0, _sumfp32_1);
783
*(int64_t*)outptr_s8 = _sum8;
788
// dequantize and relu
789
__lsx_vst(_sumfp32_0, outptr_f32, 0);
790
__lsx_vst(_sumfp32_1, outptr_f32 + 4, 0);
798
#endif // __loongarch_sx
803
const int maxk = kernel_w * kernel_h;
806
std::vector<int> _space_ofs(maxk);
807
int* space_ofs = &_space_ofs[0];
811
int gap = w * dilation_h - kernel_w * dilation_w;
812
for (int i = 0; i < kernel_h; i++)
814
for (int j = 0; j < kernel_w; j++)
824
#pragma omp parallel for num_threads(opt.num_threads)
825
for (int g = 0; g < group; g++)
827
signed char* outptr_s8 = top_blob.channel(g);
828
float* outptr_f32 = top_blob.channel(g);
829
const signed char* kptr = (const signed char*)weight_data_tm + maxk * g;
830
const Mat m = bottom_blob_bordered.channel(g);
832
for (int i = 0; i < outh; i++)
834
for (int j = 0; j < outw; j++)
838
const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w;
840
for (int k = 0; k < maxk; k++)
842
signed char val = sptr[space_ofs[k]];
843
signed char w = kptr[k];
848
if (weight_data_int8_scales[g] == 0)
851
scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
853
float sumfp32 = sum * scale_in;
856
sumfp32 += bias_data[g];
858
sumfp32 = activation_ss(sumfp32, activation_type, activation_params);
860
if (use_int8_requantize)
863
float scale_out = top_blob_int8_scales[g];
864
signed char sums8 = float2int8(sumfp32 * scale_out);
865
outptr_s8[0] = sums8;
871
outptr_f32[0] = sumfp32;
883
bool use_int8_requantize = int8_scale_term > 100;
884
int out_elempack = 1;
886
if (opt.use_packing_layout)
888
if (use_int8_requantize)
889
out_elempack = num_output % 8 == 0 ? 8 : 1;
891
out_elempack = num_output % 4 == 0 ? 4 : 1;
893
#endif // __loongarch_sx
894
size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
896
top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
897
if (top_blob.empty())
901
const int channels_g = channels * elempack / group;
902
const int num_output_g = num_output / group;
905
int out_g_elempack = 1;
907
if (opt.use_packing_layout)
909
g_elempack = channels_g % 8 == 0 ? 8 : 1;
910
if (use_int8_requantize)
911
out_g_elempack = num_output_g % 8 == 0 ? 8 : 1;
913
out_g_elempack = num_output_g % 4 == 0 ? 4 : 1;
915
#endif // __loongarch_sx
918
Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
919
if (elempack > g_elempack)
922
opt_p.blob_allocator = opt.workspace_allocator;
923
convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p);
926
Mat top_blob_unpacked = top_blob;
927
if (out_g_elempack < out_elempack)
929
top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator);
930
if (top_blob_unpacked.empty())
934
#pragma omp parallel for num_threads(opt.num_threads)
935
for (int g = 0; g < group; g++)
937
const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
938
Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
940
const ncnn::Layer* op = group_ops[g];
943
opt_g.blob_allocator = top_blob_unpacked.allocator;
946
op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
950
if (out_g_elempack < out_elempack)
952
convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
956
top_blob = top_blob_unpacked;