1
// Tencent is pleased to support the open source community by making ncnn available.
3
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
8
// https://opensource.org/licenses/BSD-3-Clause
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
15
#include "convolution_riscv.h"
19
#include "layer_type.h"
22
#include <riscv_vector.h>
23
#endif // __riscv_vector
25
#include "riscv_activation.h"
26
#include "riscv_usability.h"
32
#include "convolution_sgemm.h"
33
#include "convolution_winograd_transform.h"
34
#include "convolution_winograd_dot.h"
35
#include "convolution_1x1.h"
36
#include "convolution_3x3.h"
39
#include "convolution_packn.h"
40
#include "convolution_pack1ton.h"
41
#include "convolution_packnto1.h"
43
#include "convolution_sgemm_packn.h"
44
#include "convolution_sgemm_pack1ton.h"
45
#include "convolution_sgemm_packnto1.h"
46
#include "convolution_winograd_transform_packn.h"
47
#include "convolution_winograd_dot_packn.h"
48
#include "convolution_1x1_packn.h"
49
#include "convolution_1x1_pack1ton.h"
50
#include "convolution_1x1_packnto1.h"
51
#include "convolution_3x3_packn.h"
52
#include "convolution_3x3_pack1ton.h"
53
#include "convolution_7x7_pack1ton.h"
56
#include "convolution_fp16s.h"
57
#include "convolution_packn_fp16s.h"
58
#include "convolution_pack1ton_fp16s.h"
59
#include "convolution_packnto1_fp16s.h"
61
#include "convolution_sgemm_fp16s.h"
62
#include "convolution_sgemm_packn_fp16s.h"
63
#include "convolution_sgemm_pack1ton_fp16s.h"
64
#include "convolution_sgemm_packnto1_fp16s.h"
65
#include "convolution_winograd_transform_packn_fp16s.h"
66
#include "convolution_winograd_dot_packn_fp16s.h"
67
#include "convolution_1x1_fp16s.h"
68
#include "convolution_1x1_packn_fp16s.h"
69
#include "convolution_1x1_pack1ton_fp16s.h"
70
#include "convolution_1x1_packnto1_fp16s.h"
71
#include "convolution_3x3_packn_fp16s.h"
72
#include "convolution_3x3_pack1ton_fp16s.h"
73
#include "convolution_7x7_pack1ton_fp16s.h"
76
#endif // __riscv_vector
78
Convolution_riscv::Convolution_riscv()
81
support_packing = true;
83
support_fp16_storage = true;
85
#endif // __riscv_vector
90
static void convolution_transform_kernel_packed_rvv(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack)
92
const int maxk = kernel_w * kernel_h;
94
// src = kw-kh-inch-outch
95
// dst = pb-pa-kw-kh-inch/pa-outch/pb
97
Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
99
weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack);
101
for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
103
float* g00 = weight_data_tm.channel(q / out_elempack);
105
for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
107
for (int k = 0; k < maxk; k++)
109
for (int i = 0; i < elempack; i++)
111
for (int j = 0; j < out_elempack; j++)
113
const float* k00 = weight_data_r2.channel(q + j).row(p + i);
126
int Convolution_riscv::create_pipeline(const Option& opt)
131
activation = create_activation_layer(activation_type, activation_params, opt);
134
if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
136
// TODO implement int8
141
#if __riscv_vector && __riscv_zfh
142
if (opt.use_fp16_storage)
144
return create_pipeline_fp16s(opt);
149
const int packn = csrr_vlenb() / 4;
152
const int maxk = kernel_w * kernel_h;
153
const int num_input = weight_data_size / maxk / num_output;
156
int out_elempack = 1;
158
if (opt.use_packing_layout)
160
elempack = num_input % packn == 0 ? packn : 1;
161
out_elempack = num_output % packn == 0 ? packn : 1;
167
if (elempack == packn && out_elempack == packn)
169
if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
171
if ((opt.use_winograd63_convolution && num_input >= packn * 2 && num_output >= packn * 2 && num_input <= packn * 16 && num_output <= packn * 16) || (!opt.use_winograd43_convolution && !opt.use_winograd23_convolution))
172
conv3x3s1_winograd63_transform_kernel_packn_rvv(weight_data, weight_winograd63_data, num_input, num_output, opt);
173
else if ((opt.use_winograd43_convolution && num_input >= packn * 2 && num_output >= packn * 2) || (!opt.use_winograd63_convolution && !opt.use_winograd23_convolution))
174
conv3x3s1_winograd43_transform_kernel_packn_rvv(weight_data, weight_winograd43_data, num_input, num_output, opt);
175
else // if (opt.use_winograd23_convolution)
176
conv3x3s1_winograd23_transform_kernel_packn_rvv(weight_data, weight_winograd23_data, num_input, num_output, opt);
180
convolution_transform_kernel_packed_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
185
if (elempack == 1 && out_elempack == packn)
187
convolution_transform_kernel_packed_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
191
if (elempack == packn && out_elempack == 1)
193
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
195
convolution_im2col_sgemm_transform_kernel_packnto1_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
197
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
199
convolution_im2col_sgemm_transform_kernel_packnto1_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
201
else if (opt.use_sgemm_convolution)
203
convolution_im2col_sgemm_transform_kernel_packnto1_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
207
convolution_transform_kernel_packed_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
210
#endif // __riscv_vector
213
if (elempack == 1 && out_elempack == 1)
215
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
217
convolution_im2col_sgemm_transform_kernel_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
219
else if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
221
if ((opt.use_winograd43_convolution && num_input >= 16 && num_output >= 16) || !opt.use_winograd23_convolution)
223
conv3x3s1_winograd43_transform_kernel_rvv(weight_data, weight_winograd43_data, num_input, num_output, opt);
225
else if (opt.use_winograd23_convolution)
227
conv3x3s1_winograd23_transform_kernel_rvv(weight_data, weight_winograd23_data, num_input, num_output, opt);
230
else if (opt.use_sgemm_convolution)
232
convolution_im2col_sgemm_transform_kernel_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
236
weight_data_tm = weight_data;
241
weight_data.release();
246
int Convolution_riscv::destroy_pipeline(const Option& opt)
250
activation->destroy_pipeline(opt);
258
int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
261
if (opt.use_int8_inference && int8_scale_term)
263
Mat bottom_blob_unpacked = bottom_blob;
264
if (bottom_blob.elempack != 1)
266
Option opt_pack1 = opt;
267
opt_pack1.blob_allocator = opt.workspace_allocator;
269
convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
272
Mat bottom_blob_unpacked_fp32 = bottom_blob_unpacked;
273
if (bottom_blob_unpacked.elembits() == 16)
275
Option opt_pack1 = opt;
276
opt_pack1.blob_allocator = opt.workspace_allocator;
278
cast_float16_to_float32(bottom_blob_unpacked, bottom_blob_unpacked_fp32, opt_pack1);
281
Option opt_unpacked = opt;
282
opt_unpacked.use_packing_layout = false;
283
return Convolution::forward_int8(bottom_blob_unpacked_fp32, top_blob, opt_unpacked);
287
// flattened blob, implement as InnerProduct
288
if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
291
if (bottom_blob.elemsize % 16 == 0)
293
bottom_blob_3d = bottom_blob;
294
bottom_blob_3d.dims = 3;
295
bottom_blob_3d.w = 1;
296
bottom_blob_3d.h = 1;
297
bottom_blob_3d.c = bottom_blob.w;
298
bottom_blob_3d.cstep = 1;
302
bottom_blob_3d = bottom_blob.reshape(1, 1, bottom_blob.w, opt.workspace_allocator);
306
int ret = forward(bottom_blob_3d, top_blob_3d, opt);
310
if (top_blob_3d.elemsize % 16 == 0)
312
top_blob = top_blob_3d;
314
top_blob.w = top_blob_3d.c;
317
bottom_blob_3d.cstep = top_blob_3d.c;
321
top_blob = top_blob_3d.reshape(top_blob_3d.c, opt.blob_allocator);
327
int elembits = bottom_blob.elembits();
329
#if __riscv_vector && __riscv_zfh
330
if (opt.use_fp16_storage && elembits == 16)
332
if (opt.use_fp16_arithmetic)
333
return forward_fp16sa(bottom_blob, top_blob, opt);
335
return forward_fp16s(bottom_blob, top_blob, opt);
340
const int packn = csrr_vlenb() / 4;
343
int w = bottom_blob.w;
344
int h = bottom_blob.h;
345
int channels = bottom_blob.c;
346
size_t elemsize = bottom_blob.elemsize;
347
int elempack = bottom_blob.elempack;
349
// NCNN_LOGE("Convolution input %d x %d pad = %d %d ksize=%d %d stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
351
const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
352
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
354
Mat bottom_blob_bordered;
355
make_padding(bottom_blob, bottom_blob_bordered, opt);
356
if (bottom_blob_bordered.empty())
359
w = bottom_blob_bordered.w;
360
h = bottom_blob_bordered.h;
362
int outw = (w - kernel_extent_w) / stride_w + 1;
363
int outh = (h - kernel_extent_h) / stride_h + 1;
364
int out_elempack = 1;
366
if (opt.use_packing_layout)
368
out_elempack = num_output % packn == 0 ? packn : 1;
371
size_t out_elemsize = elemsize / elempack * out_elempack;
373
top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
374
if (top_blob.empty())
377
const int num_input = channels * elempack;
380
if (elempack == packn && out_elempack == packn)
382
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
384
conv1x1s1_sgemm_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
388
activation->forward_inplace(top_blob, opt);
391
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
393
conv1x1s2_sgemm_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
397
activation->forward_inplace(top_blob, opt);
400
else if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
402
if ((opt.use_winograd63_convolution && num_input >= packn * 2 && num_output >= packn * 2 && num_input <= packn * 16 && num_output <= packn * 16) || (!opt.use_winograd43_convolution && !opt.use_winograd23_convolution))
403
conv3x3s1_winograd63_packn_rvv(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data, opt);
404
else if ((opt.use_winograd43_convolution && num_input >= packn * 2 && num_output >= packn * 2) || (!opt.use_winograd63_convolution && !opt.use_winograd23_convolution))
405
conv3x3s1_winograd43_packn_rvv(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, opt);
406
else // if (opt.use_winograd23_convolution)
407
conv3x3s1_winograd23_packn_rvv(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, opt);
411
activation->forward_inplace(top_blob, opt);
414
else if (opt.use_sgemm_convolution)
416
convolution_im2col_sgemm_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
420
activation->forward_inplace(top_blob, opt);
425
convolution_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
429
if (elempack == 1 && out_elempack == packn)
431
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
433
conv1x1s1_sgemm_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
437
activation->forward_inplace(top_blob, opt);
440
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
442
conv3x3s1_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
446
activation->forward_inplace(top_blob, opt);
449
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
451
conv3x3s2_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
455
activation->forward_inplace(top_blob, opt);
458
else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
460
conv7x7s2_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
464
activation->forward_inplace(top_blob, opt);
467
else if (opt.use_sgemm_convolution)
469
convolution_im2col_sgemm_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
473
activation->forward_inplace(top_blob, opt);
478
convolution_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
482
if (elempack == packn && out_elempack == 1)
484
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
486
conv1x1s1_sgemm_packnto1_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
490
activation->forward_inplace(top_blob, opt);
493
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
495
conv1x1s2_sgemm_packnto1_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
499
activation->forward_inplace(top_blob, opt);
502
else if (opt.use_sgemm_convolution)
504
convolution_im2col_sgemm_packnto1_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
508
activation->forward_inplace(top_blob, opt);
513
convolution_packnto1_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
516
#endif // __riscv_vector
518
if (elempack == 1 && out_elempack == 1)
520
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
522
conv1x1s1_sgemm_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
526
activation->forward_inplace(top_blob, opt);
529
else if (opt.use_winograd_convolution && (opt.use_winograd43_convolution || opt.use_winograd23_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
531
if ((opt.use_winograd43_convolution && num_input >= 16 && num_output >= 16) || !opt.use_winograd23_convolution)
533
conv3x3s1_winograd43_rvv(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, opt);
535
else if (opt.use_winograd23_convolution)
537
conv3x3s1_winograd23_rvv(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, opt);
542
activation->forward_inplace(top_blob, opt);
545
else if (opt.use_sgemm_convolution)
547
convolution_im2col_sgemm_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
551
activation->forward_inplace(top_blob, opt);
556
const int maxk = kernel_w * kernel_h;
559
std::vector<int> _space_ofs(maxk);
560
int* space_ofs = &_space_ofs[0];
564
int gap = w * dilation_h - kernel_w * dilation_w;
565
for (int i = 0; i < kernel_h; i++)
567
for (int j = 0; j < kernel_w; j++)
578
#pragma omp parallel for num_threads(opt.num_threads)
579
for (int p = 0; p < num_output; p++)
581
float* outptr = top_blob.channel(p);
583
for (int i = 0; i < outh; i++)
585
for (int j = 0; j < outw; j++)
594
const float* kptr = (const float*)weight_data_tm + maxk * channels * p;
597
for (int q = 0; q < channels; q++)
599
const Mat m = bottom_blob_bordered.channel(q);
600
const float* sptr = m.row(i * stride_h) + j * stride_w;
602
for (int k = 0; k < maxk; k++)
604
float val = sptr[space_ofs[k]];
612
sum = activation_ss(sum, activation_type, activation_params);
626
int Convolution_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
628
const Mat& bottom_blob = bottom_blobs[0];
629
const Mat& _weight_data = bottom_blobs[1];
630
Mat& top_blob = top_blobs[0];
632
const int _kernel_w = _weight_data.w;
633
const int _kernel_h = _weight_data.h;
634
const int _num_output = _weight_data.c * _weight_data.elempack;
636
Mat weight_data_flattened;
637
flatten(_weight_data, weight_data_flattened, opt);
638
if (weight_data_flattened.empty())
642
if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && weight_data_flattened.elembits() == 16)
644
Mat weight_data_flattened_fp32;
645
cast_float16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt);
646
weight_data_flattened = weight_data_flattened_fp32;
650
// weight_data_flattened as pack1
651
weight_data_flattened.w *= weight_data_flattened.elempack;
652
weight_data_flattened.elemsize /= weight_data_flattened.elempack;
653
weight_data_flattened.elempack = 1;
655
Mat bias_data_flattened;
658
const Mat& _bias_data = bottom_blobs[2];
659
flatten(_bias_data, bias_data_flattened, opt);
660
if (bias_data_flattened.empty())
664
if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && bias_data_flattened.elembits() == 16)
666
Mat bias_data_flattened_fp32;
667
cast_float16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt);
668
bias_data_flattened = bias_data_flattened_fp32;
672
// bias_data_flattened as pack1
673
bias_data_flattened.w *= bias_data_flattened.elempack;
674
bias_data_flattened.elemsize /= bias_data_flattened.elempack;
675
bias_data_flattened.elempack = 1;
678
ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
681
pd.set(0, _num_output);
682
pd.set(1, _kernel_w);
683
pd.set(11, _kernel_h);
684
pd.set(2, dilation_w);
685
pd.set(12, dilation_h);
687
pd.set(13, stride_h);
689
pd.set(15, pad_right);
691
pd.set(16, pad_bottom);
692
pd.set(18, pad_value);
693
pd.set(5, bias_term);
694
pd.set(6, weight_data_flattened.w);
695
pd.set(8, int8_scale_term);
696
pd.set(9, activation_type);
697
pd.set(10, activation_params);
701
ncnn::Mat weights[2];
702
weights[0] = weight_data_flattened;
703
weights[1] = bias_data_flattened;
705
op->load_model(ncnn::ModelBinFromMatArray(weights));
707
op->create_pipeline(opt);
709
op->forward(bottom_blob, top_blob, opt);
711
op->destroy_pipeline(opt);
718
#if __riscv_vector && __riscv_zfh
719
static void convolution_transform_kernel_packed_fp16s_rvv(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack)
721
const int maxk = kernel_w * kernel_h;
723
// src = kw-kh-inch-outch
724
// dst = pb-pa-kw-kh-inch/pa-outch/pb
726
Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
728
weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)2u * elempack * out_elempack, elempack * out_elempack);
730
for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
732
__fp16* g00 = weight_data_tm.channel(q / out_elempack);
734
for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
736
for (int k = 0; k < maxk; k++)
738
for (int i = 0; i < elempack; i++)
740
for (int j = 0; j < out_elempack; j++)
742
const float* k00 = weight_data_r2.channel(q + j).row(p + i);
744
g00[0] = (__fp16)k00[k];
755
int Convolution_riscv::create_pipeline_fp16s(const Option& opt)
757
const int packn = csrr_vlenb() / 2;
759
const int maxk = kernel_w * kernel_h;
760
const int num_input = weight_data_size / maxk / num_output;
763
int out_elempack = 1;
765
if (opt.use_packing_layout)
767
elempack = num_input % packn == 0 ? packn : 1;
768
out_elempack = num_output % packn == 0 ? packn : 1;
772
if (elempack == packn && out_elempack == packn)
774
if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && opt.use_fp16_arithmetic && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
776
if ((opt.use_winograd63_convolution && num_input >= packn * 2 && num_output >= packn * 2 && num_input <= packn * 16 && num_output <= packn * 16) || (!opt.use_winograd43_convolution && !opt.use_winograd23_convolution))
777
conv3x3s1_winograd63_transform_kernel_packn_fp16sa_rvv(weight_data, weight_winograd63_data, num_input, num_output, opt);
778
else if ((opt.use_winograd43_convolution && num_input >= packn * 2 && num_output >= packn * 2) || (!opt.use_winograd63_convolution && !opt.use_winograd23_convolution))
779
conv3x3s1_winograd43_transform_kernel_packn_fp16sa_rvv(weight_data, weight_winograd43_data, num_input, num_output, opt);
780
else // if (opt.use_winograd23_convolution)
781
conv3x3s1_winograd23_transform_kernel_packn_fp16sa_rvv(weight_data, weight_winograd23_data, num_input, num_output, opt);
785
convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
790
if (elempack == 1 && out_elempack == packn)
792
convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
796
if (elempack == packn && out_elempack == 1)
798
if (opt.use_fp16_arithmetic && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
800
convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
802
else if (opt.use_fp16_arithmetic && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
804
convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
806
else if (opt.use_fp16_arithmetic && opt.use_sgemm_convolution)
808
convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
812
convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
817
if (elempack == 1 && out_elempack == 1)
819
if (opt.use_fp16_arithmetic && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
821
convolution_im2col_sgemm_transform_kernel_fp16sa_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
823
else if (opt.use_fp16_arithmetic && opt.use_sgemm_convolution)
825
convolution_im2col_sgemm_transform_kernel_fp16sa_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
829
convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
833
if (opt.use_fp16_arithmetic)
835
ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
839
weight_data.release();
844
int Convolution_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
846
const int packn = csrr_vlenb() / 2;
848
int w = bottom_blob.w;
849
int h = bottom_blob.h;
850
size_t elemsize = bottom_blob.elemsize;
851
int elempack = bottom_blob.elempack;
853
// NCNN_LOGE("Convolution forward_fp16s input %d x %d pad = %d %d ksize=%d %d stride=%d %d", w, h, pad_left, pad_top, kernel_w, kernel_h, stride_w, stride_h);
855
const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
856
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
858
Mat bottom_blob_bordered;
859
make_padding(bottom_blob, bottom_blob_bordered, opt);
860
if (bottom_blob_bordered.empty())
863
w = bottom_blob_bordered.w;
864
h = bottom_blob_bordered.h;
866
int outw = (w - kernel_extent_w) / stride_w + 1;
867
int outh = (h - kernel_extent_h) / stride_h + 1;
868
int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1;
869
size_t out_elemsize = elemsize / elempack * out_elempack;
871
top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
872
if (top_blob.empty())
875
if (elempack == packn && out_elempack == packn)
878
convolution_packn_fp16s_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
882
if (elempack == 1 && out_elempack == packn)
885
convolution_pack1ton_fp16s_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
889
if (elempack == packn && out_elempack == 1)
892
convolution_packnto1_fp16s_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
896
if (elempack == 1 && out_elempack == 1)
899
convolution_fp16s(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
906
int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
908
const int packn = csrr_vlenb() / 2;
910
int w = bottom_blob.w;
911
int h = bottom_blob.h;
912
int channels = bottom_blob.c;
913
size_t elemsize = bottom_blob.elemsize;
914
int elempack = bottom_blob.elempack;
916
// NCNN_LOGE("Convolution input %d x %d pad = %d %d ksize=%d %d stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
918
const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
919
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
921
Mat bottom_blob_bordered;
922
make_padding(bottom_blob, bottom_blob_bordered, opt);
923
if (bottom_blob_bordered.empty())
926
w = bottom_blob_bordered.w;
927
h = bottom_blob_bordered.h;
929
int outw = (w - kernel_extent_w) / stride_w + 1;
930
int outh = (h - kernel_extent_h) / stride_h + 1;
931
int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1;
932
size_t out_elemsize = elemsize / elempack * out_elempack;
934
top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
935
if (top_blob.empty())
938
const int num_input = channels * elempack;
940
if (elempack == packn && out_elempack == packn)
942
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
944
conv1x1s1_sgemm_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);
948
activation->forward_inplace(top_blob, opt);
951
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
953
conv1x1s2_sgemm_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);
957
activation->forward_inplace(top_blob, opt);
960
else if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
962
if ((opt.use_winograd63_convolution && num_input >= packn * 2 && num_output >= packn * 2 && num_input <= packn * 16 && num_output <= packn * 16) || (!opt.use_winograd43_convolution && !opt.use_winograd23_convolution))
963
conv3x3s1_winograd63_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data_fp16, opt);
964
else if ((opt.use_winograd43_convolution && num_input >= packn * 2 && num_output >= packn * 2) || (!opt.use_winograd63_convolution && !opt.use_winograd23_convolution))
965
conv3x3s1_winograd43_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data_fp16, opt);
966
else // if (opt.use_winograd23_convolution)
967
conv3x3s1_winograd23_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data_fp16, opt);
971
activation->forward_inplace(top_blob, opt);
974
else if (opt.use_sgemm_convolution)
976
convolution_im2col_sgemm_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
980
activation->forward_inplace(top_blob, opt);
985
convolution_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
989
if (elempack == 1 && out_elempack == packn)
991
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
993
conv1x1s1_sgemm_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);
997
activation->forward_inplace(top_blob, opt);
1000
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
1002
conv3x3s1_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);
1006
activation->forward_inplace(top_blob, opt);
1009
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
1011
conv3x3s2_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);
1015
activation->forward_inplace(top_blob, opt);
1018
else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
1020
conv7x7s2_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);
1024
activation->forward_inplace(top_blob, opt);
1027
else if (opt.use_sgemm_convolution)
1029
convolution_im2col_sgemm_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
1033
activation->forward_inplace(top_blob, opt);
1038
convolution_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
1042
if (elempack == packn && out_elempack == 1)
1044
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
1046
conv1x1s1_sgemm_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);
1050
activation->forward_inplace(top_blob, opt);
1053
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
1055
conv1x1s2_sgemm_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);
1059
activation->forward_inplace(top_blob, opt);
1062
else if (opt.use_sgemm_convolution)
1064
convolution_im2col_sgemm_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
1068
activation->forward_inplace(top_blob, opt);
1073
convolution_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
1077
if (elempack == 1 && out_elempack == 1)
1079
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
1081
conv1x1s1_sgemm_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);
1085
activation->forward_inplace(top_blob, opt);
1088
else if (opt.use_sgemm_convolution)
1090
convolution_im2col_sgemm_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
1094
activation->forward_inplace(top_blob, opt);
1099
convolution_fp16s(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
1105
#endif // __riscv_vector && __riscv_zfh