1
// Tencent is pleased to support the open source community by making ncnn available.
3
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
8
// https://opensource.org/licenses/BSD-3-Clause
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
15
#include "innerproduct_riscv.h"
17
#include "layer_type.h"
20
#include <riscv_vector.h>
21
#endif // __riscv_vector
23
#include "riscv_activation.h"
24
#include "riscv_usability.h"
28
InnerProduct_riscv::InnerProduct_riscv()
31
support_packing = true;
33
support_fp16_storage = true;
35
#endif // __riscv_vector
40
int InnerProduct_riscv::create_pipeline(const Option& opt)
43
flatten = ncnn::create_layer_cpu(ncnn::LayerType::Flatten);
47
flatten->load_param(pd);
49
flatten->create_pipeline(opt);
53
if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
55
// TODO implement int8
60
#if __riscv_vector && __riscv_zfh
61
if (opt.use_fp16_storage)
63
return create_pipeline_fp16s(opt);
70
const int packn = csrr_vlenb() / 4;
72
const int num_input = weight_data_size / num_output;
74
if (opt.use_packing_layout)
76
out_elempack = num_output % packn == 0 ? packn : 1;
79
if (out_elempack == packn)
82
// dst = packn-inch-outch/packn
84
Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
86
weight_data_tm.create(num_input, num_output / packn, (size_t)4u * packn, packn);
88
for (int q = 0; q + (packn - 1) < num_output; q += packn)
90
float* g0 = weight_data_tm.row(q / packn);
92
for (int p = 0; p < num_input; p++)
94
for (int j = 0; j < packn; j++)
96
*g0++ = weight_data_r2.row(q + j)[p];
102
#endif // __riscv_vector
104
if (out_elempack == 1)
106
weight_data_tm = weight_data;
110
weight_data.release();
115
int InnerProduct_riscv::destroy_pipeline(const Option& opt)
119
flatten->destroy_pipeline(opt);
127
int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
130
if (opt.use_int8_inference && int8_scale_term)
132
Mat bottom_blob_unpacked = bottom_blob;
133
if (bottom_blob.elempack != 1)
135
Option opt_pack1 = opt;
136
opt_pack1.blob_allocator = opt.workspace_allocator;
138
convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
141
Mat bottom_blob_unpacked_fp32 = bottom_blob_unpacked;
142
if (bottom_blob_unpacked.elembits() == 16)
144
Option opt_pack1 = opt;
145
opt_pack1.blob_allocator = opt.workspace_allocator;
147
cast_float16_to_float32(bottom_blob_unpacked, bottom_blob_unpacked_fp32, opt_pack1);
150
Option opt_unpacked = opt;
151
opt_unpacked.use_packing_layout = false;
152
return InnerProduct::forward_int8(bottom_blob_unpacked_fp32, top_blob, opt_unpacked);
156
int elembits = bottom_blob.elembits();
158
#if __riscv_vector && __riscv_zfh
159
if (opt.use_fp16_storage && elembits == 16)
161
if (opt.use_fp16_arithmetic)
162
return forward_fp16sa(bottom_blob, top_blob, opt);
164
return forward_fp16s(bottom_blob, top_blob, opt);
169
const int packn = csrr_vlenb() / 4;
172
const int num_input = weight_data_size / num_output;
174
if (bottom_blob.dims == 2 && bottom_blob.w == num_input)
177
int h = bottom_blob.h;
178
size_t elemsize = bottom_blob.elemsize;
179
int elempack = bottom_blob.elempack;
181
top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator);
182
if (top_blob.empty())
185
int num_output_elempack = 1;
187
if (opt.use_packing_layout)
189
num_output_elempack = num_output % packn == 0 ? packn : 1;
193
#pragma omp parallel for num_threads(opt.num_threads)
194
for (int j = 0; j < h; j++)
197
if (elempack == packn && num_output_elempack == packn)
199
const size_t vl = vsetvl_e32m1(packn);
201
float* outptr = top_blob.row(j);
203
for (int p = 0; p < num_output / num_output_elempack; p++)
205
for (int l = 0; l < packn; l++)
207
const float* kptr = weight_data_tm.row(p) + l;
208
const float* m = bottom_blob.row(j);
210
vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl);
214
_sum = vfmv_v_f_f32m1(bias_data[p * packn + l], vl);
220
vfloat32m1_t _val = vle32_v_f32m1(m, vl);
221
_sum = vfmacc_vf_f32m1(_sum, *kptr, _val, vl);
228
_sum = activation_ps(_sum, activation_type, activation_params, vl);
230
vse32_v_f32m1(outptr, _sum, vl);
236
if (elempack == 1 && num_output_elempack == packn)
238
const size_t vl = vsetvl_e32m1(packn);
240
float* outptr = top_blob.row(j);
242
for (int p = 0; p < num_output / num_output_elempack; p++)
244
const float* kptr = weight_data_tm.row(p);
245
const float* m = bottom_blob.row(j);
247
vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl);
251
_sum = vle32_v_f32m1((const float*)bias_data + p * packn, vl);
257
vfloat32m1_t _w = vle32_v_f32m1(kptr, vl);
258
_sum = vfmacc_vf_f32m1(_sum, *m, _w, vl);
265
_sum = activation_ps(_sum, activation_type, activation_params, vl);
267
vse32_v_f32m1(outptr, _sum, vl);
272
if (elempack == packn && num_output_elempack == 1)
274
const size_t vl = vsetvl_e32m1(packn);
276
float* outptr = top_blob.row(j);
278
for (int p = 0; p < num_output; p++)
280
const float* kptr = (const float*)weight_data_tm + num_input * p;
281
const float* m = bottom_blob.row(j);
283
vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl);
287
_sum = vfmv_v_f_f32m1(bias_data[p], vl);
293
vfloat32m1_t _val = vle32_v_f32m1(m, vl);
294
_sum = vfmacc_vf_f32m1(_sum, *kptr, _val, vl);
301
_sum = activation_ps(_sum, activation_type, activation_params, vl);
303
vse32_v_f32m1(outptr, _sum, vl);
307
#endif // __riscv_vector
309
if (elempack == 1 && num_output_elempack == 1)
311
float* outptr = top_blob.row(j);
313
for (int p = 0; p < num_output; p++)
315
const float* kptr = (const float*)weight_data_tm + num_input * p;
316
const float* m = bottom_blob.row(j);
325
for (int i = 0; i < num_input; i++)
327
sum += m[i] * kptr[i];
330
sum = activation_ss(sum, activation_type, activation_params);
342
Mat bottom_blob_flattened = bottom_blob;
343
if (bottom_blob.dims != 1)
345
Option opt_flatten = opt;
346
opt_flatten.blob_allocator = opt.workspace_allocator;
348
flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten);
351
size_t elemsize = bottom_blob_flattened.elemsize;
352
int elempack = bottom_blob_flattened.elempack;
354
int out_elempack = 1;
356
if (opt.use_packing_layout)
358
out_elempack = num_output % packn == 0 ? packn : 1;
361
size_t out_elemsize = elemsize / elempack * out_elempack;
363
top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
364
if (top_blob.empty())
368
if (out_elempack == packn)
370
#pragma omp parallel for num_threads(opt.num_threads)
371
for (int p = 0; p < num_output / out_elempack; p++)
373
const size_t vl = vsetvl_e32m1(packn);
374
vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl);
378
_sum = vle32_v_f32m1((const float*)bias_data + p * packn, vl);
381
const float* kptr = weight_data_tm.row(p);
383
const float* sptr = bottom_blob_flattened;
388
vfloat32m1_t _w = vle32_v_f32m1(kptr, vl);
389
_sum = vfmacc_vf_f32m1(_sum, *sptr, _w, vl);
396
_sum = activation_ps(_sum, activation_type, activation_params, vl);
398
float* outptr = top_blob;
399
vse32_v_f32m1(outptr + p * packn, _sum, vl);
402
#endif // __riscv_vector
404
if (out_elempack == 1)
407
int nn_num_output = num_output / packn;
408
int remain_num_output_start = nn_num_output * packn;
410
#pragma omp parallel for num_threads(opt.num_threads)
411
for (int pp = 0; pp < nn_num_output; pp++)
415
const size_t vl = vsetvl_e32m1(packn);
416
vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl);
420
_sum = vle32_v_f32m1((const float*)bias_data + p, vl);
423
const float* w = (const float*)weight_data_tm + num_input * p;
425
const float* m = bottom_blob_flattened;
430
vfloat32m1_t _w = vlse32_v_f32m1(w, num_input * sizeof(float), vl);
432
_sum = vfmacc_vf_f32m1(_sum, *m, _w, vl);
439
_sum = activation_ps(_sum, activation_type, activation_params, vl);
441
vse32_v_f32m1((float*)top_blob + p, _sum, vl);
443
#else // __riscv_vector
444
int nn_num_output = num_output / 4;
445
int remain_num_output_start = nn_num_output * 4;
447
#pragma omp parallel for num_threads(opt.num_threads)
448
for (int pp = 0; pp < nn_num_output; pp++)
460
sum1 = bias_data[p + 1];
461
sum2 = bias_data[p + 2];
462
sum3 = bias_data[p + 3];
465
const float* w0 = (const float*)weight_data_tm + num_input * p;
466
const float* w1 = (const float*)weight_data_tm + num_input * (p + 1);
467
const float* w2 = (const float*)weight_data_tm + num_input * (p + 2);
468
const float* w3 = (const float*)weight_data_tm + num_input * (p + 3);
470
const float* m = bottom_blob_flattened;
472
for (int i = 0; i < num_input; i++)
486
sum0 = activation_ss(sum0, activation_type, activation_params);
487
sum1 = activation_ss(sum1, activation_type, activation_params);
488
sum2 = activation_ss(sum2, activation_type, activation_params);
489
sum3 = activation_ss(sum3, activation_type, activation_params);
492
top_blob[p + 1] = sum1;
493
top_blob[p + 2] = sum2;
494
top_blob[p + 3] = sum3;
496
#endif // __riscv_vector
498
#pragma omp parallel for num_threads(opt.num_threads)
499
for (int p = remain_num_output_start; p < num_output; p++)
506
const float* w = (const float*)weight_data_tm + num_input * p;
508
const float* m = bottom_blob_flattened;
510
for (int i = 0; i < num_input; i++)
518
sum = activation_ss(sum, activation_type, activation_params);
527
#if __riscv_vector && __riscv_zfh
528
int InnerProduct_riscv::create_pipeline_fp16s(const Option& opt)
530
const int packn = csrr_vlenb() / 2;
532
const int num_input = weight_data_size / num_output;
534
int out_elempack = 1;
536
if (opt.use_packing_layout)
538
out_elempack = num_output % packn == 0 ? packn : 1;
542
// dst = pb-inch-outch/pb
544
Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
546
weight_data_tm.create(num_input, num_output / out_elempack, (size_t)2u * out_elempack, out_elempack);
548
for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
550
__fp16* g0 = weight_data_tm.row<__fp16>(q / out_elempack);
552
for (int p = 0; p < num_input; p++)
554
for (int j = 0; j < out_elempack; j++)
556
*g0++ = (__fp16)(weight_data_r2.row(q + j)[p]);
562
ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
565
weight_data.release();
570
int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
572
const int packn = csrr_vlenb() / 2;
574
const int num_input = weight_data_size / num_output;
576
if (bottom_blob.dims == 2 && bottom_blob.w == num_input)
579
int h = bottom_blob.h;
580
size_t elemsize = bottom_blob.elemsize;
581
int elempack = bottom_blob.elempack;
583
top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator);
584
if (top_blob.empty())
587
int num_output_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1;
589
#pragma omp parallel for num_threads(opt.num_threads)
590
for (int j = 0; j < h; j++)
592
if (elempack == packn && num_output_elempack == packn)
594
const size_t vl = vsetvl_e16m1(packn);
596
__fp16* outptr = top_blob.row<__fp16>(j);
598
for (int p = 0; p < num_output / num_output_elempack; p++)
600
for (int l = 0; l < packn; l++)
602
const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn + l;
603
const __fp16* m = bottom_blob.row<const __fp16>(j);
605
vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl);
609
_sum = vfmv_v_f_f32m2(bias_data[p * packn + l], vl);
615
vfloat32m2_t _val = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(m, vl), vl);
617
_sum = vfmacc_vf_f32m2(_sum, *kptr, _val, vl);
624
_sum = activation_ps(_sum, activation_type, activation_params, vl);
626
vse16_v_f16m1(outptr, vfncvt_f_f_w_f16m1(_sum, vl), vl);
632
if (elempack == 1 && num_output_elempack == packn)
634
const size_t vl = vsetvl_e16m1(packn);
636
__fp16* outptr = top_blob.row<__fp16>(j);
638
for (int p = 0; p < num_output / num_output_elempack; p++)
640
const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn;
641
const __fp16* m = bottom_blob.row<const __fp16>(j);
643
vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl);
647
_sum = vle32_v_f32m2((const float*)bias_data + p * packn, vl);
653
vfloat32m2_t _w = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(kptr, vl), vl);
655
_sum = vfmacc_vf_f32m2(_sum, *m, _w, vl);
662
_sum = activation_ps(_sum, activation_type, activation_params, vl);
664
vse16_v_f16m1(outptr, vfncvt_f_f_w_f16m1(_sum, vl), vl);
669
if (elempack == packn && num_output_elempack == 1)
671
const size_t vl = vsetvl_e16m1(packn);
673
__fp16* outptr = top_blob.row<__fp16>(j);
675
for (int p = 0; p < num_output; p++)
677
const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p;
678
const __fp16* m = bottom_blob.row<const __fp16>(j);
680
vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl);
684
_sum = vfmv_v_f_f32m2(bias_data[p], vl);
690
vfloat32m2_t _val = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(m, vl), vl);
692
_sum = vfmacc_vf_f32m2(_sum, *kptr, _val, vl);
699
_sum = activation_ps(_sum, activation_type, activation_params, vl);
701
vse16_v_f16m1(outptr, vfncvt_f_f_w_f16m1(_sum, vl), vl);
706
if (elempack == 1 && num_output_elempack == 1)
708
__fp16* outptr = top_blob.row<__fp16>(j);
710
for (int p = 0; p < num_output; p++)
712
const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p;
713
const __fp16* m = bottom_blob.row<const __fp16>(j);
722
for (int i = 0; i < num_input; i++)
724
sum += (float)m[i] * (float)kptr[i];
727
sum = activation_ss(sum, activation_type, activation_params);
729
outptr[0] = (__fp16)sum;
739
Mat bottom_blob_flattened = bottom_blob;
740
if (bottom_blob.dims != 1)
742
Option opt_flatten = opt;
743
opt_flatten.blob_allocator = opt.workspace_allocator;
745
flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten);
748
size_t elemsize = bottom_blob_flattened.elemsize;
749
int elempack = bottom_blob_flattened.elempack;
751
int out_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1;
752
size_t out_elemsize = elemsize / elempack * out_elempack;
754
top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
755
if (top_blob.empty())
758
if (out_elempack == packn)
761
#pragma omp parallel for num_threads(opt.num_threads)
762
for (int p = 0; p < num_output / out_elempack; p++)
764
const size_t vl = vsetvl_e16m1(packn);
765
vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl);
769
_sum = vle32_v_f32m2((const float*)bias_data + p * packn, vl);
772
const __fp16* kptr = weight_data_tm.row<const __fp16>(p);
774
const __fp16* sptr = bottom_blob_flattened;
779
vfloat32m2_t _w = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(kptr, vl), vl);
781
_sum = vfmacc_vf_f32m2(_sum, (float)(*sptr), _w, vl);
788
_sum = activation_ps(_sum, activation_type, activation_params, vl);
790
__fp16* outptr = (__fp16*)top_blob;
791
vse16_v_f16m1(outptr + p * packn, vfncvt_f_f_w_f16m1(_sum, vl), vl);
795
if (out_elempack == 1)
798
#pragma omp parallel for num_threads(opt.num_threads)
799
for (int p = 0; p < num_output; p++)
806
const __fp16* kptr = weight_data_tm.row<__fp16>(p);
808
const __fp16* sptr = bottom_blob_flattened;
811
for (; i < num_input; i++)
813
float v = (float)(*sptr);
814
float k = (float)(*kptr);
822
sum = activation_ss(sum, activation_type, activation_params);
824
__fp16* outptr = (__fp16*)top_blob;
825
outptr[p] = (__fp16)sum;
832
int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
834
const int packn = csrr_vlenb() / 2;
836
const int num_input = weight_data_size / num_output;
838
if (bottom_blob.dims == 2 && bottom_blob.w == num_input)
841
int h = bottom_blob.h;
842
size_t elemsize = bottom_blob.elemsize;
843
int elempack = bottom_blob.elempack;
845
top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator);
846
if (top_blob.empty())
849
int num_output_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1;
851
#pragma omp parallel for num_threads(opt.num_threads)
852
for (int j = 0; j < h; j++)
854
if (elempack == packn && num_output_elempack == packn)
856
const size_t vl = vsetvl_e16m1(packn);
858
__fp16* outptr = top_blob.row<__fp16>(j);
860
for (int p = 0; p < num_output / num_output_elempack; p++)
862
for (int l = 0; l < packn; l++)
864
const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn + l;
865
const __fp16* m = bottom_blob.row<const __fp16>(j);
867
vfloat16m1_t _sum = vfmv_v_f_f16m1((__fp16)0.f, vl);
871
_sum = vfmv_v_f_f16m1(((const __fp16*)bias_data_fp16)[p * packn + l], vl);
877
vfloat16m1_t _val = vle16_v_f16m1(m, vl);
879
_sum = vfmacc_vf_f16m1(_sum, *kptr, _val, vl);
886
_sum = activation_ps(_sum, activation_type, activation_params, vl);
888
vse16_v_f16m1(outptr, _sum, vl);
894
if (elempack == 1 && num_output_elempack == packn)
896
const size_t vl = vsetvl_e16m1(packn);
898
__fp16* outptr = top_blob.row<__fp16>(j);
900
for (int p = 0; p < num_output / num_output_elempack; p++)
902
const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn;
903
const __fp16* m = bottom_blob.row<const __fp16>(j);
905
vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl);
909
_sum = vle16_v_f16m1((const __fp16*)bias_data_fp16 + p * packn, vl);
915
vfloat16m1_t _w = vle16_v_f16m1(kptr, vl);
917
_sum = vfmacc_vf_f16m1(_sum, *m, _w, vl);
924
_sum = activation_ps(_sum, activation_type, activation_params, vl);
926
vse16_v_f16m1(outptr, _sum, vl);
931
if (elempack == packn && num_output_elempack == 1)
933
const size_t vl = vsetvl_e16m1(packn);
935
__fp16* outptr = top_blob.row<__fp16>(j);
937
for (int p = 0; p < num_output; p++)
939
const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p;
940
const __fp16* m = bottom_blob.row<const __fp16>(j);
942
vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl);
946
_sum = vfmv_v_f_f16m1(((const __fp16*)bias_data_fp16)[p], vl);
952
vfloat16m1_t _val = vle16_v_f16m1(m, vl);
954
_sum = vfmacc_vf_f16m1(_sum, *kptr, _val, vl);
961
_sum = activation_ps(_sum, activation_type, activation_params, vl);
963
vse16_v_f16m1(outptr, _sum, vl);
968
if (elempack == 1 && num_output_elempack == 1)
970
__fp16* outptr = top_blob.row<__fp16>(j);
972
for (int p = 0; p < num_output; p++)
974
const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p;
975
const __fp16* m = bottom_blob.row<const __fp16>(j);
984
for (int i = 0; i < num_input; i++)
986
sum += (float)(m[i] * kptr[i]);
989
sum = activation_ss(sum, activation_type, activation_params);
991
outptr[0] = (__fp16)sum;
1001
Mat bottom_blob_flattened = bottom_blob;
1002
if (bottom_blob.dims != 1)
1004
Option opt_flatten = opt;
1005
opt_flatten.blob_allocator = opt.workspace_allocator;
1007
flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten);
1010
size_t elemsize = bottom_blob_flattened.elemsize;
1011
int elempack = bottom_blob_flattened.elempack;
1013
int out_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1;
1014
size_t out_elemsize = elemsize / elempack * out_elempack;
1016
top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
1017
if (top_blob.empty())
1020
if (out_elempack == packn)
1023
#pragma omp parallel for num_threads(opt.num_threads)
1024
for (int p = 0; p < num_output / out_elempack; p++)
1026
const size_t vl = vsetvl_e16m1(packn);
1027
vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl);
1031
_sum = vle16_v_f16m1((const __fp16*)bias_data_fp16 + p * packn, vl);
1034
const __fp16* kptr = weight_data_tm.row<const __fp16>(p);
1036
const __fp16* sptr = bottom_blob_flattened;
1041
vfloat16m1_t _w = vle16_v_f16m1(kptr, vl);
1043
_sum = vfmacc_vf_f16m1(_sum, *sptr, _w, vl);
1050
_sum = activation_ps(_sum, activation_type, activation_params, vl);
1052
__fp16* outptr = (__fp16*)top_blob;
1053
vse16_v_f16m1(outptr + p * packn, _sum, vl);
1057
if (out_elempack == 1)
1060
#pragma omp parallel for num_threads(opt.num_threads)
1061
for (int p = 0; p < num_output; p++)
1068
const __fp16* kptr = weight_data_tm.row<__fp16>(p);
1070
const __fp16* sptr = bottom_blob_flattened;
1073
for (; i < num_input; i++)
1078
sum += (float)(v * k);
1084
sum = activation_ss(sum, activation_type, activation_params);
1086
__fp16* outptr = (__fp16*)top_blob;
1087
outptr[p] = (__fp16)sum;
1093
#endif // __riscv_vector && __riscv_zfh