15
#include "pooling_riscv.h"
20
#include <riscv_vector.h>
23
#include "riscv_usability.h"
27
Pooling_riscv::Pooling_riscv()
30
support_packing = true;
32
support_fp16_storage = true;
37
int Pooling_riscv::create_pipeline(const Option& )
41
support_packing = false;
43
support_bf16_storage = false;
44
support_fp16_storage = false;
45
support_int8_storage = false;
46
support_tensor_storage = false;
51
int Pooling_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
55
return Pooling::forward(bottom_blob, top_blob, opt);
58
int elembits = bottom_blob.elembits();
60
#if __riscv_vector && __riscv_zfh
61
if (opt.use_fp16_storage && elembits == 16)
63
if (opt.use_fp16_arithmetic)
64
return forward_fp16sa(bottom_blob, top_blob, opt);
66
return forward_fp16s(bottom_blob, top_blob, opt);
74
const int packn = csrr_vlenb() / 4;
75
const size_t vl = vsetvl_e32m1(packn);
78
int w = bottom_blob.w;
79
int h = bottom_blob.h;
80
int channels = bottom_blob.c;
81
size_t elemsize = bottom_blob.elemsize;
82
int elempack = bottom_blob.elempack;
87
if (elempack == packn)
91
top_blob.create(channels, elemsize, elempack, opt.blob_allocator);
97
if (pooling_type == PoolMethod_MAX)
99
#pragma omp parallel for num_threads(opt.num_threads)
100
for (int q = 0; q < channels; q++)
102
const float* ptr = bottom_blob.channel(q);
104
vfloat32m1_t _max = vle32_v_f32m1(ptr, vl);
105
for (int i = 0; i < size; i++)
107
vfloat32m1_t _val = vle32_v_f32m1(ptr, vl);
108
_max = vfmax_vv_f32m1(_max, _val, vl);
112
float* outptr = top_blob;
113
vse32_v_f32m1(outptr + q * packn, _max, vl);
116
else if (pooling_type == PoolMethod_AVE)
118
#pragma omp parallel for num_threads(opt.num_threads)
119
for (int q = 0; q < channels; q++)
121
const float* ptr = bottom_blob.channel(q);
123
vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl);
124
for (int i = 0; i < size; i++)
126
vfloat32m1_t _val = vle32_v_f32m1(ptr, vl);
127
_sum = vfadd_vv_f32m1(_sum, _val, vl);
131
vfloat32m1_t _avg = vfmul_vf_f32m1(_sum, 1.f / size, vl);
133
float* outptr = top_blob;
134
vse32_v_f32m1(outptr + q * packn, _avg, vl);
141
Mat bottom_blob_bordered;
142
make_padding(bottom_blob, bottom_blob_bordered, opt);
143
if (bottom_blob_bordered.empty())
146
w = bottom_blob_bordered.w;
147
h = bottom_blob_bordered.h;
149
int outw = (w - kernel_w) / stride_w + 1;
150
int outh = (h - kernel_h) / stride_h + 1;
152
top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
153
if (top_blob.empty())
156
const int maxk = kernel_w * kernel_h;
159
std::vector<int> _space_ofs(maxk);
160
int* space_ofs = &_space_ofs[0];
164
int gap = w - kernel_w;
165
for (int i = 0; i < kernel_h; i++)
167
for (int j = 0; j < kernel_w; j++)
177
if (pooling_type == PoolMethod_MAX)
179
#pragma omp parallel for num_threads(opt.num_threads)
180
for (int q = 0; q < channels; q++)
182
const Mat m = bottom_blob_bordered.channel(q);
183
float* outptr = top_blob.channel(q);
185
for (int i = 0; i < outh; i++)
187
for (int j = 0; j < outw; j++)
189
const float* sptr = m.row(i * stride_h) + j * stride_w * packn;
191
vfloat32m1_t _max = vle32_v_f32m1(sptr, vl);
193
for (int k = 0; k < maxk; k++)
195
vfloat32m1_t _val = vle32_v_f32m1(sptr + space_ofs[k] * packn, vl);
196
_max = vfmax_vv_f32m1(_max, _val, vl);
199
vse32_v_f32m1(outptr + j * packn, _max, vl);
202
outptr += outw * packn;
206
else if (pooling_type == PoolMethod_AVE)
208
if (avgpool_count_include_pad == 0)
215
wtailpad = bottom_blob_bordered.w - bottom_blob.w - pad_left - pad_right;
216
htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom;
219
#pragma omp parallel for num_threads(opt.num_threads)
220
for (int q = 0; q < channels; q++)
222
const Mat m = bottom_blob_bordered.channel(q);
223
float* outptr = top_blob.channel(q);
225
for (int i = 0; i < outh; i++)
227
int sy0 = i * stride_h;
229
for (int j = 0; j < outw; j++)
231
int sx0 = j * stride_w;
233
vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl);
236
for (int ki = 0; ki < kernel_h; ki++)
243
if (sy >= h - pad_bottom - htailpad)
246
for (int kj = 0; kj < kernel_w; kj++)
253
if (sx >= w - pad_right - wtailpad)
256
vfloat32m1_t _val = vle32_v_f32m1(m.row(sy) + sx * packn, vl);
257
_sum = vfadd_vv_f32m1(_sum, _val, vl);
262
vfloat32m1_t _avg = vfmul_vf_f32m1(_sum, 1.f / area, vl);
263
vse32_v_f32m1(outptr + j * packn, _avg, vl);
266
outptr += outw * packn;
272
#pragma omp parallel for num_threads(opt.num_threads)
273
for (int q = 0; q < channels; q++)
275
const Mat m = bottom_blob_bordered.channel(q);
276
float* outptr = top_blob.channel(q);
278
const float inv_maxk = 1.f / maxk;
280
for (int i = 0; i < outh; i++)
282
for (int j = 0; j < outw; j++)
284
const float* sptr = m.row(i * stride_h) + j * stride_w * packn;
286
vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl);
288
for (int k = 0; k < maxk; k++)
290
vfloat32m1_t _val = vle32_v_f32m1(sptr + space_ofs[k] * packn, vl);
291
_sum = vfadd_vv_f32m1(_sum, _val, vl);
294
vfloat32m1_t _avg = vfmul_vf_f32m1(_sum, inv_maxk, vl);
295
vse32_v_f32m1(outptr + j * packn, _avg, vl);
298
outptr += outw * packn;
308
return Pooling::forward(bottom_blob, top_blob, opt);
311
#if __riscv_vector && __riscv_zfh
312
int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
317
const int packn = csrr_vlenb() / 2;
318
const size_t vl = vsetvl_e16m1(packn);
320
int w = bottom_blob.w;
321
int h = bottom_blob.h;
322
int channels = bottom_blob.c;
323
size_t elemsize = bottom_blob.elemsize;
324
int elempack = bottom_blob.elempack;
330
top_blob.create(channels, elemsize, elempack, opt.blob_allocator);
331
if (top_blob.empty())
336
if (pooling_type == PoolMethod_MAX)
338
if (elempack == packn)
340
#pragma omp parallel for num_threads(opt.num_threads)
341
for (int q = 0; q < channels; q++)
343
const __fp16* ptr = bottom_blob.channel(q);
345
vfloat16m1_t _max = vfmv_v_f_f16m1((__fp16)-FLT_MAX, vl);
346
for (int i = 0; i < size; i++)
348
vfloat16m1_t _val = vle16_v_f16m1(ptr, vl);
349
_max = vfmax_vv_f16m1(_max, _val, vl);
353
__fp16* outptr = top_blob;
354
vse16_v_f16m1(outptr + q * packn, _max, vl);
360
#pragma omp parallel for num_threads(opt.num_threads)
361
for (int q = 0; q < channels; q++)
363
const __fp16* ptr = bottom_blob.channel(q);
365
__fp16 max = (__fp16)-FLT_MAX;
366
for (int i = 0; i < size; i++)
368
max = std::max(max, ptr[i]);
371
__fp16* outptr = top_blob;
377
if (pooling_type == PoolMethod_AVE)
379
if (elempack == packn)
381
#pragma omp parallel for num_threads(opt.num_threads)
382
for (int q = 0; q < channels; q++)
384
const __fp16* ptr = bottom_blob.channel(q);
386
vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl);
387
for (int i = 0; i < size; i++)
389
vfloat32m2_t _val = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr, vl), vl);
390
_sum = vfadd_vv_f32m2(_sum, _val, vl);
394
vfloat32m2_t _avg = vfmul_vf_f32m2(_sum, 1.f / size, vl);
396
__fp16* outptr = top_blob;
397
vse16_v_f16m1(outptr + q * packn, vfncvt_f_f_w_f16m1(_avg, vl), vl);
403
#pragma omp parallel for num_threads(opt.num_threads)
404
for (int q = 0; q < channels; q++)
406
const __fp16* ptr = bottom_blob.channel(q);
409
for (int i = 0; i < size; i++)
411
sum += (float)ptr[i];
414
__fp16* outptr = top_blob;
415
outptr[q] = (__fp16)(sum / size);
423
Mat bottom_blob_bordered;
424
make_padding(bottom_blob, bottom_blob_bordered, opt);
425
if (bottom_blob_bordered.empty())
428
w = bottom_blob_bordered.w;
429
h = bottom_blob_bordered.h;
431
int outw = (w - kernel_w) / stride_w + 1;
432
int outh = (h - kernel_h) / stride_h + 1;
434
top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
435
if (top_blob.empty())
438
const int maxk = kernel_w * kernel_h;
441
std::vector<int> _space_ofs(maxk);
442
int* space_ofs = &_space_ofs[0];
446
int gap = w - kernel_w;
447
for (int i = 0; i < kernel_h; i++)
449
for (int j = 0; j < kernel_w; j++)
459
if (pooling_type == PoolMethod_MAX)
461
if (elempack == packn)
463
#pragma omp parallel for num_threads(opt.num_threads)
464
for (int q = 0; q < channels; q++)
466
const Mat m = bottom_blob_bordered.channel(q);
467
__fp16* outptr = top_blob.channel(q);
469
for (int i = 0; i < outh; i++)
471
for (int j = 0; j < outw; j++)
473
const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w * packn;
475
vfloat16m1_t _max = vfmv_v_f_f16m1((__fp16)-FLT_MAX, vl);
477
for (int k = 0; k < maxk; k++)
479
vfloat16m1_t _val = vle16_v_f16m1(sptr + space_ofs[k] * packn, vl);
480
_max = vfmax_vv_f16m1(_max, _val, vl);
483
vse16_v_f16m1(outptr + j * packn, _max, vl);
486
outptr += outw * packn;
493
#pragma omp parallel for num_threads(opt.num_threads)
494
for (int q = 0; q < channels; q++)
496
const Mat m = bottom_blob_bordered.channel(q);
497
__fp16* outptr = top_blob.channel(q);
499
for (int i = 0; i < outh; i++)
501
for (int j = 0; j < outw; j++)
503
const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w;
505
__fp16 max = (__fp16)-FLT_MAX;
507
for (int k = 0; k < maxk; k++)
509
__fp16 val = sptr[space_ofs[k]];
510
max = std::max(max, val);
522
if (pooling_type == PoolMethod_AVE)
524
if (avgpool_count_include_pad == 0)
531
wtailpad = bottom_blob_bordered.w - bottom_blob.w - pad_left - pad_right;
532
htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom;
535
if (elempack == packn)
537
#pragma omp parallel for num_threads(opt.num_threads)
538
for (int q = 0; q < channels; q++)
540
const Mat m = bottom_blob_bordered.channel(q);
541
__fp16* outptr = top_blob.channel(q);
543
for (int i = 0; i < outh; i++)
545
int sy0 = i * stride_h;
547
for (int j = 0; j < outw; j++)
549
int sx0 = j * stride_w;
551
vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl);
554
for (int ki = 0; ki < kernel_h; ki++)
561
if (sy >= h - pad_bottom - htailpad)
564
for (int kj = 0; kj < kernel_w; kj++)
571
if (sx >= w - pad_right - wtailpad)
574
vfloat32m2_t _val = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(m.row<const __fp16>(sy) + sx * packn, vl), vl);
575
_sum = vfadd_vv_f32m2(_sum, _val, vl);
580
vfloat32m2_t _avg = vfmul_vf_f32m2(_sum, 1.f / area, vl);
581
vse16_v_f16m1(outptr + j * packn, vfncvt_f_f_w_f16m1(_avg, vl), vl);
584
outptr += outw * packn;
591
#pragma omp parallel for num_threads(opt.num_threads)
592
for (int q = 0; q < channels; q++)
594
const Mat m = bottom_blob_bordered.channel(q);
595
__fp16* outptr = top_blob.channel(q);
597
for (int i = 0; i < outh; i++)
599
int sy0 = i * stride_h;
601
for (int j = 0; j < outw; j++)
603
int sx0 = j * stride_w;
608
for (int ki = 0; ki < kernel_h; ki++)
615
if (sy >= h - pad_bottom - htailpad)
618
for (int kj = 0; kj < kernel_w; kj++)
625
if (sx >= w - pad_right - wtailpad)
628
float val = (float)(m.row<const __fp16>(sy)[sx]);
634
outptr[j] = (__fp16)(sum / area);
643
if (avgpool_count_include_pad == 1)
645
if (elempack == packn)
647
#pragma omp parallel for num_threads(opt.num_threads)
648
for (int q = 0; q < channels; q++)
650
const Mat m = bottom_blob_bordered.channel(q);
651
__fp16* outptr = top_blob.channel(q);
653
const float inv_maxk = 1.f / maxk;
655
for (int i = 0; i < outh; i++)
657
for (int j = 0; j < outw; j++)
659
const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w * packn;
661
vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl);
663
for (int k = 0; k < maxk; k++)
665
vfloat32m2_t _val = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(sptr + space_ofs[k] * packn, vl), vl);
666
_sum = vfadd_vv_f32m2(_sum, _val, vl);
669
vfloat32m2_t _avg = vfmul_vf_f32m2(_sum, inv_maxk, vl);
670
vse16_v_f16m1(outptr + j * packn, vfncvt_f_f_w_f16m1(_avg, vl), vl);
673
outptr += outw * packn;
680
#pragma omp parallel for num_threads(opt.num_threads)
681
for (int q = 0; q < channels; q++)
683
const Mat m = bottom_blob_bordered.channel(q);
684
__fp16* outptr = top_blob.channel(q);
686
for (int i = 0; i < outh; i++)
688
for (int j = 0; j < outw; j++)
690
const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w;
694
for (int k = 0; k < maxk; k++)
696
float val = (float)(sptr[space_ofs[k]]);
700
outptr[j] = (__fp16)(sum / maxk);
713
int Pooling_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
718
if (pooling_type == PoolMethod_MAX || global_pooling)
720
return forward_fp16s(bottom_blob, top_blob, opt);
723
const int packn = csrr_vlenb() / 2;
724
const size_t vl = vsetvl_e16m1(packn);
726
int w = bottom_blob.w;
727
int h = bottom_blob.h;
728
int channels = bottom_blob.c;
729
size_t elemsize = bottom_blob.elemsize;
730
int elempack = bottom_blob.elempack;
734
Mat bottom_blob_bordered;
735
make_padding(bottom_blob, bottom_blob_bordered, opt);
736
if (bottom_blob_bordered.empty())
739
w = bottom_blob_bordered.w;
740
h = bottom_blob_bordered.h;
742
int outw = (w - kernel_w) / stride_w + 1;
743
int outh = (h - kernel_h) / stride_h + 1;
745
top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
746
if (top_blob.empty())
749
const int maxk = kernel_w * kernel_h;
752
std::vector<int> _space_ofs(maxk);
753
int* space_ofs = &_space_ofs[0];
757
int gap = w - kernel_w;
758
for (int i = 0; i < kernel_h; i++)
760
for (int j = 0; j < kernel_w; j++)
770
if (pooling_type == PoolMethod_AVE)
772
if (avgpool_count_include_pad == 0)
779
wtailpad = bottom_blob_bordered.w - bottom_blob.w - pad_left - pad_right;
780
htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom;
783
if (elempack == packn)
785
#pragma omp parallel for num_threads(opt.num_threads)
786
for (int q = 0; q < channels; q++)
788
const Mat m = bottom_blob_bordered.channel(q);
789
__fp16* outptr = top_blob.channel(q);
791
for (int i = 0; i < outh; i++)
793
int sy0 = i * stride_h;
795
for (int j = 0; j < outw; j++)
797
int sx0 = j * stride_w;
799
vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl);
802
for (int ki = 0; ki < kernel_h; ki++)
809
if (sy >= h - pad_bottom - htailpad)
812
for (int kj = 0; kj < kernel_w; kj++)
819
if (sx >= w - pad_right - wtailpad)
822
vfloat16m1_t _val = vle16_v_f16m1(m.row<const __fp16>(sy) + sx * packn, vl);
823
_sum = vfadd_vv_f16m1(_sum, _val, vl);
828
vfloat16m1_t _avg = vfmul_vf_f16m1(_sum, (__fp16)(1.f / area), vl);
829
vse16_v_f16m1(outptr + j * packn, _avg, vl);
832
outptr += outw * packn;
839
#pragma omp parallel for num_threads(opt.num_threads)
840
for (int q = 0; q < channels; q++)
842
const Mat m = bottom_blob_bordered.channel(q);
843
__fp16* outptr = top_blob.channel(q);
845
for (int i = 0; i < outh; i++)
847
int sy0 = i * stride_h;
849
for (int j = 0; j < outw; j++)
851
int sx0 = j * stride_w;
853
__fp16 sum = (__fp16)0.f;
856
for (int ki = 0; ki < kernel_h; ki++)
863
if (sy >= h - pad_bottom - htailpad)
866
for (int kj = 0; kj < kernel_w; kj++)
873
if (sx >= w - pad_right - wtailpad)
876
__fp16 val = m.row<const __fp16>(sy)[sx];
882
outptr[j] = sum / area;
891
if (avgpool_count_include_pad == 1)
893
if (elempack == packn)
895
#pragma omp parallel for num_threads(opt.num_threads)
896
for (int q = 0; q < channels; q++)
898
const Mat m = bottom_blob_bordered.channel(q);
899
__fp16* outptr = top_blob.channel(q);
901
const __fp16 inv_maxk = (__fp16)(1.f / maxk);
903
for (int i = 0; i < outh; i++)
905
for (int j = 0; j < outw; j++)
907
const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w * packn;
909
vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl);
911
for (int k = 0; k < maxk; k++)
913
vfloat16m1_t _val = vle16_v_f16m1(sptr + space_ofs[k] * packn, vl);
914
_sum = vfadd_vv_f16m1(_sum, _val, vl);
917
vfloat16m1_t _avg = vfmul_vf_f16m1(_sum, inv_maxk, vl);
918
vse16_v_f16m1(outptr + j * packn, _avg, vl);
921
outptr += outw * packn;
928
#pragma omp parallel for num_threads(opt.num_threads)
929
for (int q = 0; q < channels; q++)
931
const Mat m = bottom_blob_bordered.channel(q);
932
__fp16* outptr = top_blob.channel(q);
934
for (int i = 0; i < outh; i++)
936
for (int j = 0; j < outw; j++)
938
const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w;
940
__fp16 sum = (__fp16)0.f;
942
for (int k = 0; k < maxk; k++)
944
__fp16 val = sptr[space_ofs[k]];
948
outptr[j] = sum / maxk;