1
// Xavier Hsinyuan is pleased to support the open source community by making ncnn available.
3
// Copyright (C) 2021 Xavier Hsinyuan <me@lstlx.com>. All rights reserved.
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
8
// https://opensource.org/licenses/BSD-3-Clause
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
15
#include "prelu_riscv.h"
18
#include <riscv_vector.h>
19
#endif // __riscv_vector
23
PReLU_riscv::PReLU_riscv()
26
support_packing = true;
28
support_fp16_storage = true;
33
int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
35
int elembits = bottom_top_blob.elembits();
37
#if __riscv_vector && __riscv_zfh
38
if (opt.use_fp16_storage && elembits == 16)
40
if (opt.use_fp16_arithmetic)
41
return forward_inplace_fp16sa(bottom_top_blob, opt);
43
return forward_inplace_fp16s(bottom_top_blob, opt);
47
int w = bottom_top_blob.w;
48
int h = bottom_top_blob.h;
49
int channels = bottom_top_blob.c;
51
int elempack = bottom_top_blob.elempack;
52
int dims = bottom_top_blob.dims;
56
int w = bottom_top_blob.w;
57
float* ptr = bottom_top_blob;
58
const float* ptr_slope = slope_data;
63
// #pragma omp parallel for num_threads(opt.num_threads)
66
size_t vl = vsetvl_e32m8(n);
67
vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
68
vfloat32m8_t _slope = vle32_v_f32m8(ptr_slope, vl);
69
vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl);
71
_p = vfmul_vv_f32m8_m(_lower, _p, /*op1*/ _p, _slope, vl);
72
vse32_v_f32m8(ptr, _p, vl);
81
float slope = slope_data[0];
84
// #pragma omp parallel for num_threads(opt.num_threads)
87
size_t vl = vsetvl_e32m8(n);
88
vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
89
vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl);
91
_p = vfmul_vf_f32m8_m(_lower, _p, /*op1*/ _p, slope, vl);
92
vse32_v_f32m8(ptr, _p, vl);
102
int w = bottom_top_blob.w;
103
int h = bottom_top_blob.h;
105
#pragma omp parallel for num_threads(opt.num_threads)
106
for (int i = 0; i < h; i++)
108
float* ptr = bottom_top_blob.row(i);
111
for (int j = 0; j < w; j++)
113
const float* ptr_slope = (const float*)slope_data + i * elempack;
118
size_t vl = vsetvl_e32m8(n);
119
vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
120
vfloat32m8_t _slope = vle32_v_f32m8(ptr_slope, vl);
122
vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl);
123
_p = vfmul_vv_f32m8_m(_lower, _p, /*op1*/ _p, _slope, vl);
124
vse32_v_f32m8(ptr, _p, vl);
134
float slope = slope_data[0];
135
int n = w * elempack;
138
size_t vl = vsetvl_e32m8(n);
139
vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
140
vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl);
142
_p = vfmul_vf_f32m8_m(_lower, _p, /*op1*/ _p, slope, vl);
143
vse32_v_f32m8(ptr, _p, vl);
154
int w = bottom_top_blob.w;
155
int h = bottom_top_blob.h;
156
int channels = bottom_top_blob.c;
159
#pragma omp parallel for num_threads(opt.num_threads)
160
for (int q = 0; q < channels; q++)
162
float* ptr = bottom_top_blob.channel(q);
163
int n = size * elempack;
165
if (num_slope > 1 && elempack != 1)
170
const float* slope_ptr = (const float*)slope_data + q * elempack;
173
size_t vl = vsetvl_e32m8(n1);
174
vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
175
vfloat32m8_t _slope = vle32_v_f32m8(slope_ptr, vl);
177
vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl);
178
_p = vfmul_vv_f32m8_m(_lower, _p, /*op1*/ _p, _slope, vl);
179
vse32_v_f32m8(ptr, _p, vl);
190
// num_slope == 1 or elempack ==1
191
float slope = num_slope > 1 ? slope_data[q] : slope_data[0];
194
size_t vl = vsetvl_e32m8(n);
195
vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
197
vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl);
198
_p = vfmul_vf_f32m8_m(_lower, _p, /*op1*/ _p, slope, vl);
199
vse32_v_f32m8(ptr, _p, vl);
211
int w = bottom_top_blob.w;
213
float* ptr = bottom_top_blob;
217
#pragma omp parallel for num_threads(opt.num_threads)
218
for (int i = 0; i < w; i++)
221
ptr[i] *= slope_data[i];
226
float slope = slope_data[0];
228
#pragma omp parallel for num_threads(opt.num_threads)
229
for (int i = 0; i < w; i++)
239
int w = bottom_top_blob.w;
240
int h = bottom_top_blob.h;
242
#pragma omp parallel for num_threads(opt.num_threads)
243
for (int i = 0; i < h; i++)
245
float* ptr = bottom_top_blob.row(i);
246
float slope = num_slope > 1 ? slope_data[i] : slope_data[0];
248
for (int j = 0; j < w; j++)
258
int w = bottom_top_blob.w;
259
int h = bottom_top_blob.h;
260
int channels = bottom_top_blob.c;
263
#pragma omp parallel for num_threads(opt.num_threads)
264
for (int q = 0; q < channels; q++)
266
float* ptr = bottom_top_blob.channel(q);
267
float slope = num_slope > 1 ? slope_data[q] : slope_data[0];
269
for (int i = 0; i < size; i++)
282
#if __riscv_vector && __riscv_zfh
284
//hint: slope always store as fp32
286
int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
288
int w = bottom_top_blob.w;
289
int h = bottom_top_blob.h;
291
int elempack = bottom_top_blob.elempack;
292
int dims = bottom_top_blob.dims;
296
int w = bottom_top_blob.w;
297
__fp16* ptr = bottom_top_blob;
298
const float* ptr_slope = slope_data;
301
int n = w * elempack;
303
// #pragma omp parallel for num_threads(opt.num_threads)
306
size_t vl = vsetvl_e16m4(n);
308
vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl);
309
vfloat32m8_t _slope = vle32_v_f32m8(ptr_slope, vl);
310
vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl);
311
_p = vfmul_vv_f32m8_m(_lower, _p, /*op1*/ _p, _slope, vl);
313
vse16_v_f16m4(ptr, vfncvt_f_f_w_f16m4(_p, vl), vl);
321
float slope = slope_data[0];
323
int n = w * elempack;
324
// #pragma omp parallel for num_threads(opt.num_threads)
327
size_t vl = vsetvl_e16m4(n);
328
vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl);
329
vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl);
331
_p = vfmul_vf_f32m8_m(_lower, _p, /*op1*/ _p, slope, vl);
332
vse16_v_f16m4(ptr, vfncvt_f_f_w_f16m4(_p, vl), vl);
342
int w = bottom_top_blob.w;
343
int h = bottom_top_blob.h;
345
#pragma omp parallel for num_threads(opt.num_threads)
346
for (int i = 0; i < h; i++)
348
__fp16* ptr = bottom_top_blob.row<__fp16>(i);
351
for (int j = 0; j < w; j++)
353
const float* ptr_slope = (const float*)slope_data + i * elempack;
358
size_t vl = vsetvl_e16m4(n);
359
vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl);
360
vfloat32m8_t _slope = vle32_v_f32m8(ptr_slope, vl);
362
vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl);
363
_p = vfmul_vv_f32m8_m(_lower, _p, /*op1*/ _p, _slope, vl);
364
vse16_v_f16m4(ptr, vfncvt_f_f_w_f16m4(_p, vl), vl);
374
float slope = slope_data[0];
375
int n = w * elempack;
378
size_t vl = vsetvl_e16m4(n);
379
vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl);
380
vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl);
382
_p = vfmul_vf_f32m8_m(_lower, _p, /*op1*/ _p, slope, vl);
383
vse16_v_f16m4(ptr, vfncvt_f_f_w_f16m4(_p, vl), vl);
394
int w = bottom_top_blob.w;
395
int h = bottom_top_blob.h;
396
int channels = bottom_top_blob.c;
399
#pragma omp parallel for num_threads(opt.num_threads)
400
for (int q = 0; q < channels; q++)
402
__fp16* ptr = bottom_top_blob.channel(q);
403
int n = size * elempack;
405
if (num_slope > 1 && elempack != 1)
410
const float* slope_ptr = (const float*)slope_data + q * elempack;
413
size_t vl = vsetvl_e16m4(n1);
414
vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl);
415
vfloat32m8_t _slope = vle32_v_f32m8(slope_ptr, vl);
417
vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl);
418
_p = vfmul_vv_f32m8_m(_lower, _p, /*op1*/ _p, _slope, vl);
419
vse16_v_f16m4(ptr, vfncvt_f_f_w_f16m4(_p, vl), vl);
430
// num_slope == 1 or elempack ==1
431
float slope = num_slope > 1 ? slope_data[q] : slope_data[0];
434
size_t vl = vsetvl_e16m4(n);
435
vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl);
437
vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl);
438
_p = vfmul_vf_f32m8_m(_lower, _p, /*op1*/ _p, slope, vl);
439
vse16_v_f16m4(ptr, vfncvt_f_f_w_f16m4(_p, vl), vl);
451
int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const
453
int w = bottom_top_blob.w;
454
int h = bottom_top_blob.h;
456
int elempack = bottom_top_blob.elempack;
457
int dims = bottom_top_blob.dims;
461
int w = bottom_top_blob.w;
462
__fp16* ptr = bottom_top_blob;
463
const float* ptr_slope = slope_data;
466
int n = w * elempack;
468
// #pragma omp parallel for num_threads(opt.num_threads)
471
size_t vl = vsetvl_e16m4(n);
472
vfloat16m4_t _p = vle16_v_f16m4(ptr, vl);
473
vfloat16m4_t _slope = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_slope, vl), vl);
474
vbool4_t _lower = vmflt_vf_f16m4_b4(_p, .0f, vl);
476
_p = vfmul_vv_f16m4_m(_lower, _p, /*op1*/ _p, _slope, vl);
477
vse16_v_f16m4(ptr, _p, vl);
486
__fp16 slope = slope_data[0];
488
int n = w * elempack;
489
// #pragma omp parallel for num_threads(opt.num_threads)
492
size_t vl = vsetvl_e16m8(n);
493
vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
494
vbool2_t _lower = vmflt_vf_f16m8_b2(_p, .0f, vl);
496
_p = vfmul_vf_f16m8_m(_lower, _p, /*op1*/ _p, slope, vl);
497
vse16_v_f16m8(ptr, _p, vl);
507
int w = bottom_top_blob.w;
508
int h = bottom_top_blob.h;
510
#pragma omp parallel for num_threads(opt.num_threads)
511
for (int i = 0; i < h; i++)
513
__fp16* ptr = bottom_top_blob.row<__fp16>(i);
516
for (int j = 0; j < w; j++)
518
const float* ptr_slope = (const float*)slope_data + i * elempack;
523
size_t vl = vsetvl_e16m4(n);
524
vfloat16m4_t _p = vle16_v_f16m4(ptr, vl);
525
vfloat16m4_t _slope = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_slope, vl), vl);
527
vbool4_t _lower = vmflt_vf_f16m4_b4(_p, .0f, vl);
528
_p = vfmul_vv_f16m4_m(_lower, _p, /*op1*/ _p, _slope, vl);
529
vse16_v_f16m4(ptr, _p, vl);
539
__fp16 slope = slope_data[0];
540
int n = w * elempack;
543
size_t vl = vsetvl_e16m8(n);
544
vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
545
vbool2_t _lower = vmflt_vf_f16m8_b2(_p, .0f, vl);
547
_p = vfmul_vf_f16m8_m(_lower, _p, /*op1*/ _p, slope, vl);
548
vse16_v_f16m8(ptr, _p, vl);
559
int w = bottom_top_blob.w;
560
int h = bottom_top_blob.h;
561
int channels = bottom_top_blob.c;
564
#pragma omp parallel for num_threads(opt.num_threads)
565
for (int q = 0; q < channels; q++)
567
__fp16* ptr = bottom_top_blob.channel(q);
568
int n = size * elempack;
570
if (num_slope > 1 && elempack != 1)
575
const float* slope_ptr = (const float*)slope_data + q * elempack;
578
size_t vl = vsetvl_e16m4(n1);
579
vfloat16m4_t _p = vle16_v_f16m4(ptr, vl);
580
vfloat16m4_t _slope = vfncvt_f_f_w_f16m4(vle32_v_f32m8(slope_ptr, vl), vl);
582
vbool4_t _lower = vmflt_vf_f16m4_b4(_p, .0f, vl);
583
_p = vfmul_vv_f16m4_m(_lower, _p, /*op1*/ _p, _slope, vl);
584
vse16_v_f16m4(ptr, _p, vl);
595
// num_slope == 1 or elempack ==1
596
float slope = num_slope > 1 ? slope_data[q] : slope_data[0];
599
size_t vl = vsetvl_e16m8(n);
600
vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
602
vbool2_t _lower = vmflt_vf_f16m8_b2(_p, .0f, vl);
603
_p = vfmul_vf_f16m8_m(_lower, _p, /*op1*/ _p, (__fp16)slope, vl);
604
vse16_v_f16m8(ptr, _p, vl);