1
// yala is pleased to support the open source community by making ncnn available.
4
// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
8
// https://opensource.org/licenses/BSD-3-Clause
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
15
#include "eltwise_loongarch.h"
19
#endif // __loongarch_sx
21
#include "loongarch_usability.h"
25
Eltwise_loongarch::Eltwise_loongarch()
28
support_packing = true;
29
#endif // __loongarch_sx
32
int Eltwise_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
34
const Mat& bottom_blob = bottom_blobs[0];
35
int w = bottom_blob.w;
36
int h = bottom_blob.h;
37
int d = bottom_blob.d;
38
int channels = bottom_blob.c;
39
int elempack = bottom_blob.elempack;
40
int size = w * h * d * elempack;
42
Mat& top_blob = top_blobs[0];
43
top_blob.create_like(bottom_blob, opt.blob_allocator);
47
if (op_type == Operation_PROD)
50
const Mat& bottom_blob1 = bottom_blobs[1];
51
#pragma omp parallel for num_threads(opt.num_threads)
52
for (int q = 0; q < channels; q++)
54
const float* ptr = bottom_blob.channel(q);
55
const float* ptr1 = bottom_blob1.channel(q);
56
float* outptr = top_blob.channel(q);
60
for (; i + 3 < size; i += 4)
62
__m128 _p = (__m128)__lsx_vld(ptr, 0);
63
__m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
64
_p = __lsx_vfmul_s(_p, _p1);
65
__lsx_vst(_p, outptr, 0);
71
#endif // __loongarch_sx
74
*outptr = *ptr * *ptr1;
82
for (size_t b = 2; b < bottom_blobs.size(); b++)
84
const Mat& bottom_blob1 = bottom_blobs[b];
85
#pragma omp parallel for num_threads(opt.num_threads)
86
for (int q = 0; q < channels; q++)
88
const float* ptr = bottom_blob1.channel(q);
89
float* outptr = top_blob.channel(q);
93
for (; i + 3 < size; i += 4)
95
__m128 _p = (__m128)__lsx_vld(outptr, 0);
96
__m128 _p1 = (__m128)__lsx_vld(ptr, 0);
97
_p = __lsx_vfmul_s(_p, _p1);
98
__lsx_vst(_p, outptr, 0);
103
#endif // __loongarch_sx
104
for (; i < size; i++)
114
if (op_type == Operation_SUM)
119
const Mat& bottom_blob1 = bottom_blobs[1];
120
#pragma omp parallel for num_threads(opt.num_threads)
121
for (int q = 0; q < channels; q++)
123
const float* ptr = bottom_blob.channel(q);
124
const float* ptr1 = bottom_blob1.channel(q);
125
float* outptr = top_blob.channel(q);
129
for (; i + 3 < size; i += 4)
131
__m128 _p = (__m128)__lsx_vld(ptr, 0);
132
__m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
133
_p = __lsx_vfadd_s(_p, _p1);
134
__lsx_vst(_p, outptr, 0);
140
#endif // __loongarch_sx
141
for (; i < size; i++)
143
*outptr = *ptr + *ptr1;
151
for (size_t b = 2; b < bottom_blobs.size(); b++)
153
const Mat& bottom_blob1 = bottom_blobs[b];
154
#pragma omp parallel for num_threads(opt.num_threads)
155
for (int q = 0; q < channels; q++)
157
const float* ptr = bottom_blob1.channel(q);
158
float* outptr = top_blob.channel(q);
162
for (; i + 3 < size; i += 4)
164
__m128 _p = (__m128)__lsx_vld(outptr, 0);
165
__m128 _p1 = (__m128)__lsx_vld(ptr, 0);
166
_p = __lsx_vfadd_s(_p, _p1);
167
__lsx_vst(_p, outptr, 0);
172
#endif // __loongarch_sx
173
for (; i < size; i++)
186
const Mat& bottom_blob1 = bottom_blobs[1];
187
float coeff0 = coeffs[0];
188
float coeff1 = coeffs[1];
190
__m128 _coeff0 = (__m128)__lsx_vreplfr2vr_s(coeff0);
191
__m128 _coeff1 = (__m128)__lsx_vreplfr2vr_s(coeff1);
192
#endif // __loongarch_sx
193
#pragma omp parallel for num_threads(opt.num_threads)
194
for (int q = 0; q < channels; q++)
196
const float* ptr = bottom_blob.channel(q);
197
const float* ptr1 = bottom_blob1.channel(q);
198
float* outptr = top_blob.channel(q);
202
for (; i + 3 < size; i += 4)
204
__m128 _p = (__m128)__lsx_vld(ptr, 0);
205
__m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
206
_p = __lsx_vfmul_s(_p, _coeff0);
207
_p = __lsx_vfmadd_s(_coeff1, _p1, _p);
208
__lsx_vst(_p, outptr, 0);
214
#endif // __loongarch_sx
215
for (; i < size; i++)
217
*outptr = *ptr * coeff0 + *ptr1 * coeff1;
225
for (size_t b = 2; b < bottom_blobs.size(); b++)
227
const Mat& bottom_blob1 = bottom_blobs[b];
228
float coeff = coeffs[b];
230
__m128 _coeff = (__m128)__lsx_vreplfr2vr_s(coeff);
231
#endif // __loongarch_sx
232
#pragma omp parallel for num_threads(opt.num_threads)
233
for (int q = 0; q < channels; q++)
235
const float* ptr = bottom_blob1.channel(q);
236
float* outptr = top_blob.channel(q);
240
for (; i + 3 < size; i += 4)
242
__m128 _p = (__m128)__lsx_vld(outptr, 0);
243
__m128 _p1 = (__m128)__lsx_vld(ptr, 0);
244
_p = __lsx_vfmadd_s(_coeff, _p1, _p);
245
__lsx_vst(_p, outptr, 0);
250
#endif // __loongarch_sx
251
for (; i < size; i++)
253
*outptr += *ptr * coeff;
262
if (op_type == Operation_MAX)
265
const Mat& bottom_blob1 = bottom_blobs[1];
266
#pragma omp parallel for num_threads(opt.num_threads)
267
for (int q = 0; q < channels; q++)
269
const float* ptr = bottom_blob.channel(q);
270
const float* ptr1 = bottom_blob1.channel(q);
271
float* outptr = top_blob.channel(q);
275
for (; i + 3 < size; i += 4)
277
__m128 _p = (__m128)__lsx_vld(ptr, 0);
278
__m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
279
_p = __lsx_vfmax_s(_p, _p1);
280
__lsx_vst(_p, outptr, 0);
286
#endif // __loongarch_sx
287
for (; i < size; i++)
289
*outptr = std::max(*ptr, *ptr1);
297
for (size_t b = 2; b < bottom_blobs.size(); b++)
299
const Mat& bottom_blob1 = bottom_blobs[b];
300
#pragma omp parallel for num_threads(opt.num_threads)
301
for (int q = 0; q < channels; q++)
303
const float* ptr = bottom_blob1.channel(q);
304
float* outptr = top_blob.channel(q);
308
for (; i + 3 < size; i += 4)
310
__m128 _p = (__m128)__lsx_vld(outptr, 0);
311
__m128 _p1 = (__m128)__lsx_vld(ptr, 0);
312
_p = __lsx_vfmax_s(_p, _p1);
313
__lsx_vst(_p, outptr, 0);
318
#endif // __loongarch_sx
319
for (; i < size; i++)
321
*outptr = std::max(*ptr, *outptr);