ncnn

eltwise_loongarch.cpp
333 строки · 10.2 Кб
Перенос по словам
1
// yala is pleased to support the open source community by making ncnn available.
2
//
3
//
4
// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
7
//
8
// https://opensource.org/licenses/BSD-3-Clause
9
//
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
14

15
#include "eltwise_loongarch.h"
16

17
#if __loongarch_sx
18
#include <lsxintrin.h>
19
#endif // __loongarch_sx
20

21
#include "loongarch_usability.h"
22

23
namespace ncnn {
24

25
Eltwise_loongarch::Eltwise_loongarch()
26
{
27
#if __loongarch_sx
28
    support_packing = true;
29
#endif // __loongarch_sx
30
}
31

32
int Eltwise_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
33
{
34
    const Mat& bottom_blob = bottom_blobs[0];
35
    int w = bottom_blob.w;
36
    int h = bottom_blob.h;
37
    int d = bottom_blob.d;
38
    int channels = bottom_blob.c;
39
    int elempack = bottom_blob.elempack;
40
    int size = w * h * d * elempack;
41

42
    Mat& top_blob = top_blobs[0];
43
    top_blob.create_like(bottom_blob, opt.blob_allocator);
44
    if (top_blob.empty())
45
        return -100;
46

47
    if (op_type == Operation_PROD)
48
    {
49
        // first blob
50
        const Mat& bottom_blob1 = bottom_blobs[1];
51
        #pragma omp parallel for num_threads(opt.num_threads)
52
        for (int q = 0; q < channels; q++)
53
        {
54
            const float* ptr = bottom_blob.channel(q);
55
            const float* ptr1 = bottom_blob1.channel(q);
56
            float* outptr = top_blob.channel(q);
57

58
            int i = 0;
59
#if __loongarch_sx
60
            for (; i + 3 < size; i += 4)
61
            {
62
                __m128 _p = (__m128)__lsx_vld(ptr, 0);
63
                __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
64
                _p = __lsx_vfmul_s(_p, _p1);
65
                __lsx_vst(_p, outptr, 0);
66

67
                ptr += 4;
68
                ptr1 += 4;
69
                outptr += 4;
70
            }
71
#endif // __loongarch_sx
72
            for (; i < size; i++)
73
            {
74
                *outptr = *ptr * *ptr1;
75

76
                ptr++;
77
                ptr1++;
78
                outptr++;
79
            }
80
        }
81

82
        for (size_t b = 2; b < bottom_blobs.size(); b++)
83
        {
84
            const Mat& bottom_blob1 = bottom_blobs[b];
85
            #pragma omp parallel for num_threads(opt.num_threads)
86
            for (int q = 0; q < channels; q++)
87
            {
88
                const float* ptr = bottom_blob1.channel(q);
89
                float* outptr = top_blob.channel(q);
90

91
                int i = 0;
92
#if __loongarch_sx
93
                for (; i + 3 < size; i += 4)
94
                {
95
                    __m128 _p = (__m128)__lsx_vld(outptr, 0);
96
                    __m128 _p1 = (__m128)__lsx_vld(ptr, 0);
97
                    _p = __lsx_vfmul_s(_p, _p1);
98
                    __lsx_vst(_p, outptr, 0);
99

100
                    ptr += 4;
101
                    outptr += 4;
102
                }
103
#endif // __loongarch_sx
104
                for (; i < size; i++)
105
                {
106
                    *outptr *= *ptr;
107

108
                    ptr++;
109
                    outptr++;
110
                }
111
            }
112
        }
113
    }
114
    if (op_type == Operation_SUM)
115
    {
116
        if (coeffs.w == 0)
117
        {
118
            // first blob
119
            const Mat& bottom_blob1 = bottom_blobs[1];
120
            #pragma omp parallel for num_threads(opt.num_threads)
121
            for (int q = 0; q < channels; q++)
122
            {
123
                const float* ptr = bottom_blob.channel(q);
124
                const float* ptr1 = bottom_blob1.channel(q);
125
                float* outptr = top_blob.channel(q);
126

127
                int i = 0;
128
#if __loongarch_sx
129
                for (; i + 3 < size; i += 4)
130
                {
131
                    __m128 _p = (__m128)__lsx_vld(ptr, 0);
132
                    __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
133
                    _p = __lsx_vfadd_s(_p, _p1);
134
                    __lsx_vst(_p, outptr, 0);
135

136
                    ptr += 4;
137
                    ptr1 += 4;
138
                    outptr += 4;
139
                }
140
#endif // __loongarch_sx
141
                for (; i < size; i++)
142
                {
143
                    *outptr = *ptr + *ptr1;
144

145
                    ptr++;
146
                    ptr1++;
147
                    outptr++;
148
                }
149
            }
150

151
            for (size_t b = 2; b < bottom_blobs.size(); b++)
152
            {
153
                const Mat& bottom_blob1 = bottom_blobs[b];
154
                #pragma omp parallel for num_threads(opt.num_threads)
155
                for (int q = 0; q < channels; q++)
156
                {
157
                    const float* ptr = bottom_blob1.channel(q);
158
                    float* outptr = top_blob.channel(q);
159

160
                    int i = 0;
161
#if __loongarch_sx
162
                    for (; i + 3 < size; i += 4)
163
                    {
164
                        __m128 _p = (__m128)__lsx_vld(outptr, 0);
165
                        __m128 _p1 = (__m128)__lsx_vld(ptr, 0);
166
                        _p = __lsx_vfadd_s(_p, _p1);
167
                        __lsx_vst(_p, outptr, 0);
168

169
                        ptr += 4;
170
                        outptr += 4;
171
                    }
172
#endif // __loongarch_sx
173
                    for (; i < size; i++)
174
                    {
175
                        *outptr += *ptr;
176

177
                        ptr++;
178
                        outptr++;
179
                    }
180
                }
181
            }
182
        }
183
        else
184
        {
185
            // first blob
186
            const Mat& bottom_blob1 = bottom_blobs[1];
187
            float coeff0 = coeffs[0];
188
            float coeff1 = coeffs[1];
189
#if __loongarch_sx
190
            __m128 _coeff0 = (__m128)__lsx_vreplfr2vr_s(coeff0);
191
            __m128 _coeff1 = (__m128)__lsx_vreplfr2vr_s(coeff1);
192
#endif // __loongarch_sx
193
            #pragma omp parallel for num_threads(opt.num_threads)
194
            for (int q = 0; q < channels; q++)
195
            {
196
                const float* ptr = bottom_blob.channel(q);
197
                const float* ptr1 = bottom_blob1.channel(q);
198
                float* outptr = top_blob.channel(q);
199

200
                int i = 0;
201
#if __loongarch_sx
202
                for (; i + 3 < size; i += 4)
203
                {
204
                    __m128 _p = (__m128)__lsx_vld(ptr, 0);
205
                    __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
206
                    _p = __lsx_vfmul_s(_p, _coeff0);
207
                    _p = __lsx_vfmadd_s(_coeff1, _p1, _p);
208
                    __lsx_vst(_p, outptr, 0);
209

210
                    ptr += 4;
211
                    ptr1 += 4;
212
                    outptr += 4;
213
                }
214
#endif // __loongarch_sx
215
                for (; i < size; i++)
216
                {
217
                    *outptr = *ptr * coeff0 + *ptr1 * coeff1;
218

219
                    ptr++;
220
                    ptr1++;
221
                    outptr++;
222
                }
223
            }
224

225
            for (size_t b = 2; b < bottom_blobs.size(); b++)
226
            {
227
                const Mat& bottom_blob1 = bottom_blobs[b];
228
                float coeff = coeffs[b];
229
#if __loongarch_sx
230
                __m128 _coeff = (__m128)__lsx_vreplfr2vr_s(coeff);
231
#endif // __loongarch_sx
232
                #pragma omp parallel for num_threads(opt.num_threads)
233
                for (int q = 0; q < channels; q++)
234
                {
235
                    const float* ptr = bottom_blob1.channel(q);
236
                    float* outptr = top_blob.channel(q);
237

238
                    int i = 0;
239
#if __loongarch_sx
240
                    for (; i + 3 < size; i += 4)
241
                    {
242
                        __m128 _p = (__m128)__lsx_vld(outptr, 0);
243
                        __m128 _p1 = (__m128)__lsx_vld(ptr, 0);
244
                        _p = __lsx_vfmadd_s(_coeff, _p1, _p);
245
                        __lsx_vst(_p, outptr, 0);
246

247
                        ptr += 4;
248
                        outptr += 4;
249
                    }
250
#endif // __loongarch_sx
251
                    for (; i < size; i++)
252
                    {
253
                        *outptr += *ptr * coeff;
254

255
                        ptr++;
256
                        outptr++;
257
                    }
258
                }
259
            }
260
        }
261
    }
262
    if (op_type == Operation_MAX)
263
    {
264
        // first blob
265
        const Mat& bottom_blob1 = bottom_blobs[1];
266
        #pragma omp parallel for num_threads(opt.num_threads)
267
        for (int q = 0; q < channels; q++)
268
        {
269
            const float* ptr = bottom_blob.channel(q);
270
            const float* ptr1 = bottom_blob1.channel(q);
271
            float* outptr = top_blob.channel(q);
272

273
            int i = 0;
274
#if __loongarch_sx
275
            for (; i + 3 < size; i += 4)
276
            {
277
                __m128 _p = (__m128)__lsx_vld(ptr, 0);
278
                __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
279
                _p = __lsx_vfmax_s(_p, _p1);
280
                __lsx_vst(_p, outptr, 0);
281

282
                ptr += 4;
283
                ptr1 += 4;
284
                outptr += 4;
285
            }
286
#endif // __loongarch_sx
287
            for (; i < size; i++)
288
            {
289
                *outptr = std::max(*ptr, *ptr1);
290

291
                ptr++;
292
                ptr1++;
293
                outptr++;
294
            }
295
        }
296

297
        for (size_t b = 2; b < bottom_blobs.size(); b++)
298
        {
299
            const Mat& bottom_blob1 = bottom_blobs[b];
300
            #pragma omp parallel for num_threads(opt.num_threads)
301
            for (int q = 0; q < channels; q++)
302
            {
303
                const float* ptr = bottom_blob1.channel(q);
304
                float* outptr = top_blob.channel(q);
305

306
                int i = 0;
307
#if __loongarch_sx
308
                for (; i + 3 < size; i += 4)
309
                {
310
                    __m128 _p = (__m128)__lsx_vld(outptr, 0);
311
                    __m128 _p1 = (__m128)__lsx_vld(ptr, 0);
312
                    _p = __lsx_vfmax_s(_p, _p1);
313
                    __lsx_vst(_p, outptr, 0);
314

315
                    ptr += 4;
316
                    outptr += 4;
317
                }
318
#endif // __loongarch_sx
319
                for (; i < size; i++)
320
                {
321
                    *outptr = std::max(*ptr, *outptr);
322

323
                    ptr++;
324
                    outptr++;
325
                }
326
            }
327
        }
328
    }
329

330
    return 0;
331
}
332

333
} // namespace ncnn
334
ncnn

Использование cookies