ncnn

unaryop_mips.cpp
507 строк · 11.4 Кб
Перенос по словам
1
// Tencent is pleased to support the open source community by making ncnn available.
2
//
3
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
4
//
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
7
//
8
// https://opensource.org/licenses/BSD-3-Clause
9
//
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
14

15
#include "unaryop_mips.h"
16

17
// #include <fenv.h>
18
#include <float.h>
19

20
#if __mips_msa
21
#include <msa.h>
22
#include "msa_mathfun.h"
23
#endif // __mips_msa
24

25
namespace ncnn {
26

27
UnaryOp_mips::UnaryOp_mips()
28
{
29
#if __mips_msa
30
    support_packing = true;
31
#endif // __mips_msa
32
}
33

34
template<typename Op>
35
static int unary_op_inplace(Mat& a, const Option& opt)
36
{
37
    Op op;
38

39
    int w = a.w;
40
    int h = a.h;
41
    int d = a.d;
42
    int channels = a.c;
43
    int elempack = a.elempack;
44
    int size = w * h * d * elempack;
45

46
    #pragma omp parallel for num_threads(opt.num_threads)
47
    for (int q = 0; q < channels; q++)
48
    {
49
        float* ptr = a.channel(q);
50

51
        int i = 0;
52
#if __mips_msa
53
        for (; i + 3 < size; i += 4)
54
        {
55
            __builtin_prefetch(ptr + 16);
56
            v4f32 _p = (v4f32)__msa_ld_w(ptr, 0);
57
            _p = op.func_pack4(_p);
58
            __msa_st_w((v4i32)_p, ptr, 0);
59
            ptr += 4;
60
        }
61
#endif // __mips_msa
62
        for (; i < size; i++)
63
        {
64
            *ptr = op.func(*ptr);
65
            ptr++;
66
        }
67
    }
68

69
    return 0;
70
}
71

72
namespace UnaryOp_mips_functor {
73

74
struct unary_op_abs
75
{
76
    float func(const float& x) const
77
    {
78
        return (float)fabs(x);
79
    }
80
#if __mips_msa
81
    v4f32 func_pack4(const v4f32& x) const
82
    {
83
        return (v4f32)__msa_bclri_w((v4u32)x, 31);
84
    }
85
#endif // __mips_msa
86
};
87

88
struct unary_op_neg
89
{
90
    float func(const float& x) const
91
    {
92
        return -x;
93
    }
94
#if __mips_msa
95
    v4f32 func_pack4(const v4f32& x) const
96
    {
97
        return (v4f32)__msa_bnegi_w((v4u32)x, 31);
98
    }
99
#endif // __mips_msa
100
};
101

102
struct unary_op_floor
103
{
104
    float func(const float& x) const
105
    {
106
        return (float)floor(x);
107
    }
108
#if __mips_msa
109
    v4f32 func_pack4(const v4f32& x) const
110
    {
111
        v4i32 _xi = __msa_ftrunc_s_w(x);
112
        v4i32 _mask = __msa_fclt_w(x, __msa_ffint_s_w(_xi));
113
        return __msa_ffint_s_w(__msa_addv_w(_xi, _mask));
114
        // int old_msacsr = __msa_cfcmsa_msacsr();
115
        // __msa_ctcmsa_msacsr(old_msacsr | 3); // round towards -inf
116
        // v4f32 y = __msa_frint_w(x);
117
        // __msa_ctcmsa_msacsr(old_msacsr);
118
        // return y;
119
    }
120
#endif // __mips_msa
121
};
122

123
struct unary_op_ceil
124
{
125
    float func(const float& x) const
126
    {
127
        return (float)ceil(x);
128
    }
129
#if __mips_msa
130
    v4f32 func_pack4(const v4f32& x) const
131
    {
132
        v4i32 _xi = __msa_ftrunc_s_w(x);
133
        v4i32 _mask = __msa_fclt_w(__msa_ffint_s_w(_xi), x);
134
        return __msa_ffint_s_w(__msa_subv_w(_xi, _mask));
135
        // int old_msacsr = __msa_cfcmsa_msacsr();
136
        // __msa_ctcmsa_msacsr((old_msacsr | 3) ^ 1); // round towards +inf
137
        // v4f32 y = __msa_frint_w(x);
138
        // __msa_ctcmsa_msacsr(old_msacsr);
139
        // return y;
140
    }
141
#endif // __mips_msa
142
};
143

144
struct unary_op_square
145
{
146
    float func(const float& x) const
147
    {
148
        return x * x;
149
    }
150
#if __mips_msa
151
    v4f32 func_pack4(const v4f32& x) const
152
    {
153
        return __msa_fmul_w(x, x);
154
    }
155
#endif // __mips_msa
156
};
157

158
struct unary_op_sqrt
159
{
160
    float func(const float& x) const
161
    {
162
        return (float)sqrt(x);
163
    }
164
#if __mips_msa
165
    v4f32 func_pack4(const v4f32& x) const
166
    {
167
        return __msa_fsqrt_w(x);
168
    }
169
#endif // __mips_msa
170
};
171

172
struct unary_op_rsqrt
173
{
174
    float func(const float& x) const
175
    {
176
        return (float)(1.f / sqrt(x));
177
    }
178
#if __mips_msa
179
    v4f32 func_pack4(const v4f32& x) const
180
    {
181
        return __msa_frsqrt_w(x);
182
    }
183
#endif // __mips_msa
184
};
185

186
struct unary_op_exp
187
{
188
    float func(const float& x) const
189
    {
190
        return (float)exp(x);
191
    }
192
#if __mips_msa
193
    v4f32 func_pack4(const v4f32& x) const
194
    {
195
        return exp_ps(x);
196
    }
197
#endif // __mips_msa
198
};
199

200
struct unary_op_log
201
{
202
    float func(const float& x) const
203
    {
204
        return (float)log(x);
205
    }
206
#if __mips_msa
207
    v4f32 func_pack4(const v4f32& x) const
208
    {
209
        return log_ps(x);
210
    }
211
#endif // __mips_msa
212
};
213

214
struct unary_op_sin
215
{
216
    float func(const float& x) const
217
    {
218
        return (float)sin(x);
219
    }
220
#if __mips_msa
221
    v4f32 func_pack4(const v4f32& x) const
222
    {
223
        // TODO msa optimize
224
        float tmp[4];
225
        __msa_st_w((v4i32)x, tmp, 0);
226
        tmp[0] = sin(tmp[0]);
227
        tmp[1] = sin(tmp[1]);
228
        tmp[2] = sin(tmp[2]);
229
        tmp[3] = sin(tmp[3]);
230
        return (v4f32)__msa_ld_w(tmp, 0);
231
    }
232
#endif // __mips_msa
233
};
234

235
struct unary_op_cos
236
{
237
    float func(const float& x) const
238
    {
239
        return (float)cos(x);
240
    }
241
#if __mips_msa
242
    v4f32 func_pack4(const v4f32& x) const
243
    {
244
        // TODO msa optimize
245
        float tmp[4];
246
        __msa_st_w((v4i32)x, tmp, 0);
247
        tmp[0] = cos(tmp[0]);
248
        tmp[1] = cos(tmp[1]);
249
        tmp[2] = cos(tmp[2]);
250
        tmp[3] = cos(tmp[3]);
251
        return (v4f32)__msa_ld_w(tmp, 0);
252
    }
253
#endif // __mips_msa
254
};
255

256
struct unary_op_tan
257
{
258
    float func(const float& x) const
259
    {
260
        return (float)tan(x);
261
    }
262
#if __mips_msa
263
    v4f32 func_pack4(const v4f32& x) const
264
    {
265
        // TODO msa optimize
266
        float tmp[4];
267
        __msa_st_w((v4i32)x, tmp, 0);
268
        tmp[0] = tan(tmp[0]);
269
        tmp[1] = tan(tmp[1]);
270
        tmp[2] = tan(tmp[2]);
271
        tmp[3] = tan(tmp[3]);
272
        return (v4f32)__msa_ld_w(tmp, 0);
273
    }
274
#endif // __mips_msa
275
};
276

277
struct unary_op_asin
278
{
279
    float func(const float& x) const
280
    {
281
        return (float)asin(x);
282
    }
283
#if __mips_msa
284
    v4f32 func_pack4(const v4f32& x) const
285
    {
286
        // TODO msa optimize
287
        float tmp[4];
288
        __msa_st_w((v4i32)x, tmp, 0);
289
        tmp[0] = asin(tmp[0]);
290
        tmp[1] = asin(tmp[1]);
291
        tmp[2] = asin(tmp[2]);
292
        tmp[3] = asin(tmp[3]);
293
        return (v4f32)__msa_ld_w(tmp, 0);
294
    }
295
#endif // __mips_msa
296
};
297

298
struct unary_op_acos
299
{
300
    float func(const float& x) const
301
    {
302
        return (float)acos(x);
303
    }
304
#if __mips_msa
305
    v4f32 func_pack4(const v4f32& x) const
306
    {
307
        // TODO msa optimize
308
        float tmp[4];
309
        __msa_st_w((v4i32)x, tmp, 0);
310
        tmp[0] = acos(tmp[0]);
311
        tmp[1] = acos(tmp[1]);
312
        tmp[2] = acos(tmp[2]);
313
        tmp[3] = acos(tmp[3]);
314
        return (v4f32)__msa_ld_w(tmp, 0);
315
    }
316
#endif // __mips_msa
317
};
318

319
struct unary_op_atan
320
{
321
    float func(const float& x) const
322
    {
323
        return (float)atan(x);
324
    }
325
#if __mips_msa
326
    v4f32 func_pack4(const v4f32& x) const
327
    {
328
        // TODO msa optimize
329
        float tmp[4];
330
        __msa_st_w((v4i32)x, tmp, 0);
331
        tmp[0] = atan(tmp[0]);
332
        tmp[1] = atan(tmp[1]);
333
        tmp[2] = atan(tmp[2]);
334
        tmp[3] = atan(tmp[3]);
335
        return (v4f32)__msa_ld_w(tmp, 0);
336
    }
337
#endif // __mips_msa
338
};
339

340
struct unary_op_reciprocal
341
{
342
    float func(const float& x) const
343
    {
344
        return 1.f / x;
345
    }
346
#if __mips_msa
347
    v4f32 func_pack4(const v4f32& x) const
348
    {
349
        return __msa_frcp_w(x);
350
    }
351
#endif // __mips_msa
352
};
353

354
struct unary_op_tanh
355
{
356
    float func(const float& x) const
357
    {
358
        return (float)tanh(x);
359
    }
360
#if __mips_msa
361
    v4f32 func_pack4(const v4f32& x) const
362
    {
363
        return tanh_ps(x);
364
    }
365
#endif // __mips_msa
366
};
367

368
struct unary_op_log10
369
{
370
    float func(const float& x) const
371
    {
372
        return (float)log10(x);
373
    }
374
#if __mips_msa
375
    v4f32 func_pack4(const v4f32& x) const
376
    {
377
        return __msa_fmul_w(log_ps(x), __msa_fill_w_f32(0.434294481903));
378
    }
379
#endif // __mips_msa
380
};
381

382
struct unary_op_round
383
{
384
    float func(const float& x) const
385
    {
386
        // round to nearest even
387
#if NCNN_GNU_INLINE_ASM
388
        // return (x + 12582912.f) - 12582912.f;
389
        float y;
390
        const float magic = 12582912.f;
391
        asm volatile(
392
            "add.s   %0, %1, %2  \n"
393
            "sub.s   %0, %0, %2  \n"
394
            : "=f"(y)
395
            : "f"(x), "f"(magic)
396
            :);
397
        return y;
398
#else
399
#ifdef FE_TONEAREST
400
        int old_rm = fegetround();
401
        fesetround(FE_TONEAREST);
402
#endif
403
        float y = nearbyintf(x);
404
#ifdef FE_TONEAREST
405
        fesetround(old_rm);
406
#endif
407
        return y;
408
#endif
409
    }
410
#if __mips_msa
411
    v4f32 func_pack4(const v4f32& x) const
412
    {
413
        // round towards nearest even by default
414
        return __msa_frint_w(x);
415
    }
416
#endif // __mips_msa
417
};
418

419
struct unary_op_trunc
420
{
421
    float func(const float& x) const
422
    {
423
        return (float)truncf(x);
424
    }
425
#if __mips_msa
426
    v4f32 func_pack4(const v4f32& x) const
427
    {
428
        return __msa_ffint_s_w(__msa_ftrunc_s_w(x));
429
        // int old_msacsr = __msa_cfcmsa_msacsr();
430
        // __msa_ctcmsa_msacsr((old_msacsr | 3) ^ 2); // round towards zero
431
        // v4f32 y = __msa_frint_w(x);
432
        // __msa_ctcmsa_msacsr(old_msacsr);
433
        // return y;
434
    }
435
#endif // __mips_msa
436
};
437

438
} // namespace UnaryOp_mips_functor
439

440
int UnaryOp_mips::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
441
{
442
    using namespace UnaryOp_mips_functor;
443

444
    if (op_type == Operation_ABS)
445
        return unary_op_inplace<unary_op_abs>(bottom_top_blob, opt);
446

447
    if (op_type == Operation_NEG)
448
        return unary_op_inplace<unary_op_neg>(bottom_top_blob, opt);
449

450
    if (op_type == Operation_FLOOR)
451
        return unary_op_inplace<unary_op_floor>(bottom_top_blob, opt);
452

453
    if (op_type == Operation_CEIL)
454
        return unary_op_inplace<unary_op_ceil>(bottom_top_blob, opt);
455

456
    if (op_type == Operation_SQUARE)
457
        return unary_op_inplace<unary_op_square>(bottom_top_blob, opt);
458

459
    if (op_type == Operation_SQRT)
460
        return unary_op_inplace<unary_op_sqrt>(bottom_top_blob, opt);
461

462
    if (op_type == Operation_RSQRT)
463
        return unary_op_inplace<unary_op_rsqrt>(bottom_top_blob, opt);
464

465
    if (op_type == Operation_EXP)
466
        return unary_op_inplace<unary_op_exp>(bottom_top_blob, opt);
467

468
    if (op_type == Operation_LOG)
469
        return unary_op_inplace<unary_op_log>(bottom_top_blob, opt);
470

471
    if (op_type == Operation_SIN)
472
        return unary_op_inplace<unary_op_sin>(bottom_top_blob, opt);
473

474
    if (op_type == Operation_COS)
475
        return unary_op_inplace<unary_op_cos>(bottom_top_blob, opt);
476

477
    if (op_type == Operation_TAN)
478
        return unary_op_inplace<unary_op_tan>(bottom_top_blob, opt);
479

480
    if (op_type == Operation_ASIN)
481
        return unary_op_inplace<unary_op_asin>(bottom_top_blob, opt);
482

483
    if (op_type == Operation_ACOS)
484
        return unary_op_inplace<unary_op_acos>(bottom_top_blob, opt);
485

486
    if (op_type == Operation_ATAN)
487
        return unary_op_inplace<unary_op_atan>(bottom_top_blob, opt);
488

489
    if (op_type == Operation_RECIPROCAL)
490
        return unary_op_inplace<unary_op_reciprocal>(bottom_top_blob, opt);
491

492
    if (op_type == Operation_TANH)
493
        return unary_op_inplace<unary_op_tanh>(bottom_top_blob, opt);
494

495
    if (op_type == Operation_LOG10)
496
        return unary_op_inplace<unary_op_log10>(bottom_top_blob, opt);
497

498
    if (op_type == Operation_ROUND)
499
        return unary_op_inplace<unary_op_round>(bottom_top_blob, opt);
500

501
    if (op_type == Operation_TRUNC)
502
        return unary_op_inplace<unary_op_trunc>(bottom_top_blob, opt);
503

504
    return 0;
505
}
506

507
} // namespace ncnn
508
ncnn

Использование cookies