15
#include "requantize_mips.h"
21
#include "mips_activation.h"
22
#include "mips_usability.h"
27
#include "requantize_leakyrelu_pack4.h"
28
#include "requantize_leakyrelu_pack8.h"
29
#include "requantize_relu_pack4.h"
30
#include "requantize_relu_pack8.h"
33
Requantize_mips::Requantize_mips()
36
support_packing = true;
40
int Requantize_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
42
int dims = bottom_blob.dims;
43
int elempack = bottom_blob.elempack;
50
int w = bottom_blob.w;
52
top_blob.create(w, (size_t)8u, 8, opt.blob_allocator);
56
if (scale_in_data_size == 1 && scale_out_data_size == 1)
58
v4f32 _scale_in = (v4f32)__msa_fill_w_f32(scale_in_data[0]);
59
v4f32 _scale_out = (v4f32)__msa_fill_w_f32(scale_out_data[0]);
61
if (bias_data_size == 0)
63
#pragma omp parallel for num_threads(opt.num_threads)
64
for (int i = 0; i < w; i++)
66
const int* intptr = (const int*)bottom_blob + i * 8;
67
signed char* ptr = (signed char*)top_blob + i * 8;
69
v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
70
v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0));
71
_v0 = __msa_fmul_w(_v0, _scale_in);
72
_v1 = __msa_fmul_w(_v1, _scale_in);
73
_v0 = activation_ps(_v0, activation_type, activation_params);
74
_v1 = activation_ps(_v1, activation_type, activation_params);
75
_v0 = __msa_fmul_w(_v0, _scale_out);
76
_v1 = __msa_fmul_w(_v1, _scale_out);
77
*((int64_t*)ptr) = float2int8(_v0, _v1);
80
else if (bias_data_size == 1)
82
v4f32 _bias = (v4f32)__msa_fill_w_f32(bias_data[0]);
84
#pragma omp parallel for num_threads(opt.num_threads)
85
for (int i = 0; i < w; i++)
87
const int* intptr = (const int*)bottom_blob + i * 8;
88
signed char* ptr = (signed char*)top_blob + i * 8;
90
v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
91
v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0));
92
_v0 = __msa_fmadd_w(_bias, _v0, _scale_in);
93
_v1 = __msa_fmadd_w(_bias, _v1, _scale_in);
94
_v0 = activation_ps(_v0, activation_type, activation_params);
95
_v1 = activation_ps(_v1, activation_type, activation_params);
96
_v0 = __msa_fmul_w(_v0, _scale_out);
97
_v1 = __msa_fmul_w(_v1, _scale_out);
98
*((int64_t*)ptr) = float2int8(_v0, _v1);
103
#pragma omp parallel for num_threads(opt.num_threads)
104
for (int i = 0; i < w; i++)
106
const int* intptr = (const int*)bottom_blob + i * 8;
107
signed char* ptr = (signed char*)top_blob + i * 8;
109
v4f32 _bias0 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + i * 8, 0);
110
v4f32 _bias1 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + i * 8 + 4, 0);
111
v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
112
v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0));
113
_v0 = __msa_fmadd_w(_bias0, _v0, _scale_in);
114
_v1 = __msa_fmadd_w(_bias1, _v1, _scale_in);
115
_v0 = activation_ps(_v0, activation_type, activation_params);
116
_v1 = activation_ps(_v1, activation_type, activation_params);
117
_v0 = __msa_fmul_w(_v0, _scale_out);
118
_v1 = __msa_fmul_w(_v1, _scale_out);
119
*((int64_t*)ptr) = float2int8(_v0, _v1);
123
else if (scale_in_data_size == 1 && scale_out_data_size > 1)
125
v4f32 _scale_in = (v4f32)__msa_fill_w_f32(scale_in_data[0]);
127
if (bias_data_size == 0)
129
#pragma omp parallel for num_threads(opt.num_threads)
130
for (int i = 0; i < w; i++)
132
const int* intptr = (const int*)bottom_blob + i * 8;
133
signed char* ptr = (signed char*)top_blob + i * 8;
135
v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8, 0);
136
v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8 + 4, 0);
137
v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
138
v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0));
139
_v0 = __msa_fmul_w(_v0, _scale_in);
140
_v1 = __msa_fmul_w(_v1, _scale_in);
141
_v0 = activation_ps(_v0, activation_type, activation_params);
142
_v1 = activation_ps(_v1, activation_type, activation_params);
143
_v0 = __msa_fmul_w(_v0, _scale_out0);
144
_v1 = __msa_fmul_w(_v1, _scale_out1);
145
*((int64_t*)ptr) = float2int8(_v0, _v1);
148
else if (bias_data_size == 1)
150
v4f32 _bias = (v4f32)__msa_fill_w_f32(bias_data[0]);
152
#pragma omp parallel for num_threads(opt.num_threads)
153
for (int i = 0; i < w; i++)
155
const int* intptr = (const int*)bottom_blob + i * 8;
156
signed char* ptr = (signed char*)top_blob + i * 8;
158
v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8, 0);
159
v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8 + 4, 0);
160
v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
161
v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0));
162
_v0 = __msa_fmadd_w(_bias, _v0, _scale_in);
163
_v1 = __msa_fmadd_w(_bias, _v1, _scale_in);
164
_v0 = activation_ps(_v0, activation_type, activation_params);
165
_v1 = activation_ps(_v1, activation_type, activation_params);
166
_v0 = __msa_fmul_w(_v0, _scale_out0);
167
_v1 = __msa_fmul_w(_v1, _scale_out1);
168
*((int64_t*)ptr) = float2int8(_v0, _v1);
173
#pragma omp parallel for num_threads(opt.num_threads)
174
for (int i = 0; i < w; i++)
176
const int* intptr = (const int*)bottom_blob + i * 8;
177
signed char* ptr = (signed char*)top_blob + i * 8;
179
v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8, 0);
180
v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8 + 4, 0);
181
v4f32 _bias0 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + i * 8, 0);
182
v4f32 _bias1 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + i * 8 + 4, 0);
183
v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
184
v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0));
185
_v0 = __msa_fmadd_w(_bias0, _v0, _scale_in);
186
_v1 = __msa_fmadd_w(_bias1, _v1, _scale_in);
187
_v0 = activation_ps(_v0, activation_type, activation_params);
188
_v1 = activation_ps(_v1, activation_type, activation_params);
189
_v0 = __msa_fmul_w(_v0, _scale_out0);
190
_v1 = __msa_fmul_w(_v1, _scale_out1);
191
*((int64_t*)ptr) = float2int8(_v0, _v1);
195
else if (scale_in_data_size > 1 && scale_out_data_size == 1)
197
v4f32 _scale_out = (v4f32)__msa_fill_w_f32(scale_out_data[0]);
199
if (bias_data_size == 0)
201
#pragma omp parallel for num_threads(opt.num_threads)
202
for (int i = 0; i < w; i++)
204
const int* intptr = (const int*)bottom_blob + i * 8;
205
signed char* ptr = (signed char*)top_blob + i * 8;
207
v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8, 0);
208
v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8 + 4, 0);
209
v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
210
v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0));
211
_v0 = __msa_fmul_w(_v0, _scale_in0);
212
_v1 = __msa_fmul_w(_v1, _scale_in1);
213
_v0 = activation_ps(_v0, activation_type, activation_params);
214
_v1 = activation_ps(_v1, activation_type, activation_params);
215
_v0 = __msa_fmul_w(_v0, _scale_out);
216
_v1 = __msa_fmul_w(_v1, _scale_out);
217
*((int64_t*)ptr) = float2int8(_v0, _v1);
220
else if (bias_data_size == 1)
222
v4f32 _bias = (v4f32)__msa_fill_w_f32(bias_data[0]);
224
#pragma omp parallel for num_threads(opt.num_threads)
225
for (int i = 0; i < w; i++)
227
const int* intptr = (const int*)bottom_blob + i * 8;
228
signed char* ptr = (signed char*)top_blob + i * 8;
230
v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8, 0);
231
v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8 + 4, 0);
232
v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
233
v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0));
234
_v0 = __msa_fmadd_w(_bias, _v0, _scale_in0);
235
_v1 = __msa_fmadd_w(_bias, _v1, _scale_in1);
236
_v0 = activation_ps(_v0, activation_type, activation_params);
237
_v1 = activation_ps(_v1, activation_type, activation_params);
238
_v0 = __msa_fmul_w(_v0, _scale_out);
239
_v1 = __msa_fmul_w(_v1, _scale_out);
240
*((int64_t*)ptr) = float2int8(_v0, _v1);
245
#pragma omp parallel for num_threads(opt.num_threads)
246
for (int i = 0; i < w; i++)
248
const int* intptr = (const int*)bottom_blob + i * 8;
249
signed char* ptr = (signed char*)top_blob + i * 8;
251
v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8, 0);
252
v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8 + 4, 0);
253
v4f32 _bias0 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + i * 8, 0);
254
v4f32 _bias1 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + i * 8 + 4, 0);
255
v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
256
v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0));
257
_v0 = __msa_fmadd_w(_bias0, _v0, _scale_in0);
258
_v1 = __msa_fmadd_w(_bias1, _v1, _scale_in1);
259
_v0 = activation_ps(_v0, activation_type, activation_params);
260
_v1 = activation_ps(_v1, activation_type, activation_params);
261
_v0 = __msa_fmul_w(_v0, _scale_out);
262
_v1 = __msa_fmul_w(_v1, _scale_out);
263
*((int64_t*)ptr) = float2int8(_v0, _v1);
269
if (bias_data_size == 0)
271
#pragma omp parallel for num_threads(opt.num_threads)
272
for (int i = 0; i < w; i++)
274
const int* intptr = (const int*)bottom_blob + i * 8;
275
signed char* ptr = (signed char*)top_blob + i * 8;
277
v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8, 0);
278
v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8 + 4, 0);
279
v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8, 0);
280
v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8 + 4, 0);
281
v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
282
v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0));
283
_v0 = __msa_fmul_w(_v0, _scale_in0);
284
_v1 = __msa_fmul_w(_v1, _scale_in1);
285
_v0 = activation_ps(_v0, activation_type, activation_params);
286
_v1 = activation_ps(_v1, activation_type, activation_params);
287
_v0 = __msa_fmul_w(_v0, _scale_out0);
288
_v1 = __msa_fmul_w(_v1, _scale_out1);
289
*((int64_t*)ptr) = float2int8(_v0, _v1);
292
else if (bias_data_size == 1)
294
v4f32 _bias = (v4f32)__msa_fill_w_f32(bias_data[0]);
296
#pragma omp parallel for num_threads(opt.num_threads)
297
for (int i = 0; i < w; i++)
299
const int* intptr = (const int*)bottom_blob + i * 8;
300
signed char* ptr = (signed char*)top_blob + i * 8;
302
v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8, 0);
303
v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8 + 4, 0);
304
v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8, 0);
305
v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8 + 4, 0);
306
v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
307
v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0));
308
_v0 = __msa_fmadd_w(_bias, _v0, _scale_in0);
309
_v1 = __msa_fmadd_w(_bias, _v1, _scale_in1);
310
_v0 = activation_ps(_v0, activation_type, activation_params);
311
_v1 = activation_ps(_v1, activation_type, activation_params);
312
_v0 = __msa_fmul_w(_v0, _scale_out0);
313
_v1 = __msa_fmul_w(_v1, _scale_out1);
314
*((int64_t*)ptr) = float2int8(_v0, _v1);
319
#pragma omp parallel for num_threads(opt.num_threads)
320
for (int i = 0; i < w; i++)
322
const int* intptr = (const int*)bottom_blob + i * 8;
323
signed char* ptr = (signed char*)top_blob + i * 8;
325
v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8, 0);
326
v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8 + 4, 0);
327
v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8, 0);
328
v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8 + 4, 0);
329
v4f32 _bias0 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + i * 8, 0);
330
v4f32 _bias1 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + i * 8 + 4, 0);
331
v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
332
v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0));
333
_v0 = __msa_fmadd_w(_bias0, _v0, _scale_in0);
334
_v1 = __msa_fmadd_w(_bias1, _v1, _scale_in1);
335
_v0 = activation_ps(_v0, activation_type, activation_params);
336
_v1 = activation_ps(_v1, activation_type, activation_params);
337
_v0 = __msa_fmul_w(_v0, _scale_out0);
338
_v1 = __msa_fmul_w(_v1, _scale_out1);
339
*((int64_t*)ptr) = float2int8(_v0, _v1);
347
int w = bottom_blob.w;
348
int h = bottom_blob.h;
350
top_blob.create(w, h, (size_t)8u, 8, opt.blob_allocator);
351
if (top_blob.empty())
354
if (bias_data_size == 0)
356
#pragma omp parallel for num_threads(opt.num_threads)
357
for (int i = 0; i < h; i++)
359
const int* intptr = bottom_blob.row<const int>(i);
360
signed char* ptr = top_blob.row<signed char>(i);
362
v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8, 0);
363
v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8 + 4, 0);
364
v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8, 0);
365
v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8 + 4, 0);
367
for (int j = 0; j < w; j++)
369
__builtin_prefetch(intptr + 32);
370
v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
371
v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0));
372
_v0 = __msa_fmul_w(_v0, _scale_in0);
373
_v1 = __msa_fmul_w(_v1, _scale_in1);
374
_v0 = activation_ps(_v0, activation_type, activation_params);
375
_v1 = activation_ps(_v1, activation_type, activation_params);
376
_v0 = __msa_fmul_w(_v0, _scale_out0);
377
_v1 = __msa_fmul_w(_v1, _scale_out1);
378
*((int64_t*)ptr) = float2int8(_v0, _v1);
387
#pragma omp parallel for num_threads(opt.num_threads)
388
for (int i = 0; i < h; i++)
390
const int* intptr = bottom_blob.row<const int>(i);
391
signed char* ptr = top_blob.row<signed char>(i);
393
v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8, 0);
394
v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8 + 4, 0);
395
v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8, 0);
396
v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8 + 4, 0);
397
v4f32 _bias0 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + i * 8, 0);
398
v4f32 _bias1 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + i * 8 + 4, 0);
400
for (int j = 0; j < w; j++)
402
__builtin_prefetch(intptr + 32);
403
v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
404
v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0));
405
_v0 = __msa_fmadd_w(_bias0, _v0, _scale_in0);
406
_v1 = __msa_fmadd_w(_bias1, _v1, _scale_in1);
407
_v0 = activation_ps(_v0, activation_type, activation_params);
408
_v1 = activation_ps(_v1, activation_type, activation_params);
409
_v0 = __msa_fmul_w(_v0, _scale_out0);
410
_v1 = __msa_fmul_w(_v1, _scale_out1);
411
*((int64_t*)ptr) = float2int8(_v0, _v1);
422
int w = bottom_blob.w;
423
int h = bottom_blob.h;
424
int channels = bottom_blob.c;
427
top_blob.create(w, h, channels, (size_t)8u, 8, opt.blob_allocator);
428
if (top_blob.empty())
431
if (activation_type == 1)
433
requantize_relu_pack8_msa(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, opt);
437
if (activation_type == 2 && activation_params[0] > 0.f)
439
requantize_leakyrelu_pack8_msa(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, activation_params[0], opt);
443
if (bias_data_size == 0)
445
#pragma omp parallel for num_threads(opt.num_threads)
446
for (int q = 0; q < channels; q++)
448
const int* intptr = bottom_blob.channel(q);
449
signed char* ptr = top_blob.channel(q);
451
v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8, 0);
452
v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8 + 4, 0);
453
v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8, 0);
454
v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8 + 4, 0);
456
for (int i = 0; i < size; i++)
458
__builtin_prefetch(intptr + 32);
459
v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
460
v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0));
461
_v0 = __msa_fmul_w(_v0, _scale_in0);
462
_v1 = __msa_fmul_w(_v1, _scale_in1);
463
_v0 = activation_ps(_v0, activation_type, activation_params);
464
_v1 = activation_ps(_v1, activation_type, activation_params);
465
_v0 = __msa_fmul_w(_v0, _scale_out0);
466
_v1 = __msa_fmul_w(_v1, _scale_out1);
467
*((int64_t*)ptr) = float2int8(_v0, _v1);
476
#pragma omp parallel for num_threads(opt.num_threads)
477
for (int q = 0; q < channels; q++)
479
const int* intptr = bottom_blob.channel(q);
480
signed char* ptr = top_blob.channel(q);
482
v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8, 0);
483
v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8 + 4, 0);
484
v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8, 0);
485
v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8 + 4, 0);
486
v4f32 _bias0 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + q * 8, 0);
487
v4f32 _bias1 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + q * 8 + 4, 0);
489
for (int i = 0; i < size; i++)
491
__builtin_prefetch(intptr + 32);
492
v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
493
v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0));
494
_v0 = __msa_fmadd_w(_bias0, _v0, _scale_in0);
495
_v1 = __msa_fmadd_w(_bias1, _v1, _scale_in1);
496
_v0 = activation_ps(_v0, activation_type, activation_params);
497
_v1 = activation_ps(_v1, activation_type, activation_params);
498
_v0 = __msa_fmul_w(_v0, _scale_out0);
499
_v1 = __msa_fmul_w(_v1, _scale_out1);
500
*((int64_t*)ptr) = float2int8(_v0, _v1);
516
int w = bottom_blob.w;
517
int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1;
518
int outw = w * elempack / out_elempack;
520
top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator);
521
if (top_blob.empty())
524
if (scale_in_data_size == 1 && scale_out_data_size == 1)
526
v4f32 _scale_in = (v4f32)__msa_fill_w_f32(scale_in_data[0]);
527
v4f32 _scale_out = (v4f32)__msa_fill_w_f32(scale_out_data[0]);
529
if (bias_data_size == 0)
531
#pragma omp parallel for num_threads(opt.num_threads)
532
for (int i = 0; i < w; i++)
534
const int* intptr = (const int*)bottom_blob + i * 4;
535
signed char* ptr = (signed char*)top_blob + i * 4;
537
v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
538
_v = __msa_fmul_w(_v, _scale_in);
539
_v = activation_ps(_v, activation_type, activation_params);
540
_v = __msa_fmul_w(_v, _scale_out);
541
v16i8 v = float2int8(_v);
548
else if (bias_data_size == 1)
550
v4f32 _bias = (v4f32)__msa_fill_w_f32(bias_data[0]);
552
#pragma omp parallel for num_threads(opt.num_threads)
553
for (int i = 0; i < w; i++)
555
const int* intptr = (const int*)bottom_blob + i * 4;
556
signed char* ptr = (signed char*)top_blob + i * 4;
558
v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
559
_v = __msa_fmadd_w(_bias, _v, _scale_in);
560
_v = activation_ps(_v, activation_type, activation_params);
561
_v = __msa_fmul_w(_v, _scale_out);
562
v16i8 v = float2int8(_v);
571
#pragma omp parallel for num_threads(opt.num_threads)
572
for (int i = 0; i < w; i++)
574
const int* intptr = (const int*)bottom_blob + i * 4;
575
signed char* ptr = (signed char*)top_blob + i * 4;
577
v4f32 _bias = (v4f32)__msa_ld_w((const float*)bias_data + i * 4, 0);
578
v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
579
_v = __msa_fmadd_w(_bias, _v, _scale_in);
580
_v = activation_ps(_v, activation_type, activation_params);
581
_v = __msa_fmul_w(_v, _scale_out);
582
v16i8 v = float2int8(_v);
590
else if (scale_in_data_size == 1 && scale_out_data_size > 1)
592
v4f32 _scale_in = (v4f32)__msa_fill_w_f32(scale_in_data[0]);
594
if (bias_data_size == 0)
596
#pragma omp parallel for num_threads(opt.num_threads)
597
for (int i = 0; i < w; i++)
599
const int* intptr = (const int*)bottom_blob + i * 4;
600
signed char* ptr = (signed char*)top_blob + i * 4;
602
v4f32 _scale_out = (v4f32)__msa_ld_w((const float*)scale_out_data + i * 4, 0);
603
v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
604
_v = __msa_fmul_w(_v, _scale_in);
605
_v = activation_ps(_v, activation_type, activation_params);
606
_v = __msa_fmul_w(_v, _scale_out);
607
v16i8 v = float2int8(_v);
614
else if (bias_data_size == 1)
616
v4f32 _bias = (v4f32)__msa_fill_w_f32(bias_data[0]);
618
#pragma omp parallel for num_threads(opt.num_threads)
619
for (int i = 0; i < w; i++)
621
const int* intptr = (const int*)bottom_blob + i * 4;
622
signed char* ptr = (signed char*)top_blob + i * 4;
624
v4f32 _scale_out = (v4f32)__msa_ld_w((const float*)scale_out_data + i * 4, 0);
625
v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
626
_v = __msa_fmadd_w(_bias, _v, _scale_in);
627
_v = activation_ps(_v, activation_type, activation_params);
628
_v = __msa_fmul_w(_v, _scale_out);
629
v16i8 v = float2int8(_v);
638
#pragma omp parallel for num_threads(opt.num_threads)
639
for (int i = 0; i < w; i++)
641
const int* intptr = (const int*)bottom_blob + i * 4;
642
signed char* ptr = (signed char*)top_blob + i * 4;
644
v4f32 _scale_out = (v4f32)__msa_ld_w((const float*)scale_out_data + i * 4, 0);
645
v4f32 _bias = (v4f32)__msa_ld_w((const float*)bias_data + i * 4, 0);
646
v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
647
_v = __msa_fmadd_w(_bias, _v, _scale_in);
648
_v = activation_ps(_v, activation_type, activation_params);
649
_v = __msa_fmul_w(_v, _scale_out);
650
v16i8 v = float2int8(_v);
658
else if (scale_in_data_size > 1 && scale_out_data_size == 1)
660
v4f32 _scale_out = (v4f32)__msa_fill_w_f32(scale_out_data[0]);
662
if (bias_data_size == 0)
664
#pragma omp parallel for num_threads(opt.num_threads)
665
for (int i = 0; i < w; i++)
667
const int* intptr = (const int*)bottom_blob + i * 4;
668
signed char* ptr = (signed char*)top_blob + i * 4;
670
v4f32 _scale_in = (v4f32)__msa_ld_w((const float*)scale_in_data + i * 4, 0);
671
v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
672
_v = __msa_fmul_w(_v, _scale_in);
673
_v = activation_ps(_v, activation_type, activation_params);
674
_v = __msa_fmul_w(_v, _scale_out);
675
v16i8 v = float2int8(_v);
682
else if (bias_data_size == 1)
684
v4f32 _bias = (v4f32)__msa_fill_w_f32(bias_data[0]);
686
#pragma omp parallel for num_threads(opt.num_threads)
687
for (int i = 0; i < w; i++)
689
const int* intptr = (const int*)bottom_blob + i * 4;
690
signed char* ptr = (signed char*)top_blob + i * 4;
692
v4f32 _scale_in = (v4f32)__msa_ld_w((const float*)scale_in_data + i * 4, 0);
693
v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
694
_v = __msa_fmadd_w(_bias, _v, _scale_in);
695
_v = activation_ps(_v, activation_type, activation_params);
696
_v = __msa_fmul_w(_v, _scale_out);
697
v16i8 v = float2int8(_v);
706
#pragma omp parallel for num_threads(opt.num_threads)
707
for (int i = 0; i < w; i++)
709
const int* intptr = (const int*)bottom_blob + i * 4;
710
signed char* ptr = (signed char*)top_blob + i * 4;
712
v4f32 _scale_in = (v4f32)__msa_ld_w((const float*)scale_in_data + i * 4, 0);
713
v4f32 _bias = (v4f32)__msa_ld_w((const float*)bias_data + i * 4, 0);
714
v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
715
_v = __msa_fmadd_w(_bias, _v, _scale_in);
716
_v = activation_ps(_v, activation_type, activation_params);
717
_v = __msa_fmul_w(_v, _scale_out);
718
v16i8 v = float2int8(_v);
728
if (bias_data_size == 0)
730
#pragma omp parallel for num_threads(opt.num_threads)
731
for (int i = 0; i < w; i++)
733
const int* intptr = (const int*)bottom_blob + i * 4;
734
signed char* ptr = (signed char*)top_blob + i * 4;
736
v4f32 _scale_in = (v4f32)__msa_ld_w((const float*)scale_in_data + i * 4, 0);
737
v4f32 _scale_out = (v4f32)__msa_ld_w((const float*)scale_out_data + i * 4, 0);
738
v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
739
_v = __msa_fmul_w(_v, _scale_in);
740
_v = activation_ps(_v, activation_type, activation_params);
741
_v = __msa_fmul_w(_v, _scale_out);
742
v16i8 v = float2int8(_v);
749
else if (bias_data_size == 1)
751
v4f32 _bias = (v4f32)__msa_fill_w_f32(bias_data[0]);
753
#pragma omp parallel for num_threads(opt.num_threads)
754
for (int i = 0; i < w; i++)
756
const int* intptr = (const int*)bottom_blob + i * 4;
757
signed char* ptr = (signed char*)top_blob + i * 4;
759
v4f32 _scale_in = (v4f32)__msa_ld_w((const float*)scale_in_data + i * 4, 0);
760
v4f32 _scale_out = (v4f32)__msa_ld_w((const float*)scale_out_data + i * 4, 0);
761
v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
762
_v = __msa_fmadd_w(_bias, _v, _scale_in);
763
_v = activation_ps(_v, activation_type, activation_params);
764
_v = __msa_fmul_w(_v, _scale_out);
765
v16i8 v = float2int8(_v);
774
#pragma omp parallel for num_threads(opt.num_threads)
775
for (int i = 0; i < w; i++)
777
const int* intptr = (const int*)bottom_blob + i * 4;
778
signed char* ptr = (signed char*)top_blob + i * 4;
780
v4f32 _scale_in = (v4f32)__msa_ld_w((const float*)scale_in_data + i * 4, 0);
781
v4f32 _scale_out = (v4f32)__msa_ld_w((const float*)scale_out_data + i * 4, 0);
782
v4f32 _bias = (v4f32)__msa_ld_w((const float*)bias_data + i * 4, 0);
783
v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
784
_v = __msa_fmadd_w(_bias, _v, _scale_in);
785
_v = activation_ps(_v, activation_type, activation_params);
786
_v = __msa_fmul_w(_v, _scale_out);
787
v16i8 v = float2int8(_v);
799
int w = bottom_blob.w;
800
int h = bottom_blob.h;
801
int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1;
802
int outh = h * elempack / out_elempack;
804
top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator);
805
if (top_blob.empty())
808
if (out_elempack == 8)
810
if (bias_data_size == 0)
812
#pragma omp parallel for num_threads(opt.num_threads)
813
for (int i = 0; i < outh; i++)
815
const int* intptr0 = bottom_blob.row<const int>(i * 2);
816
const int* intptr1 = bottom_blob.row<const int>(i * 2 + 1);
817
signed char* ptr = top_blob.row<signed char>(i);
819
v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8, 0);
820
v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8 + 4, 0);
821
v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8, 0);
822
v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8 + 4, 0);
824
for (int j = 0; j < w; j++)
826
__builtin_prefetch(intptr0 + 16);
827
__builtin_prefetch(intptr1 + 16);
828
v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0, 0));
829
v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1, 0));
830
_v0 = __msa_fmul_w(_v0, _scale_in0);
831
_v1 = __msa_fmul_w(_v1, _scale_in1);
832
_v0 = activation_ps(_v0, activation_type, activation_params);
833
_v1 = activation_ps(_v1, activation_type, activation_params);
834
_v0 = __msa_fmul_w(_v0, _scale_out0);
835
_v1 = __msa_fmul_w(_v1, _scale_out1);
836
*((int64_t*)ptr) = float2int8(_v0, _v1);
846
#pragma omp parallel for num_threads(opt.num_threads)
847
for (int i = 0; i < outh; i++)
849
const int* intptr0 = bottom_blob.row<const int>(i * 2);
850
const int* intptr1 = bottom_blob.row<const int>(i * 2 + 1);
851
signed char* ptr = top_blob.row<signed char>(i);
853
v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8, 0);
854
v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8 + 4, 0);
855
v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8, 0);
856
v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8 + 4, 0);
857
v4f32 _bias0 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + i * 8, 0);
858
v4f32 _bias1 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + i * 8 + 4, 0);
860
for (int j = 0; j < w; j++)
862
__builtin_prefetch(intptr0 + 16);
863
__builtin_prefetch(intptr1 + 16);
864
v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0, 0));
865
v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1, 0));
866
_v0 = __msa_fmadd_w(_bias0, _v0, _scale_in0);
867
_v1 = __msa_fmadd_w(_bias1, _v1, _scale_in1);
868
_v0 = activation_ps(_v0, activation_type, activation_params);
869
_v1 = activation_ps(_v1, activation_type, activation_params);
870
_v0 = __msa_fmul_w(_v0, _scale_out0);
871
_v1 = __msa_fmul_w(_v1, _scale_out1);
872
*((int64_t*)ptr) = float2int8(_v0, _v1);
881
if (out_elempack == 1)
883
if (bias_data_size == 0)
885
#pragma omp parallel for num_threads(opt.num_threads)
886
for (int i = 0; i < h; i++)
888
const int* intptr = bottom_blob.row<const int>(i);
889
signed char* ptr0 = top_blob.row<signed char>(i * 4);
890
signed char* ptr1 = top_blob.row<signed char>(i * 4 + 1);
891
signed char* ptr2 = top_blob.row<signed char>(i * 4 + 2);
892
signed char* ptr3 = top_blob.row<signed char>(i * 4 + 3);
894
v4f32 _scale_in = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 4, 0);
895
v4f32 _scale_out = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 4, 0);
897
for (int j = 0; j < w; j++)
899
__builtin_prefetch(intptr + 16);
900
v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
901
_v = __msa_fmul_w(_v, _scale_in);
902
_v = activation_ps(_v, activation_type, activation_params);
903
_v = __msa_fmul_w(_v, _scale_out);
904
v16i8 v = float2int8(_v);
920
#pragma omp parallel for num_threads(opt.num_threads)
921
for (int i = 0; i < h; i++)
923
const int* intptr = bottom_blob.row<const int>(i);
924
signed char* ptr0 = top_blob.row<signed char>(i * 4);
925
signed char* ptr1 = top_blob.row<signed char>(i * 4 + 1);
926
signed char* ptr2 = top_blob.row<signed char>(i * 4 + 2);
927
signed char* ptr3 = top_blob.row<signed char>(i * 4 + 3);
929
v4f32 _scale_in = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 4, 0);
930
v4f32 _scale_out = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 4, 0);
931
v4f32 _bias = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + i * 4, 0);
933
for (int j = 0; j < w; j++)
935
__builtin_prefetch(intptr + 16);
936
v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
937
_v = __msa_fmadd_w(_bias, _v, _scale_in);
938
_v = activation_ps(_v, activation_type, activation_params);
939
_v = __msa_fmul_w(_v, _scale_out);
940
v16i8 v = float2int8(_v);
959
int w = bottom_blob.w;
960
int h = bottom_blob.h;
961
int channels = bottom_blob.c;
963
int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1;
964
int outc = channels * elempack / out_elempack;
966
top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator);
967
if (top_blob.empty())
970
if (activation_type == 1)
972
requantize_relu_pack4_msa(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, opt);
976
if (activation_type == 2 && activation_params[0] > 0.f)
978
requantize_leakyrelu_pack4_msa(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, activation_params[0], opt);
982
if (out_elempack == 8)
984
if (bias_data_size == 0)
986
#pragma omp parallel for num_threads(opt.num_threads)
987
for (int q = 0; q < outc; q++)
989
const int* intptr0 = bottom_blob.channel(q * 2);
990
const int* intptr1 = bottom_blob.channel(q * 2 + 1);
991
signed char* ptr = top_blob.channel(q);
993
v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8, 0);
994
v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8 + 4, 0);
995
v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8, 0);
996
v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8 + 4, 0);
998
for (int i = 0; i < size; i++)
1000
__builtin_prefetch(intptr0 + 16);
1001
__builtin_prefetch(intptr1 + 16);
1002
v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0, 0));
1003
v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1, 0));
1004
_v0 = __msa_fmul_w(_v0, _scale_in0);
1005
_v1 = __msa_fmul_w(_v1, _scale_in1);
1006
_v0 = activation_ps(_v0, activation_type, activation_params);
1007
_v1 = activation_ps(_v1, activation_type, activation_params);
1008
_v0 = __msa_fmul_w(_v0, _scale_out0);
1009
_v1 = __msa_fmul_w(_v1, _scale_out1);
1010
*((int64_t*)ptr) = float2int8(_v0, _v1);
1020
#pragma omp parallel for num_threads(opt.num_threads)
1021
for (int q = 0; q < outc; q++)
1023
const int* intptr0 = bottom_blob.channel(q * 2);
1024
const int* intptr1 = bottom_blob.channel(q * 2 + 1);
1025
signed char* ptr = top_blob.channel(q);
1027
v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8, 0);
1028
v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8 + 4, 0);
1029
v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8, 0);
1030
v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8 + 4, 0);
1031
v4f32 _bias0 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + q * 8, 0);
1032
v4f32 _bias1 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + q * 8 + 4, 0);
1034
for (int i = 0; i < size; i++)
1036
__builtin_prefetch(intptr0 + 16);
1037
__builtin_prefetch(intptr1 + 16);
1038
v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0, 0));
1039
v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1, 0));
1040
_v0 = __msa_fmadd_w(_bias0, _v0, _scale_in0);
1041
_v1 = __msa_fmadd_w(_bias1, _v1, _scale_in1);
1042
_v0 = activation_ps(_v0, activation_type, activation_params);
1043
_v1 = activation_ps(_v1, activation_type, activation_params);
1044
_v0 = __msa_fmul_w(_v0, _scale_out0);
1045
_v1 = __msa_fmul_w(_v1, _scale_out1);
1046
*((int64_t*)ptr) = float2int8(_v0, _v1);
1055
if (out_elempack == 1)
1057
if (bias_data_size == 0)
1059
#pragma omp parallel for num_threads(opt.num_threads)
1060
for (int q = 0; q < channels; q++)
1062
const int* intptr = bottom_blob.channel(q);
1063
signed char* ptr0 = top_blob.channel(q * 4);
1064
signed char* ptr1 = top_blob.channel(q * 4 + 1);
1065
signed char* ptr2 = top_blob.channel(q * 4 + 2);
1066
signed char* ptr3 = top_blob.channel(q * 4 + 3);
1068
v4f32 _scale_in = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 4, 0);
1069
v4f32 _scale_out = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 4, 0);
1071
for (int i = 0; i < size; i++)
1073
__builtin_prefetch(intptr + 16);
1074
v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
1075
_v = __msa_fmul_w(_v, _scale_in);
1076
_v = activation_ps(_v, activation_type, activation_params);
1077
_v = __msa_fmul_w(_v, _scale_out);
1078
v16i8 v = float2int8(_v);
1094
#pragma omp parallel for num_threads(opt.num_threads)
1095
for (int q = 0; q < channels; q++)
1097
const int* intptr = bottom_blob.channel(q);
1098
signed char* ptr0 = top_blob.channel(q * 4);
1099
signed char* ptr1 = top_blob.channel(q * 4 + 1);
1100
signed char* ptr2 = top_blob.channel(q * 4 + 2);
1101
signed char* ptr3 = top_blob.channel(q * 4 + 3);
1103
v4f32 _scale_in = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 4, 0);
1104
v4f32 _scale_out = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 4, 0);
1105
v4f32 _bias = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + q * 4, 0);
1107
for (int i = 0; i < size; i++)
1109
__builtin_prefetch(intptr + 16);
1110
v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0));
1111
_v = __msa_fmadd_w(_bias, _v, _scale_in);
1112
_v = activation_ps(_v, activation_type, activation_params);
1113
_v = __msa_fmul_w(_v, _scale_out);
1114
v16i8 v = float2int8(_v);
1137
int w = bottom_blob.w;
1139
top_blob.create(w, (size_t)1u, opt.blob_allocator);
1140
if (top_blob.empty())
1143
const int* intptr = bottom_blob;
1144
signed char* ptr = top_blob;
1146
if (scale_in_data_size == 1 && scale_out_data_size == 1)
1148
const float scale_in = scale_in_data[0];
1149
const float scale_out = scale_out_data[0];
1151
if (bias_data_size == 0)
1153
#pragma omp parallel for num_threads(opt.num_threads)
1154
for (int i = 0; i < w; i++)
1156
float v = intptr[i] * scale_in;
1157
ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1160
else if (bias_data_size == 1)
1162
const float bias = bias_data[0];
1164
#pragma omp parallel for num_threads(opt.num_threads)
1165
for (int i = 0; i < w; i++)
1167
float v = intptr[i] * scale_in + bias;
1168
ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1173
#pragma omp parallel for num_threads(opt.num_threads)
1174
for (int i = 0; i < w; i++)
1176
float v = intptr[i] * scale_in + bias_data[i];
1177
ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1181
else if (scale_in_data_size == 1 && scale_out_data_size > 1)
1183
const float scale_in = scale_in_data[0];
1185
if (bias_data_size == 0)
1187
#pragma omp parallel for num_threads(opt.num_threads)
1188
for (int i = 0; i < w; i++)
1190
float v = intptr[i] * scale_in;
1191
ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
1194
else if (bias_data_size == 1)
1196
const float bias = bias_data[0];
1198
#pragma omp parallel for num_threads(opt.num_threads)
1199
for (int i = 0; i < w; i++)
1201
float v = intptr[i] * scale_in + bias;
1202
ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
1207
#pragma omp parallel for num_threads(opt.num_threads)
1208
for (int i = 0; i < w; i++)
1210
float v = intptr[i] * scale_in + bias_data[i];
1211
ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
1215
else if (scale_in_data_size > 1 && scale_out_data_size == 1)
1217
const float scale_out = scale_out_data[0];
1219
if (bias_data_size == 0)
1221
#pragma omp parallel for num_threads(opt.num_threads)
1222
for (int i = 0; i < w; i++)
1224
float v = intptr[i] * scale_in_data[i];
1225
ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1228
else if (bias_data_size == 1)
1230
const float bias = bias_data[0];
1232
#pragma omp parallel for num_threads(opt.num_threads)
1233
for (int i = 0; i < w; i++)
1235
float v = intptr[i] * scale_in_data[i] + bias;
1236
ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1241
#pragma omp parallel for num_threads(opt.num_threads)
1242
for (int i = 0; i < w; i++)
1244
float v = intptr[i] * scale_in_data[i] + bias_data[i];
1245
ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1251
if (bias_data_size == 0)
1253
#pragma omp parallel for num_threads(opt.num_threads)
1254
for (int i = 0; i < w; i++)
1256
float v = intptr[i] * scale_in_data[i];
1257
ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
1260
else if (bias_data_size == 1)
1262
const float bias = bias_data[0];
1264
#pragma omp parallel for num_threads(opt.num_threads)
1265
for (int i = 0; i < w; i++)
1267
float v = intptr[i] * scale_in_data[i] + bias;
1268
ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
1273
#pragma omp parallel for num_threads(opt.num_threads)
1274
for (int i = 0; i < w; i++)
1276
float v = intptr[i] * scale_in_data[i] + bias_data[i];
1277
ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
1285
int w = bottom_blob.w;
1286
int h = bottom_blob.h;
1288
top_blob.create(w, h, (size_t)1u, opt.blob_allocator);
1289
if (top_blob.empty())
1292
if (bias_data_size == 0)
1294
#pragma omp parallel for num_threads(opt.num_threads)
1295
for (int i = 0; i < h; i++)
1297
const int* intptr = bottom_blob.row<const int>(i);
1298
signed char* ptr = top_blob.row<signed char>(i);
1300
const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i];
1301
const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i];
1303
for (int j = 0; j < w; j++)
1305
float v = intptr[j] * scale_in;
1306
ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1312
#pragma omp parallel for num_threads(opt.num_threads)
1313
for (int i = 0; i < h; i++)
1315
const int* intptr = bottom_blob.row<const int>(i);
1316
signed char* ptr = top_blob.row<signed char>(i);
1318
const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i];
1319
const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i];
1320
const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i];
1322
for (int j = 0; j < w; j++)
1324
float v = intptr[j] * scale_in + bias;
1325
ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1333
int w = bottom_blob.w;
1334
int h = bottom_blob.h;
1335
int channels = bottom_blob.c;
1338
top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator);
1339
if (top_blob.empty())
1342
if (bias_data_size == 0)
1344
#pragma omp parallel for num_threads(opt.num_threads)
1345
for (int q = 0; q < channels; q++)
1347
const int* intptr = bottom_blob.channel(q);
1348
signed char* ptr = top_blob.channel(q);
1350
const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q];
1351
const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q];
1353
for (int i = 0; i < size; i++)
1355
float v = intptr[i] * scale_in;
1356
ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1362
#pragma omp parallel for num_threads(opt.num_threads)
1363
for (int q = 0; q < channels; q++)
1365
const int* intptr = bottom_blob.channel(q);
1366
signed char* ptr = top_blob.channel(q);
1368
const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q];
1369
const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q];
1370
const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q];
1372
for (int i = 0; i < size; i++)
1374
float v = intptr[i] * scale_in + bias;
1375
ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);