ncnn

convolution_mips.cpp
971 строка · 36.6 Кб
Перенос по словам
1
// Tencent is pleased to support the open source community by making ncnn available.
2
//
3
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
4
//
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
7
//
8
// https://opensource.org/licenses/BSD-3-Clause
9
//
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
14

15
#include "convolution_mips.h"
16

17
#include "benchmark.h"
18
#include "cpu.h"
19
#include "layer_type.h"
20

21
#if __mips_msa
22
#include <msa.h>
23
#endif // __mips_msa
24

25
#include "mips_activation.h"
26
#include "mips_usability.h"
27

28
#include "cpu.h"
29

30
namespace ncnn {
31

32
#include "convolution_sgemm.h"
33
#include "convolution_winograd_transform.h"
34
#include "convolution_winograd_dot.h"
35
#include "convolution_1x1.h"
36
#include "convolution_3x3.h"
37

38
#if NCNN_INT8
39
#include "convolution_sgemm_int8.h"
40
#include "convolution_winograd_transform_int8.h"
41
#include "convolution_winograd_dot_int8.h"
42
#include "convolution_1x1_int8.h"
43
#include "convolution_3x3_int8.h"
44
#include "convolution_int8.h"
45
#endif // NCNN_INT8
46

47
#if __mips_msa
48
#include "convolution_pack4.h"
49
#include "convolution_pack1to4.h"
50
#include "convolution_pack4to1.h"
51

52
#include "convolution_sgemm_pack4.h"
53
#include "convolution_sgemm_pack4to1.h"
54
#include "convolution_winograd_transform_pack4.h"
55
#include "convolution_winograd_dot_pack4.h"
56
#include "convolution_1x1_pack4.h"
57
#include "convolution_1x1_pack4to1.h"
58
#include "convolution_3x3_pack4.h"
59
#include "convolution_3x3_pack1to4.h"
60
#include "convolution_7x7_pack1to4.h"
61

62
#if NCNN_INT8
63
#include "convolution_pack8to4_int8.h"
64
#include "convolution_pack1to4_int8.h"
65
#include "convolution_pack8to1_int8.h"
66
#include "convolution_sgemm_pack8to4_int8.h"
67
#include "convolution_sgemm_pack1to4_int8.h"
68
#include "convolution_sgemm_pack8to1_int8.h"
69
#include "convolution_winograd_transform_pack4_int8.h"
70
#include "convolution_winograd_transform_pack8_int8.h"
71
#include "convolution_winograd_dot_pack8to4_int8.h"
72
#include "convolution_winograd_dot_pack8to1_int8.h"
73
#include "convolution_1x1_pack8to4_int8.h"
74
#include "convolution_1x1_pack1to4_int8.h"
75
#include "convolution_1x1_pack8to1_int8.h"
76
#include "convolution_3x3_pack8to4_int8.h"
77
#include "convolution_3x3_pack8to1_int8.h"
78
#endif // NCNN_INT8
79
#endif // __mips_msa
80

81
Convolution_mips::Convolution_mips()
82
{
83
#if __mips_msa
84
    support_packing = true;
85
#endif // __mips_msa
86

87
    activation = 0;
88
}
89

90
static void convolution_transform_kernel_packed_msa(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack)
91
{
92
    const int maxk = kernel_w * kernel_h;
93

94
    // src = kw-kh-inch-outch
95
    // dst = pb-pa-kw-kh-inch/pa-outch/pb
96
    {
97
        Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
98

99
        weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack);
100

101
        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
102
        {
103
            float* g00 = weight_data_tm.channel(q / out_elempack);
104

105
            for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
106
            {
107
                for (int k = 0; k < maxk; k++)
108
                {
109
                    for (int i = 0; i < elempack; i++)
110
                    {
111
                        for (int j = 0; j < out_elempack; j++)
112
                        {
113
                            const float* k00 = weight_data_r2.channel(q + j).row(p + i);
114

115
                            g00[0] = k00[k];
116

117
                            g00++;
118
                        }
119
                    }
120
                }
121
            }
122
        }
123
    }
124
}
125

126
int Convolution_mips::create_pipeline(const Option& opt)
127
{
128
    if (dynamic_weight)
129
        return 0;
130

131
    activation = create_activation_layer(activation_type, activation_params, opt);
132

133
#if NCNN_INT8
134
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
135
    {
136
        return create_pipeline_int8_mips(opt);
137
    }
138
#endif
139

140
    const int maxk = kernel_w * kernel_h;
141
    const int num_input = weight_data_size / maxk / num_output;
142

143
    int elempack = 1;
144
    int out_elempack = 1;
145
#if __mips_msa
146
    if (opt.use_packing_layout)
147
    {
148
        elempack = num_input % 4 == 0 ? 4 : 1;
149
        out_elempack = num_output % 4 == 0 ? 4 : 1;
150
    }
151
#endif
152

153
#if __mips_msa
154
    // pack4
155
    if (elempack == 4 && out_elempack == 4)
156
    {
157
        if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
158
        {
159
            if ((opt.use_winograd63_convolution && num_input >= 8 && num_output >= 8 && num_input <= 64 && num_output <= 64) || (!opt.use_winograd43_convolution && !opt.use_winograd23_convolution))
160
                conv3x3s1_winograd63_transform_kernel_pack4_msa(weight_data, weight_winograd63_data, num_input, num_output, opt);
161
            else if ((opt.use_winograd43_convolution && num_input >= 8 && num_output >= 8) || (!opt.use_winograd63_convolution && !opt.use_winograd23_convolution))
162
                conv3x3s1_winograd43_transform_kernel_pack4_msa(weight_data, weight_winograd43_data, num_input, num_output, opt);
163
            else // if (opt.use_winograd23_convolution)
164
                conv3x3s1_winograd23_transform_kernel_pack4_msa(weight_data, weight_winograd23_data, num_input, num_output, opt);
165
        }
166
        else
167
        {
168
            convolution_transform_kernel_packed_msa(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
169
        }
170
    }
171

172
    // pack1ton
173
    if (elempack == 1 && out_elempack == 4)
174
    {
175
        convolution_transform_kernel_packed_msa(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
176
    }
177

178
    // pack4to1
179
    if (elempack == 4 && out_elempack == 1)
180
    {
181
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
182
        {
183
            convolution_im2col_sgemm_transform_kernel_pack4to1_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
184
        }
185
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
186
        {
187
            convolution_im2col_sgemm_transform_kernel_pack4to1_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
188
        }
189
        else if (opt.use_sgemm_convolution)
190
        {
191
            convolution_im2col_sgemm_transform_kernel_pack4to1_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
192
        }
193
        else
194
        {
195
            convolution_transform_kernel_packed_msa(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
196
        }
197
    }
198
#endif // __mips_msa
199

200
    // pack1
201
    if (elempack == 1 && out_elempack == 1)
202
    {
203
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
204
        {
205
            convolution_im2col_sgemm_transform_kernel_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
206
        }
207
        if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
208
        {
209
            if ((opt.use_winograd43_convolution && num_input >= 16 && num_output >= 16) || !opt.use_winograd23_convolution)
210
            {
211
                conv3x3s1_winograd43_transform_kernel_msa(weight_data, weight_winograd43_data, num_input, num_output, opt);
212
            }
213
            else if (opt.use_winograd23_convolution)
214
            {
215
                conv3x3s1_winograd23_transform_kernel_msa(weight_data, weight_winograd23_data, num_input, num_output, opt);
216
            }
217
        }
218
        else if (opt.use_sgemm_convolution)
219
        {
220
            convolution_im2col_sgemm_transform_kernel_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
221
        }
222
        else
223
        {
224
            weight_data_tm = weight_data;
225
        }
226
    }
227

228
    if (opt.lightmode)
229
        weight_data.release();
230

231
    return 0;
232
}
233

234
int Convolution_mips::destroy_pipeline(const Option& opt)
235
{
236
    if (activation)
237
    {
238
        activation->destroy_pipeline(opt);
239
        delete activation;
240
        activation = 0;
241
    }
242

243
    return 0;
244
}
245

246
int Convolution_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
247
{
248
#if NCNN_INT8
249
    if (opt.use_int8_inference && int8_scale_term)
250
    {
251
        return forward_int8_mips(bottom_blob, top_blob, opt);
252
    }
253
#endif
254

255
    // flattened blob, implement as InnerProduct
256
    if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
257
    {
258
        Mat bottom_blob_3d;
259
        if (bottom_blob.elemsize % 16 == 0)
260
        {
261
            bottom_blob_3d = bottom_blob;
262
            bottom_blob_3d.dims = 3;
263
            bottom_blob_3d.w = 1;
264
            bottom_blob_3d.h = 1;
265
            bottom_blob_3d.c = bottom_blob.w;
266
            bottom_blob_3d.cstep = 1;
267
        }
268
        else
269
        {
270
            bottom_blob_3d = bottom_blob.reshape(1, 1, bottom_blob.w, opt.workspace_allocator);
271
        }
272

273
        Mat top_blob_3d;
274
        int ret = forward(bottom_blob_3d, top_blob_3d, opt);
275
        if (ret != 0)
276
            return ret;
277

278
        if (top_blob_3d.elemsize % 16 == 0)
279
        {
280
            top_blob = top_blob_3d;
281
            top_blob.dims = 1;
282
            top_blob.w = top_blob_3d.c;
283
            top_blob.h = 1;
284
            top_blob.c = 1;
285
            bottom_blob_3d.cstep = top_blob_3d.c;
286
        }
287
        else
288
        {
289
            top_blob = top_blob_3d.reshape(top_blob_3d.c, opt.blob_allocator);
290
        }
291

292
        return 0;
293
    }
294

295
    int w = bottom_blob.w;
296
    int h = bottom_blob.h;
297
    int channels = bottom_blob.c;
298
    size_t elemsize = bottom_blob.elemsize;
299
    int elempack = bottom_blob.elempack;
300

301
    //     NCNN_LOGE("Convolution input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
302

303
    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
304
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
305

306
    Mat bottom_blob_bordered;
307
    make_padding(bottom_blob, bottom_blob_bordered, opt);
308
    if (bottom_blob_bordered.empty())
309
        return -100;
310

311
    w = bottom_blob_bordered.w;
312
    h = bottom_blob_bordered.h;
313

314
    int outw = (w - kernel_extent_w) / stride_w + 1;
315
    int outh = (h - kernel_extent_h) / stride_h + 1;
316
    int out_elempack = 1;
317
#if __mips_msa
318
    if (opt.use_packing_layout)
319
    {
320
        out_elempack = num_output % 4 == 0 ? 4 : 1;
321
    }
322
#endif
323
    size_t out_elemsize = elemsize / elempack * out_elempack;
324

325
    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
326
    if (top_blob.empty())
327
        return -100;
328

329
    const int num_input = channels * elempack;
330

331
#if __mips_msa
332
    if (elempack == 4 && out_elempack == 4)
333
    {
334
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
335
        {
336
            conv1x1s1_sgemm_pack4_msa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
337

338
            if (activation)
339
            {
340
                activation->forward_inplace(top_blob, opt);
341
            }
342
        }
343
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
344
        {
345
            conv1x1s2_sgemm_pack4_msa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
346

347
            if (activation)
348
            {
349
                activation->forward_inplace(top_blob, opt);
350
            }
351
        }
352
        else if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
353
        {
354
            if ((opt.use_winograd63_convolution && num_input >= 8 && num_output >= 8 && num_input <= 64 && num_output <= 64) || (!opt.use_winograd43_convolution && !opt.use_winograd23_convolution))
355
                conv3x3s1_winograd63_pack4_msa(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data, opt);
356
            else if ((opt.use_winograd43_convolution && num_input >= 8 && num_output >= 8) || (!opt.use_winograd63_convolution && !opt.use_winograd23_convolution))
357
                conv3x3s1_winograd43_pack4_msa(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, opt);
358
            else // if (opt.use_winograd23_convolution)
359
                conv3x3s1_winograd23_pack4_msa(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, opt);
360

361
            if (activation)
362
            {
363
                activation->forward_inplace(top_blob, opt);
364
            }
365
        }
366
        else if (opt.use_sgemm_convolution)
367
        {
368
            convolution_im2col_sgemm_pack4_msa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
369

370
            if (activation)
371
            {
372
                activation->forward_inplace(top_blob, opt);
373
            }
374
        }
375
        else
376
        {
377
            convolution_pack4_msa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
378
        }
379
    }
380

381
    if (elempack == 1 && out_elempack == 4)
382
    {
383
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
384
        {
385
            conv3x3s1_pack1to4_msa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
386

387
            if (activation)
388
            {
389
                activation->forward_inplace(top_blob, opt);
390
            }
391
        }
392
        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
393
        {
394
            conv3x3s2_pack1to4_msa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
395

396
            if (activation)
397
            {
398
                activation->forward_inplace(top_blob, opt);
399
            }
400
        }
401
        else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
402
        {
403
            conv7x7s2_pack1to4_msa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
404

405
            if (activation)
406
            {
407
                activation->forward_inplace(top_blob, opt);
408
            }
409
        }
410
        else
411
        {
412
            convolution_pack1to4_msa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
413
        }
414
    }
415

416
    if (elempack == 4 && out_elempack == 1)
417
    {
418
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
419
        {
420
            conv1x1s1_sgemm_pack4to1_msa(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, opt);
421

422
            if (activation)
423
            {
424
                activation->forward_inplace(top_blob, opt);
425
            }
426
        }
427
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
428
        {
429
            conv1x1s2_sgemm_pack4to1_msa(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, opt);
430

431
            if (activation)
432
            {
433
                activation->forward_inplace(top_blob, opt);
434
            }
435
        }
436
        else if (opt.use_sgemm_convolution)
437
        {
438
            convolution_im2col_sgemm_pack4to1_msa(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
439

440
            if (activation)
441
            {
442
                activation->forward_inplace(top_blob, opt);
443
            }
444
        }
445
        else
446
        {
447
            convolution_pack4to1_msa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
448
        }
449
    }
450
#endif // __mips_msa
451

452
    if (elempack == 1 && out_elempack == 1)
453
    {
454
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
455
        {
456
            conv1x1s1_sgemm_msa(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, opt);
457

458
            if (activation)
459
            {
460
                activation->forward_inplace(top_blob, opt);
461
            }
462
        }
463
        else if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
464
        {
465
            if ((opt.use_winograd43_convolution && num_input >= 16 && num_output >= 16) || !opt.use_winograd23_convolution)
466
            {
467
                conv3x3s1_winograd43_msa(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, opt);
468
            }
469
            else if (opt.use_winograd23_convolution)
470
            {
471
                conv3x3s1_winograd23_msa(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, opt);
472
            }
473

474
            if (activation)
475
            {
476
                activation->forward_inplace(top_blob, opt);
477
            }
478
        }
479
        else if (opt.use_sgemm_convolution)
480
        {
481
            convolution_im2col_sgemm_msa(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
482

483
            if (activation)
484
            {
485
                activation->forward_inplace(top_blob, opt);
486
            }
487
        }
488
        else
489
        {
490
            const int maxk = kernel_w * kernel_h;
491

492
            // kernel offsets
493
            std::vector<int> _space_ofs(maxk);
494
            int* space_ofs = &_space_ofs[0];
495
            {
496
                int p1 = 0;
497
                int p2 = 0;
498
                int gap = w * dilation_h - kernel_w * dilation_w;
499
                for (int i = 0; i < kernel_h; i++)
500
                {
501
                    for (int j = 0; j < kernel_w; j++)
502
                    {
503
                        space_ofs[p1] = p2;
504
                        p1++;
505
                        p2 += dilation_w;
506
                    }
507
                    p2 += gap;
508
                }
509
            }
510

511
            // num_output
512
            #pragma omp parallel for num_threads(opt.num_threads)
513
            for (int p = 0; p < num_output; p++)
514
            {
515
                float* outptr = top_blob.channel(p);
516

517
                for (int i = 0; i < outh; i++)
518
                {
519
                    for (int j = 0; j < outw; j++)
520
                    {
521
                        float sum = 0.f;
522

523
                        if (bias_term)
524
                        {
525
                            sum = bias_data[p];
526
                        }
527

528
                        const float* kptr = (const float*)weight_data_tm + maxk * channels * p;
529

530
                        // channels
531
                        for (int q = 0; q < channels; q++)
532
                        {
533
                            const Mat m = bottom_blob_bordered.channel(q);
534
                            const float* sptr = m.row(i * stride_h) + j * stride_w;
535

536
                            for (int k = 0; k < maxk; k++)
537
                            {
538
                                float val = sptr[space_ofs[k]];
539
                                float wt = kptr[k];
540
                                sum += val * wt;
541
                            }
542

543
                            kptr += maxk;
544
                        }
545

546
                        sum = activation_ss(sum, activation_type, activation_params);
547

548
                        outptr[j] = sum;
549
                    }
550

551
                    outptr += outw;
552
                }
553
            }
554
        }
555
    }
556

557
    return 0;
558
}
559

560
int Convolution_mips::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
561
{
562
    const Mat& bottom_blob = bottom_blobs[0];
563
    const Mat& _weight_data = bottom_blobs[1];
564
    Mat& top_blob = top_blobs[0];
565

566
    const int _kernel_w = _weight_data.w;
567
    const int _kernel_h = _weight_data.h;
568
    const int _num_output = _weight_data.c * _weight_data.elempack;
569

570
    Mat weight_data_flattened;
571
    flatten(_weight_data, weight_data_flattened, opt);
572
    if (weight_data_flattened.empty())
573
        return -100;
574

575
    // weight_data_flattened as pack1
576
    weight_data_flattened.w *= weight_data_flattened.elempack;
577
    weight_data_flattened.elemsize /= weight_data_flattened.elempack;
578
    weight_data_flattened.elempack = 1;
579

580
    Mat bias_data_flattened;
581
    if (bias_term)
582
    {
583
        const Mat& _bias_data = bottom_blobs[2];
584
        flatten(_bias_data, bias_data_flattened, opt);
585
        if (bias_data_flattened.empty())
586
            return -100;
587

588
        // bias_data_flattened as pack1
589
        bias_data_flattened.w *= bias_data_flattened.elempack;
590
        bias_data_flattened.elemsize /= bias_data_flattened.elempack;
591
        bias_data_flattened.elempack = 1;
592
    }
593

594
    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
595

596
    ncnn::ParamDict pd;
597
    pd.set(0, _num_output);
598
    pd.set(1, _kernel_w);
599
    pd.set(11, _kernel_h);
600
    pd.set(2, dilation_w);
601
    pd.set(12, dilation_h);
602
    pd.set(3, stride_w);
603
    pd.set(13, stride_h);
604
    pd.set(4, pad_left);
605
    pd.set(15, pad_right);
606
    pd.set(14, pad_top);
607
    pd.set(16, pad_bottom);
608
    pd.set(18, pad_value);
609
    pd.set(5, bias_term);
610
    pd.set(6, weight_data_flattened.w);
611
    pd.set(8, int8_scale_term);
612
    pd.set(9, activation_type);
613
    pd.set(10, activation_params);
614

615
    op->load_param(pd);
616

617
    ncnn::Mat weights[2];
618
    weights[0] = weight_data_flattened;
619
    weights[1] = bias_data_flattened;
620

621
    op->load_model(ncnn::ModelBinFromMatArray(weights));
622

623
    op->create_pipeline(opt);
624

625
    op->forward(bottom_blob, top_blob, opt);
626

627
    op->destroy_pipeline(opt);
628

629
    delete op;
630

631
    return 0;
632
}
633

634
#if NCNN_INT8
635
static void convolution_transform_kernel_packed_int8_msa(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack)
636
{
637
    const int maxk = kernel_w * kernel_h;
638

639
    // src = kw-kh-inch-outch
640
    // dst = pa-pb-kw-kh-inch/pa-outch/pb
641
    {
642
        Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
643

644
        weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)elempack * out_elempack, elempack * out_elempack);
645

646
        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
647
        {
648
            signed char* g00 = weight_data_tm.channel(q / out_elempack);
649

650
            for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
651
            {
652
                for (int k = 0; k < maxk; k++)
653
                {
654
                    for (int i = 0; i < out_elempack; i++)
655
                    {
656
                        for (int j = 0; j < elempack; j++)
657
                        {
658
                            const signed char* k00 = weight_data_r2.channel(q + i).row<const signed char>(p + j);
659

660
                            g00[0] = k00[k];
661

662
                            g00++;
663
                        }
664
                    }
665
                }
666
            }
667
        }
668
    }
669
}
670

671
int Convolution_mips::create_pipeline_int8_mips(const Option& opt)
672
{
673
    const int maxk = kernel_w * kernel_h;
674
    const int num_input = weight_data_size / maxk / num_output;
675

676
    int elempack = 1;
677
    int out_elempack = 1;
678
#if __mips_msa
679
    if (opt.use_packing_layout)
680
    {
681
        elempack = num_input % 8 == 0 ? 8 : 1;
682
        out_elempack = num_output % 4 == 0 ? 4 : 1;
683
    }
684
#endif // __mips_msa
685

686
#if __mips_msa
687
    if (elempack == 8 && out_elempack == 4)
688
    {
689
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
690
        {
691
            convolution_im2col_sgemm_transform_kernel_pack8to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
692
        }
693
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
694
        {
695
            convolution_im2col_sgemm_transform_kernel_pack8to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
696
        }
697
        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
698
        {
699
            conv3x3s1_winograd43_transform_kernel_pack8to4_int8_msa(weight_data, weight_winograd43_data, num_input, num_output, opt);
700
        }
701
        else if (opt.use_sgemm_convolution)
702
        {
703
            convolution_im2col_sgemm_transform_kernel_pack8to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
704
        }
705
        else
706
        {
707
            convolution_transform_kernel_packed_int8_msa(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
708
        }
709
    }
710

711
    if (elempack == 1 && out_elempack == 4)
712
    {
713
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
714
        {
715
            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
716
        }
717
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
718
        {
719
            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
720
        }
721
        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
722
        {
723
            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
724
        }
725
        else
726
        {
727
            convolution_transform_kernel_packed_int8_msa(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
728
        }
729
    }
730

731
    if (elempack == 8 && out_elempack == 1)
732
    {
733
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
734
        {
735
            convolution_im2col_sgemm_transform_kernel_pack8to1_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
736
        }
737
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
738
        {
739
            convolution_im2col_sgemm_transform_kernel_pack8to1_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
740
        }
741
        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
742
        {
743
            conv3x3s1_winograd43_transform_kernel_pack8to1_int8_msa(weight_data, weight_winograd43_data, num_input, num_output, opt);
744
        }
745
        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
746
        {
747
            convolution_im2col_sgemm_transform_kernel_pack8to1_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
748
        }
749
        else
750
        {
751
            convolution_transform_kernel_packed_int8_msa(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
752
        }
753
    }
754
#endif // __mips_msa
755

756
    if (elempack == 1 && out_elempack == 1)
757
    {
758
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
759
        {
760
            convolution_im2col_sgemm_transform_kernel_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
761
        }
762
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
763
        {
764
            convolution_im2col_sgemm_transform_kernel_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
765
        }
766
        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
767
        {
768
            conv3x3s1_winograd43_transform_kernel_int8_msa(weight_data, weight_winograd43_data, num_input, num_output, opt);
769
        }
770
        else if (opt.use_sgemm_convolution)
771
        {
772
            convolution_im2col_sgemm_transform_kernel_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
773
        }
774
        else
775
        {
776
            weight_data_tm = weight_data;
777
        }
778
    }
779

780
    scale_in_data.create(num_output);
781
    for (int p = 0; p < num_output; p++)
782
    {
783
        // requantize and relu
784
        float scale_in;
785
        if (weight_data_int8_scales[p] == 0)
786
            scale_in = 0;
787
        else
788
            scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);
789

790
        scale_in_data[p] = scale_in;
791
    }
792

793
    if (opt.lightmode)
794
        weight_data.release();
795

796
    return 0;
797
}
798

799
int Convolution_mips::forward_int8_mips(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
800
{
801
    int elembits = bottom_blob.elembits();
802

803
    Mat bottom_blob_int8 = bottom_blob;
804
    if (elembits != 8)
805
    {
806
        Option opt_q = opt;
807
        opt_q.blob_allocator = opt.workspace_allocator;
808
        quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q);
809
    }
810

811
    Mat bottom_blob_bordered;
812
    make_padding(bottom_blob_int8, bottom_blob_bordered, opt);
813
    if (bottom_blob_bordered.empty())
814
        return -100;
815

816
    int w = bottom_blob_bordered.w;
817
    int h = bottom_blob_bordered.h;
818
    int channels = bottom_blob_bordered.c;
819
    int elempack = bottom_blob_bordered.elempack;
820

821
    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
822
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
823

824
    int outw = (w - kernel_extent_w) / stride_w + 1;
825
    int outh = (h - kernel_extent_h) / stride_h + 1;
826

827
    bool use_int8_requantize = int8_scale_term > 100;
828
    int out_elempack = 1;
829
#if __mips_msa
830
    if (opt.use_packing_layout)
831
    {
832
        if (use_int8_requantize)
833
            out_elempack = num_output % 8 == 0 ? 8 : 1;
834
        else
835
            out_elempack = num_output % 4 == 0 ? 4 : 1;
836
    }
837
#endif // __mips_msa
838
    size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
839

840
    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
841
    if (top_blob.empty())
842
        return -100;
843

844
    const int num_input = channels * elempack;
845

846
    int out_elempack_int32 = 1;
847
#if __mips_msa
848
    if (opt.use_packing_layout)
849
    {
850
        out_elempack_int32 = num_output % 4 == 0 ? 4 : 1;
851
    }
852
#endif // __mips_msa
853

854
    Mat top_blob_int32;
855
    top_blob_int32.create(outw, outh, num_output / out_elempack_int32, (size_t)(4u * out_elempack_int32), out_elempack_int32, opt.workspace_allocator);
856
    if (top_blob_int32.empty())
857
        return -100;
858

859
#if __mips_msa
860
    if (elempack == 8 && out_elempack_int32 == 4)
861
    {
862
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
863
        {
864
            conv1x1s1_sgemm_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
865
        }
866
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
867
        {
868
            conv1x1s2_sgemm_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
869
        }
870
        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
871
        {
872
            conv3x3s1_winograd43_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
873
        }
874
        else if (opt.use_sgemm_convolution)
875
        {
876
            convolution_im2col_sgemm_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
877
        }
878
        else
879
        {
880
            convolution_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
881
        }
882
    }
883

884
    if (elempack == 1 && out_elempack_int32 == 4)
885
    {
886
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
887
        {
888
            conv1x1s1_sgemm_pack1to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
889
        }
890
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
891
        {
892
            conv1x1s2_sgemm_pack1to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
893
        }
894
        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
895
        {
896
            convolution_im2col_sgemm_pack1to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
897
        }
898
        else
899
        {
900
            convolution_pack1to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
901
        }
902
    }
903

904
    if (elempack == 8 && out_elempack_int32 == 1)
905
    {
906
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
907
        {
908
            conv1x1s1_sgemm_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
909
        }
910
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
911
        {
912
            conv1x1s2_sgemm_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
913
        }
914
        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
915
        {
916
            conv3x3s1_winograd43_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
917
        }
918
        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
919
        {
920
            convolution_im2col_sgemm_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
921
        }
922
        else
923
        {
924
            convolution_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
925
        }
926
    }
927
#endif // __mips_msa
928

929
    if (elempack == 1 && out_elempack_int32 == 1)
930
    {
931
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
932
        {
933
            conv1x1s1_sgemm_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
934
        }
935
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
936
        {
937
            conv1x1s2_sgemm_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
938
        }
939
        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
940
        {
941
            conv3x3s1_winograd43_int8_msa(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
942
        }
943
        else if (opt.use_sgemm_convolution)
944
        {
945
            convolution_im2col_sgemm_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
946
        }
947
        else
948
        {
949
            convolution_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
950
        }
951
    }
952

953
    if (use_int8_requantize)
954
    {
955
        requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
956
    }
957
    else
958
    {
959
        dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);
960

961
        if (activation)
962
        {
963
            activation->forward_inplace(top_blob, opt);
964
        }
965
    }
966

967
    return 0;
968
}
969
#endif // NCNN_INT8
970

971
} // namespace ncnn
972
ncnn

Использование cookies