ncnn

convolutiondepthwise.cpp
648 строк · 20.8 Кб
Перенос по словам
1
// Tencent is pleased to support the open source community by making ncnn available.
2
//
3
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4
//
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
7
//
8
// https://opensource.org/licenses/BSD-3-Clause
9
//
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
14

15
#include "convolutiondepthwise.h"
16

17
#include "layer_type.h"
18

19
#include "fused_activation.h"
20

21
namespace ncnn {
22

23
ConvolutionDepthWise::ConvolutionDepthWise()
24
{
25
    one_blob_only = true;
26
    support_inplace = false;
27
}
28

29
int ConvolutionDepthWise::load_param(const ParamDict& pd)
30
{
31
    num_output = pd.get(0, 0);
32
    kernel_w = pd.get(1, 0);
33
    kernel_h = pd.get(11, kernel_w);
34
    dilation_w = pd.get(2, 1);
35
    dilation_h = pd.get(12, dilation_w);
36
    stride_w = pd.get(3, 1);
37
    stride_h = pd.get(13, stride_w);
38
    pad_left = pd.get(4, 0);
39
    pad_right = pd.get(15, pad_left);
40
    pad_top = pd.get(14, pad_left);
41
    pad_bottom = pd.get(16, pad_top);
42
    pad_value = pd.get(18, 0.f);
43
    bias_term = pd.get(5, 0);
44
    weight_data_size = pd.get(6, 0);
45
    group = pd.get(7, 1);
46
    int8_scale_term = pd.get(8, 0);
47
    activation_type = pd.get(9, 0);
48
    activation_params = pd.get(10, Mat());
49

50
    dynamic_weight = pd.get(19, 0);
51

52
    if (dynamic_weight)
53
    {
54
        one_blob_only = false;
55
    }
56

57
    if (num_output % group != 0)
58
    {
59
        // reject invalid group
60
        return -100;
61
    }
62

63
    if (int8_scale_term)
64
    {
65
#if NCNN_INT8
66
        support_int8_storage = true;
67
#else
68
        NCNN_LOGE("please build ncnn with NCNN_INT8 enabled for int8 inference");
69
        return -1;
70
#endif
71
    }
72

73
    return 0;
74
}
75

76
int ConvolutionDepthWise::load_model(const ModelBin& mb)
77
{
78
    if (dynamic_weight)
79
        return 0;
80

81
    weight_data = mb.load(weight_data_size, 0);
82
    if (weight_data.empty())
83
        return -100;
84

85
    if (bias_term)
86
    {
87
        bias_data = mb.load(num_output, 1);
88
        if (bias_data.empty())
89
            return -100;
90
    }
91

92
#if NCNN_INT8
93
    if (int8_scale_term == 1 || int8_scale_term == 101)
94
    {
95
        weight_data_int8_scales = mb.load(group, 1);
96
        bottom_blob_int8_scales = mb.load(1, 1);
97

98
        float bottom_blob_int8_scale = bottom_blob_int8_scales[0];
99
        bottom_blob_int8_scales = Mat(group);
100
        bottom_blob_int8_scales.fill(bottom_blob_int8_scale);
101
    }
102
    else if (int8_scale_term == 2 || int8_scale_term == 102)
103
    {
104
        weight_data_int8_scales = mb.load(1, 1);
105
        bottom_blob_int8_scales = mb.load(1, 1);
106

107
        // extend group if only one provided
108
        float weight_data_int8_scale = weight_data_int8_scales[0];
109
        weight_data_int8_scales = Mat(group);
110
        weight_data_int8_scales.fill(weight_data_int8_scale);
111

112
        float bottom_blob_int8_scale = bottom_blob_int8_scales[0];
113
        bottom_blob_int8_scales = Mat(group);
114
        bottom_blob_int8_scales.fill(bottom_blob_int8_scale);
115
    }
116

117
    if (int8_scale_term > 100)
118
    {
119
        top_blob_int8_scales = mb.load(1, 1);
120

121
        float top_blob_int8_scale = top_blob_int8_scales[0];
122
        top_blob_int8_scales = Mat(group);
123
        top_blob_int8_scales.fill(top_blob_int8_scale);
124
    }
125
#endif // NCNN_INT8
126

127
#if NCNN_INT8
128
    // runtime quantize the weight data
129
    if (weight_data.elemsize == (size_t)4u && int8_scale_term)
130
    {
131
        Mat int8_weight_data(weight_data_size, (size_t)1u);
132
        if (int8_weight_data.empty())
133
            return -100;
134

135
        const int weight_data_size_g = weight_data_size / group;
136

137
        for (int g = 0; g < group; g++)
138
        {
139
            Option opt_q;
140
            opt_q.num_threads = 1;
141
            opt_q.blob_allocator = int8_weight_data.allocator;
142
            opt_q.use_packing_layout = false;
143

144
            const Mat weight_data_g = weight_data.range(weight_data_size_g * g, weight_data_size_g);
145
            Mat int8_weight_data_g = int8_weight_data.range(weight_data_size_g * g, weight_data_size_g);
146
            const Mat weight_data_int8_scales_g = weight_data_int8_scales.range(g, 1);
147
            quantize_to_int8(weight_data_g, int8_weight_data_g, weight_data_int8_scales_g, opt_q);
148
        }
149

150
        weight_data = int8_weight_data;
151
    }
152
#endif // NCNN_INT8
153

154
    return 0;
155
}
156

157
static int convolutiondepthwise(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int kernel_h, int stride_w, int stride_h, int dilation_w, int dilation_h, int group, int activation_type, const Mat& activation_params, const Option& opt)
158
{
159
    const int w = bottom_blob.w;
160
    const int inch = bottom_blob.c;
161

162
    const int outw = top_blob.w;
163
    const int outh = top_blob.h;
164
    const int outch = top_blob.c;
165

166
    const int bias_term = bias_data.empty() ? 0 : 1;
167

168
    const int maxk = kernel_w * kernel_h;
169

170
    // kernel offsets
171
    std::vector<int> _space_ofs(maxk);
172
    int* space_ofs = &_space_ofs[0];
173
    {
174
        int p1 = 0;
175
        int p2 = 0;
176
        int gap = w * dilation_h - kernel_w * dilation_w;
177
        for (int i = 0; i < kernel_h; i++)
178
        {
179
            for (int j = 0; j < kernel_w; j++)
180
            {
181
                space_ofs[p1] = p2;
182
                p1++;
183
                p2 += dilation_w;
184
            }
185
            p2 += gap;
186
        }
187
    }
188

189
    // depth-wise
190
    if (inch == group && group == outch)
191
    {
192
        #pragma omp parallel for num_threads(opt.num_threads)
193
        for (int g = 0; g < group; g++)
194
        {
195
            float* outptr = top_blob.channel(g);
196
            const float* kptr = (const float*)weight_data + maxk * g;
197
            const Mat m = bottom_blob.channel(g);
198

199
            for (int i = 0; i < outh; i++)
200
            {
201
                for (int j = 0; j < outw; j++)
202
                {
203
                    float sum = 0.f;
204

205
                    if (bias_term)
206
                        sum = bias_data[g];
207

208
                    const float* sptr = m.row(i * stride_h) + j * stride_w;
209

210
                    for (int k = 0; k < maxk; k++)
211
                    {
212
                        float val = sptr[space_ofs[k]];
213
                        float w = kptr[k];
214
                        sum += val * w;
215
                    }
216

217
                    outptr[j] = activation_ss(sum, activation_type, activation_params);
218
                }
219

220
                outptr += outw;
221
            }
222
        }
223
    }
224
    else
225
    {
226
        // group convolution
227
        const int inch_g = inch / group;
228
        const int outch_g = outch / group;
229

230
#ifdef _WIN32
231
        #pragma omp parallel for num_threads(opt.num_threads)
232
#else
233
        #pragma omp parallel for collapse(2) num_threads(opt.num_threads)
234
#endif
235
        for (int g = 0; g < group; g++)
236
        {
237
            for (int p = 0; p < outch_g; p++)
238
            {
239
                float* outptr = top_blob.channel(g * outch_g + p);
240
                const float* weight_data_ptr = (const float*)weight_data + maxk * inch_g * outch_g * g;
241

242
                // shadowed variable for less openmp task args
243
                const int outw = top_blob.w;
244
                const int outh = top_blob.h;
245

246
                for (int i = 0; i < outh; i++)
247
                {
248
                    for (int j = 0; j < outw; j++)
249
                    {
250
                        float sum = 0.f;
251

252
                        if (bias_term)
253
                            sum = bias_data[outch_g * g + p];
254

255
                        const float* kptr = weight_data_ptr + maxk * inch_g * p;
256

257
                        for (int q = 0; q < inch_g; q++)
258
                        {
259
                            const Mat m = bottom_blob.channel(inch_g * g + q);
260
                            const float* sptr = m.row(i * stride_h) + j * stride_w;
261

262
                            for (int k = 0; k < maxk; k++)
263
                            {
264
                                float val = sptr[space_ofs[k]];
265
                                float w = kptr[k];
266
                                sum += val * w;
267
                            }
268

269
                            kptr += maxk;
270
                        }
271

272
                        outptr[j] = activation_ss(sum, activation_type, activation_params);
273
                    }
274

275
                    outptr += outw;
276
                }
277
            }
278
        }
279
    }
280

281
    return 0;
282
}
283

284
int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
285
{
286
    // convolv with NxN kernel
287
    // value = value + bias
288

289
#if NCNN_INT8
290
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
291
    {
292
        return forward_int8(bottom_blob, top_blob, opt);
293
    }
294
#endif
295

296
    //     NCNN_LOGE("ConvolutionDepthWise input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
297

298
    Mat bottom_blob_bordered;
299
    make_padding(bottom_blob, bottom_blob_bordered, opt);
300
    if (bottom_blob_bordered.empty())
301
        return -100;
302

303
    const int w = bottom_blob_bordered.w;
304
    const int h = bottom_blob_bordered.h;
305
    const size_t elemsize = bottom_blob_bordered.elemsize;
306

307
    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
308
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
309

310
    const int outw = (w - kernel_extent_w) / stride_w + 1;
311
    const int outh = (h - kernel_extent_h) / stride_h + 1;
312

313
    top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
314
    if (top_blob.empty())
315
        return -100;
316

317
    int ret = convolutiondepthwise(bottom_blob_bordered, top_blob, weight_data, bias_data, kernel_w, kernel_h, stride_w, stride_h, dilation_w, dilation_h, group, activation_type, activation_params, opt);
318
    if (ret != 0)
319
        return ret;
320

321
    return 0;
322
}
323

324
int ConvolutionDepthWise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
325
{
326
    const Mat& bottom_blob = bottom_blobs[0];
327
    const Mat& _weight_data = bottom_blobs[1];
328
    Mat& top_blob = top_blobs[0];
329

330
    const int _kernel_w = _weight_data.w;
331
    const int _kernel_h = _weight_data.h;
332
    const int _num_output = _weight_data.c;
333

334
    Mat weight_data_flattened;
335
    flatten(_weight_data, weight_data_flattened, opt);
336
    if (weight_data_flattened.empty())
337
        return -100;
338

339
    Mat bias_data_flattened;
340
    if (bias_term)
341
    {
342
        const Mat& _bias_data = bottom_blobs[2];
343
        flatten(_bias_data, bias_data_flattened, opt);
344
        if (bias_data_flattened.empty())
345
            return -100;
346
    }
347

348
    Mat bottom_blob_bordered;
349
    make_padding(bottom_blob, bottom_blob_bordered, _kernel_w, _kernel_h, opt);
350
    if (bottom_blob_bordered.empty())
351
        return -100;
352

353
    const int w = bottom_blob_bordered.w;
354
    const int h = bottom_blob_bordered.h;
355
    const size_t elemsize = bottom_blob_bordered.elemsize;
356

357
    const int kernel_extent_w = dilation_w * (_kernel_w - 1) + 1;
358
    const int kernel_extent_h = dilation_h * (_kernel_h - 1) + 1;
359

360
    const int outw = (w - kernel_extent_w) / stride_w + 1;
361
    const int outh = (h - kernel_extent_h) / stride_h + 1;
362

363
    top_blob.create(outw, outh, _num_output, elemsize, opt.blob_allocator);
364
    if (top_blob.empty())
365
        return -100;
366

367
    int ret = convolutiondepthwise(bottom_blob_bordered, top_blob, weight_data_flattened, bias_data_flattened, _kernel_w, _kernel_h, stride_w, stride_h, dilation_w, dilation_h, group, activation_type, activation_params, opt);
368
    if (ret != 0)
369
        return ret;
370

371
    return 0;
372
}
373

374
void ConvolutionDepthWise::make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered, const Option& opt) const
375
{
376
    make_padding(bottom_blob, bottom_blob_bordered, kernel_w, kernel_h, opt);
377
}
378

379
void ConvolutionDepthWise::make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered, int _kernel_w, int _kernel_h, const Option& opt) const
380
{
381
    int w = bottom_blob.w;
382
    int h = bottom_blob.h;
383

384
    const int kernel_extent_w = dilation_w * (_kernel_w - 1) + 1;
385
    const int kernel_extent_h = dilation_h * (_kernel_h - 1) + 1;
386

387
    bottom_blob_bordered = bottom_blob;
388
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
389
    {
390
        Option opt_b = opt;
391
        opt_b.blob_allocator = opt.workspace_allocator;
392
        copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom, pad_left, pad_right, BORDER_CONSTANT, pad_value, opt_b);
393
    }
394
    else if (pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233)
395
    {
396
        // tensorflow padding=SAME or onnx padding=SAME_UPPER
397
        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
398
        int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
399
        if (wpad > 0 || hpad > 0)
400
        {
401
            Option opt_b = opt;
402
            opt_b.blob_allocator = opt.workspace_allocator;
403
            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, pad_value, opt_b);
404
        }
405
    }
406
    else if (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234)
407
    {
408
        // onnx padding=SAME_LOWER
409
        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
410
        int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
411
        if (wpad > 0 || hpad > 0)
412
        {
413
            Option opt_b = opt;
414
            opt_b.blob_allocator = opt.workspace_allocator;
415
            copy_make_border(bottom_blob, bottom_blob_bordered, hpad - hpad / 2, hpad / 2, wpad - wpad / 2, wpad / 2, BORDER_CONSTANT, pad_value, opt_b);
416
        }
417
    }
418
}
419

420
#if NCNN_INT8
421
static inline signed char float2int8(float v)
422
{
423
    int int32 = static_cast<int>(round(v));
424
    if (int32 > 127) return 127;
425
    if (int32 < -127) return -127;
426
    return (signed char)int32;
427
}
428

429
int ConvolutionDepthWise::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
430
{
431
    // convolv with NxN kernel
432
    // value = value + bias
433

434
    int w = bottom_blob.w;
435
    int h = bottom_blob.h;
436
    int channels = bottom_blob.c;
437
    size_t elemsize = bottom_blob.elemsize;
438

439
    if (channels % group != 0 || num_output % group != 0)
440
    {
441
        // reject invalid group
442
        return -100;
443
    }
444

445
    //     NCNN_LOGE("ConvolutionDepthWise input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
446

447
    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
448
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
449

450
    Mat bottom_blob_int8 = bottom_blob;
451
    if (elemsize != 1)
452
    {
453
        const int channels_g = channels / group;
454

455
        Mat scales(channels);
456
        {
457
            float* ps = scales;
458
            for (int g = 0; g < group; g++)
459
            {
460
                float scale = bottom_blob_int8_scales[g];
461
                for (int q = 0; q < channels_g; q++)
462
                {
463
                    *ps++ = scale;
464
                }
465
            }
466
        }
467

468
        Option opt_q = opt;
469
        opt_q.blob_allocator = opt.workspace_allocator;
470
        quantize_to_int8(bottom_blob, bottom_blob_int8, scales, opt_q);
471
    }
472

473
    Mat bottom_blob_bordered;
474
    make_padding(bottom_blob_int8, bottom_blob_bordered, opt);
475
    if (bottom_blob_bordered.empty())
476
        return -100;
477

478
    w = bottom_blob_bordered.w;
479
    h = bottom_blob_bordered.h;
480

481
    int outw = (w - kernel_extent_w) / stride_w + 1;
482
    int outh = (h - kernel_extent_h) / stride_h + 1;
483

484
    const int maxk = kernel_w * kernel_h;
485

486
    // kernel offsets
487
    std::vector<int> _space_ofs(maxk);
488
    int* space_ofs = &_space_ofs[0];
489
    {
490
        int p1 = 0;
491
        int p2 = 0;
492
        int gap = w * dilation_h - kernel_w * dilation_w;
493
        for (int i = 0; i < kernel_h; i++)
494
        {
495
            for (int j = 0; j < kernel_w; j++)
496
            {
497
                space_ofs[p1] = p2;
498
                p1++;
499
                p2 += dilation_w;
500
            }
501
            p2 += gap;
502
        }
503
    }
504

505
    // int8
506
    bool use_int8_requantize = int8_scale_term > 100;
507
    size_t out_elemsize = use_int8_requantize ? 1u : 4u;
508

509
    top_blob.create(outw, outh, num_output, out_elemsize, opt.blob_allocator);
510
    if (top_blob.empty())
511
        return -100;
512

513
    // depth-wise
514
    if (channels == group && group == num_output)
515
    {
516
        #pragma omp parallel for num_threads(opt.num_threads)
517
        for (int g = 0; g < group; g++)
518
        {
519
            signed char* outptr = top_blob.channel(g);
520
            const signed char* kptr = (const signed char*)weight_data + maxk * g;
521
            const Mat m = bottom_blob_bordered.channel(g);
522

523
            for (int i = 0; i < outh; i++)
524
            {
525
                for (int j = 0; j < outw; j++)
526
                {
527
                    int sum = 0;
528

529
                    const signed char* sptr = m.row<signed char>(i * stride_h) + j * stride_w;
530

531
                    for (int k = 0; k < maxk; k++)
532
                    {
533
                        signed char val = sptr[space_ofs[k]];
534
                        signed char w = kptr[k];
535
                        sum += val * w;
536
                    }
537

538
                    float scale_in;
539
                    if (weight_data_int8_scales[g] == 0)
540
                        scale_in = 0;
541
                    else
542
                        scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
543

544
                    float sumfp32 = sum * scale_in;
545

546
                    if (bias_term)
547
                        sumfp32 += bias_data[g];
548

549
                    sumfp32 = activation_ss(sumfp32, activation_type, activation_params);
550

551
                    if (use_int8_requantize)
552
                    {
553
                        // requantize
554
                        float scale_out = top_blob_int8_scales[g];
555
                        signed char sums8 = float2int8(sumfp32 * scale_out);
556
                        outptr[0] = sums8;
557
                        outptr += 1;
558
                    }
559
                    else
560
                    {
561
                        // dequantize
562
                        ((float*)outptr)[0] = sumfp32;
563
                        outptr += 4;
564
                    }
565
                }
566
            }
567
        }
568
    }
569
    else
570
    {
571
        // group convolution
572
        const int channels_g = channels / group;
573
        const int num_output_g = num_output / group;
574

575
#ifdef _WIN32
576
        #pragma omp parallel for num_threads(opt.num_threads)
577
#else // _WIN32
578
        #pragma omp parallel for collapse(2) num_threads(opt.num_threads)
579
#endif // _WIN32
580
        for (int g = 0; g < group; g++)
581
        {
582
            for (int p = 0; p < num_output_g; p++)
583
            {
584
                signed char* outptr = top_blob.channel(g * num_output_g + p);
585
                const signed char* weight_data_ptr = (const signed char*)weight_data + maxk * channels_g * num_output_g * g;
586

587
                for (int i = 0; i < outh; i++)
588
                {
589
                    for (int j = 0; j < outw; j++)
590
                    {
591
                        int sum = 0;
592

593
                        const signed char* kptr = weight_data_ptr + maxk * channels_g * p;
594

595
                        // channels_g
596
                        for (int q = 0; q < channels_g; q++)
597
                        {
598
                            const Mat m = bottom_blob_bordered.channel(channels_g * g + q);
599
                            const signed char* sptr = m.row<signed char>(i * stride_h) + j * stride_w;
600

601
                            for (int k = 0; k < maxk; k++)
602
                            {
603
                                signed char val = sptr[space_ofs[k]];
604
                                signed char w = kptr[k];
605
                                sum += val * w;
606
                            }
607

608
                            kptr += maxk;
609
                        }
610

611
                        float scale_in;
612
                        if (weight_data_int8_scales[g] == 0)
613
                            scale_in = 0;
614
                        else
615
                            scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
616

617
                        float sumfp32 = sum * scale_in;
618

619
                        if (bias_term)
620
                            sumfp32 += bias_data[g * num_output_g + p];
621

622
                        sumfp32 = activation_ss(sumfp32, activation_type, activation_params);
623

624
                        if (use_int8_requantize)
625
                        {
626
                            // requantize
627
                            float scale_out = top_blob_int8_scales[g];
628
                            signed char sums8 = float2int8(sumfp32 * scale_out);
629
                            outptr[0] = sums8;
630
                            outptr += 1;
631
                        }
632
                        else
633
                        {
634
                            // dequantize
635
                            ((float*)outptr)[0] = sumfp32;
636
                            outptr += 4;
637
                        }
638
                    }
639
                }
640
            }
641
        }
642
    }
643

644
    return 0;
645
}
646
#endif // NCNN_INT8
647

648
} // namespace ncnn
649
ncnn

Использование cookies