ncnn

convolutiondepthwise_riscv.cpp
1160 строк · 36.9 Кб
Перенос по словам
1
// Tencent is pleased to support the open source community by making ncnn available.
2
//
3
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
4
//
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
7
//
8
// https://opensource.org/licenses/BSD-3-Clause
9
//
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
14

15
#include "convolutiondepthwise_riscv.h"
16

17
#include "cpu.h"
18
#include "layer_type.h"
19

20
#if __riscv_vector
21
#include <riscv_vector.h>
22
#endif // __riscv_vector
23

24
#include "riscv_activation.h"
25
#include "riscv_usability.h"
26

27
namespace ncnn {
28

29
#include "convolutiondepthwise_3x3.h"
30

31
#if __riscv_vector
32
#include "convolutiondepthwise_3x3_packn.h"
33
#include "convolutiondepthwise_5x5_packn.h"
34

35
#if __riscv_zfh
36
#include "convolutiondepthwise_3x3_packn_fp16s.h"
37
#include "convolutiondepthwise_5x5_packn_fp16s.h"
38
#endif
39
#endif // __riscv_vector
40

41
ConvolutionDepthWise_riscv::ConvolutionDepthWise_riscv()
42
{
43
#if __riscv_vector
44
    support_packing = true;
45
#if __riscv_zfh
46
    support_fp16_storage = true;
47
#endif
48
#endif // __riscv_vector
49

50
    activation = 0;
51
}
52

53
int ConvolutionDepthWise_riscv::create_pipeline(const Option& opt)
54
{
55
    if (dynamic_weight)
56
        return 0;
57

58
    activation = create_activation_layer(activation_type, activation_params, opt);
59

60
#if NCNN_INT8
61
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
62
    {
63
        // TODO implement int8
64
        return 0;
65
    }
66
#endif
67

68
#if __riscv_vector && __riscv_zfh
69
    if (opt.use_fp16_storage)
70
    {
71
        return create_pipeline_fp16s(opt);
72
    }
73
#endif
74

75
#if __riscv_vector
76
    const int packn = csrr_vlenb() / 4;
77
#endif
78

79
    const int maxk = kernel_w * kernel_h;
80
    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
81

82
    // depth-wise
83
    if (channels == group && group == num_output)
84
    {
85
        int elempack = 1;
86
#if __riscv_vector
87
        if (opt.use_packing_layout)
88
        {
89
            elempack = channels % packn == 0 ? packn : 1;
90
        }
91
#endif
92

93
#if __riscv_vector
94
        // packn
95
        if (elempack == packn)
96
        {
97
            Mat weight_data_r2 = weight_data.reshape(maxk, group);
98
            convert_packing(weight_data_r2, weight_data_tm, packn, opt);
99
        }
100
#endif // __riscv_vector
101

102
        if (elempack == 1)
103
        {
104
            weight_data_tm = weight_data;
105
        }
106

107
        if (opt.lightmode)
108
            weight_data.release();
109

110
        return 0;
111
    }
112

113
    // group convolution
114
    create_group_ops(opt);
115

116
    if (opt.lightmode)
117
        weight_data.release();
118

119
    return 0;
120
}
121

122
int ConvolutionDepthWise_riscv::create_group_ops(const Option& opt)
123
{
124
    // create Convolution op for each group
125
    const int maxk = kernel_w * kernel_h;
126
    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
127

128
    for (int i = 0; i < (int)group_ops.size(); i++)
129
        delete group_ops[i];
130

131
    group_ops.clear();
132

133
    const int channels_g = channels / group;
134
    const int num_output_g = num_output / group;
135

136
    group_ops.resize(group);
137

138
    for (int g = 0; g < group; g++)
139
    {
140
        Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g).clone();
141
        Mat bias_data_g;
142
        if (bias_term)
143
            bias_data_g = bias_data.range(num_output_g * g, num_output_g);
144

145
        ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
146

147
        // set param
148
        ncnn::ParamDict pd;
149
        pd.set(0, num_output_g); // num_output
150
        pd.set(1, kernel_w);
151
        pd.set(11, kernel_h);
152
        pd.set(2, dilation_w);
153
        pd.set(12, dilation_h);
154
        pd.set(3, stride_w);
155
        pd.set(13, stride_h);
156
        pd.set(4, 0);  // pad_w
157
        pd.set(14, 0); // pad_h
158
        pd.set(5, bias_term);
159
        pd.set(6, maxk * channels_g * num_output_g); // weight_data_size
160
        pd.set(8, int8_scale_term);
161
        pd.set(9, activation_type);
162
        pd.set(10, activation_params);
163

164
        op->load_param(pd);
165

166
        // set weights
167
        if (bias_term)
168
        {
169
            ncnn::Mat weights[5];
170
            weights[0] = weight_data_g;
171
            weights[1] = bias_data_g;
172

173
#if NCNN_INT8
174
            if (int8_scale_term)
175
            {
176
                Mat weight_data_int8_scales_g(num_output_g);
177
                weight_data_int8_scales_g.fill(weight_data_int8_scales[g]);
178
                weights[2] = weight_data_int8_scales_g;
179
                weights[3] = bottom_blob_int8_scales.range(g, 1);
180
            }
181
            if (int8_scale_term > 100)
182
            {
183
                weights[4] = top_blob_int8_scales.range(g, 1);
184
            }
185
#endif
186

187
            op->load_model(ModelBinFromMatArray(weights));
188
        }
189
        else
190
        {
191
            ncnn::Mat weights[4];
192
            weights[0] = weight_data_g;
193

194
#if NCNN_INT8
195
            if (int8_scale_term)
196
            {
197
                Mat weight_data_int8_scales_g(num_output_g);
198
                weight_data_int8_scales_g.fill(weight_data_int8_scales[g]);
199
                weights[1] = weight_data_int8_scales_g;
200
                weights[2] = bottom_blob_int8_scales.range(g, 1);
201
            }
202
            if (int8_scale_term > 100)
203
            {
204
                weights[3] = top_blob_int8_scales.range(g, 1);
205
            }
206
#endif
207

208
            op->load_model(ModelBinFromMatArray(weights));
209
        }
210

211
        op->create_pipeline(opt);
212

213
        group_ops[g] = op;
214
    }
215

216
    return 0;
217
}
218

219
int ConvolutionDepthWise_riscv::destroy_pipeline(const Option& opt)
220
{
221
    if (activation)
222
    {
223
        activation->destroy_pipeline(opt);
224
        delete activation;
225
        activation = 0;
226
    }
227

228
    for (int i = 0; i < (int)group_ops.size(); i++)
229
    {
230
        group_ops[i]->destroy_pipeline(opt);
231
        delete group_ops[i];
232
    }
233
    group_ops.clear();
234

235
    return 0;
236
}
237

238
int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
239
{
240
#if NCNN_INT8
241
    if (opt.use_int8_inference && int8_scale_term)
242
    {
243
        Mat bottom_blob_unpacked = bottom_blob;
244
        if (bottom_blob.elempack != 1)
245
        {
246
            Option opt_pack1 = opt;
247
            opt_pack1.blob_allocator = opt.workspace_allocator;
248

249
            convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
250
        }
251

252
        Mat bottom_blob_unpacked_fp32 = bottom_blob_unpacked;
253
        if (bottom_blob_unpacked.elembits() == 16)
254
        {
255
            Option opt_pack1 = opt;
256
            opt_pack1.blob_allocator = opt.workspace_allocator;
257

258
            cast_float16_to_float32(bottom_blob_unpacked, bottom_blob_unpacked_fp32, opt_pack1);
259
        }
260

261
        Option opt_unpacked = opt;
262
        opt_unpacked.use_packing_layout = false;
263
        return ConvolutionDepthWise::forward_int8(bottom_blob_unpacked_fp32, top_blob, opt_unpacked);
264
    }
265
#endif
266

267
    int elembits = bottom_blob.elembits();
268

269
#if __riscv_vector && __riscv_zfh
270
    if (opt.use_fp16_storage && elembits == 16)
271
    {
272
        if (opt.use_fp16_arithmetic)
273
            return forward_fp16sa(bottom_blob, top_blob, opt);
274
        else
275
            return forward_fp16s(bottom_blob, top_blob, opt);
276
    }
277
#endif
278

279
#if __riscv_vector
280
    const int packn = csrr_vlenb() / 4;
281
    const size_t vl = vsetvl_e32m1(packn);
282
#endif
283

284
    int w = bottom_blob.w;
285
    int h = bottom_blob.h;
286
    int channels = bottom_blob.c;
287
    size_t elemsize = bottom_blob.elemsize;
288
    int elempack = bottom_blob.elempack;
289

290
    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
291
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
292

293
    Mat bottom_blob_bordered;
294
    make_padding(bottom_blob, bottom_blob_bordered, opt);
295
    if (bottom_blob_bordered.empty())
296
        return -100;
297

298
    w = bottom_blob_bordered.w;
299
    h = bottom_blob_bordered.h;
300

301
    int outw = (w - kernel_extent_w) / stride_w + 1;
302
    int outh = (h - kernel_extent_h) / stride_h + 1;
303
    int out_elempack = 1;
304
#if __riscv_vector
305
    if (opt.use_packing_layout)
306
    {
307
        out_elempack = num_output % packn == 0 ? packn : 1;
308
    }
309
#endif
310
    size_t out_elemsize = elemsize / elempack * out_elempack;
311

312
    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
313
    if (top_blob.empty())
314
        return -100;
315

316
    // depth-wise
317
    if (channels * elempack == group && group == num_output)
318
    {
319
#if __riscv_vector
320
        if (elempack == packn)
321
        {
322
            if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
323
            {
324
                convdw3x3s1_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
325

326
                if (activation)
327
                {
328
                    activation->forward_inplace(top_blob, opt);
329
                }
330
            }
331
            else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
332
            {
333
                convdw3x3s2_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
334

335
                if (activation)
336
                {
337
                    activation->forward_inplace(top_blob, opt);
338
                }
339
            }
340
            else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
341
            {
342
                convdw5x5s1_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
343

344
                if (activation)
345
                {
346
                    activation->forward_inplace(top_blob, opt);
347
                }
348
            }
349
            else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
350
            {
351
                convdw5x5s2_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
352

353
                if (activation)
354
                {
355
                    activation->forward_inplace(top_blob, opt);
356
                }
357
            }
358
            else
359
            {
360
                const int maxk = kernel_w * kernel_h;
361

362
                // kernel offsets
363
                std::vector<int> _space_ofs(maxk);
364
                int* space_ofs = &_space_ofs[0];
365
                {
366
                    int p1 = 0;
367
                    int p2 = 0;
368
                    int gap = w * dilation_h - kernel_w * dilation_w;
369
                    for (int i = 0; i < kernel_h; i++)
370
                    {
371
                        for (int j = 0; j < kernel_w; j++)
372
                        {
373
                            space_ofs[p1] = p2;
374
                            p1++;
375
                            p2 += dilation_w;
376
                        }
377
                        p2 += gap;
378
                    }
379
                }
380

381
                #pragma omp parallel for num_threads(opt.num_threads)
382
                for (int g = 0; g < channels; g++)
383
                {
384
                    float* outptr = top_blob.channel(g);
385
                    const float* kptr = (const float*)weight_data_tm + maxk * g * packn;
386
                    const Mat m = bottom_blob_bordered.channel(g);
387

388
                    for (int i = 0; i < outh; i++)
389
                    {
390
                        for (int j = 0; j < outw; j++)
391
                        {
392
                            vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl);
393

394
                            if (bias_term)
395
                            {
396
                                _sum = vle32_v_f32m1((const float*)bias_data + g * packn, vl);
397
                            }
398

399
                            const float* sptr = m.row(i * stride_h) + j * stride_w * packn;
400

401
                            for (int k = 0; k < maxk; k++)
402
                            {
403
                                vfloat32m1_t _val = vle32_v_f32m1(sptr + space_ofs[k] * packn, vl);
404
                                vfloat32m1_t _w = vle32_v_f32m1(kptr + k * packn, vl);
405
                                _sum = vfmacc_vv_f32m1(_sum, _val, _w, vl);
406
                            }
407

408
                            _sum = activation_ps(_sum, activation_type, activation_params, vl);
409

410
                            vse32_v_f32m1(outptr + j * packn, _sum, vl);
411
                        }
412

413
                        outptr += outw * packn;
414
                    }
415
                }
416
            }
417
        }
418
#endif // __riscv_vector
419

420
        if (elempack == 1)
421
        {
422
            if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
423
            {
424
                convdw3x3s1_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
425

426
                if (activation)
427
                {
428
                    activation->forward_inplace(top_blob, opt);
429
                }
430
            }
431
            else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
432
            {
433
                convdw3x3s2_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
434

435
                if (activation)
436
                {
437
                    activation->forward_inplace(top_blob, opt);
438
                }
439
            }
440
            else
441
            {
442
                const int maxk = kernel_w * kernel_h;
443

444
                // kernel offsets
445
                std::vector<int> _space_ofs(maxk);
446
                int* space_ofs = &_space_ofs[0];
447
                {
448
                    int p1 = 0;
449
                    int p2 = 0;
450
                    int gap = w * dilation_h - kernel_w * dilation_w;
451
                    for (int i = 0; i < kernel_h; i++)
452
                    {
453
                        for (int j = 0; j < kernel_w; j++)
454
                        {
455
                            space_ofs[p1] = p2;
456
                            p1++;
457
                            p2 += dilation_w;
458
                        }
459
                        p2 += gap;
460
                    }
461
                }
462

463
                #pragma omp parallel for num_threads(opt.num_threads)
464
                for (int g = 0; g < group; g++)
465
                {
466
                    float* outptr = top_blob.channel(g);
467
                    const float* kptr = (const float*)weight_data_tm + maxk * g;
468
                    const Mat m = bottom_blob_bordered.channel(g);
469

470
                    for (int i = 0; i < outh; i++)
471
                    {
472
                        for (int j = 0; j < outw; j++)
473
                        {
474
                            float sum = 0.f;
475

476
                            if (bias_term)
477
                                sum = bias_data[g];
478

479
                            const float* sptr = m.row(i * stride_h) + j * stride_w;
480

481
                            for (int k = 0; k < maxk; k++)
482
                            {
483
                                float val = (float)sptr[space_ofs[k]];
484
                                float w = (float)kptr[k];
485
                                sum += val * w;
486
                            }
487

488
                            sum = activation_ss(sum, activation_type, activation_params);
489

490
                            outptr[j] = sum;
491
                        }
492

493
                        outptr += outw;
494
                    }
495
                }
496
            }
497
        }
498

499
        return 0;
500
    }
501

502
    // group convolution
503
    const int channels_g = channels * elempack / group;
504
    const int num_output_g = num_output / group;
505

506
    int g_elempack = 1;
507
    int out_g_elempack = 1;
508
#if __riscv_vector
509
    if (opt.use_packing_layout)
510
    {
511
        g_elempack = channels_g % packn == 0 ? packn : 1;
512
        out_g_elempack = num_output_g % packn == 0 ? packn : 1;
513
    }
514
#endif
515

516
    // unpacking
517
    Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
518
    if (elempack > g_elempack)
519
    {
520
        Option opt_p = opt;
521
        opt_p.blob_allocator = opt.workspace_allocator;
522
        convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, 1, opt_p);
523
    }
524

525
    Mat top_blob_unpacked = top_blob;
526
    if (out_g_elempack < out_elempack)
527
    {
528
        top_blob_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
529
        if (top_blob_unpacked.empty())
530
            return -100;
531
    }
532

533
    for (int g = 0; g < group; g++)
534
    {
535
        const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
536
        Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
537

538
        const ncnn::Layer* op = group_ops[g];
539

540
        Option opt_g = opt;
541
        opt_g.blob_allocator = top_blob_unpacked.allocator;
542

543
        // forward
544
        op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
545
    }
546

547
    // packing
548
    if (out_g_elempack < out_elempack)
549
    {
550
        convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
551
    }
552
    else
553
    {
554
        top_blob = top_blob_unpacked;
555
    }
556

557
    return 0;
558
}
559

560
int ConvolutionDepthWise_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
561
{
562
    const Mat& bottom_blob = bottom_blobs[0];
563
    const Mat& _weight_data = bottom_blobs[1];
564
    Mat& top_blob = top_blobs[0];
565

566
    const int _kernel_w = _weight_data.w;
567
    const int _kernel_h = _weight_data.h;
568
    const int _num_output = _weight_data.c * _weight_data.elempack;
569

570
    Mat weight_data_flattened;
571
    flatten(_weight_data, weight_data_flattened, opt);
572
    if (weight_data_flattened.empty())
573
        return -100;
574

575
#if NCNN_RVV
576
    if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && weight_data_flattened.elembits() == 16)
577
    {
578
        Mat weight_data_flattened_fp32;
579
        cast_float16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt);
580
        weight_data_flattened = weight_data_flattened_fp32;
581
    }
582
#endif // NCNN_RVV
583

584
    // weight_data_flattened as pack1
585
    weight_data_flattened.w *= weight_data_flattened.elempack;
586
    weight_data_flattened.elemsize /= weight_data_flattened.elempack;
587
    weight_data_flattened.elempack = 1;
588

589
    Mat bias_data_flattened;
590
    if (bias_term)
591
    {
592
        const Mat& _bias_data = bottom_blobs[2];
593
        flatten(_bias_data, bias_data_flattened, opt);
594
        if (bias_data_flattened.empty())
595
            return -100;
596

597
#if NCNN_RVV
598
        if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && bias_data_flattened.elembits() == 16)
599
        {
600
            Mat bias_data_flattened_fp32;
601
            cast_float16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt);
602
            bias_data_flattened = bias_data_flattened_fp32;
603
        }
604
#endif // NCNN_RVV
605

606
        // bias_data_flattened as pack1
607
        bias_data_flattened.w *= bias_data_flattened.elempack;
608
        bias_data_flattened.elemsize /= bias_data_flattened.elempack;
609
        bias_data_flattened.elempack = 1;
610
    }
611

612
    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::ConvolutionDepthWise);
613

614
    ncnn::ParamDict pd;
615
    pd.set(0, _num_output);
616
    pd.set(1, _kernel_w);
617
    pd.set(11, _kernel_h);
618
    pd.set(2, dilation_w);
619
    pd.set(12, dilation_h);
620
    pd.set(3, stride_w);
621
    pd.set(13, stride_h);
622
    pd.set(4, pad_left);
623
    pd.set(15, pad_right);
624
    pd.set(14, pad_top);
625
    pd.set(16, pad_bottom);
626
    pd.set(18, pad_value);
627
    pd.set(5, bias_term);
628
    pd.set(6, weight_data_flattened.w);
629
    pd.set(7, group);
630
    pd.set(8, int8_scale_term);
631
    pd.set(9, activation_type);
632
    pd.set(10, activation_params);
633

634
    op->load_param(pd);
635

636
    ncnn::Mat weights[2];
637
    weights[0] = weight_data_flattened;
638
    weights[1] = bias_data_flattened;
639

640
    op->load_model(ncnn::ModelBinFromMatArray(weights));
641

642
    op->create_pipeline(opt);
643

644
    op->forward(bottom_blob, top_blob, opt);
645

646
    op->destroy_pipeline(opt);
647

648
    delete op;
649

650
    return 0;
651
}
652

653
#if __riscv_vector && __riscv_zfh
654
int ConvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
655
{
656
    const int packn = csrr_vlenb() / 2;
657

658
    const int maxk = kernel_w * kernel_h;
659
    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
660

661
    // depth-wise
662
    if (channels == group && group == num_output)
663
    {
664
        int elempack = 1;
665
        if (opt.use_packing_layout)
666
        {
667
            elempack = channels % packn == 0 ? packn : 1;
668
        }
669

670
        // packn
671
        if (elempack == packn)
672
        {
673
            Mat weight_data_r2 = weight_data.reshape(maxk, group);
674
            Mat weight_data_r2_packed;
675
            convert_packing(weight_data_r2, weight_data_r2_packed, packn, opt);
676

677
            ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_tm, opt);
678
        }
679

680
        if (elempack == 1)
681
        {
682
            ncnn::cast_float32_to_float16(weight_data, weight_data_tm, opt);
683
        }
684

685
        ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
686

687
        if (opt.lightmode)
688
            weight_data.release();
689

690
        return 0;
691
    }
692

693
    // group convolution
694
    create_group_ops(opt);
695

696
    if (opt.lightmode)
697
        weight_data.release();
698

699
    return 0;
700
}
701

702
int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
703
{
704
    const int packn = csrr_vlenb() / 2;
705
    const size_t vl = vsetvl_e16m1(packn);
706

707
    int w = bottom_blob.w;
708
    int h = bottom_blob.h;
709
    int channels = bottom_blob.c;
710
    size_t elemsize = bottom_blob.elemsize;
711
    int elempack = bottom_blob.elempack;
712

713
    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
714
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
715

716
    Mat bottom_blob_bordered;
717
    make_padding(bottom_blob, bottom_blob_bordered, opt);
718
    if (bottom_blob_bordered.empty())
719
        return -100;
720

721
    w = bottom_blob_bordered.w;
722
    h = bottom_blob_bordered.h;
723

724
    int outw = (w - kernel_extent_w) / stride_w + 1;
725
    int outh = (h - kernel_extent_h) / stride_h + 1;
726
    int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1;
727
    size_t out_elemsize = elemsize / elempack * out_elempack;
728

729
    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
730
    if (top_blob.empty())
731
        return -100;
732

733
    // depth-wise
734
    if (channels * elempack == group && group == num_output)
735
    {
736
        if (elempack == packn)
737
        {
738
            {
739
                const int maxk = kernel_w * kernel_h;
740

741
                // kernel offsets
742
                std::vector<int> _space_ofs(maxk);
743
                int* space_ofs = &_space_ofs[0];
744
                {
745
                    int p1 = 0;
746
                    int p2 = 0;
747
                    int gap = w * dilation_h - kernel_w * dilation_w;
748
                    for (int i = 0; i < kernel_h; i++)
749
                    {
750
                        for (int j = 0; j < kernel_w; j++)
751
                        {
752
                            space_ofs[p1] = p2;
753
                            p1++;
754
                            p2 += dilation_w;
755
                        }
756
                        p2 += gap;
757
                    }
758
                }
759

760
                #pragma omp parallel for num_threads(opt.num_threads)
761
                for (int g = 0; g < channels; g++)
762
                {
763
                    __fp16* outptr = top_blob.channel(g);
764
                    const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * packn;
765
                    const Mat m = bottom_blob_bordered.channel(g);
766

767
                    for (int i = 0; i < outh; i++)
768
                    {
769
                        for (int j = 0; j < outw; j++)
770
                        {
771
                            vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl);
772

773
                            if (bias_term)
774
                            {
775
                                _sum = vle32_v_f32m2((const float*)bias_data + g * packn, vl);
776
                            }
777

778
                            const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w * packn;
779

780
                            for (int k = 0; k < maxk; k++)
781
                            {
782
                                vfloat16m1_t _val = vle16_v_f16m1(sptr + space_ofs[k] * packn, vl);
783
                                vfloat16m1_t _w = vle16_v_f16m1(kptr + k * packn, vl);
784
                                _sum = vfwmacc_vv_f32m2(_sum, _val, _w, vl);
785
                            }
786

787
                            _sum = activation_ps(_sum, activation_type, activation_params, vl);
788

789
                            vse16_v_f16m1(outptr + j * packn, vfncvt_f_f_w_f16m1(_sum, vl), vl);
790
                        }
791

792
                        outptr += outw * packn;
793
                    }
794
                }
795
            }
796
        }
797

798
        if (elempack == 1)
799
        {
800
            {
801
                const int maxk = kernel_w * kernel_h;
802

803
                // kernel offsets
804
                std::vector<int> _space_ofs(maxk);
805
                int* space_ofs = &_space_ofs[0];
806
                {
807
                    int p1 = 0;
808
                    int p2 = 0;
809
                    int gap = w * dilation_h - kernel_w * dilation_w;
810
                    for (int i = 0; i < kernel_h; i++)
811
                    {
812
                        for (int j = 0; j < kernel_w; j++)
813
                        {
814
                            space_ofs[p1] = p2;
815
                            p1++;
816
                            p2 += dilation_w;
817
                        }
818
                        p2 += gap;
819
                    }
820
                }
821

822
                #pragma omp parallel for num_threads(opt.num_threads)
823
                for (int g = 0; g < group; g++)
824
                {
825
                    __fp16* outptr = top_blob.channel(g);
826
                    const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g;
827
                    const Mat m = bottom_blob_bordered.channel(g);
828

829
                    for (int i = 0; i < outh; i++)
830
                    {
831
                        for (int j = 0; j < outw; j++)
832
                        {
833
                            float sum = 0.f;
834

835
                            if (bias_term)
836
                                sum = bias_data[g];
837

838
                            const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w;
839

840
                            for (int k = 0; k < maxk; k++)
841
                            {
842
                                float val = (float)sptr[space_ofs[k]];
843
                                float w = (float)kptr[k];
844
                                sum += val * w;
845
                            }
846

847
                            sum = activation_ss(sum, activation_type, activation_params);
848

849
                            outptr[j] = (__fp16)sum;
850
                        }
851

852
                        outptr += outw;
853
                    }
854
                }
855
            }
856
        }
857

858
        return 0;
859
    }
860

861
    // group convolution
862
    const int channels_g = channels * elempack / group;
863
    const int num_output_g = num_output / group;
864

865
    int g_elempack = (opt.use_packing_layout && channels_g % packn == 0) ? packn : 1;
866
    int out_g_elempack = (opt.use_packing_layout && num_output_g % packn == 0) ? packn : 1;
867

868
    // unpacking
869
    Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
870
    if (elempack > g_elempack)
871
    {
872
        Option opt_p = opt;
873
        opt_p.blob_allocator = opt.workspace_allocator;
874
        convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, 1, opt_p);
875
    }
876

877
    Mat top_blob_unpacked = top_blob;
878
    if (out_g_elempack < out_elempack)
879
    {
880
        top_blob_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
881
        if (top_blob_unpacked.empty())
882
            return -100;
883
    }
884

885
    for (int g = 0; g < group; g++)
886
    {
887
        const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
888
        Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
889

890
        const ncnn::Layer* op = group_ops[g];
891

892
        Option opt_g = opt;
893
        opt_g.blob_allocator = top_blob_unpacked.allocator;
894

895
        // forward
896
        op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
897
    }
898

899
    // packing
900
    if (out_g_elempack < out_elempack)
901
    {
902
        convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
903
    }
904
    else
905
    {
906
        top_blob = top_blob_unpacked;
907
    }
908

909
    return 0;
910
}
911

912
int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
913
{
914
    const int packn = csrr_vlenb() / 2;
915
    const size_t vl = vsetvl_e16m1(packn);
916

917
    int w = bottom_blob.w;
918
    int h = bottom_blob.h;
919
    int channels = bottom_blob.c;
920
    size_t elemsize = bottom_blob.elemsize;
921
    int elempack = bottom_blob.elempack;
922

923
    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
924
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
925

926
    Mat bottom_blob_bordered;
927
    make_padding(bottom_blob, bottom_blob_bordered, opt);
928
    if (bottom_blob_bordered.empty())
929
        return -100;
930

931
    w = bottom_blob_bordered.w;
932
    h = bottom_blob_bordered.h;
933

934
    int outw = (w - kernel_extent_w) / stride_w + 1;
935
    int outh = (h - kernel_extent_h) / stride_h + 1;
936
    int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1;
937
    size_t out_elemsize = elemsize / elempack * out_elempack;
938

939
    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
940
    if (top_blob.empty())
941
        return -100;
942

943
    // depth-wise
944
    if (channels * elempack == group && group == num_output)
945
    {
946
        if (elempack == packn)
947
        {
948
            if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
949
            {
950
                convdw3x3s1_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);
951

952
                if (activation)
953
                {
954
                    activation->forward_inplace(top_blob, opt);
955
                }
956
            }
957
            else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
958
            {
959
                convdw3x3s2_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);
960

961
                if (activation)
962
                {
963
                    activation->forward_inplace(top_blob, opt);
964
                }
965
            }
966
            else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
967
            {
968
                convdw5x5s1_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);
969

970
                if (activation)
971
                {
972
                    activation->forward_inplace(top_blob, opt);
973
                }
974
            }
975
            else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
976
            {
977
                convdw5x5s2_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);
978

979
                if (activation)
980
                {
981
                    activation->forward_inplace(top_blob, opt);
982
                }
983
            }
984
            else
985
            {
986
                const int maxk = kernel_w * kernel_h;
987

988
                // kernel offsets
989
                std::vector<int> _space_ofs(maxk);
990
                int* space_ofs = &_space_ofs[0];
991
                {
992
                    int p1 = 0;
993
                    int p2 = 0;
994
                    int gap = w * dilation_h - kernel_w * dilation_w;
995
                    for (int i = 0; i < kernel_h; i++)
996
                    {
997
                        for (int j = 0; j < kernel_w; j++)
998
                        {
999
                            space_ofs[p1] = p2;
1000
                            p1++;
1001
                            p2 += dilation_w;
1002
                        }
1003
                        p2 += gap;
1004
                    }
1005
                }
1006

1007
                #pragma omp parallel for num_threads(opt.num_threads)
1008
                for (int g = 0; g < channels; g++)
1009
                {
1010
                    __fp16* outptr = top_blob.channel(g);
1011
                    const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * packn;
1012
                    const Mat m = bottom_blob_bordered.channel(g);
1013

1014
                    for (int i = 0; i < outh; i++)
1015
                    {
1016
                        for (int j = 0; j < outw; j++)
1017
                        {
1018
                            vfloat16m1_t _sum = vfmv_v_f_f16m1((__fp16)0.f, vl);
1019

1020
                            if (bias_term)
1021
                            {
1022
                                _sum = vle16_v_f16m1((const __fp16*)bias_data_fp16 + g * packn, vl);
1023
                            }
1024

1025
                            const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w * packn;
1026

1027
                            for (int k = 0; k < maxk; k++)
1028
                            {
1029
                                vfloat16m1_t _val = vle16_v_f16m1(sptr + space_ofs[k] * packn, vl);
1030
                                vfloat16m1_t _w = vle16_v_f16m1(kptr + k * packn, vl);
1031
                                _sum = vfmacc_vv_f16m1(_sum, _val, _w, vl);
1032
                            }
1033

1034
                            _sum = activation_ps(_sum, activation_type, activation_params, vl);
1035

1036
                            vse16_v_f16m1(outptr + j * packn, _sum, vl);
1037
                        }
1038

1039
                        outptr += outw * packn;
1040
                    }
1041
                }
1042
            }
1043
        }
1044

1045
        if (elempack == 1)
1046
        {
1047
            {
1048
                const int maxk = kernel_w * kernel_h;
1049

1050
                // kernel offsets
1051
                std::vector<int> _space_ofs(maxk);
1052
                int* space_ofs = &_space_ofs[0];
1053
                {
1054
                    int p1 = 0;
1055
                    int p2 = 0;
1056
                    int gap = w * dilation_h - kernel_w * dilation_w;
1057
                    for (int i = 0; i < kernel_h; i++)
1058
                    {
1059
                        for (int j = 0; j < kernel_w; j++)
1060
                        {
1061
                            space_ofs[p1] = p2;
1062
                            p1++;
1063
                            p2 += dilation_w;
1064
                        }
1065
                        p2 += gap;
1066
                    }
1067
                }
1068

1069
                #pragma omp parallel for num_threads(opt.num_threads)
1070
                for (int g = 0; g < group; g++)
1071
                {
1072
                    __fp16* outptr = top_blob.channel(g);
1073
                    const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g;
1074
                    const Mat m = bottom_blob_bordered.channel(g);
1075

1076
                    for (int i = 0; i < outh; i++)
1077
                    {
1078
                        for (int j = 0; j < outw; j++)
1079
                        {
1080
                            float sum = 0.f;
1081

1082
                            if (bias_term)
1083
                                sum = bias_data[g];
1084

1085
                            const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w;
1086

1087
                            for (int k = 0; k < maxk; k++)
1088
                            {
1089
                                __fp16 val = sptr[space_ofs[k]];
1090
                                __fp16 w = kptr[k];
1091
                                sum += val * w;
1092
                            }
1093

1094
                            sum = activation_ss(sum, activation_type, activation_params);
1095

1096
                            outptr[j] = (__fp16)sum;
1097
                        }
1098

1099
                        outptr += outw;
1100
                    }
1101
                }
1102
            }
1103
        }
1104

1105
        return 0;
1106
    }
1107

1108
    // group convolution
1109
    const int channels_g = channels * elempack / group;
1110
    const int num_output_g = num_output / group;
1111

1112
    int g_elempack = (opt.use_packing_layout && channels_g % packn == 0) ? packn : 1;
1113
    int out_g_elempack = (opt.use_packing_layout && num_output_g % packn == 0) ? packn : 1;
1114

1115
    // unpacking
1116
    Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
1117
    if (elempack > g_elempack)
1118
    {
1119
        Option opt_p = opt;
1120
        opt_p.blob_allocator = opt.workspace_allocator;
1121
        convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p);
1122
    }
1123

1124
    Mat top_blob_unpacked = top_blob;
1125
    if (out_g_elempack < out_elempack)
1126
    {
1127
        top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator);
1128
        if (top_blob_unpacked.empty())
1129
            return -100;
1130
    }
1131

1132
    for (int g = 0; g < group; g++)
1133
    {
1134
        const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
1135
        Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
1136

1137
        const ncnn::Layer* op = group_ops[g];
1138

1139
        Option opt_g = opt;
1140
        opt_g.blob_allocator = top_blob_unpacked.allocator;
1141

1142
        // forward
1143
        op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
1144
    }
1145

1146
    // packing
1147
    if (out_g_elempack < out_elempack)
1148
    {
1149
        convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
1150
    }
1151
    else
1152
    {
1153
        top_blob = top_blob_unpacked;
1154
    }
1155

1156
    return 0;
1157
}
1158
#endif // __riscv_vector && __riscv_zfh
1159

1160
} // namespace ncnn
1161
ncnn

Использование cookies