ncnn

concat_riscv.cpp
855 строк · 26.2 Кб
Перенос по словам
1
// Tencent is pleased to support the open source community by making ncnn available.
2
//
3
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
4
//
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
7
//
8
// https://opensource.org/licenses/BSD-3-Clause
9
//
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
14

15
#include "concat_riscv.h"
16

17
#if __riscv_vector
18
#include <riscv_vector.h>
19
#endif // __riscv_vector
20

21
#include "riscv_usability.h"
22

23
namespace ncnn {
24

25
Concat_riscv::Concat_riscv()
26
{
27
#if __riscv_vector
28
    support_packing = true;
29
#if __riscv_zfh
30
    support_fp16_storage = true;
31
#endif
32
#endif // __riscv_vector
33

34
#if NCNN_BF16
35
    support_bf16_storage = true;
36
#endif
37
}
38

39
int Concat_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
40
{
41
    int elembits = bottom_blobs[0].elembits();
42

43
#if __riscv_vector && __riscv_zfh
44
    if (opt.use_fp16_storage && elembits == 16)
45
        return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt);
46
#endif
47

48
#if NCNN_BF16
49
    if (opt.use_bf16_storage && elembits == 16)
50
        return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt);
51
#endif
52

53
#if __riscv_vector
54
    const int packn = csrr_vlenb() / 4;
55
#endif
56

57
    int dims = bottom_blobs[0].dims;
58
    int positive_axis = axis < 0 ? dims + axis : axis;
59

60
    if (dims == 1) // positive_axis == 0
61
    {
62
        // concat vector
63
        // total length
64
        size_t elemsize = bottom_blobs[0].elemsize;
65
        int elempack = bottom_blobs[0].elempack;
66
        int top_w = 0;
67
        for (size_t b = 0; b < bottom_blobs.size(); b++)
68
        {
69
            const Mat& bottom_blob = bottom_blobs[b];
70
            top_w += bottom_blob.w * bottom_blob.elempack;
71
        }
72

73
        int out_elempack = 1;
74
#if __riscv_vector
75
        if (opt.use_packing_layout)
76
        {
77
            out_elempack = top_w % packn == 0 ? packn : 1;
78
        }
79
#endif
80
        size_t out_elemsize = elemsize / elempack * out_elempack;
81

82
        Mat& top_blob = top_blobs[0];
83
        top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
84
        if (top_blob.empty())
85
            return -100;
86

87
        float* outptr = top_blob;
88
        for (size_t b = 0; b < bottom_blobs.size(); b++)
89
        {
90
            const Mat& bottom_blob = bottom_blobs[b];
91

92
            const float* ptr = bottom_blob;
93
            memcpy(outptr, ptr, bottom_blob.w * bottom_blob.elemsize);
94

95
            outptr += bottom_blob.w * bottom_blob.elempack;
96
        }
97
    }
98

99
    if (dims == 2 && positive_axis == 0)
100
    {
101
        // concat image
102
        int w = bottom_blobs[0].w;
103

104
        // total height
105
        size_t elemsize = bottom_blobs[0].elemsize;
106
        int elempack = bottom_blobs[0].elempack;
107
        int top_h = 0;
108
        for (size_t b = 0; b < bottom_blobs.size(); b++)
109
        {
110
            const Mat& bottom_blob = bottom_blobs[b];
111
            elemsize = std::min(elemsize, bottom_blob.elemsize);
112
            elempack = std::min(elempack, bottom_blob.elempack);
113
            top_h += bottom_blob.h * bottom_blob.elempack;
114
        }
115

116
        int out_elempack = 1;
117
#if __riscv_vector
118
        if (opt.use_packing_layout)
119
        {
120
            out_elempack = top_h % packn == 0 ? packn : 1;
121
        }
122
#endif
123
        size_t out_elemsize = elemsize / elempack * out_elempack;
124

125
        Mat& top_blob = top_blobs[0];
126
        top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
127
        if (top_blob.empty())
128
            return -100;
129

130
        Mat top_blob_unpacked = top_blob;
131
        if (elempack < out_elempack)
132
        {
133
            top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_allocator);
134
            if (top_blob_unpacked.empty())
135
                return -100;
136
        }
137

138
        float* outptr = top_blob_unpacked;
139
        for (size_t b = 0; b < bottom_blobs.size(); b++)
140
        {
141
            const Mat& bottom_blob = bottom_blobs[b];
142

143
#if __riscv_vector
144
            if (bottom_blob.elempack == packn && elempack == 1)
145
            {
146
                const size_t vl = vsetvl_e32m1(packn);
147

148
                for (int i = 0; i < bottom_blob.h; i++)
149
                {
150
                    const float* r0 = bottom_blob.row(i);
151

152
                    float* outptr0 = outptr;
153

154
                    for (int j = 0; j < w; j++)
155
                    {
156
                        vfloat32m1_t _p = vle32_v_f32m1(r0, vl);
157
                        vsse32_v_f32m1(outptr0, w * sizeof(float), _p, vl);
158

159
                        r0 += packn;
160
                        outptr0 += 1;
161
                    }
162

163
                    outptr += w * packn;
164
                }
165
            }
166
            else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == packn && elempack == packn)
167
#endif           // __riscv_vector
168
            {
169
                int size = w * bottom_blob.h;
170

171
                const float* ptr = bottom_blob;
172
                memcpy(outptr, ptr, size * bottom_blob.elemsize);
173

174
                outptr += size * bottom_blob.elempack;
175
            }
176
        }
177

178
        // packing
179
        if (elempack < out_elempack)
180
        {
181
            convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
182
        }
183
    }
184

185
    if (dims == 2 && positive_axis == 1)
186
    {
187
        // interleave image row
188
        int h = bottom_blobs[0].h;
189
        size_t elemsize = bottom_blobs[0].elemsize;
190
        int elempack = bottom_blobs[0].elempack;
191

192
        // total width
193
        int top_w = 0;
194
        for (size_t b = 0; b < bottom_blobs.size(); b++)
195
        {
196
            const Mat& bottom_blob = bottom_blobs[b];
197
            top_w += bottom_blob.w;
198
        }
199

200
        Mat& top_blob = top_blobs[0];
201
        top_blob.create(top_w, h, elemsize, elempack, opt.blob_allocator);
202
        if (top_blob.empty())
203
            return -100;
204

205
        #pragma omp parallel for num_threads(opt.num_threads)
206
        for (int i = 0; i < h; i++)
207
        {
208
            float* outptr = top_blob.row(i);
209
            for (size_t b = 0; b < bottom_blobs.size(); b++)
210
            {
211
                const Mat& bottom_blob = bottom_blobs[b];
212

213
                const float* ptr = bottom_blob.row(i);
214
                memcpy(outptr, ptr, bottom_blob.w * elemsize);
215

216
                outptr += bottom_blob.w * elempack;
217
            }
218
        }
219
    }
220

221
    if ((dims == 3 || dims == 4) && positive_axis == 0)
222
    {
223
        // concat dim
224
        int w = bottom_blobs[0].w;
225
        int h = bottom_blobs[0].h;
226
        int d = bottom_blobs[0].d;
227

228
        // total channels
229
        size_t elemsize = bottom_blobs[0].elemsize;
230
        int elempack = bottom_blobs[0].elempack;
231
        int top_channels = 0;
232
        for (size_t b = 0; b < bottom_blobs.size(); b++)
233
        {
234
            const Mat& bottom_blob = bottom_blobs[b];
235
            elemsize = std::min(elemsize, bottom_blob.elemsize);
236
            elempack = std::min(elempack, bottom_blob.elempack);
237
            top_channels += bottom_blob.c * bottom_blob.elempack;
238
        }
239

240
        int out_elempack = 1;
241
#if __riscv_vector
242
        if (opt.use_packing_layout)
243
        {
244
            out_elempack = top_channels % packn == 0 ? packn : 1;
245
        }
246
#endif
247
        size_t out_elemsize = elemsize / elempack * out_elempack;
248

249
        Mat& top_blob = top_blobs[0];
250
        top_blob.create(w, h, d, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
251
        if (top_blob.empty())
252
            return -100;
253

254
        top_blob.dims = dims;
255

256
        Mat top_blob_unpacked = top_blob;
257
        if (elempack < out_elempack)
258
        {
259
            top_blob_unpacked.create(w, h, d, top_channels / elempack, elemsize, elempack, opt.workspace_allocator);
260
            if (top_blob_unpacked.empty())
261
                return -100;
262

263
            top_blob_unpacked.dims = dims;
264
        }
265

266
        int p = 0;
267
        for (size_t b = 0; b < bottom_blobs.size(); b++)
268
        {
269
            const Mat& bottom_blob = bottom_blobs[b];
270

271
#if __riscv_vector
272
            if (bottom_blob.elempack == packn && elempack == 1)
273
            {
274
                const size_t vl = vsetvl_e32m1(packn);
275

276
                int size = bottom_blob.w * bottom_blob.h * bottom_blob.d;
277

278
                for (int q = 0; q < bottom_blob.c; q++)
279
                {
280
                    const float* r0 = bottom_blob.channel(q);
281

282
                    float* outptr0 = top_blob_unpacked.channel(p);
283

284
                    for (int i = 0; i < size; i++)
285
                    {
286
                        vfloat32m1_t _p = vle32_v_f32m1(r0, vl);
287
                        vsse32_v_f32m1(outptr0, top_blob_unpacked.cstep * sizeof(float), _p, vl);
288

289
                        r0 += packn;
290
                        outptr0 += 1;
291
                    }
292

293
                    p += packn;
294
                }
295
            }
296
            else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == 4 && elempack == 4)
297
#endif           // __riscv_vector
298
            {
299
                int size = bottom_blob.total();
300

301
                const float* ptr = bottom_blob;
302
                float* outptr = top_blob_unpacked.channel(p);
303
                memcpy(outptr, ptr, size * bottom_blob.elemsize);
304

305
                p += bottom_blob.c;
306
            }
307
        }
308

309
        // packing
310
        if (elempack < out_elempack)
311
        {
312
            convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
313
        }
314
    }
315

316
    if ((dims == 3 && positive_axis == 1) || (dims == 4 && positive_axis == 2))
317
    {
318
        // interleave dim height
319
        int w = bottom_blobs[0].w;
320
        int d = bottom_blobs[0].d;
321
        int channels = bottom_blobs[0].c;
322
        size_t elemsize = bottom_blobs[0].elemsize;
323
        int elempack = bottom_blobs[0].elempack;
324

325
        // total height
326
        int top_h = 0;
327
        for (size_t b = 0; b < bottom_blobs.size(); b++)
328
        {
329
            const Mat& bottom_blob = bottom_blobs[b];
330
            top_h += bottom_blob.h;
331
        }
332

333
        Mat& top_blob = top_blobs[0];
334
        top_blob.create(w, top_h, d, channels, elemsize, elempack, opt.blob_allocator);
335
        if (top_blob.empty())
336
            return -100;
337

338
        top_blob.dims = dims;
339

340
        #pragma omp parallel for num_threads(opt.num_threads)
341
        for (int q = 0; q < channels; q++)
342
        {
343
            float* outptr = top_blob.channel(q);
344

345
            for (int i = 0; i < d; i++)
346
            {
347
                for (size_t b = 0; b < bottom_blobs.size(); b++)
348
                {
349
                    const Mat& bottom_blob = bottom_blobs[b];
350

351
                    int size = bottom_blob.w * bottom_blob.h;
352

353
                    const float* ptr = bottom_blob.channel(q).depth(i);
354
                    memcpy(outptr, ptr, size * elemsize);
355

356
                    outptr += size * elempack;
357
                }
358
            }
359
        }
360
    }
361

362
    if ((dims == 3 && positive_axis == 2) || (dims == 4 && positive_axis == 3))
363
    {
364
        // interleave dim width
365
        int h = bottom_blobs[0].h;
366
        int d = bottom_blobs[0].d;
367
        int channels = bottom_blobs[0].c;
368
        size_t elemsize = bottom_blobs[0].elemsize;
369
        int elempack = bottom_blobs[0].elempack;
370

371
        // total height
372
        int top_w = 0;
373
        for (size_t b = 0; b < bottom_blobs.size(); b++)
374
        {
375
            const Mat& bottom_blob = bottom_blobs[b];
376
            top_w += bottom_blob.w;
377
        }
378

379
        Mat& top_blob = top_blobs[0];
380
        top_blob.create(top_w, h, d, channels, elemsize, elempack, opt.blob_allocator);
381
        if (top_blob.empty())
382
            return -100;
383

384
        top_blob.dims = dims;
385

386
        #pragma omp parallel for num_threads(opt.num_threads)
387
        for (int q = 0; q < channels; q++)
388
        {
389
            float* outptr = top_blob.channel(q);
390

391
            for (int i = 0; i < d; i++)
392
            {
393
                for (int j = 0; j < h; j++)
394
                {
395
                    for (size_t b = 0; b < bottom_blobs.size(); b++)
396
                    {
397
                        const Mat& bottom_blob = bottom_blobs[b];
398

399
                        const float* ptr = bottom_blob.channel(q).depth(i).row(j);
400
                        memcpy(outptr, ptr, bottom_blob.w * elemsize);
401

402
                        outptr += bottom_blob.w * elempack;
403
                    }
404
                }
405
            }
406
        }
407
    }
408

409
    if (dims == 4 && positive_axis == 1)
410
    {
411
        // interleave dim depth
412
        int w = bottom_blobs[0].w;
413
        int h = bottom_blobs[0].h;
414
        int channels = bottom_blobs[0].c;
415
        size_t elemsize = bottom_blobs[0].elemsize;
416
        int elempack = bottom_blobs[0].elempack;
417

418
        // total depth
419
        int top_d = 0;
420
        for (size_t b = 0; b < bottom_blobs.size(); b++)
421
        {
422
            const Mat& bottom_blob = bottom_blobs[b];
423
            top_d += bottom_blob.d;
424
        }
425

426
        Mat& top_blob = top_blobs[0];
427
        top_blob.create(w, h, top_d, channels, elemsize, elempack, opt.blob_allocator);
428
        if (top_blob.empty())
429
            return -100;
430

431
        #pragma omp parallel for num_threads(opt.num_threads)
432
        for (int q = 0; q < channels; q++)
433
        {
434
            float* outptr = top_blob.channel(q);
435

436
            for (size_t b = 0; b < bottom_blobs.size(); b++)
437
            {
438
                const Mat& bottom_blob = bottom_blobs[b];
439

440
                int size = bottom_blob.w * bottom_blob.h * bottom_blob.d;
441

442
                const float* ptr = bottom_blob.channel(q);
443
                memcpy(outptr, ptr, size * elemsize);
444

445
                outptr += size * elempack;
446
            }
447
        }
448
    }
449

450
    return 0;
451
}
452

453
int Concat_riscv::forward_bf16s_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
454
{
455
#if __riscv_vector
456
    const int packn = csrr_vlenb() / 2;
457
#endif
458

459
    int dims = bottom_blobs[0].dims;
460
    int positive_axis = axis < 0 ? dims + axis : axis;
461

462
    if (dims == 1) // positive_axis == 0
463
    {
464
        // concat vector
465
        // total length
466
        size_t elemsize = bottom_blobs[0].elemsize;
467
        int elempack = bottom_blobs[0].elempack;
468
        int top_w = 0;
469
        for (size_t b = 0; b < bottom_blobs.size(); b++)
470
        {
471
            const Mat& bottom_blob = bottom_blobs[b];
472
            top_w += bottom_blob.w * bottom_blob.elempack;
473
        }
474

475
        int out_elempack = 1;
476
#if __riscv_vector
477
        if (opt.use_packing_layout)
478
        {
479
            out_elempack = top_w % packn == 0 ? packn : 1;
480
        }
481
#endif
482
        size_t out_elemsize = elemsize / elempack * out_elempack;
483

484
        Mat& top_blob = top_blobs[0];
485
        top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
486
        if (top_blob.empty())
487
            return -100;
488

489
        unsigned short* outptr = top_blob;
490
        for (size_t b = 0; b < bottom_blobs.size(); b++)
491
        {
492
            const Mat& bottom_blob = bottom_blobs[b];
493

494
            const unsigned short* ptr = bottom_blob;
495
            memcpy(outptr, ptr, bottom_blob.w * bottom_blob.elemsize);
496

497
            outptr += bottom_blob.w * bottom_blob.elempack;
498
        }
499
    }
500

501
    if (dims == 2 && positive_axis == 0)
502
    {
503
        // concat image
504
        int w = bottom_blobs[0].w;
505

506
        // total height
507
        size_t elemsize = bottom_blobs[0].elemsize;
508
        int elempack = bottom_blobs[0].elempack;
509
        int top_h = 0;
510
        for (size_t b = 0; b < bottom_blobs.size(); b++)
511
        {
512
            const Mat& bottom_blob = bottom_blobs[b];
513
            elemsize = std::min(elemsize, bottom_blob.elemsize);
514
            elempack = std::min(elempack, bottom_blob.elempack);
515
            top_h += bottom_blob.h * bottom_blob.elempack;
516
        }
517

518
        int out_elempack = 1;
519
#if __riscv_vector
520
        if (opt.use_packing_layout)
521
        {
522
            out_elempack = top_h % packn == 0 ? packn : 1;
523
        }
524
#endif
525
        size_t out_elemsize = elemsize / elempack * out_elempack;
526

527
        Mat& top_blob = top_blobs[0];
528
        top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
529
        if (top_blob.empty())
530
            return -100;
531

532
        Mat top_blob_unpacked = top_blob;
533
        if (elempack < out_elempack)
534
        {
535
            top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_allocator);
536
            if (top_blob_unpacked.empty())
537
                return -100;
538
        }
539

540
        unsigned short* outptr = top_blob_unpacked;
541
        for (size_t b = 0; b < bottom_blobs.size(); b++)
542
        {
543
            const Mat& bottom_blob = bottom_blobs[b];
544

545
#if __riscv_vector
546
            if (bottom_blob.elempack == packn && elempack == 1)
547
            {
548
                const size_t vl = vsetvl_e16m1(packn);
549

550
                for (int i = 0; i < bottom_blob.h; i++)
551
                {
552
                    const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);
553

554
                    unsigned short* outptr0 = outptr;
555

556
                    for (int j = 0; j < w; j++)
557
                    {
558
                        vuint16m1_t _p = vle16_v_u16m1(r0, vl);
559
                        vsse16_v_u16m1(outptr0, w * sizeof(unsigned short), _p, vl);
560

561
                        r0 += packn;
562
                        outptr0 += 1;
563
                    }
564

565
                    outptr += w * packn;
566
                }
567
            }
568
            else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == packn && elempack == packn)
569
#endif           // __riscv_vector
570
            {
571
                int size = w * bottom_blob.h;
572

573
                const unsigned short* ptr = bottom_blob;
574
                memcpy(outptr, ptr, size * bottom_blob.elemsize);
575

576
                outptr += size * bottom_blob.elempack;
577
            }
578
        }
579

580
        // packing
581
        if (elempack < out_elempack)
582
        {
583
            convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
584
        }
585
    }
586

587
    if (dims == 2 && positive_axis == 1)
588
    {
589
        // interleave image row
590
        int h = bottom_blobs[0].h;
591
        size_t elemsize = bottom_blobs[0].elemsize;
592
        int elempack = bottom_blobs[0].elempack;
593

594
        // total width
595
        int top_w = 0;
596
        for (size_t b = 0; b < bottom_blobs.size(); b++)
597
        {
598
            const Mat& bottom_blob = bottom_blobs[b];
599
            top_w += bottom_blob.w;
600
        }
601

602
        Mat& top_blob = top_blobs[0];
603
        top_blob.create(top_w, h, elemsize, elempack, opt.blob_allocator);
604
        if (top_blob.empty())
605
            return -100;
606

607
        #pragma omp parallel for num_threads(opt.num_threads)
608
        for (int i = 0; i < h; i++)
609
        {
610
            unsigned short* outptr = top_blob.row<unsigned short>(i);
611
            for (size_t b = 0; b < bottom_blobs.size(); b++)
612
            {
613
                const Mat& bottom_blob = bottom_blobs[b];
614

615
                const unsigned short* ptr = bottom_blob.row<unsigned short>(i);
616
                memcpy(outptr, ptr, bottom_blob.w * elemsize);
617

618
                outptr += bottom_blob.w * elempack;
619
            }
620
        }
621
    }
622

623
    if ((dims == 3 || dims == 4) && positive_axis == 0)
624
    {
625
        // concat dim
626
        int w = bottom_blobs[0].w;
627
        int h = bottom_blobs[0].h;
628
        int d = bottom_blobs[0].d;
629

630
        // total channels
631
        size_t elemsize = bottom_blobs[0].elemsize;
632
        int elempack = bottom_blobs[0].elempack;
633
        int top_channels = 0;
634
        for (size_t b = 0; b < bottom_blobs.size(); b++)
635
        {
636
            const Mat& bottom_blob = bottom_blobs[b];
637
            elemsize = std::min(elemsize, bottom_blob.elemsize);
638
            elempack = std::min(elempack, bottom_blob.elempack);
639
            top_channels += bottom_blob.c * bottom_blob.elempack;
640
        }
641

642
        int out_elempack = 1;
643
#if __riscv_vector
644
        if (opt.use_packing_layout)
645
        {
646
            out_elempack = top_channels % packn == 0 ? packn : 1;
647
        }
648
#endif
649
        size_t out_elemsize = elemsize / elempack * out_elempack;
650

651
        Mat& top_blob = top_blobs[0];
652
        top_blob.create(w, h, d, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
653
        if (top_blob.empty())
654
            return -100;
655

656
        top_blob.dims = dims;
657

658
        Mat top_blob_unpacked = top_blob;
659
        if (elempack < out_elempack)
660
        {
661
            top_blob_unpacked.create(w, h, d, top_channels / elempack, elemsize, elempack, opt.workspace_allocator);
662
            if (top_blob_unpacked.empty())
663
                return -100;
664

665
            top_blob_unpacked.dims = dims;
666
        }
667

668
        int p = 0;
669
        for (size_t b = 0; b < bottom_blobs.size(); b++)
670
        {
671
            const Mat& bottom_blob = bottom_blobs[b];
672

673
#if __riscv_vector
674
            if (bottom_blob.elempack == packn && elempack == 1)
675
            {
676
                const size_t vl = vsetvl_e16m1(packn);
677

678
                int size = bottom_blob.w * bottom_blob.h * bottom_blob.d;
679

680
                for (int q = 0; q < bottom_blob.c; q++)
681
                {
682
                    const unsigned short* r0 = bottom_blob.channel(q);
683

684
                    unsigned short* outptr0 = top_blob_unpacked.channel(p);
685

686
                    for (int i = 0; i < size; i++)
687
                    {
688
                        vuint16m1_t _p = vle16_v_u16m1(r0, vl);
689
                        vsse16_v_u16m1(outptr0, top_blob_unpacked.cstep * sizeof(unsigned short), _p, vl);
690

691
                        r0 += packn;
692
                        outptr0 += 1;
693
                    }
694

695
                    p += packn;
696
                }
697
            }
698
            else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == packn && elempack == packn)
699
#endif           // __riscv_vector
700
            {
701
                int size = bottom_blob.total();
702

703
                const unsigned short* ptr = bottom_blob;
704
                unsigned short* outptr = top_blob_unpacked.channel(p);
705
                memcpy(outptr, ptr, size * bottom_blob.elemsize);
706

707
                p += bottom_blob.c;
708
            }
709
        }
710

711
        // packing
712
        if (elempack < out_elempack)
713
        {
714
            convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
715
        }
716
    }
717

718
    if ((dims == 3 && positive_axis == 1) || (dims == 4 && positive_axis == 2))
719
    {
720
        // interleave dim height
721
        int w = bottom_blobs[0].w;
722
        int d = bottom_blobs[0].d;
723
        int channels = bottom_blobs[0].c;
724
        size_t elemsize = bottom_blobs[0].elemsize;
725
        int elempack = bottom_blobs[0].elempack;
726

727
        // total height
728
        int top_h = 0;
729
        for (size_t b = 0; b < bottom_blobs.size(); b++)
730
        {
731
            const Mat& bottom_blob = bottom_blobs[b];
732
            top_h += bottom_blob.h;
733
        }
734

735
        Mat& top_blob = top_blobs[0];
736
        top_blob.create(w, top_h, d, channels, elemsize, elempack, opt.blob_allocator);
737
        if (top_blob.empty())
738
            return -100;
739

740
        top_blob.dims = dims;
741

742
        #pragma omp parallel for num_threads(opt.num_threads)
743
        for (int q = 0; q < channels; q++)
744
        {
745
            unsigned short* outptr = top_blob.channel(q);
746

747
            for (int i = 0; i < d; i++)
748
            {
749
                for (size_t b = 0; b < bottom_blobs.size(); b++)
750
                {
751
                    const Mat& bottom_blob = bottom_blobs[b];
752

753
                    int size = bottom_blob.w * bottom_blob.h;
754

755
                    const unsigned short* ptr = bottom_blob.channel(q).depth(i);
756
                    memcpy(outptr, ptr, size * elemsize);
757

758
                    outptr += size * elempack;
759
                }
760
            }
761
        }
762
    }
763

764
    if ((dims == 3 && positive_axis == 2) || (dims == 4 && positive_axis == 3))
765
    {
766
        // interleave dim width
767
        int h = bottom_blobs[0].h;
768
        int d = bottom_blobs[0].d;
769
        int channels = bottom_blobs[0].c;
770
        size_t elemsize = bottom_blobs[0].elemsize;
771
        int elempack = bottom_blobs[0].elempack;
772

773
        // total height
774
        int top_w = 0;
775
        for (size_t b = 0; b < bottom_blobs.size(); b++)
776
        {
777
            const Mat& bottom_blob = bottom_blobs[b];
778
            top_w += bottom_blob.w;
779
        }
780

781
        Mat& top_blob = top_blobs[0];
782
        top_blob.create(top_w, h, d, channels, elemsize, elempack, opt.blob_allocator);
783
        if (top_blob.empty())
784
            return -100;
785

786
        top_blob.dims = dims;
787

788
        #pragma omp parallel for num_threads(opt.num_threads)
789
        for (int q = 0; q < channels; q++)
790
        {
791
            unsigned short* outptr = top_blob.channel(q);
792

793
            for (int i = 0; i < d; i++)
794
            {
795
                for (int j = 0; j < h; j++)
796
                {
797
                    for (size_t b = 0; b < bottom_blobs.size(); b++)
798
                    {
799
                        const Mat& bottom_blob = bottom_blobs[b];
800

801
                        const unsigned short* ptr = bottom_blob.channel(q).depth(i).row<const unsigned short>(j);
802
                        memcpy(outptr, ptr, bottom_blob.w * elemsize);
803

804
                        outptr += bottom_blob.w * elempack;
805
                    }
806
                }
807
            }
808
        }
809
    }
810

811
    if (dims == 4 && positive_axis == 1)
812
    {
813
        // interleave dim depth
814
        int w = bottom_blobs[0].w;
815
        int h = bottom_blobs[0].h;
816
        int channels = bottom_blobs[0].c;
817
        size_t elemsize = bottom_blobs[0].elemsize;
818
        int elempack = bottom_blobs[0].elempack;
819

820
        // total depth
821
        int top_d = 0;
822
        for (size_t b = 0; b < bottom_blobs.size(); b++)
823
        {
824
            const Mat& bottom_blob = bottom_blobs[b];
825
            top_d += bottom_blob.d;
826
        }
827

828
        Mat& top_blob = top_blobs[0];
829
        top_blob.create(w, h, top_d, channels, elemsize, elempack, opt.blob_allocator);
830
        if (top_blob.empty())
831
            return -100;
832

833
        #pragma omp parallel for num_threads(opt.num_threads)
834
        for (int q = 0; q < channels; q++)
835
        {
836
            unsigned short* outptr = top_blob.channel(q);
837

838
            for (size_t b = 0; b < bottom_blobs.size(); b++)
839
            {
840
                const Mat& bottom_blob = bottom_blobs[b];
841

842
                int size = bottom_blob.w * bottom_blob.h * bottom_blob.d;
843

844
                const unsigned short* ptr = bottom_blob.channel(q);
845
                memcpy(outptr, ptr, size * elemsize);
846

847
                outptr += size * elempack;
848
            }
849
        }
850
    }
851

852
    return 0;
853
}
854

855
} // namespace ncnn
856
ncnn

Использование cookies