15
#include "concat_riscv.h"
18
#include <riscv_vector.h>
21
#include "riscv_usability.h"
25
Concat_riscv::Concat_riscv()
28
support_packing = true;
30
support_fp16_storage = true;
35
support_bf16_storage = true;
39
int Concat_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
41
int elembits = bottom_blobs[0].elembits();
43
#if __riscv_vector && __riscv_zfh
44
if (opt.use_fp16_storage && elembits == 16)
45
return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt);
49
if (opt.use_bf16_storage && elembits == 16)
50
return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt);
54
const int packn = csrr_vlenb() / 4;
57
int dims = bottom_blobs[0].dims;
58
int positive_axis = axis < 0 ? dims + axis : axis;
64
size_t elemsize = bottom_blobs[0].elemsize;
65
int elempack = bottom_blobs[0].elempack;
67
for (size_t b = 0; b < bottom_blobs.size(); b++)
69
const Mat& bottom_blob = bottom_blobs[b];
70
top_w += bottom_blob.w * bottom_blob.elempack;
75
if (opt.use_packing_layout)
77
out_elempack = top_w % packn == 0 ? packn : 1;
80
size_t out_elemsize = elemsize / elempack * out_elempack;
82
Mat& top_blob = top_blobs[0];
83
top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
87
float* outptr = top_blob;
88
for (size_t b = 0; b < bottom_blobs.size(); b++)
90
const Mat& bottom_blob = bottom_blobs[b];
92
const float* ptr = bottom_blob;
93
memcpy(outptr, ptr, bottom_blob.w * bottom_blob.elemsize);
95
outptr += bottom_blob.w * bottom_blob.elempack;
99
if (dims == 2 && positive_axis == 0)
102
int w = bottom_blobs[0].w;
105
size_t elemsize = bottom_blobs[0].elemsize;
106
int elempack = bottom_blobs[0].elempack;
108
for (size_t b = 0; b < bottom_blobs.size(); b++)
110
const Mat& bottom_blob = bottom_blobs[b];
111
elemsize = std::min(elemsize, bottom_blob.elemsize);
112
elempack = std::min(elempack, bottom_blob.elempack);
113
top_h += bottom_blob.h * bottom_blob.elempack;
116
int out_elempack = 1;
118
if (opt.use_packing_layout)
120
out_elempack = top_h % packn == 0 ? packn : 1;
123
size_t out_elemsize = elemsize / elempack * out_elempack;
125
Mat& top_blob = top_blobs[0];
126
top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
127
if (top_blob.empty())
130
Mat top_blob_unpacked = top_blob;
131
if (elempack < out_elempack)
133
top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_allocator);
134
if (top_blob_unpacked.empty())
138
float* outptr = top_blob_unpacked;
139
for (size_t b = 0; b < bottom_blobs.size(); b++)
141
const Mat& bottom_blob = bottom_blobs[b];
144
if (bottom_blob.elempack == packn && elempack == 1)
146
const size_t vl = vsetvl_e32m1(packn);
148
for (int i = 0; i < bottom_blob.h; i++)
150
const float* r0 = bottom_blob.row(i);
152
float* outptr0 = outptr;
154
for (int j = 0; j < w; j++)
156
vfloat32m1_t _p = vle32_v_f32m1(r0, vl);
157
vsse32_v_f32m1(outptr0, w * sizeof(float), _p, vl);
169
int size = w * bottom_blob.h;
171
const float* ptr = bottom_blob;
172
memcpy(outptr, ptr, size * bottom_blob.elemsize);
174
outptr += size * bottom_blob.elempack;
179
if (elempack < out_elempack)
181
convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
185
if (dims == 2 && positive_axis == 1)
188
int h = bottom_blobs[0].h;
189
size_t elemsize = bottom_blobs[0].elemsize;
190
int elempack = bottom_blobs[0].elempack;
194
for (size_t b = 0; b < bottom_blobs.size(); b++)
196
const Mat& bottom_blob = bottom_blobs[b];
197
top_w += bottom_blob.w;
200
Mat& top_blob = top_blobs[0];
201
top_blob.create(top_w, h, elemsize, elempack, opt.blob_allocator);
202
if (top_blob.empty())
205
#pragma omp parallel for num_threads(opt.num_threads)
206
for (int i = 0; i < h; i++)
208
float* outptr = top_blob.row(i);
209
for (size_t b = 0; b < bottom_blobs.size(); b++)
211
const Mat& bottom_blob = bottom_blobs[b];
213
const float* ptr = bottom_blob.row(i);
214
memcpy(outptr, ptr, bottom_blob.w * elemsize);
216
outptr += bottom_blob.w * elempack;
221
if ((dims == 3 || dims == 4) && positive_axis == 0)
224
int w = bottom_blobs[0].w;
225
int h = bottom_blobs[0].h;
226
int d = bottom_blobs[0].d;
229
size_t elemsize = bottom_blobs[0].elemsize;
230
int elempack = bottom_blobs[0].elempack;
231
int top_channels = 0;
232
for (size_t b = 0; b < bottom_blobs.size(); b++)
234
const Mat& bottom_blob = bottom_blobs[b];
235
elemsize = std::min(elemsize, bottom_blob.elemsize);
236
elempack = std::min(elempack, bottom_blob.elempack);
237
top_channels += bottom_blob.c * bottom_blob.elempack;
240
int out_elempack = 1;
242
if (opt.use_packing_layout)
244
out_elempack = top_channels % packn == 0 ? packn : 1;
247
size_t out_elemsize = elemsize / elempack * out_elempack;
249
Mat& top_blob = top_blobs[0];
250
top_blob.create(w, h, d, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
251
if (top_blob.empty())
254
top_blob.dims = dims;
256
Mat top_blob_unpacked = top_blob;
257
if (elempack < out_elempack)
259
top_blob_unpacked.create(w, h, d, top_channels / elempack, elemsize, elempack, opt.workspace_allocator);
260
if (top_blob_unpacked.empty())
263
top_blob_unpacked.dims = dims;
267
for (size_t b = 0; b < bottom_blobs.size(); b++)
269
const Mat& bottom_blob = bottom_blobs[b];
272
if (bottom_blob.elempack == packn && elempack == 1)
274
const size_t vl = vsetvl_e32m1(packn);
276
int size = bottom_blob.w * bottom_blob.h * bottom_blob.d;
278
for (int q = 0; q < bottom_blob.c; q++)
280
const float* r0 = bottom_blob.channel(q);
282
float* outptr0 = top_blob_unpacked.channel(p);
284
for (int i = 0; i < size; i++)
286
vfloat32m1_t _p = vle32_v_f32m1(r0, vl);
287
vsse32_v_f32m1(outptr0, top_blob_unpacked.cstep * sizeof(float), _p, vl);
299
int size = bottom_blob.total();
301
const float* ptr = bottom_blob;
302
float* outptr = top_blob_unpacked.channel(p);
303
memcpy(outptr, ptr, size * bottom_blob.elemsize);
310
if (elempack < out_elempack)
312
convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
316
if ((dims == 3 && positive_axis == 1) || (dims == 4 && positive_axis == 2))
319
int w = bottom_blobs[0].w;
320
int d = bottom_blobs[0].d;
321
int channels = bottom_blobs[0].c;
322
size_t elemsize = bottom_blobs[0].elemsize;
323
int elempack = bottom_blobs[0].elempack;
327
for (size_t b = 0; b < bottom_blobs.size(); b++)
329
const Mat& bottom_blob = bottom_blobs[b];
330
top_h += bottom_blob.h;
333
Mat& top_blob = top_blobs[0];
334
top_blob.create(w, top_h, d, channels, elemsize, elempack, opt.blob_allocator);
335
if (top_blob.empty())
338
top_blob.dims = dims;
340
#pragma omp parallel for num_threads(opt.num_threads)
341
for (int q = 0; q < channels; q++)
343
float* outptr = top_blob.channel(q);
345
for (int i = 0; i < d; i++)
347
for (size_t b = 0; b < bottom_blobs.size(); b++)
349
const Mat& bottom_blob = bottom_blobs[b];
351
int size = bottom_blob.w * bottom_blob.h;
353
const float* ptr = bottom_blob.channel(q).depth(i);
354
memcpy(outptr, ptr, size * elemsize);
356
outptr += size * elempack;
362
if ((dims == 3 && positive_axis == 2) || (dims == 4 && positive_axis == 3))
365
int h = bottom_blobs[0].h;
366
int d = bottom_blobs[0].d;
367
int channels = bottom_blobs[0].c;
368
size_t elemsize = bottom_blobs[0].elemsize;
369
int elempack = bottom_blobs[0].elempack;
373
for (size_t b = 0; b < bottom_blobs.size(); b++)
375
const Mat& bottom_blob = bottom_blobs[b];
376
top_w += bottom_blob.w;
379
Mat& top_blob = top_blobs[0];
380
top_blob.create(top_w, h, d, channels, elemsize, elempack, opt.blob_allocator);
381
if (top_blob.empty())
384
top_blob.dims = dims;
386
#pragma omp parallel for num_threads(opt.num_threads)
387
for (int q = 0; q < channels; q++)
389
float* outptr = top_blob.channel(q);
391
for (int i = 0; i < d; i++)
393
for (int j = 0; j < h; j++)
395
for (size_t b = 0; b < bottom_blobs.size(); b++)
397
const Mat& bottom_blob = bottom_blobs[b];
399
const float* ptr = bottom_blob.channel(q).depth(i).row(j);
400
memcpy(outptr, ptr, bottom_blob.w * elemsize);
402
outptr += bottom_blob.w * elempack;
409
if (dims == 4 && positive_axis == 1)
412
int w = bottom_blobs[0].w;
413
int h = bottom_blobs[0].h;
414
int channels = bottom_blobs[0].c;
415
size_t elemsize = bottom_blobs[0].elemsize;
416
int elempack = bottom_blobs[0].elempack;
420
for (size_t b = 0; b < bottom_blobs.size(); b++)
422
const Mat& bottom_blob = bottom_blobs[b];
423
top_d += bottom_blob.d;
426
Mat& top_blob = top_blobs[0];
427
top_blob.create(w, h, top_d, channels, elemsize, elempack, opt.blob_allocator);
428
if (top_blob.empty())
431
#pragma omp parallel for num_threads(opt.num_threads)
432
for (int q = 0; q < channels; q++)
434
float* outptr = top_blob.channel(q);
436
for (size_t b = 0; b < bottom_blobs.size(); b++)
438
const Mat& bottom_blob = bottom_blobs[b];
440
int size = bottom_blob.w * bottom_blob.h * bottom_blob.d;
442
const float* ptr = bottom_blob.channel(q);
443
memcpy(outptr, ptr, size * elemsize);
445
outptr += size * elempack;
453
int Concat_riscv::forward_bf16s_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
456
const int packn = csrr_vlenb() / 2;
459
int dims = bottom_blobs[0].dims;
460
int positive_axis = axis < 0 ? dims + axis : axis;
466
size_t elemsize = bottom_blobs[0].elemsize;
467
int elempack = bottom_blobs[0].elempack;
469
for (size_t b = 0; b < bottom_blobs.size(); b++)
471
const Mat& bottom_blob = bottom_blobs[b];
472
top_w += bottom_blob.w * bottom_blob.elempack;
475
int out_elempack = 1;
477
if (opt.use_packing_layout)
479
out_elempack = top_w % packn == 0 ? packn : 1;
482
size_t out_elemsize = elemsize / elempack * out_elempack;
484
Mat& top_blob = top_blobs[0];
485
top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
486
if (top_blob.empty())
489
unsigned short* outptr = top_blob;
490
for (size_t b = 0; b < bottom_blobs.size(); b++)
492
const Mat& bottom_blob = bottom_blobs[b];
494
const unsigned short* ptr = bottom_blob;
495
memcpy(outptr, ptr, bottom_blob.w * bottom_blob.elemsize);
497
outptr += bottom_blob.w * bottom_blob.elempack;
501
if (dims == 2 && positive_axis == 0)
504
int w = bottom_blobs[0].w;
507
size_t elemsize = bottom_blobs[0].elemsize;
508
int elempack = bottom_blobs[0].elempack;
510
for (size_t b = 0; b < bottom_blobs.size(); b++)
512
const Mat& bottom_blob = bottom_blobs[b];
513
elemsize = std::min(elemsize, bottom_blob.elemsize);
514
elempack = std::min(elempack, bottom_blob.elempack);
515
top_h += bottom_blob.h * bottom_blob.elempack;
518
int out_elempack = 1;
520
if (opt.use_packing_layout)
522
out_elempack = top_h % packn == 0 ? packn : 1;
525
size_t out_elemsize = elemsize / elempack * out_elempack;
527
Mat& top_blob = top_blobs[0];
528
top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
529
if (top_blob.empty())
532
Mat top_blob_unpacked = top_blob;
533
if (elempack < out_elempack)
535
top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_allocator);
536
if (top_blob_unpacked.empty())
540
unsigned short* outptr = top_blob_unpacked;
541
for (size_t b = 0; b < bottom_blobs.size(); b++)
543
const Mat& bottom_blob = bottom_blobs[b];
546
if (bottom_blob.elempack == packn && elempack == 1)
548
const size_t vl = vsetvl_e16m1(packn);
550
for (int i = 0; i < bottom_blob.h; i++)
552
const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);
554
unsigned short* outptr0 = outptr;
556
for (int j = 0; j < w; j++)
558
vuint16m1_t _p = vle16_v_u16m1(r0, vl);
559
vsse16_v_u16m1(outptr0, w * sizeof(unsigned short), _p, vl);
571
int size = w * bottom_blob.h;
573
const unsigned short* ptr = bottom_blob;
574
memcpy(outptr, ptr, size * bottom_blob.elemsize);
576
outptr += size * bottom_blob.elempack;
581
if (elempack < out_elempack)
583
convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
587
if (dims == 2 && positive_axis == 1)
590
int h = bottom_blobs[0].h;
591
size_t elemsize = bottom_blobs[0].elemsize;
592
int elempack = bottom_blobs[0].elempack;
596
for (size_t b = 0; b < bottom_blobs.size(); b++)
598
const Mat& bottom_blob = bottom_blobs[b];
599
top_w += bottom_blob.w;
602
Mat& top_blob = top_blobs[0];
603
top_blob.create(top_w, h, elemsize, elempack, opt.blob_allocator);
604
if (top_blob.empty())
607
#pragma omp parallel for num_threads(opt.num_threads)
608
for (int i = 0; i < h; i++)
610
unsigned short* outptr = top_blob.row<unsigned short>(i);
611
for (size_t b = 0; b < bottom_blobs.size(); b++)
613
const Mat& bottom_blob = bottom_blobs[b];
615
const unsigned short* ptr = bottom_blob.row<unsigned short>(i);
616
memcpy(outptr, ptr, bottom_blob.w * elemsize);
618
outptr += bottom_blob.w * elempack;
623
if ((dims == 3 || dims == 4) && positive_axis == 0)
626
int w = bottom_blobs[0].w;
627
int h = bottom_blobs[0].h;
628
int d = bottom_blobs[0].d;
631
size_t elemsize = bottom_blobs[0].elemsize;
632
int elempack = bottom_blobs[0].elempack;
633
int top_channels = 0;
634
for (size_t b = 0; b < bottom_blobs.size(); b++)
636
const Mat& bottom_blob = bottom_blobs[b];
637
elemsize = std::min(elemsize, bottom_blob.elemsize);
638
elempack = std::min(elempack, bottom_blob.elempack);
639
top_channels += bottom_blob.c * bottom_blob.elempack;
642
int out_elempack = 1;
644
if (opt.use_packing_layout)
646
out_elempack = top_channels % packn == 0 ? packn : 1;
649
size_t out_elemsize = elemsize / elempack * out_elempack;
651
Mat& top_blob = top_blobs[0];
652
top_blob.create(w, h, d, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
653
if (top_blob.empty())
656
top_blob.dims = dims;
658
Mat top_blob_unpacked = top_blob;
659
if (elempack < out_elempack)
661
top_blob_unpacked.create(w, h, d, top_channels / elempack, elemsize, elempack, opt.workspace_allocator);
662
if (top_blob_unpacked.empty())
665
top_blob_unpacked.dims = dims;
669
for (size_t b = 0; b < bottom_blobs.size(); b++)
671
const Mat& bottom_blob = bottom_blobs[b];
674
if (bottom_blob.elempack == packn && elempack == 1)
676
const size_t vl = vsetvl_e16m1(packn);
678
int size = bottom_blob.w * bottom_blob.h * bottom_blob.d;
680
for (int q = 0; q < bottom_blob.c; q++)
682
const unsigned short* r0 = bottom_blob.channel(q);
684
unsigned short* outptr0 = top_blob_unpacked.channel(p);
686
for (int i = 0; i < size; i++)
688
vuint16m1_t _p = vle16_v_u16m1(r0, vl);
689
vsse16_v_u16m1(outptr0, top_blob_unpacked.cstep * sizeof(unsigned short), _p, vl);
701
int size = bottom_blob.total();
703
const unsigned short* ptr = bottom_blob;
704
unsigned short* outptr = top_blob_unpacked.channel(p);
705
memcpy(outptr, ptr, size * bottom_blob.elemsize);
712
if (elempack < out_elempack)
714
convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
718
if ((dims == 3 && positive_axis == 1) || (dims == 4 && positive_axis == 2))
721
int w = bottom_blobs[0].w;
722
int d = bottom_blobs[0].d;
723
int channels = bottom_blobs[0].c;
724
size_t elemsize = bottom_blobs[0].elemsize;
725
int elempack = bottom_blobs[0].elempack;
729
for (size_t b = 0; b < bottom_blobs.size(); b++)
731
const Mat& bottom_blob = bottom_blobs[b];
732
top_h += bottom_blob.h;
735
Mat& top_blob = top_blobs[0];
736
top_blob.create(w, top_h, d, channels, elemsize, elempack, opt.blob_allocator);
737
if (top_blob.empty())
740
top_blob.dims = dims;
742
#pragma omp parallel for num_threads(opt.num_threads)
743
for (int q = 0; q < channels; q++)
745
unsigned short* outptr = top_blob.channel(q);
747
for (int i = 0; i < d; i++)
749
for (size_t b = 0; b < bottom_blobs.size(); b++)
751
const Mat& bottom_blob = bottom_blobs[b];
753
int size = bottom_blob.w * bottom_blob.h;
755
const unsigned short* ptr = bottom_blob.channel(q).depth(i);
756
memcpy(outptr, ptr, size * elemsize);
758
outptr += size * elempack;
764
if ((dims == 3 && positive_axis == 2) || (dims == 4 && positive_axis == 3))
767
int h = bottom_blobs[0].h;
768
int d = bottom_blobs[0].d;
769
int channels = bottom_blobs[0].c;
770
size_t elemsize = bottom_blobs[0].elemsize;
771
int elempack = bottom_blobs[0].elempack;
775
for (size_t b = 0; b < bottom_blobs.size(); b++)
777
const Mat& bottom_blob = bottom_blobs[b];
778
top_w += bottom_blob.w;
781
Mat& top_blob = top_blobs[0];
782
top_blob.create(top_w, h, d, channels, elemsize, elempack, opt.blob_allocator);
783
if (top_blob.empty())
786
top_blob.dims = dims;
788
#pragma omp parallel for num_threads(opt.num_threads)
789
for (int q = 0; q < channels; q++)
791
unsigned short* outptr = top_blob.channel(q);
793
for (int i = 0; i < d; i++)
795
for (int j = 0; j < h; j++)
797
for (size_t b = 0; b < bottom_blobs.size(); b++)
799
const Mat& bottom_blob = bottom_blobs[b];
801
const unsigned short* ptr = bottom_blob.channel(q).depth(i).row<const unsigned short>(j);
802
memcpy(outptr, ptr, bottom_blob.w * elemsize);
804
outptr += bottom_blob.w * elempack;
811
if (dims == 4 && positive_axis == 1)
814
int w = bottom_blobs[0].w;
815
int h = bottom_blobs[0].h;
816
int channels = bottom_blobs[0].c;
817
size_t elemsize = bottom_blobs[0].elemsize;
818
int elempack = bottom_blobs[0].elempack;
822
for (size_t b = 0; b < bottom_blobs.size(); b++)
824
const Mat& bottom_blob = bottom_blobs[b];
825
top_d += bottom_blob.d;
828
Mat& top_blob = top_blobs[0];
829
top_blob.create(w, h, top_d, channels, elemsize, elempack, opt.blob_allocator);
830
if (top_blob.empty())
833
#pragma omp parallel for num_threads(opt.num_threads)
834
for (int q = 0; q < channels; q++)
836
unsigned short* outptr = top_blob.channel(q);
838
for (size_t b = 0; b < bottom_blobs.size(); b++)
840
const Mat& bottom_blob = bottom_blobs[b];
842
int size = bottom_blob.w * bottom_blob.h * bottom_blob.d;
844
const unsigned short* ptr = bottom_blob.channel(q);
845
memcpy(outptr, ptr, size * elemsize);
847
outptr += size * elempack;