1
// Tencent is pleased to support the open source community by making ncnn available.
3
// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
8
// https://opensource.org/licenses/BSD-3-Clause
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
24
// should be a kanna ascii art here in my local branch
25
// but we shall ask the original art author for permission first ...
26
// https://www.reddit.com/r/anime/comments/5uxjn4/i_recreated_the_kanna_ascii_art_from_kobayashisan/
28
static void kanna_rotate_1_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
30
const int srcwgap = srcstride - srcw;
31
const int wgap = stride - w;
33
const unsigned char* src0 = src;
34
const unsigned char* src1 = src + srcstride;
35
unsigned char* dst0 = dst;
36
unsigned char* dst1 = dst + stride;
39
for (; y + 1 < srch; y += 2)
43
int remain = srcw - (nn << 5);
44
#if !NCNN_GNU_INLINE_ASM || __aarch64__
47
uint8x16_t _src0 = vld1q_u8(src0);
48
uint8x16_t _src0n = vld1q_u8(src0 + 16);
49
vst1q_u8(dst0, _src0);
50
vst1q_u8(dst0 + 16, _src0n);
52
uint8x16_t _src1 = vld1q_u8(src1);
53
uint8x16_t _src1n = vld1q_u8(src1 + 16);
54
vst1q_u8(dst1, _src1);
55
vst1q_u8(dst1 + 16, _src1n);
68
"vld1.u8 {d0-d3}, [%1]! \n"
70
"vld1.u8 {d4-d7}, [%2]! \n"
72
"vst1.u8 {d0-d3}, [%3]! \n"
73
"vst1.u8 {d4-d7}, [%4]! \n"
85
: "cc", "memory", "q0", "q1", "q2", "q3");
92
for (; remain > 0; remain--)
98
src0 += srcwgap + srcstride;
99
src1 += srcwgap + srcstride;
100
dst0 += wgap + stride;
101
dst1 += wgap + stride;
104
for (; y < srch; y++)
108
int remain = srcw - (nn << 5);
109
#if !NCNN_GNU_INLINE_ASM || __aarch64__
112
uint8x16_t _src = vld1q_u8(src0);
113
uint8x16_t _src2 = vld1q_u8(src0 + 16);
114
vst1q_u8(dst0, _src);
115
vst1q_u8(dst0 + 16, _src2);
126
"vld1.u8 {d0-d3}, [%1]! \n"
128
"vst1.u8 {d0-d3}, [%2]! \n"
136
: "cc", "memory", "q0", "q1");
143
for (; remain > 0; remain--)
153
static void kanna_rotate_1_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
155
const int srcwgap = srcstride - srcw * 2;
156
const int wgap = stride - w * 2;
160
const unsigned char* src0 = src;
161
const unsigned char* src1 = src + srcstride;
162
unsigned char* dst0 = dst;
163
unsigned char* dst1 = dst + stride;
166
for (; y + 1 < srch; y += 2)
170
int remain = size - (nn << 5);
171
#if !NCNN_GNU_INLINE_ASM || __aarch64__
174
uint8x16_t _src0 = vld1q_u8(src0);
175
uint8x16_t _src0n = vld1q_u8(src0 + 16);
176
vst1q_u8(dst0, _src0);
177
vst1q_u8(dst0 + 16, _src0n);
179
uint8x16_t _src1 = vld1q_u8(src1);
180
uint8x16_t _src1n = vld1q_u8(src1 + 16);
181
vst1q_u8(dst1, _src1);
182
vst1q_u8(dst1 + 16, _src1n);
195
"vld1.u8 {d0-d3}, [%1]! \n"
197
"vld1.u8 {d4-d7}, [%2]! \n"
199
"vst1.u8 {d0-d3}, [%3]! \n"
200
"vst1.u8 {d4-d7}, [%4]! \n"
212
: "cc", "memory", "q0", "q1", "q2", "q3");
219
for (; remain > 0; remain--)
225
src0 += srcwgap + srcstride;
226
src1 += srcwgap + srcstride;
227
dst0 += wgap + stride;
228
dst1 += wgap + stride;
231
for (; y < srch; y++)
235
int remain = size - (nn << 5);
236
#if !NCNN_GNU_INLINE_ASM || __aarch64__
239
uint8x16_t _src = vld1q_u8(src0);
240
uint8x16_t _src2 = vld1q_u8(src0 + 16);
241
vst1q_u8(dst0, _src);
242
vst1q_u8(dst0 + 16, _src2);
253
"vld1.u8 {d0-d3}, [%1]! \n"
255
"vst1.u8 {d0-d3}, [%2]! \n"
263
: "cc", "memory", "q0", "q1");
270
for (; remain > 0; remain--)
280
static void kanna_rotate_1_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
282
const int srcwgap = srcstride - srcw * 3;
283
const int wgap = stride - w * 3;
287
const unsigned char* src0 = src;
288
const unsigned char* src1 = src + srcstride;
289
unsigned char* dst0 = dst;
290
unsigned char* dst1 = dst + stride;
293
for (; y + 1 < srch; y += 2)
297
int remain = size - (nn << 5);
298
#if !NCNN_GNU_INLINE_ASM || __aarch64__
301
uint8x16_t _src0 = vld1q_u8(src0);
302
uint8x16_t _src0n = vld1q_u8(src0 + 16);
303
vst1q_u8(dst0, _src0);
304
vst1q_u8(dst0 + 16, _src0n);
306
uint8x16_t _src1 = vld1q_u8(src1);
307
uint8x16_t _src1n = vld1q_u8(src1 + 16);
308
vst1q_u8(dst1, _src1);
309
vst1q_u8(dst1 + 16, _src1n);
322
"vld1.u8 {d0-d3}, [%1]! \n"
324
"vld1.u8 {d4-d7}, [%2]! \n"
326
"vst1.u8 {d0-d3}, [%3]! \n"
327
"vst1.u8 {d4-d7}, [%4]! \n"
339
: "cc", "memory", "q0", "q1", "q2", "q3");
346
for (; remain > 0; remain--)
352
src0 += srcwgap + srcstride;
353
src1 += srcwgap + srcstride;
354
dst0 += wgap + stride;
355
dst1 += wgap + stride;
358
for (; y < srch; y++)
362
int remain = size - (nn << 5);
363
#if !NCNN_GNU_INLINE_ASM || __aarch64__
366
uint8x16_t _src = vld1q_u8(src0);
367
uint8x16_t _src2 = vld1q_u8(src0 + 16);
368
vst1q_u8(dst0, _src);
369
vst1q_u8(dst0 + 16, _src2);
380
"vld1.u8 {d0-d3}, [%1]! \n"
382
"vst1.u8 {d0-d3}, [%2]! \n"
390
: "cc", "memory", "q0", "q1");
397
for (; remain > 0; remain--)
407
static void kanna_rotate_1_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
409
const int srcwgap = srcstride - srcw * 4;
410
const int wgap = stride - w * 4;
414
const unsigned char* src0 = src;
415
const unsigned char* src1 = src + srcstride;
416
unsigned char* dst0 = dst;
417
unsigned char* dst1 = dst + stride;
420
for (; y + 1 < srch; y += 2)
424
int remain = size - (nn << 5);
425
#if !NCNN_GNU_INLINE_ASM || __aarch64__
428
uint8x16_t _src0 = vld1q_u8(src0);
429
uint8x16_t _src0n = vld1q_u8(src0 + 16);
430
vst1q_u8(dst0, _src0);
431
vst1q_u8(dst0 + 16, _src0n);
433
uint8x16_t _src1 = vld1q_u8(src1);
434
uint8x16_t _src1n = vld1q_u8(src1 + 16);
435
vst1q_u8(dst1, _src1);
436
vst1q_u8(dst1 + 16, _src1n);
449
"vld1.u8 {d0-d3}, [%1]! \n"
451
"vld1.u8 {d4-d7}, [%2]! \n"
453
"vst1.u8 {d0-d3}, [%3]! \n"
454
"vst1.u8 {d4-d7}, [%4]! \n"
466
: "cc", "memory", "q0", "q1", "q2", "q3");
473
for (; remain > 0; remain--)
479
src0 += srcwgap + srcstride;
480
src1 += srcwgap + srcstride;
481
dst0 += wgap + stride;
482
dst1 += wgap + stride;
485
for (; y < srch; y++)
489
int remain = size - (nn << 5);
490
#if !NCNN_GNU_INLINE_ASM || __aarch64__
493
uint8x16_t _src = vld1q_u8(src0);
494
uint8x16_t _src2 = vld1q_u8(src0 + 16);
495
vst1q_u8(dst0, _src);
496
vst1q_u8(dst0 + 16, _src2);
507
"vld1.u8 {d0-d3}, [%1]! \n"
509
"vst1.u8 {d0-d3}, [%2]! \n"
517
: "cc", "memory", "q0", "q1");
524
for (; remain > 0; remain--)
534
static void kanna_rotate_2_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
536
const int srcwgap = srcstride - srcw;
537
const int wgap = stride + w;
539
const unsigned char* src0 = src;
540
unsigned char* dst0 = dst + w - 1;
543
for (; y < srch; y++)
549
int remain = srcw - (nn << 4);
551
#if !NCNN_GNU_INLINE_ASM || __aarch64__
554
uint8x8_t _src = vld1_u8(src0);
555
uint8x8_t _src2 = vld1_u8(src0 + 8);
557
_src = vrev64_u8(_src);
558
_src2 = vrev64_u8(_src2);
560
vst1_u8(dst0, _src2);
561
vst1_u8(dst0 + 8, _src);
573
"vld1.u8 {d0-d1}, [%1]! \n"
574
"vrev64.u8 d3, d0 \n"
575
"vrev64.u8 d2, d1 \n"
577
"vst1.u8 {d2-d3}, [%2], r4 \n"
585
: "cc", "memory", "q0", "q1", "r4");
594
for (; remain > 0; remain--)
607
static void kanna_rotate_2_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
609
const int srcwgap = srcstride - srcw * 2;
610
const int wgap = stride + w * 2;
612
const unsigned char* src0 = src;
613
unsigned char* dst0 = dst + w * 2 - 2;
616
for (; y < srch; y++)
622
int remain = srcw - (nn << 4);
624
#if !NCNN_GNU_INLINE_ASM || __aarch64__
627
uint8x8x2_t _src = vld2_u8(src0);
628
uint8x8x2_t _src2 = vld2_u8(src0 + 8 * 2);
630
_src.val[0] = vrev64_u8(_src.val[0]);
631
_src.val[1] = vrev64_u8(_src.val[1]);
633
_src2.val[0] = vrev64_u8(_src2.val[0]);
634
_src2.val[1] = vrev64_u8(_src2.val[1]);
637
vst2_u8(dst0 - 8 * 2, _src2);
649
"vld2.u8 {d0-d1}, [%1]! \n"
650
"vrev64.u8 d0, d0 \n"
652
"vld2.u8 {d2-d3}, [%1]! \n"
653
"vrev64.u8 d1, d1 \n"
654
"vrev64.u8 d2, d2 \n"
655
"vst2.u8 {d0-d1}, [%2], r4 \n"
656
"vrev64.u8 d3, d3 \n"
658
"vst2.u8 {d2-d3}, [%2], r4 \n"
666
: "cc", "memory", "q0", "q1", "r4");
675
for (; remain > 0; remain--)
689
static void kanna_rotate_2_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
691
const int srcwgap = srcstride - srcw * 3;
692
const int wgap = stride + w * 3;
694
const unsigned char* src0 = src;
695
unsigned char* dst0 = dst + w * 3 - 3;
698
for (; y < srch; y++)
704
int remain = srcw - (nn << 4);
706
#if !NCNN_GNU_INLINE_ASM || __aarch64__
709
uint8x8x3_t _src = vld3_u8(src0);
710
uint8x8x3_t _src2 = vld3_u8(src0 + 8 * 3);
712
_src.val[0] = vrev64_u8(_src.val[0]);
713
_src.val[1] = vrev64_u8(_src.val[1]);
714
_src.val[2] = vrev64_u8(_src.val[2]);
716
_src2.val[0] = vrev64_u8(_src2.val[0]);
717
_src2.val[1] = vrev64_u8(_src2.val[1]);
718
_src2.val[2] = vrev64_u8(_src2.val[2]);
721
vst3_u8(dst0 - 8 * 3, _src2);
733
"vld3.u8 {d0-d2}, [%1]! \n"
734
"vrev64.u8 d0, d0 \n"
735
"vrev64.u8 d1, d1 \n"
737
"vld3.u8 {d4-d6}, [%1]! \n"
738
"vrev64.u8 d2, d2 \n"
739
"vrev64.u8 d4, d4 \n"
740
"vst3.u8 {d0-d2}, [%2], r4 \n"
741
"vrev64.u8 d5, d5 \n"
742
"vrev64.u8 d6, d6 \n"
744
"vst3.u8 {d4-d6}, [%2], r4 \n"
752
: "cc", "memory", "q0", "q1", "q2", "q3", "r4");
761
for (; remain > 0; remain--)
776
static void kanna_rotate_2_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
778
const int srcwgap = srcstride - srcw * 4;
779
const int wgap = stride + w * 4;
781
const unsigned char* src0 = src;
782
unsigned char* dst0 = dst + w * 4 - 4;
785
for (; y < srch; y++)
791
int remain = srcw - (nn << 4);
793
#if !NCNN_GNU_INLINE_ASM || __aarch64__
796
uint8x8x4_t _src = vld4_u8(src0);
797
uint8x8x4_t _src2 = vld4_u8(src0 + 8 * 4);
799
_src.val[0] = vrev64_u8(_src.val[0]);
800
_src.val[1] = vrev64_u8(_src.val[1]);
801
_src.val[2] = vrev64_u8(_src.val[2]);
802
_src.val[3] = vrev64_u8(_src.val[3]);
804
_src2.val[0] = vrev64_u8(_src2.val[0]);
805
_src2.val[1] = vrev64_u8(_src2.val[1]);
806
_src2.val[2] = vrev64_u8(_src2.val[2]);
807
_src2.val[3] = vrev64_u8(_src2.val[3]);
810
vst4_u8(dst0 - 8 * 4, _src2);
822
"vld4.u8 {d0-d3}, [%1]! \n"
823
"vrev64.u8 d0, d0 \n"
824
"vrev64.u8 d1, d1 \n"
825
"vrev64.u8 d2, d2 \n"
827
"vld4.u8 {d4-d7}, [%1]! \n"
828
"vrev64.u8 d3, d3 \n"
829
"vrev64.u8 d4, d4 \n"
830
"vrev64.u8 d5, d5 \n"
831
"vst4.u8 {d0-d3}, [%2], r4 \n"
832
"vrev64.u8 d6, d6 \n"
833
"vrev64.u8 d7, d7 \n"
835
"vst4.u8 {d4-d7}, [%2], r4 \n"
843
: "cc", "memory", "q0", "q1", "q2", "q3", "r4");
852
for (; remain > 0; remain--)
868
static void kanna_rotate_3_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
870
const int srcwgap = srcstride - srcw;
871
const int wgap = stride - w;
873
// point to the last dst pixel
874
unsigned char* dstend = dst + stride * h - wgap;
876
const unsigned char* src0 = src;
877
unsigned char* dst0 = dstend - 1;
880
for (; y < srch; y++)
886
int remain = srcw - (nn << 4);
888
#if !NCNN_GNU_INLINE_ASM || __aarch64__
891
uint8x8_t _src = vld1_u8(src0);
892
uint8x8_t _src2 = vld1_u8(src0 + 8);
894
_src = vrev64_u8(_src);
895
_src2 = vrev64_u8(_src2);
897
vst1_u8(dst0, _src2);
898
vst1_u8(dst0 + 8, _src);
910
"vld1.u8 {d0-d1}, [%1]! \n"
911
"vrev64.u8 d3, d0 \n"
912
"vrev64.u8 d2, d1 \n"
914
"vst1.u8 {d2-d3}, [%2], r4 \n"
922
: "cc", "memory", "q0", "q1", "r4");
931
for (; remain > 0; remain--)
944
static void kanna_rotate_3_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
946
const int srcwgap = srcstride - srcw * 2;
947
const int wgap = stride - w * 2;
949
// point to the last dst pixel
950
unsigned char* dstend = dst + stride * h - wgap;
952
const unsigned char* src0 = src;
953
unsigned char* dst0 = dstend - 2;
956
for (; y < srch; y++)
962
int remain = srcw - (nn << 4);
964
#if !NCNN_GNU_INLINE_ASM || __aarch64__
967
uint8x8x2_t _src = vld2_u8(src0);
968
uint8x8x2_t _src2 = vld2_u8(src0 + 8 * 2);
970
_src.val[0] = vrev64_u8(_src.val[0]);
971
_src.val[1] = vrev64_u8(_src.val[1]);
973
_src2.val[0] = vrev64_u8(_src2.val[0]);
974
_src2.val[1] = vrev64_u8(_src2.val[1]);
977
vst2_u8(dst0 - 8 * 2, _src2);
989
"vld2.u8 {d0-d1}, [%1]! \n"
990
"vrev64.u8 d0, d0 \n"
992
"vld2.u8 {d2-d3}, [%1]! \n"
993
"vrev64.u8 d1, d1 \n"
994
"vrev64.u8 d2, d2 \n"
995
"vst2.u8 {d0-d1}, [%2], r4 \n"
996
"vrev64.u8 d3, d3 \n"
998
"vst2.u8 {d2-d3}, [%2], r4 \n"
1006
: "cc", "memory", "q0", "q1", "r4");
1008
#endif // __aarch64__
1015
for (; remain > 0; remain--)
1029
static void kanna_rotate_3_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
1031
const int srcwgap = srcstride - srcw * 3;
1032
const int wgap = stride - w * 3;
1034
// point to the last dst pixel
1035
unsigned char* dstend = dst + stride * h - wgap;
1037
const unsigned char* src0 = src;
1038
unsigned char* dst0 = dstend - 3;
1041
for (; y < srch; y++)
1047
int remain = srcw - (nn << 4);
1049
#if !NCNN_GNU_INLINE_ASM || __aarch64__
1050
for (; nn > 0; nn--)
1052
uint8x8x3_t _src = vld3_u8(src0);
1053
uint8x8x3_t _src2 = vld3_u8(src0 + 8 * 3);
1055
_src.val[0] = vrev64_u8(_src.val[0]);
1056
_src.val[1] = vrev64_u8(_src.val[1]);
1057
_src.val[2] = vrev64_u8(_src.val[2]);
1059
_src2.val[0] = vrev64_u8(_src2.val[0]);
1060
_src2.val[1] = vrev64_u8(_src2.val[1]);
1061
_src2.val[2] = vrev64_u8(_src2.val[2]);
1063
vst3_u8(dst0, _src);
1064
vst3_u8(dst0 - 8 * 3, _src2);
1076
"vld3.u8 {d0-d2}, [%1]! \n"
1077
"vrev64.u8 d0, d0 \n"
1078
"vrev64.u8 d1, d1 \n"
1080
"vld3.u8 {d4-d6}, [%1]! \n"
1081
"vrev64.u8 d2, d2 \n"
1082
"vrev64.u8 d4, d4 \n"
1083
"vst3.u8 {d0-d2}, [%2], r4 \n"
1084
"vrev64.u8 d5, d5 \n"
1085
"vrev64.u8 d6, d6 \n"
1087
"vst3.u8 {d4-d6}, [%2], r4 \n"
1095
: "cc", "memory", "q0", "q1", "q2", "q3", "r4");
1097
#endif // __aarch64__
1104
for (; remain > 0; remain--)
1119
static void kanna_rotate_3_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
1121
const int srcwgap = srcstride - srcw * 4;
1122
const int wgap = stride - w * 4;
1124
// point to the last dst pixel
1125
unsigned char* dstend = dst + stride * h - wgap;
1127
const unsigned char* src0 = src;
1128
unsigned char* dst0 = dstend - 4;
1131
for (; y < srch; y++)
1137
int remain = srcw - (nn << 4);
1139
#if !NCNN_GNU_INLINE_ASM || __aarch64__
1140
for (; nn > 0; nn--)
1142
uint8x8x4_t _src = vld4_u8(src0);
1143
uint8x8x4_t _src2 = vld4_u8(src0 + 8 * 4);
1145
_src.val[0] = vrev64_u8(_src.val[0]);
1146
_src.val[1] = vrev64_u8(_src.val[1]);
1147
_src.val[2] = vrev64_u8(_src.val[2]);
1148
_src.val[3] = vrev64_u8(_src.val[3]);
1150
_src2.val[0] = vrev64_u8(_src2.val[0]);
1151
_src2.val[1] = vrev64_u8(_src2.val[1]);
1152
_src2.val[2] = vrev64_u8(_src2.val[2]);
1153
_src2.val[3] = vrev64_u8(_src2.val[3]);
1155
vst4_u8(dst0, _src);
1156
vst4_u8(dst0 - 8 * 4, _src2);
1168
"vld4.u8 {d0-d3}, [%1]! \n"
1169
"vrev64.u8 d0, d0 \n"
1170
"vrev64.u8 d1, d1 \n"
1171
"vrev64.u8 d2, d2 \n"
1173
"vld4.u8 {d4-d7}, [%1]! \n"
1174
"vrev64.u8 d3, d3 \n"
1175
"vrev64.u8 d4, d4 \n"
1176
"vrev64.u8 d5, d5 \n"
1177
"vst4.u8 {d0-d3}, [%2], r4 \n"
1178
"vrev64.u8 d6, d6 \n"
1179
"vrev64.u8 d7, d7 \n"
1181
"vst4.u8 {d4-d7}, [%2], r4 \n"
1189
: "cc", "memory", "q0", "q1", "q2", "q3", "r4");
1191
#endif // __aarch64__
1198
for (; remain > 0; remain--)
1214
static void kanna_rotate_4_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
1216
const int srcwgap = srcstride - srcw;
1217
const int wgap = stride + w;
1219
// point to the last dst pixel row
1220
unsigned char* dstend = dst + stride * (h - 1);
1222
const unsigned char* src0 = src;
1223
const unsigned char* src1 = src + srcstride;
1224
unsigned char* dst0 = dstend;
1225
unsigned char* dst1 = dstend - stride;
1228
for (; y + 1 < srch; y += 2)
1232
int remain = srcw - (nn << 5);
1233
#if !NCNN_GNU_INLINE_ASM || __aarch64__
1234
for (; nn > 0; nn--)
1236
uint8x16_t _src0 = vld1q_u8(src0);
1237
uint8x16_t _src0n = vld1q_u8(src0 + 16);
1238
vst1q_u8(dst0, _src0);
1239
vst1q_u8(dst0 + 16, _src0n);
1241
uint8x16_t _src1 = vld1q_u8(src1);
1242
uint8x16_t _src1n = vld1q_u8(src1 + 16);
1243
vst1q_u8(dst1, _src1);
1244
vst1q_u8(dst1 + 16, _src1n);
1257
"vld1.u8 {d0-d3}, [%1]! \n"
1259
"vld1.u8 {d4-d7}, [%2]! \n"
1261
"vst1.u8 {d0-d3}, [%3]! \n"
1262
"vst1.u8 {d4-d7}, [%4]! \n"
1274
: "cc", "memory", "q0", "q1", "q2", "q3");
1276
#endif // __aarch64__
1281
for (; remain > 0; remain--)
1287
src0 += srcwgap + srcstride;
1288
src1 += srcwgap + srcstride;
1289
dst0 -= wgap + stride;
1290
dst1 -= wgap + stride;
1293
for (; y < srch; y++)
1297
int remain = srcw - (nn << 5);
1298
#if !NCNN_GNU_INLINE_ASM || __aarch64__
1299
for (; nn > 0; nn--)
1301
uint8x16_t _src = vld1q_u8(src0);
1302
uint8x16_t _src2 = vld1q_u8(src0 + 16);
1303
vst1q_u8(dst0, _src);
1304
vst1q_u8(dst0 + 16, _src2);
1315
"vld1.u8 {d0-d3}, [%1]! \n"
1317
"vst1.u8 {d0-d3}, [%2]! \n"
1325
: "cc", "memory", "q0", "q1");
1327
#endif // __aarch64__
1332
for (; remain > 0; remain--)
1342
static void kanna_rotate_4_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
1344
const int srcwgap = srcstride - srcw * 2;
1345
const int wgap = stride + w * 2;
1347
// point to the last dst pixel row
1348
unsigned char* dstend = dst + stride * (h - 1);
1350
int size = srcw * 2;
1352
const unsigned char* src0 = src;
1353
const unsigned char* src1 = src + srcstride;
1354
unsigned char* dst0 = dstend;
1355
unsigned char* dst1 = dstend - stride;
1358
for (; y + 1 < srch; y += 2)
1362
int remain = size - (nn << 5);
1363
#if !NCNN_GNU_INLINE_ASM || __aarch64__
1364
for (; nn > 0; nn--)
1366
uint8x16_t _src0 = vld1q_u8(src0);
1367
uint8x16_t _src0n = vld1q_u8(src0 + 16);
1368
vst1q_u8(dst0, _src0);
1369
vst1q_u8(dst0 + 16, _src0n);
1371
uint8x16_t _src1 = vld1q_u8(src1);
1372
uint8x16_t _src1n = vld1q_u8(src1 + 16);
1373
vst1q_u8(dst1, _src1);
1374
vst1q_u8(dst1 + 16, _src1n);
1387
"vld1.u8 {d0-d3}, [%1]! \n"
1389
"vld1.u8 {d4-d7}, [%2]! \n"
1391
"vst1.u8 {d0-d3}, [%3]! \n"
1392
"vst1.u8 {d4-d7}, [%4]! \n"
1404
: "cc", "memory", "q0", "q1", "q2", "q3");
1406
#endif // __aarch64__
1411
for (; remain > 0; remain--)
1417
src0 += srcwgap + srcstride;
1418
src1 += srcwgap + srcstride;
1419
dst0 -= wgap + stride;
1420
dst1 -= wgap + stride;
1423
for (; y < srch; y++)
1427
int remain = size - (nn << 5);
1428
#if !NCNN_GNU_INLINE_ASM || __aarch64__
1429
for (; nn > 0; nn--)
1431
uint8x16_t _src = vld1q_u8(src0);
1432
uint8x16_t _src2 = vld1q_u8(src0 + 16);
1433
vst1q_u8(dst0, _src);
1434
vst1q_u8(dst0 + 16, _src2);
1445
"vld1.u8 {d0-d3}, [%1]! \n"
1447
"vst1.u8 {d0-d3}, [%2]! \n"
1455
: "cc", "memory", "q0", "q1");
1457
#endif // __aarch64__
1462
for (; remain > 0; remain--)
1472
static void kanna_rotate_4_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
1474
const int srcwgap = srcstride - srcw * 3;
1475
const int wgap = stride + w * 3;
1477
// point to the last dst pixel row
1478
unsigned char* dstend = dst + stride * (h - 1);
1480
int size = srcw * 3;
1482
const unsigned char* src0 = src;
1483
const unsigned char* src1 = src + srcstride;
1484
unsigned char* dst0 = dstend;
1485
unsigned char* dst1 = dstend - stride;
1488
for (; y + 1 < srch; y += 2)
1492
int remain = size - (nn << 5);
1493
#if !NCNN_GNU_INLINE_ASM || __aarch64__
1494
for (; nn > 0; nn--)
1496
uint8x16_t _src0 = vld1q_u8(src0);
1497
uint8x16_t _src0n = vld1q_u8(src0 + 16);
1498
vst1q_u8(dst0, _src0);
1499
vst1q_u8(dst0 + 16, _src0n);
1501
uint8x16_t _src1 = vld1q_u8(src1);
1502
uint8x16_t _src1n = vld1q_u8(src1 + 16);
1503
vst1q_u8(dst1, _src1);
1504
vst1q_u8(dst1 + 16, _src1n);
1517
"vld1.u8 {d0-d3}, [%1]! \n"
1519
"vld1.u8 {d4-d7}, [%2]! \n"
1521
"vst1.u8 {d0-d3}, [%3]! \n"
1522
"vst1.u8 {d4-d7}, [%4]! \n"
1534
: "cc", "memory", "q0", "q1", "q2", "q3");
1536
#endif // __aarch64__
1541
for (; remain > 0; remain--)
1547
src0 += srcwgap + srcstride;
1548
src1 += srcwgap + srcstride;
1549
dst0 -= wgap + stride;
1550
dst1 -= wgap + stride;
1553
for (; y < srch; y++)
1557
int remain = size - (nn << 5);
1558
#if !NCNN_GNU_INLINE_ASM || __aarch64__
1559
for (; nn > 0; nn--)
1561
uint8x16_t _src = vld1q_u8(src0);
1562
uint8x16_t _src2 = vld1q_u8(src0 + 16);
1563
vst1q_u8(dst0, _src);
1564
vst1q_u8(dst0 + 16, _src2);
1575
"vld1.u8 {d0-d3}, [%1]! \n"
1577
"vst1.u8 {d0-d3}, [%2]! \n"
1585
: "cc", "memory", "q0", "q1");
1587
#endif // __aarch64__
1592
for (; remain > 0; remain--)
1602
static void kanna_rotate_4_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
1604
const int srcwgap = srcstride - srcw * 4;
1605
const int wgap = stride + w * 4;
1607
// point to the last dst pixel row
1608
unsigned char* dstend = dst + stride * (h - 1);
1610
int size = srcw * 4;
1612
const unsigned char* src0 = src;
1613
const unsigned char* src1 = src + srcstride;
1614
unsigned char* dst0 = dstend;
1615
unsigned char* dst1 = dstend - stride;
1618
for (; y + 1 < srch; y += 2)
1622
int remain = size - (nn << 5);
1623
#if !NCNN_GNU_INLINE_ASM || __aarch64__
1624
for (; nn > 0; nn--)
1626
uint8x16_t _src0 = vld1q_u8(src0);
1627
uint8x16_t _src0n = vld1q_u8(src0 + 16);
1628
vst1q_u8(dst0, _src0);
1629
vst1q_u8(dst0 + 16, _src0n);
1631
uint8x16_t _src1 = vld1q_u8(src1);
1632
uint8x16_t _src1n = vld1q_u8(src1 + 16);
1633
vst1q_u8(dst1, _src1);
1634
vst1q_u8(dst1 + 16, _src1n);
1647
"vld1.u8 {d0-d3}, [%1]! \n"
1649
"vld1.u8 {d4-d7}, [%2]! \n"
1651
"vst1.u8 {d0-d3}, [%3]! \n"
1652
"vst1.u8 {d4-d7}, [%4]! \n"
1664
: "cc", "memory", "q0", "q1", "q2", "q3");
1666
#endif // __aarch64__
1671
for (; remain > 0; remain--)
1677
src0 += srcwgap + srcstride;
1678
src1 += srcwgap + srcstride;
1679
dst0 -= wgap + stride;
1680
dst1 -= wgap + stride;
1683
for (; y < srch; y++)
1687
int remain = size - (nn << 5);
1688
#if !NCNN_GNU_INLINE_ASM || __aarch64__
1689
for (; nn > 0; nn--)
1691
uint8x16_t _src = vld1q_u8(src0);
1692
uint8x16_t _src2 = vld1q_u8(src0 + 16);
1693
vst1q_u8(dst0, _src);
1694
vst1q_u8(dst0 + 16, _src2);
1705
"vld1.u8 {d0-d3}, [%1]! \n"
1707
"vst1.u8 {d0-d3}, [%2]! \n"
1715
: "cc", "memory", "q0", "q1");
1717
#endif // __aarch64__
1722
for (; remain > 0; remain--)
1732
static void kanna_rotate_5_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int /*h*/, int stride)
1734
const int srcwgap = srcstride - srcw;
1736
const unsigned char* src0 = src;
1740
for (; y + 7 < srch; y += 8)
1742
const unsigned char* src1 = src0 + srcstride;
1744
unsigned char* dst0 = dst + y;
1745
unsigned char* dst1 = dst + y + stride;
1747
int src_step = 2 * srcstride;
1748
int dst_step = 2 * stride;
1751
int remain = srcw - (nn << 3);
1753
#if !NCNN_GNU_INLINE_ASM || __aarch64__
1754
for (; nn > 0; nn--)
1756
uint8x8_t _src0 = vld1_u8(src0);
1757
uint8x8_t _src1 = vld1_u8(src1);
1759
uint8x8_t _src2 = vld1_u8(src0 + src_step);
1760
uint8x8_t _src3 = vld1_u8(src1 + src_step);
1762
uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step);
1763
uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step);
1765
uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step);
1766
uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step);
1768
uint8x8x2_t _src01t_r = vtrn_u8(_src0, _src1);
1769
uint8x8x2_t _src23t_r = vtrn_u8(_src2, _src3);
1770
uint8x8x2_t _src45t_r = vtrn_u8(_src4, _src5);
1771
uint8x8x2_t _src67t_r = vtrn_u8(_src6, _src7);
1773
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
1774
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
1775
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
1776
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
1778
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
1779
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
1780
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
1781
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
1783
uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[0]);
1784
uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[0]);
1785
uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[0]);
1786
uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[0]);
1787
uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[1]);
1788
uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[1]);
1789
uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[1]);
1790
uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[1]);
1792
vst1_u8(dst0, _dst0);
1793
vst1_u8(dst1, _dst1);
1794
vst1_u8(dst0 + dst_step, _dst2);
1795
vst1_u8(dst1 + dst_step, _dst3);
1796
vst1_u8(dst0 + 2 * dst_step, _dst4);
1797
vst1_u8(dst1 + 2 * dst_step, _dst5);
1798
vst1_u8(dst0 + 3 * dst_step, _dst6);
1799
vst1_u8(dst1 + 3 * dst_step, _dst7);
1804
dst0 += 4 * dst_step;
1805
dst1 += 4 * dst_step;
1813
"vld1.u8 {d0}, [%1], %10 \n"
1816
"vld1.u8 {d1}, [%2], %10 \n"
1819
"vld1.u8 {d2}, [%1], %10 \n"
1821
"vtrn.u8 d0, d1 \n" // _src01t_r
1824
"vld1.u8 {d3}, [%2], %10 \n"
1827
"vld1.u8 {d4}, [%1], %10 \n"
1829
"vtrn.u8 d2, d3 \n" // _src23t_r
1832
"vld1.u8 {d5}, [%2], %10 \n"
1835
"vld1.u8 {d6}, [%1], %10 \n"
1837
"vtrn.u8 d4, d5 \n" // _src45t_r
1840
"vld1.u8 {d7}, [%2], %10 \n"
1842
"vtrn.u8 d6, d7 \n" // _src67t_r
1844
"sub %1, %1, %10, lsl #2 \n" // restore src0
1846
"vtrn.u16 q0, q1 \n" // _src02tt_r _src13tt_r
1848
"sub %2, %2, %10, lsl #2 \n" // restore src1
1850
"vtrn.u16 q2, q3 \n" // _src13tt_r _src46tt_r
1852
"add %1, #8 \n" // src0 += 8
1854
"vtrn.u32 q0, q2 \n" // _src04ttt_r _src15ttt_r
1856
"add %2, #8 \n" // src1 += 8
1858
"vtrn.u32 q1, q3 \n" // _src26ttt_r _src37ttt_r
1859
"vst1.u8 {d0}, [%3], %11 \n"
1860
"vst1.u8 {d1}, [%4], %11 \n"
1864
"vst1.u8 {d2}, [%3], %11 \n"
1865
"vst1.u8 {d3}, [%4], %11 \n"
1866
"vst1.u8 {d4}, [%3], %11 \n"
1867
"vst1.u8 {d5}, [%4], %11 \n"
1868
"vst1.u8 {d6}, [%3], %11 \n"
1869
"vst1.u8 {d7}, [%4], %11 \n"
1882
"r"(src_step), // %10
1883
"r"(dst_step) // %11
1884
: "cc", "memory", "q0", "q1", "q2", "q3");
1886
#endif // __aarch64__
1887
for (; remain > 0; remain--)
1891
dst0[2] = src0[0 + src_step];
1892
dst0[3] = src1[0 + src_step];
1893
dst0[4] = src0[0 + 2 * src_step];
1894
dst0[5] = src1[0 + 2 * src_step];
1895
dst0[6] = src0[0 + 3 * src_step];
1896
dst0[7] = src1[0 + 3 * src_step];
1904
src0 += srcwgap + 7 * srcstride;
1907
for (; y < srch; y++)
1909
unsigned char* dst0 = dst + y;
1912
for (; x < srcw; x++)
1924
static void kanna_rotate_5_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int /*h*/, int stride)
1926
const int srcwgap = srcstride - srcw * 2;
1928
const unsigned char* src0 = src;
1932
for (; y + 7 < srch; y += 8)
1934
const unsigned char* src1 = src0 + srcstride;
1936
unsigned char* dst0 = dst + y * 2;
1937
unsigned char* dst1 = dst + y * 2 + stride;
1939
int src_step = 2 * srcstride;
1940
int dst_step = 2 * stride;
1943
int remain = srcw - (nn << 3);
1945
#if !NCNN_GNU_INLINE_ASM || __aarch64__
1946
for (; nn > 0; nn--)
1948
uint8x8x2_t _src0 = vld2_u8(src0);
1949
uint8x8x2_t _src1 = vld2_u8(src1);
1951
uint8x8x2_t _src2 = vld2_u8(src0 + src_step);
1952
uint8x8x2_t _src3 = vld2_u8(src1 + src_step);
1954
uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step);
1955
uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step);
1957
uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step);
1958
uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step);
1960
uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
1961
uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
1962
uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
1963
uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
1965
uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
1966
uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
1967
uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
1968
uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
1970
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
1971
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
1972
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
1973
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
1975
uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
1976
uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
1977
uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
1978
uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
1980
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
1981
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
1982
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
1983
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
1985
uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
1986
uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
1987
uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
1988
uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
1999
_dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
2000
_dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
2001
_dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
2002
_dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
2003
_dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
2004
_dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
2005
_dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
2006
_dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
2008
_dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
2009
_dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
2010
_dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
2011
_dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
2012
_dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
2013
_dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
2014
_dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
2015
_dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
2017
vst2_u8(dst0, _dst0);
2018
vst2_u8(dst1, _dst1);
2019
vst2_u8(dst0 + dst_step, _dst2);
2020
vst2_u8(dst1 + dst_step, _dst3);
2021
vst2_u8(dst0 + 2 * dst_step, _dst4);
2022
vst2_u8(dst1 + 2 * dst_step, _dst5);
2023
vst2_u8(dst0 + 3 * dst_step, _dst6);
2024
vst2_u8(dst1 + 3 * dst_step, _dst7);
2029
dst0 += 4 * dst_step;
2030
dst1 += 4 * dst_step;
2038
"vld2.u8 {d0-d1}, [%1], %10 \n"
2041
"vld2.u8 {d2-d3}, [%2], %10 \n"
2044
"vld2.u8 {d4-d5}, [%1], %10 \n"
2046
"vtrn.u8 q0, q1 \n" // _src01t_r
2049
"vld2.u8 {d6-d7}, [%2], %10 \n"
2052
"vld2.u8 {d16-d17}, [%1], %10\n"
2054
"vtrn.u8 q2, q3 \n" // _src23t_r
2057
"vld2.u8 {d18-d19}, [%2], %10\n"
2060
"vld2.u8 {d20-d21}, [%1], %10\n"
2062
"vtrn.u8 q8, q9 \n" // _src45t_r
2065
"vld2.u8 {d22-d23}, [%2], %10\n"
2067
"vtrn.u8 q10, q11 \n" // _src67t_r
2069
"sub %1, %1, %10, lsl #2 \n" // restore src0
2071
"vtrn.u16 q0, q2 \n" // _src02tt_r
2073
"sub %2, %2, %10, lsl #2 \n" // restore src1
2075
"vtrn.u16 q1, q3 \n" // _src13tt_r
2077
"add %1, #16 \n" // src0 += 16
2079
"vtrn.u16 q8, q10 \n" // _src46tt_r
2081
"add %2, #16 \n" // src1 += 16
2083
"vtrn.u16 q9, q11 \n" // _src57tt_r
2085
"vtrn.u32 q0, q8 \n" // _src04ttt_r
2087
"vtrn.u32 q1, q9 \n" // _src15ttt_r
2088
"vst2.u8 {d0-d1}, [%3], %11 \n"
2090
"vtrn.u32 q2, q10 \n" // _src26ttt_r
2091
"vst2.u8 {d2-d3}, [%4], %11 \n"
2093
"vtrn.u32 q3, q11 \n" // _src37ttt_r
2094
"vst2.u8 {d4-d5}, [%3], %11 \n"
2098
"vst2.u8 {d6-d7}, [%4], %11 \n"
2099
"vst2.u8 {d16-d17}, [%3], %11\n"
2100
"vst2.u8 {d18-d19}, [%4], %11\n"
2101
"vst2.u8 {d20-d21}, [%3], %11\n"
2102
"vst2.u8 {d22-d23}, [%4], %11\n"
2115
"r"(src_step), // %10
2116
"r"(dst_step) // %11
2117
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
2119
#endif // __aarch64__
2120
for (; remain > 0; remain--)
2126
dst0[4] = src0[0 + src_step];
2127
dst0[5] = src0[1 + src_step];
2128
dst0[6] = src1[0 + src_step];
2129
dst0[7] = src1[1 + src_step];
2130
dst0[8] = src0[0 + 2 * src_step];
2131
dst0[9] = src0[1 + 2 * src_step];
2132
dst0[10] = src1[0 + 2 * src_step];
2133
dst0[11] = src1[1 + 2 * src_step];
2134
dst0[12] = src0[0 + 3 * src_step];
2135
dst0[13] = src0[1 + 3 * src_step];
2136
dst0[14] = src1[0 + 3 * src_step];
2137
dst0[15] = src1[1 + 3 * src_step];
2145
src0 += srcwgap + 7 * srcstride;
2148
for (; y < srch; y++)
2150
unsigned char* dst0 = dst + y * 2;
2153
for (; x < srcw; x++)
2166
static void kanna_rotate_5_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int /*h*/, int stride)
2168
const int srcwgap = srcstride - srcw * 3;
2170
const unsigned char* src0 = src;
2174
for (; y + 7 < srch; y += 8)
2176
const unsigned char* src1 = src0 + srcstride;
2178
unsigned char* dst0 = dst + y * 3;
2179
unsigned char* dst1 = dst + y * 3 + stride;
2181
int src_step = 2 * srcstride;
2182
int dst_step = 2 * stride;
2185
int remain = srcw - (nn << 3);
2187
#if !NCNN_GNU_INLINE_ASM || __aarch64__
2188
for (; nn > 0; nn--)
2190
uint8x8x3_t _src0 = vld3_u8(src0);
2191
uint8x8x3_t _src1 = vld3_u8(src1);
2193
uint8x8x3_t _src2 = vld3_u8(src0 + src_step);
2194
uint8x8x3_t _src3 = vld3_u8(src1 + src_step);
2196
uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step);
2197
uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step);
2199
uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step);
2200
uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step);
2202
uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
2203
uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
2204
uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
2205
uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
2207
uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
2208
uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
2209
uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
2210
uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
2212
uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]);
2213
uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]);
2214
uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]);
2215
uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]);
2217
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
2218
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
2219
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
2220
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
2222
uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
2223
uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
2224
uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
2225
uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
2227
uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0]));
2228
uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1]));
2229
uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0]));
2230
uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1]));
2232
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
2233
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
2234
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
2235
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
2237
uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
2238
uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
2239
uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
2240
uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
2242
uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0]));
2243
uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0]));
2244
uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1]));
2245
uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1]));
2256
_dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
2257
_dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
2258
_dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
2259
_dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
2260
_dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
2261
_dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
2262
_dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
2263
_dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
2265
_dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
2266
_dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
2267
_dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
2268
_dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
2269
_dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
2270
_dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
2271
_dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
2272
_dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
2274
_dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
2275
_dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
2276
_dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
2277
_dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
2278
_dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
2279
_dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
2280
_dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
2281
_dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
2283
vst3_u8(dst0, _dst0);
2284
vst3_u8(dst1, _dst1);
2285
vst3_u8(dst0 + dst_step, _dst2);
2286
vst3_u8(dst1 + dst_step, _dst3);
2287
vst3_u8(dst0 + 2 * dst_step, _dst4);
2288
vst3_u8(dst1 + 2 * dst_step, _dst5);
2289
vst3_u8(dst0 + 3 * dst_step, _dst6);
2290
vst3_u8(dst1 + 3 * dst_step, _dst7);
2295
dst0 += 4 * dst_step;
2296
dst1 += 4 * dst_step;
2304
"vld3.u8 {d0-d2}, [%1], %10 \n"
2307
"vld3.u8 {d4-d6}, [%2], %10 \n"
2310
"vld3.u8 {d8-d10}, [%1], %10 \n"
2312
"vtrn.u8 q0, q2 \n" // _src01t_r
2316
"vld3.u8 {d12-d14}, [%2], %10\n"
2319
"vld3.u8 {d16-d18}, [%1], %10\n"
2321
"vtrn.u8 q4, q6 \n" // _src23t_r
2322
"vtrn.u8 d10, d14 \n"
2325
"vld3.u8 {d20-d22}, [%2], %10\n"
2328
"vld3.u8 {d24-d26}, [%1], %10\n"
2330
"vtrn.u8 q8, q10 \n" // _src45t_r
2331
"vtrn.u8 d18, d22 \n"
2334
"vld3.u8 {d28-d30}, [%2], %10\n"
2336
"vtrn.u8 q12, q14 \n" // _src67t_r
2337
"vtrn.u8 d26, d30 \n"
2339
"sub %1, %1, %10, lsl #2 \n" // restore src0
2341
"vtrn.u16 q0, q4 \n" // _src02tt_r
2342
"vtrn.u16 d2, d10 \n"
2344
"sub %2, %2, %10, lsl #2 \n" // restore src1
2346
"vtrn.u16 q2, q6 \n" // _src13tt_r
2347
"vtrn.u16 d6, d14 \n"
2349
"add %1, #24 \n" // src0 += 24
2351
"vtrn.u16 q8, q12 \n" // _src46tt_r
2352
"vtrn.u16 d18, d26 \n"
2354
"add %2, #24 \n" // src1 += 24
2356
"vtrn.u16 q10, q14 \n" // _src57tt_r
2357
"vtrn.u16 d22, d30 \n"
2359
"vtrn.u32 q0, q8 \n" // _src04ttt_r
2360
"vtrn.u32 d2, d18 \n"
2362
"vtrn.u32 q2, q10 \n" // _src15ttt_r
2363
"vst3.u8 {d0-d2}, [%3], %11 \n"
2364
"vtrn.u32 d6, d22 \n"
2366
"vtrn.u32 q4, q12 \n" // _src26ttt_r
2367
"vst3.u8 {d4-d6}, [%4], %11 \n"
2368
"vtrn.u32 d10, d26 \n"
2370
"vtrn.u32 q6, q14 \n" // _src37ttt_r
2371
"vst3.u8 {d8-d10}, [%3], %11 \n"
2372
"vtrn.u32 d14, d30 \n"
2376
"vst3.u8 {d16-d18}, [%3], %11\n"
2377
"vst3.u8 {d12-d14}, [%4], %11\n"
2378
"vst3.u8 {d20-d22}, [%4], %11\n"
2379
"vst3.u8 {d24-d26}, [%3], %11\n"
2380
"vst3.u8 {d28-d30}, [%4], %11\n"
2393
"r"(src_step), // %10
2394
"r"(dst_step) // %11
2395
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
2397
#endif // __aarch64__
2398
for (; remain > 0; remain--)
2406
dst0[6] = src0[0 + src_step];
2407
dst0[7] = src0[1 + src_step];
2408
dst0[8] = src0[2 + src_step];
2409
dst0[9] = src1[0 + src_step];
2410
dst0[10] = src1[1 + src_step];
2411
dst0[11] = src1[2 + src_step];
2412
dst0[12] = src0[0 + 2 * src_step];
2413
dst0[13] = src0[1 + 2 * src_step];
2414
dst0[14] = src0[2 + 2 * src_step];
2415
dst0[15] = src1[0 + 2 * src_step];
2416
dst0[16] = src1[1 + 2 * src_step];
2417
dst0[17] = src1[2 + 2 * src_step];
2418
dst0[18] = src0[0 + 3 * src_step];
2419
dst0[19] = src0[1 + 3 * src_step];
2420
dst0[20] = src0[2 + 3 * src_step];
2421
dst0[21] = src1[0 + 3 * src_step];
2422
dst0[22] = src1[1 + 3 * src_step];
2423
dst0[23] = src1[2 + 3 * src_step];
2431
src0 += srcwgap + 7 * srcstride;
2434
for (; y < srch; y++)
2436
unsigned char* dst0 = dst + y * 3;
2439
for (; x < srcw; x++)
2453
static void kanna_rotate_5_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int /*h*/, int stride)
2455
const int srcwgap = srcstride - srcw * 4;
2457
const unsigned char* src0 = src;
2461
for (; y + 7 < srch; y += 8)
2463
const unsigned char* src1 = src0 + srcstride;
2465
unsigned char* dst0 = dst + y * 4;
2466
unsigned char* dst1 = dst + y * 4 + stride;
2468
int src_step = 2 * srcstride;
2469
int dst_step = 2 * stride;
2472
int remain = srcw - (nn << 3);
2474
#if !NCNN_GNU_INLINE_ASM || __aarch64__
2475
for (; nn > 0; nn--)
2477
uint8x8x4_t _src0 = vld4_u8(src0);
2478
uint8x8x4_t _src1 = vld4_u8(src1);
2480
uint8x8x4_t _src2 = vld4_u8(src0 + src_step);
2481
uint8x8x4_t _src3 = vld4_u8(src1 + src_step);
2483
uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step);
2484
uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step);
2486
uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step);
2487
uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step);
2489
uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
2490
uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
2491
uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
2492
uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
2494
uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
2495
uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
2496
uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
2497
uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
2499
uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]);
2500
uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]);
2501
uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]);
2502
uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]);
2504
uint8x8x2_t _src01t_a = vtrn_u8(_src0.val[3], _src1.val[3]);
2505
uint8x8x2_t _src23t_a = vtrn_u8(_src2.val[3], _src3.val[3]);
2506
uint8x8x2_t _src45t_a = vtrn_u8(_src4.val[3], _src5.val[3]);
2507
uint8x8x2_t _src67t_a = vtrn_u8(_src6.val[3], _src7.val[3]);
2509
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
2510
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
2511
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
2512
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
2514
uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
2515
uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
2516
uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
2517
uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
2519
uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0]));
2520
uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1]));
2521
uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0]));
2522
uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1]));
2524
uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[0]), vreinterpret_u16_u8(_src23t_a.val[0]));
2525
uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[1]), vreinterpret_u16_u8(_src23t_a.val[1]));
2526
uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[0]), vreinterpret_u16_u8(_src67t_a.val[0]));
2527
uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[1]), vreinterpret_u16_u8(_src67t_a.val[1]));
2529
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
2530
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
2531
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
2532
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
2534
uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
2535
uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
2536
uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
2537
uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
2539
uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0]));
2540
uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0]));
2541
uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1]));
2542
uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1]));
2544
uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[0]), vreinterpret_u32_u16(_src46tt_a.val[0]));
2545
uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[0]), vreinterpret_u32_u16(_src57tt_a.val[0]));
2546
uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[1]), vreinterpret_u32_u16(_src46tt_a.val[1]));
2547
uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[1]), vreinterpret_u32_u16(_src57tt_a.val[1]));
2558
_dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
2559
_dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
2560
_dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
2561
_dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
2562
_dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
2563
_dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
2564
_dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
2565
_dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
2567
_dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
2568
_dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
2569
_dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
2570
_dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
2571
_dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
2572
_dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
2573
_dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
2574
_dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
2576
_dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
2577
_dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
2578
_dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
2579
_dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
2580
_dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
2581
_dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
2582
_dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
2583
_dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
2585
_dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]);
2586
_dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]);
2587
_dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]);
2588
_dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]);
2589
_dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]);
2590
_dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]);
2591
_dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]);
2592
_dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]);
2594
vst4_u8(dst0, _dst0);
2595
vst4_u8(dst1, _dst1);
2596
vst4_u8(dst0 + dst_step, _dst2);
2597
vst4_u8(dst1 + dst_step, _dst3);
2598
vst4_u8(dst0 + 2 * dst_step, _dst4);
2599
vst4_u8(dst1 + 2 * dst_step, _dst5);
2600
vst4_u8(dst0 + 3 * dst_step, _dst6);
2601
vst4_u8(dst1 + 3 * dst_step, _dst7);
2606
dst0 += 4 * dst_step;
2607
dst1 += 4 * dst_step;
2615
"vld4.u8 {d0-d3}, [%1], %10 \n"
2618
"vld4.u8 {d4-d7}, [%2], %10 \n"
2621
"vld4.u8 {d8-d11}, [%1], %10 \n"
2623
"vtrn.u8 q0, q2 \n" // _src01t_r
2627
"vld4.u8 {d12-d15}, [%2], %10\n"
2630
"vld4.u8 {d16-d19}, [%1], %10\n"
2632
"vtrn.u8 q4, q6 \n" // _src23t_r
2636
"vld4.u8 {d20-d23}, [%2], %10\n"
2639
"vld4.u8 {d24-d27}, [%1], %10\n"
2641
"vtrn.u8 q8, q10 \n" // _src45t_r
2642
"vtrn.u8 q9, q11 \n"
2645
"vld4.u8 {d28-d31}, [%2], %10\n"
2647
"vtrn.u8 q12, q14 \n" // _src67t_r
2648
"vtrn.u8 q13, q15 \n"
2650
"sub %1, %1, %10, lsl #2 \n" // restore src0
2652
"vtrn.u16 q0, q4 \n" // _src02tt_r
2653
"vtrn.u16 q1, q5 \n"
2655
"sub %2, %2, %10, lsl #2 \n" // restore src1
2657
"vtrn.u16 q2, q6 \n" // _src13tt_r
2658
"vtrn.u16 q3, q7 \n"
2660
"add %1, #32 \n" // src0 += 32
2662
"vtrn.u16 q8, q12 \n" // _src46tt_r
2663
"vtrn.u16 q9, q13 \n"
2665
"add %2, #32 \n" // src1 += 32
2667
"vtrn.u16 q10, q14 \n" // _src57tt_r
2668
"vtrn.u16 q11, q15 \n"
2670
"vtrn.u32 q0, q8 \n" // _src04ttt_r
2671
"vtrn.u32 q1, q9 \n"
2673
"vtrn.u32 q2, q10 \n" // _src15ttt_r
2674
"vst4.u8 {d0-d3}, [%3], %11 \n"
2675
"vtrn.u32 q3, q11 \n"
2677
"vtrn.u32 q4, q12 \n" // _src26ttt_r
2678
"vst4.u8 {d4-d7}, [%4], %11 \n"
2679
"vtrn.u32 q5, q13 \n"
2681
"vtrn.u32 q6, q14 \n" // _src37ttt_r
2682
"vst4.u8 {d8-d11}, [%3], %11 \n"
2683
"vtrn.u32 q7, q15 \n"
2687
"vst4.u8 {d16-d19}, [%3], %11\n"
2688
"vst4.u8 {d12-d15}, [%4], %11\n"
2689
"vst4.u8 {d20-d23}, [%4], %11\n"
2690
"vst4.u8 {d24-d27}, [%3], %11\n"
2691
"vst4.u8 {d28-d31}, [%4], %11\n"
2704
"r"(src_step), // %10
2705
"r"(dst_step) // %11
2706
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
2708
#endif // __aarch64__
2709
for (; remain > 0; remain--)
2719
dst0[8] = src0[0 + src_step];
2720
dst0[9] = src0[1 + src_step];
2721
dst0[10] = src0[2 + src_step];
2722
dst0[11] = src0[3 + src_step];
2723
dst0[12] = src1[0 + src_step];
2724
dst0[13] = src1[1 + src_step];
2725
dst0[14] = src1[2 + src_step];
2726
dst0[15] = src1[3 + src_step];
2727
dst0[16] = src0[0 + 2 * src_step];
2728
dst0[17] = src0[1 + 2 * src_step];
2729
dst0[18] = src0[2 + 2 * src_step];
2730
dst0[19] = src0[3 + 2 * src_step];
2731
dst0[20] = src1[0 + 2 * src_step];
2732
dst0[21] = src1[1 + 2 * src_step];
2733
dst0[22] = src1[2 + 2 * src_step];
2734
dst0[23] = src1[3 + 2 * src_step];
2735
dst0[24] = src0[0 + 3 * src_step];
2736
dst0[25] = src0[1 + 3 * src_step];
2737
dst0[26] = src0[2 + 3 * src_step];
2738
dst0[27] = src0[3 + 3 * src_step];
2739
dst0[28] = src1[0 + 3 * src_step];
2740
dst0[29] = src1[1 + 3 * src_step];
2741
dst0[30] = src1[2 + 3 * src_step];
2742
dst0[31] = src1[3 + 3 * src_step];
2750
src0 += srcwgap + 7 * srcstride;
2753
for (; y < srch; y++)
2755
unsigned char* dst0 = dst + y * 4;
2758
for (; x < srcw; x++)
2773
static void kanna_rotate_6_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
2775
const int srcwgap = srcstride - srcw;
2777
// point to the last dst pixel in row
2778
unsigned char* dstend = dst + w;
2780
const unsigned char* src0 = src;
2784
for (; y + 7 < srch; y += 8)
2786
const unsigned char* src1 = src0 + srcstride;
2788
unsigned char* dst0 = dstend - y - 8;
2789
unsigned char* dst1 = dstend - y - 8 + stride;
2791
int src_step = 2 * srcstride;
2792
int dst_step = 2 * stride;
2795
int remain = srcw - (nn << 3);
2797
#if !NCNN_GNU_INLINE_ASM || __aarch64__
2798
for (; nn > 0; nn--)
2800
uint8x8_t _src0 = vld1_u8(src0);
2801
uint8x8_t _src1 = vld1_u8(src1);
2803
uint8x8_t _src2 = vld1_u8(src0 + src_step);
2804
uint8x8_t _src3 = vld1_u8(src1 + src_step);
2806
uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step);
2807
uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step);
2809
uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step);
2810
uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step);
2812
uint8x8x2_t _src01t_r = vtrn_u8(_src1, _src0);
2813
uint8x8x2_t _src23t_r = vtrn_u8(_src3, _src2);
2814
uint8x8x2_t _src45t_r = vtrn_u8(_src5, _src4);
2815
uint8x8x2_t _src67t_r = vtrn_u8(_src7, _src6);
2817
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
2818
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
2819
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
2820
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
2822
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
2823
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
2824
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
2825
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
2827
uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[1]);
2828
uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[1]);
2829
uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[1]);
2830
uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[1]);
2831
uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[0]);
2832
uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[0]);
2833
uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[0]);
2834
uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[0]);
2836
vst1_u8(dst0, _dst7);
2837
vst1_u8(dst1, _dst6);
2838
vst1_u8(dst0 + dst_step, _dst5);
2839
vst1_u8(dst1 + dst_step, _dst4);
2840
vst1_u8(dst0 + 2 * dst_step, _dst3);
2841
vst1_u8(dst1 + 2 * dst_step, _dst2);
2842
vst1_u8(dst0 + 3 * dst_step, _dst1);
2843
vst1_u8(dst1 + 3 * dst_step, _dst0);
2848
dst0 += 4 * dst_step;
2849
dst1 += 4 * dst_step;
2857
"vld1.u8 {d0}, [%1], %10 \n"
2860
"vld1.u8 {d1}, [%2], %10 \n"
2863
"vld1.u8 {d2}, [%1], %10 \n"
2865
"vtrn.u8 d1, d0 \n" // _src01t_r
2868
"vld1.u8 {d3}, [%2], %10 \n"
2871
"vld1.u8 {d4}, [%1], %10 \n"
2873
"vtrn.u8 d3, d2 \n" // _src23t_r
2876
"vld1.u8 {d5}, [%2], %10 \n"
2879
"vld1.u8 {d6}, [%1], %10 \n"
2881
"vtrn.u8 d5, d4 \n" // _src45t_r
2884
"vld1.u8 {d7}, [%2], %10 \n"
2886
"vtrn.u8 d7, d6 \n" // _src67t_r
2888
"sub %1, %1, %10, lsl #2 \n" // restore src0
2890
"vtrn.u16 q1, q0 \n" // _src02tt_r _src13tt_r
2892
"sub %2, %2, %10, lsl #2 \n" // restore src1
2894
"vtrn.u16 q3, q2 \n" // _src46tt_r _src57tt_r
2896
"add %1, #8 \n" // src0 += 8
2898
"vtrn.u32 q3, q1 \n" // _src26ttt_r _src37ttt_r
2900
"add %2, #8 \n" // src1 += 8
2902
"vtrn.u32 q2, q0 \n" // _src04ttt_r _src15ttt_r
2903
"vst1.u8 {d6}, [%4], %11 \n"
2904
"vst1.u8 {d7}, [%3], %11 \n"
2908
"vst1.u8 {d4}, [%4], %11 \n"
2909
"vst1.u8 {d5}, [%3], %11 \n"
2910
"vst1.u8 {d2}, [%4], %11 \n"
2911
"vst1.u8 {d3}, [%3], %11 \n"
2912
"vst1.u8 {d0}, [%4], %11 \n"
2913
"vst1.u8 {d1}, [%3], %11 \n"
2926
"r"(src_step), // %10
2927
"r"(dst_step) // %11
2928
: "cc", "memory", "q0", "q1", "q2", "q3");
2930
#endif // __aarch64__
2931
for (; remain > 0; remain--)
2933
dst0[0] = src1[0 + 3 * src_step];
2934
dst0[1] = src0[0 + 3 * src_step];
2935
dst0[2] = src1[0 + 2 * src_step];
2936
dst0[3] = src0[0 + 2 * src_step];
2937
dst0[4] = src1[0 + src_step];
2938
dst0[5] = src0[0 + src_step];
2948
src0 += srcwgap + 7 * srcstride;
2951
for (; y < srch; y++)
2953
unsigned char* dst0 = dstend - y - 1;
2956
for (; x < srcw; x++)
2968
static void kanna_rotate_6_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
2970
const int srcwgap = srcstride - srcw * 2;
2972
// point to the last dst pixel in row
2973
unsigned char* dstend = dst + w * 2;
2975
const unsigned char* src0 = src;
2979
for (; y + 7 < srch; y += 8)
2981
const unsigned char* src1 = src0 + srcstride;
2983
unsigned char* dst0 = dstend - y * 2 - 8 * 2;
2984
unsigned char* dst1 = dstend - y * 2 - 8 * 2 + stride;
2986
int src_step = 2 * srcstride;
2987
int dst_step = 2 * stride;
2990
int remain = srcw - (nn << 3);
2992
#if !NCNN_GNU_INLINE_ASM || __aarch64__
2993
for (; nn > 0; nn--)
2995
uint8x8x2_t _src0 = vld2_u8(src0);
2996
uint8x8x2_t _src1 = vld2_u8(src1);
2998
uint8x8x2_t _src2 = vld2_u8(src0 + src_step);
2999
uint8x8x2_t _src3 = vld2_u8(src1 + src_step);
3001
uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step);
3002
uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step);
3004
uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step);
3005
uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step);
3007
uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
3008
uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
3009
uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
3010
uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
3012
uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
3013
uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
3014
uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
3015
uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
3017
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
3018
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
3019
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
3020
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
3022
uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
3023
uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
3024
uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
3025
uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
3027
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
3028
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
3029
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
3030
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
3032
uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
3033
uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
3034
uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
3035
uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
3046
_dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
3047
_dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
3048
_dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
3049
_dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
3050
_dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
3051
_dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
3052
_dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
3053
_dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
3055
_dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
3056
_dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
3057
_dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
3058
_dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
3059
_dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
3060
_dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
3061
_dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
3062
_dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
3064
vst2_u8(dst0, _dst7);
3065
vst2_u8(dst1, _dst6);
3066
vst2_u8(dst0 + dst_step, _dst5);
3067
vst2_u8(dst1 + dst_step, _dst4);
3068
vst2_u8(dst0 + 2 * dst_step, _dst3);
3069
vst2_u8(dst1 + 2 * dst_step, _dst2);
3070
vst2_u8(dst0 + 3 * dst_step, _dst1);
3071
vst2_u8(dst1 + 3 * dst_step, _dst0);
3076
dst0 += 4 * dst_step;
3077
dst1 += 4 * dst_step;
3085
"vld2.u8 {d0-d1}, [%1], %10 \n"
3088
"vld2.u8 {d2-d3}, [%2], %10 \n"
3091
"vld2.u8 {d4-d5}, [%1], %10 \n"
3093
"vtrn.u8 q1, q0 \n" // _src01t_r
3096
"vld2.u8 {d6-d7}, [%2], %10 \n"
3099
"vld2.u8 {d16-d17}, [%1], %10\n"
3101
"vtrn.u8 q3, q2 \n" // _src23t_r
3104
"vld2.u8 {d18-d19}, [%2], %10\n"
3107
"vld2.u8 {d20-d21}, [%1], %10\n"
3109
"vtrn.u8 q9, q8 \n" // _src45t_r
3112
"vld2.u8 {d22-d23}, [%2], %10\n"
3114
"vtrn.u8 q11, q10 \n" // _src67t_r
3116
"sub %1, %1, %10, lsl #2 \n" // restore src0
3118
"vtrn.u16 q2, q0 \n" // _src02tt_r
3120
"sub %2, %2, %10, lsl #2 \n" // restore src1
3122
"vtrn.u16 q3, q1 \n" // _src13tt_r
3124
"add %1, #16 \n" // src0 += 16
3126
"vtrn.u16 q10, q8 \n" // _src46tt_r
3128
"add %2, #16 \n" // src1 += 16
3130
"vtrn.u16 q11, q9 \n" // _src57tt_r
3132
"vtrn.u32 q10, q2 \n" // _src26ttt_r
3134
"vtrn.u32 q11, q3 \n" // _src37ttt_r
3135
"vst2.u8 {d20-d21}, [%4], %11\n"
3137
"vtrn.u32 q8, q0 \n" // _src04ttt_r
3138
"vst2.u8 {d22-d23}, [%3], %11\n"
3140
"vtrn.u32 q9, q1 \n" // _src15ttt_r
3141
"vst2.u8 {d16-d17}, [%4], %11\n"
3145
"vst2.u8 {d18-d19}, [%3], %11\n"
3146
"vst2.u8 {d4-d5}, [%4], %11 \n"
3147
"vst2.u8 {d6-d7}, [%3], %11 \n"
3148
"vst2.u8 {d0-d1}, [%4], %11 \n"
3149
"vst2.u8 {d2-d3}, [%3], %11 \n"
3162
"r"(src_step), // %10
3163
"r"(dst_step) // %11
3164
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
3166
#endif // __aarch64__
3167
for (; remain > 0; remain--)
3169
dst0[0] = src1[0 + 3 * src_step];
3170
dst0[1] = src1[1 + 3 * src_step];
3171
dst0[2] = src0[0 + 3 * src_step];
3172
dst0[3] = src0[1 + 3 * src_step];
3173
dst0[4] = src1[0 + 2 * src_step];
3174
dst0[5] = src1[1 + 2 * src_step];
3175
dst0[6] = src0[0 + 2 * src_step];
3176
dst0[7] = src0[1 + 2 * src_step];
3177
dst0[8] = src1[0 + src_step];
3178
dst0[9] = src1[1 + src_step];
3179
dst0[10] = src0[0 + src_step];
3180
dst0[11] = src0[1 + src_step];
3192
src0 += srcwgap + 7 * srcstride;
3195
for (; y < srch; y++)
3197
unsigned char* dst0 = dstend - y * 2 - 2;
3200
for (; x < srcw; x++)
3213
static void kanna_rotate_6_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
3215
const int srcwgap = srcstride - srcw * 3;
3217
// point to the last dst pixel in row
3218
unsigned char* dstend = dst + w * 3;
3220
const unsigned char* src0 = src;
3224
for (; y + 7 < srch; y += 8)
3226
const unsigned char* src1 = src0 + srcstride;
3228
unsigned char* dst0 = dstend - y * 3 - 8 * 3;
3229
unsigned char* dst1 = dstend - y * 3 - 8 * 3 + stride;
3231
int src_step = 2 * srcstride;
3232
int dst_step = 2 * stride;
3235
int remain = srcw - (nn << 3);
3237
#if !NCNN_GNU_INLINE_ASM || __aarch64__
3238
for (; nn > 0; nn--)
3240
uint8x8x3_t _src0 = vld3_u8(src0);
3241
uint8x8x3_t _src1 = vld3_u8(src1);
3243
uint8x8x3_t _src2 = vld3_u8(src0 + src_step);
3244
uint8x8x3_t _src3 = vld3_u8(src1 + src_step);
3246
uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step);
3247
uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step);
3249
uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step);
3250
uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step);
3252
uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
3253
uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
3254
uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
3255
uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
3257
uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
3258
uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
3259
uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
3260
uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
3262
uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]);
3263
uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]);
3264
uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]);
3265
uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]);
3267
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
3268
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
3269
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
3270
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
3272
uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
3273
uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
3274
uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
3275
uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
3277
uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1]));
3278
uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0]));
3279
uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1]));
3280
uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0]));
3282
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
3283
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
3284
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
3285
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
3287
uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
3288
uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
3289
uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
3290
uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
3292
uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1]));
3293
uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1]));
3294
uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0]));
3295
uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0]));
3306
_dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
3307
_dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
3308
_dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
3309
_dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
3310
_dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
3311
_dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
3312
_dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
3313
_dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
3315
_dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
3316
_dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
3317
_dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
3318
_dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
3319
_dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
3320
_dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
3321
_dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
3322
_dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
3324
_dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
3325
_dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
3326
_dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
3327
_dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
3328
_dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
3329
_dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
3330
_dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
3331
_dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
3333
vst3_u8(dst0, _dst7);
3334
vst3_u8(dst1, _dst6);
3335
vst3_u8(dst0 + dst_step, _dst5);
3336
vst3_u8(dst1 + dst_step, _dst4);
3337
vst3_u8(dst0 + 2 * dst_step, _dst3);
3338
vst3_u8(dst1 + 2 * dst_step, _dst2);
3339
vst3_u8(dst0 + 3 * dst_step, _dst1);
3340
vst3_u8(dst1 + 3 * dst_step, _dst0);
3345
dst0 += 4 * dst_step;
3346
dst1 += 4 * dst_step;
3354
"vld3.u8 {d0-d2}, [%1], %10 \n"
3357
"vld3.u8 {d4-d6}, [%2], %10 \n"
3360
"vld3.u8 {d8-d10}, [%1], %10 \n"
3362
"vtrn.u8 q2, q0 \n" // _src01t_r
3366
"vld3.u8 {d12-d14}, [%2], %10\n"
3369
"vld3.u8 {d16-d18}, [%1], %10\n"
3371
"vtrn.u8 q6, q4 \n" // _src23t_r
3372
"vtrn.u8 d14, d10 \n"
3375
"vld3.u8 {d20-d22}, [%2], %10\n"
3378
"vld3.u8 {d24-d26}, [%1], %10\n"
3380
"vtrn.u8 q10, q8 \n" // _src45t_r
3381
"vtrn.u8 d22, d18 \n"
3384
"vld3.u8 {d28-d30}, [%2], %10\n"
3386
"vtrn.u8 q14, q12 \n" // _src67t_r
3387
"vtrn.u8 d30, d26 \n"
3389
"sub %1, %1, %10, lsl #2 \n" // restore src0
3391
"vtrn.u16 q4, q0 \n" // _src02tt_r
3392
"vtrn.u16 d10, d2 \n"
3394
"sub %2, %2, %10, lsl #2 \n" // restore src1
3396
"vtrn.u16 q6, q2 \n" // _src13tt_r
3397
"vtrn.u16 d14, d6 \n"
3399
"add %1, #24 \n" // src0 += 24
3401
"vtrn.u16 q12, q8 \n" // _src46tt_r
3402
"vtrn.u16 d26, d18 \n"
3404
"add %2, #24 \n" // src1 += 24
3406
"vtrn.u16 q14, q10 \n" // _src57tt_r
3407
"vtrn.u16 d30, d22 \n"
3409
"vtrn.u32 q12, q4 \n" // _src26ttt_r
3410
"vtrn.u32 d26, d10 \n"
3412
"vtrn.u32 q14, q6 \n" // _src37ttt_r
3413
"vst3.u8 {d24-d26}, [%4], %11\n"
3414
"vtrn.u32 d30, d14 \n"
3416
"vtrn.u32 q8, q0 \n" // _src04ttt_r
3417
"vst3.u8 {d28-d30}, [%3], %11\n"
3418
"vtrn.u32 d18, d2 \n"
3420
"vtrn.u32 q10, q2 \n" // _src15ttt_r
3421
"vst3.u8 {d16-d18}, [%4], %11\n"
3422
"vtrn.u32 d22, d6 \n"
3426
"vst3.u8 {d20-d22}, [%3], %11\n"
3427
"vst3.u8 {d8-d10}, [%4], %11 \n"
3428
"vst3.u8 {d12-d14}, [%3], %11\n"
3429
"vst3.u8 {d0-d2}, [%4], %11 \n"
3430
"vst3.u8 {d4-d6}, [%3], %11 \n"
3443
"r"(src_step), // %10
3444
"r"(dst_step) // %11
3445
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
3447
#endif // __aarch64__
3448
for (; remain > 0; remain--)
3450
dst0[0] = src1[0 + 3 * src_step];
3451
dst0[1] = src1[1 + 3 * src_step];
3452
dst0[2] = src1[2 + 3 * src_step];
3453
dst0[3] = src0[0 + 3 * src_step];
3454
dst0[4] = src0[1 + 3 * src_step];
3455
dst0[5] = src0[2 + 3 * src_step];
3456
dst0[6] = src1[0 + 2 * src_step];
3457
dst0[7] = src1[1 + 2 * src_step];
3458
dst0[8] = src1[2 + 2 * src_step];
3459
dst0[9] = src0[0 + 2 * src_step];
3460
dst0[10] = src0[1 + 2 * src_step];
3461
dst0[11] = src0[2 + 2 * src_step];
3462
dst0[12] = src1[0 + src_step];
3463
dst0[13] = src1[1 + src_step];
3464
dst0[14] = src1[2 + src_step];
3465
dst0[15] = src0[0 + src_step];
3466
dst0[16] = src0[1 + src_step];
3467
dst0[17] = src0[2 + src_step];
3481
src0 += srcwgap + 7 * srcstride;
3484
for (; y < srch; y++)
3486
unsigned char* dst0 = dstend - y * 3 - 3;
3489
for (; x < srcw; x++)
3503
static void kanna_rotate_6_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
3505
const int srcwgap = srcstride - srcw * 4;
3507
// point to the last dst pixel in row
3508
unsigned char* dstend = dst + w * 4;
3510
const unsigned char* src0 = src;
3514
for (; y + 7 < srch; y += 8)
3516
const unsigned char* src1 = src0 + srcstride;
3518
unsigned char* dst0 = dstend - y * 4 - 8 * 4;
3519
unsigned char* dst1 = dstend - y * 4 - 8 * 4 + stride;
3521
int src_step = 2 * srcstride;
3522
int dst_step = 2 * stride;
3525
int remain = srcw - (nn << 3);
3527
#if !NCNN_GNU_INLINE_ASM || __aarch64__
3528
for (; nn > 0; nn--)
3530
uint8x8x4_t _src0 = vld4_u8(src0);
3531
uint8x8x4_t _src1 = vld4_u8(src1);
3533
uint8x8x4_t _src2 = vld4_u8(src0 + src_step);
3534
uint8x8x4_t _src3 = vld4_u8(src1 + src_step);
3536
uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step);
3537
uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step);
3539
uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step);
3540
uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step);
3542
uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
3543
uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
3544
uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
3545
uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
3547
uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
3548
uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
3549
uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
3550
uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
3552
uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]);
3553
uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]);
3554
uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]);
3555
uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]);
3557
uint8x8x2_t _src01t_a = vtrn_u8(_src1.val[3], _src0.val[3]);
3558
uint8x8x2_t _src23t_a = vtrn_u8(_src3.val[3], _src2.val[3]);
3559
uint8x8x2_t _src45t_a = vtrn_u8(_src5.val[3], _src4.val[3]);
3560
uint8x8x2_t _src67t_a = vtrn_u8(_src7.val[3], _src6.val[3]);
3562
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
3563
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
3564
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
3565
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
3567
uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
3568
uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
3569
uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
3570
uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
3572
uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1]));
3573
uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0]));
3574
uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1]));
3575
uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0]));
3577
uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[1]), vreinterpret_u16_u8(_src01t_a.val[1]));
3578
uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[0]), vreinterpret_u16_u8(_src01t_a.val[0]));
3579
uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[1]), vreinterpret_u16_u8(_src45t_a.val[1]));
3580
uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[0]), vreinterpret_u16_u8(_src45t_a.val[0]));
3582
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
3583
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
3584
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
3585
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
3587
uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
3588
uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
3589
uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
3590
uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
3592
uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1]));
3593
uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1]));
3594
uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0]));
3595
uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0]));
3597
uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[1]), vreinterpret_u32_u16(_src02tt_a.val[1]));
3598
uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[1]), vreinterpret_u32_u16(_src13tt_a.val[1]));
3599
uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[0]), vreinterpret_u32_u16(_src02tt_a.val[0]));
3600
uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[0]), vreinterpret_u32_u16(_src13tt_a.val[0]));
3611
_dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
3612
_dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
3613
_dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
3614
_dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
3615
_dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
3616
_dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
3617
_dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
3618
_dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
3620
_dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
3621
_dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
3622
_dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
3623
_dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
3624
_dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
3625
_dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
3626
_dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
3627
_dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
3629
_dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
3630
_dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
3631
_dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
3632
_dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
3633
_dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
3634
_dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
3635
_dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
3636
_dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
3638
_dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]);
3639
_dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]);
3640
_dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]);
3641
_dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]);
3642
_dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]);
3643
_dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]);
3644
_dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]);
3645
_dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]);
3647
vst4_u8(dst0, _dst7);
3648
vst4_u8(dst1, _dst6);
3649
vst4_u8(dst0 + dst_step, _dst5);
3650
vst4_u8(dst1 + dst_step, _dst4);
3651
vst4_u8(dst0 + 2 * dst_step, _dst3);
3652
vst4_u8(dst1 + 2 * dst_step, _dst2);
3653
vst4_u8(dst0 + 3 * dst_step, _dst1);
3654
vst4_u8(dst1 + 3 * dst_step, _dst0);
3659
dst0 += 4 * dst_step;
3660
dst1 += 4 * dst_step;
3668
"vld4.u8 {d0-d3}, [%1], %10 \n"
3671
"vld4.u8 {d4-d7}, [%2], %10 \n"
3674
"vld4.u8 {d8-d11}, [%1], %10 \n"
3676
"vtrn.u8 q2, q0 \n" // _src01t_r
3680
"vld4.u8 {d12-d15}, [%2], %10\n"
3683
"vld4.u8 {d16-d19}, [%1], %10\n"
3685
"vtrn.u8 q6, q4 \n" // _src23t_r
3689
"vld4.u8 {d20-d23}, [%2], %10\n"
3692
"vld4.u8 {d24-d27}, [%1], %10\n"
3694
"vtrn.u8 q10, q8 \n" // _src45t_r
3695
"vtrn.u8 q11, q9 \n"
3698
"vld4.u8 {d28-d31}, [%2], %10\n"
3700
"vtrn.u8 q14, q12 \n" // _src67t_r
3701
"vtrn.u8 q15, q13 \n"
3703
"sub %1, %1, %10, lsl #2 \n" // restore src0
3705
"vtrn.u16 q4, q0 \n" // _src02tt_r
3706
"vtrn.u16 q5, q1 \n"
3708
"sub %2, %2, %10, lsl #2 \n" // restore src1
3710
"vtrn.u16 q6, q2 \n" // _src13tt_r
3711
"vtrn.u16 q7, q3 \n"
3713
"add %1, #32 \n" // src0 += 32
3715
"vtrn.u16 q12, q8 \n" // _src46tt_r
3716
"vtrn.u16 q13, q9 \n"
3718
"add %2, #32 \n" // src1 += 32
3720
"vtrn.u16 q14, q10 \n" // _src57tt_r
3721
"vtrn.u16 q15, q11 \n"
3723
"vtrn.u32 q12, q4 \n" // _src26ttt_r
3724
"vtrn.u32 q13, q5 \n"
3726
"vtrn.u32 q14, q6 \n" // _src37ttt_r
3727
"vst4.u8 {d24-d27}, [%4], %11\n"
3728
"vtrn.u32 q15, q7 \n"
3730
"vtrn.u32 q8, q0 \n" // _src04ttt_r
3731
"vst4.u8 {d28-d31}, [%3], %11\n"
3732
"vtrn.u32 q9, q1 \n"
3734
"vtrn.u32 q10, q2 \n" // _src15ttt_r
3735
"vst4.u8 {d16-d19}, [%4], %11\n"
3736
"vtrn.u32 q11, q3 \n"
3740
"vst4.u8 {d8-d11}, [%4], %11 \n"
3741
"vst4.u8 {d20-d23}, [%3], %11\n"
3742
"vst4.u8 {d12-d15}, [%3], %11\n"
3743
"vst4.u8 {d0-d3}, [%4], %11 \n"
3744
"vst4.u8 {d4-d7}, [%3], %11 \n"
3757
"r"(src_step), // %10
3758
"r"(dst_step) // %11
3759
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
3761
#endif // __aarch64__
3762
for (; remain > 0; remain--)
3764
dst0[0] = src1[0 + 3 * src_step];
3765
dst0[1] = src1[1 + 3 * src_step];
3766
dst0[2] = src1[2 + 3 * src_step];
3767
dst0[3] = src1[3 + 3 * src_step];
3768
dst0[4] = src0[0 + 3 * src_step];
3769
dst0[5] = src0[1 + 3 * src_step];
3770
dst0[6] = src0[2 + 3 * src_step];
3771
dst0[7] = src0[3 + 3 * src_step];
3772
dst0[8] = src1[0 + 2 * src_step];
3773
dst0[9] = src1[1 + 2 * src_step];
3774
dst0[10] = src1[2 + 2 * src_step];
3775
dst0[11] = src1[3 + 2 * src_step];
3776
dst0[12] = src0[0 + 2 * src_step];
3777
dst0[13] = src0[1 + 2 * src_step];
3778
dst0[14] = src0[2 + 2 * src_step];
3779
dst0[15] = src0[3 + 2 * src_step];
3780
dst0[16] = src1[0 + src_step];
3781
dst0[17] = src1[1 + src_step];
3782
dst0[18] = src1[2 + src_step];
3783
dst0[19] = src1[3 + src_step];
3784
dst0[20] = src0[0 + src_step];
3785
dst0[21] = src0[1 + src_step];
3786
dst0[22] = src0[2 + src_step];
3787
dst0[23] = src0[3 + src_step];
3803
src0 += srcwgap + 7 * srcstride;
3806
for (; y < srch; y++)
3808
unsigned char* dst0 = dstend - y * 4 - 4;
3811
for (; x < srcw; x++)
3826
static void kanna_rotate_7_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
3828
const int srcwgap = srcstride - srcw;
3830
// point to the last dst pixel
3831
unsigned char* dstend = dst + stride * (h - 1) + w;
3833
const unsigned char* src0 = src;
3837
for (; y + 7 < srch; y += 8)
3839
const unsigned char* src1 = src0 + srcstride;
3841
unsigned char* dst6 = dstend - y - 8 - stride;
3842
unsigned char* dst7 = dstend - y - 8;
3844
int src_step = 2 * srcstride;
3845
int dst_step = -2 * stride;
3848
int remain = srcw - (nn << 3);
3850
#if !NCNN_GNU_INLINE_ASM || __aarch64__
3851
for (; nn > 0; nn--)
3853
uint8x8_t _src0 = vld1_u8(src0);
3854
uint8x8_t _src1 = vld1_u8(src1);
3856
uint8x8_t _src2 = vld1_u8(src0 + src_step);
3857
uint8x8_t _src3 = vld1_u8(src1 + src_step);
3859
uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step);
3860
uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step);
3862
uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step);
3863
uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step);
3865
uint8x8x2_t _src01t_r = vtrn_u8(_src1, _src0);
3866
uint8x8x2_t _src23t_r = vtrn_u8(_src3, _src2);
3867
uint8x8x2_t _src45t_r = vtrn_u8(_src5, _src4);
3868
uint8x8x2_t _src67t_r = vtrn_u8(_src7, _src6);
3870
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
3871
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
3872
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
3873
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
3875
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
3876
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
3877
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
3878
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
3880
uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[1]);
3881
uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[1]);
3882
uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[1]);
3883
uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[1]);
3884
uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[0]);
3885
uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[0]);
3886
uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[0]);
3887
uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[0]);
3889
vst1_u8(dst7, _dst7);
3890
vst1_u8(dst6, _dst6);
3891
vst1_u8(dst7 + dst_step, _dst5);
3892
vst1_u8(dst6 + dst_step, _dst4);
3893
vst1_u8(dst7 + 2 * dst_step, _dst3);
3894
vst1_u8(dst6 + 2 * dst_step, _dst2);
3895
vst1_u8(dst7 + 3 * dst_step, _dst1);
3896
vst1_u8(dst6 + 3 * dst_step, _dst0);
3901
dst7 += 4 * dst_step;
3902
dst6 += 4 * dst_step;
3910
"vld1.u8 {d0}, [%1], %10 \n"
3913
"vld1.u8 {d1}, [%2], %10 \n"
3916
"vld1.u8 {d2}, [%1], %10 \n"
3918
"vtrn.u8 d1, d0 \n" // _src01t_r
3921
"vld1.u8 {d3}, [%2], %10 \n"
3924
"vld1.u8 {d4}, [%1], %10 \n"
3926
"vtrn.u8 d3, d2 \n" // _src23t_r
3929
"vld1.u8 {d5}, [%2], %10 \n"
3932
"vld1.u8 {d6}, [%1], %10 \n"
3934
"vtrn.u8 d5, d4 \n" // _src45t_r
3937
"vld1.u8 {d7}, [%2], %10 \n"
3939
"vtrn.u8 d7, d6 \n" // _src67t_r
3941
"sub %1, %1, %10, lsl #2 \n" // restore src0
3943
"vtrn.u16 q1, q0 \n" // _src02tt_r _src13tt_r
3945
"sub %2, %2, %10, lsl #2 \n" // restore src1
3947
"vtrn.u16 q3, q2 \n" // _src46tt_r _src57tt_r
3949
"add %1, #8 \n" // src0 += 8
3951
"vtrn.u32 q3, q1 \n" // _src26ttt_r _src37ttt_r
3953
"add %2, #8 \n" // src1 += 8
3955
"vtrn.u32 q2, q0 \n" // _src04ttt_r _src15ttt_r
3956
"vst1.u8 {d6}, [%4], %11 \n"
3957
"vst1.u8 {d7}, [%3], %11 \n"
3961
"vst1.u8 {d4}, [%4], %11 \n"
3962
"vst1.u8 {d5}, [%3], %11 \n"
3963
"vst1.u8 {d2}, [%4], %11 \n"
3964
"vst1.u8 {d3}, [%3], %11 \n"
3965
"vst1.u8 {d0}, [%4], %11 \n"
3966
"vst1.u8 {d1}, [%3], %11 \n"
3979
"r"(src_step), // %10
3980
"r"(dst_step) // %11
3981
: "cc", "memory", "q0", "q1", "q2", "q3");
3983
#endif // __aarch64__
3984
for (; remain > 0; remain--)
3986
dst7[0] = src1[0 + 3 * src_step];
3987
dst7[1] = src0[0 + 3 * src_step];
3988
dst7[2] = src1[0 + 2 * src_step];
3989
dst7[3] = src0[0 + 2 * src_step];
3990
dst7[4] = src1[0 + src_step];
3991
dst7[5] = src0[0 + src_step];
4001
src0 += srcwgap + 7 * srcstride;
4004
for (; y < srch; y++)
4006
unsigned char* dst0 = dstend - y - 1;
4009
for (; x < srcw; x++)
4021
static void kanna_rotate_7_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
4023
const int srcwgap = srcstride - srcw * 2;
4025
// point to the last dst pixel
4026
unsigned char* dstend = dst + stride * (h - 1) + w * 2;
4028
const unsigned char* src0 = src;
4032
for (; y + 7 < srch; y += 8)
4034
const unsigned char* src1 = src0 + srcstride;
4036
unsigned char* dst6 = dstend - y * 2 - 8 * 2 - stride;
4037
unsigned char* dst7 = dstend - y * 2 - 8 * 2;
4039
int src_step = 2 * srcstride;
4040
int dst_step = -2 * stride;
4043
int remain = srcw - (nn << 3);
4045
#if !NCNN_GNU_INLINE_ASM || __aarch64__
4046
for (; nn > 0; nn--)
4048
uint8x8x2_t _src0 = vld2_u8(src0);
4049
uint8x8x2_t _src1 = vld2_u8(src1);
4051
uint8x8x2_t _src2 = vld2_u8(src0 + src_step);
4052
uint8x8x2_t _src3 = vld2_u8(src1 + src_step);
4054
uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step);
4055
uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step);
4057
uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step);
4058
uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step);
4060
uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
4061
uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
4062
uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
4063
uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
4065
uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
4066
uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
4067
uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
4068
uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
4070
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
4071
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
4072
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
4073
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
4075
uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
4076
uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
4077
uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
4078
uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
4080
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
4081
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
4082
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
4083
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
4085
uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
4086
uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
4087
uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
4088
uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
4099
_dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
4100
_dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
4101
_dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
4102
_dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
4103
_dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
4104
_dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
4105
_dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
4106
_dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
4108
_dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
4109
_dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
4110
_dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
4111
_dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
4112
_dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
4113
_dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
4114
_dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
4115
_dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
4117
vst2_u8(dst7, _dst7);
4118
vst2_u8(dst6, _dst6);
4119
vst2_u8(dst7 + dst_step, _dst5);
4120
vst2_u8(dst6 + dst_step, _dst4);
4121
vst2_u8(dst7 + 2 * dst_step, _dst3);
4122
vst2_u8(dst6 + 2 * dst_step, _dst2);
4123
vst2_u8(dst7 + 3 * dst_step, _dst1);
4124
vst2_u8(dst6 + 3 * dst_step, _dst0);
4129
dst7 += 4 * dst_step;
4130
dst6 += 4 * dst_step;
4138
"vld2.u8 {d0-d1}, [%1], %10 \n"
4141
"vld2.u8 {d2-d3}, [%2], %10 \n"
4144
"vld2.u8 {d4-d5}, [%1], %10 \n"
4146
"vtrn.u8 q1, q0 \n" // _src01t_r
4149
"vld2.u8 {d6-d7}, [%2], %10 \n"
4152
"vld2.u8 {d16-d17}, [%1], %10\n"
4154
"vtrn.u8 q3, q2 \n" // _src23t_r
4157
"vld2.u8 {d18-d19}, [%2], %10\n"
4160
"vld2.u8 {d20-d21}, [%1], %10\n"
4162
"vtrn.u8 q9, q8 \n" // _src45t_r
4165
"vld2.u8 {d22-d23}, [%2], %10\n"
4167
"vtrn.u8 q11, q10 \n" // _src67t_r
4169
"sub %1, %1, %10, lsl #2 \n" // restore src0
4171
"vtrn.u16 q2, q0 \n" // _src02tt_r
4173
"sub %2, %2, %10, lsl #2 \n" // restore src1
4175
"vtrn.u16 q3, q1 \n" // _src13tt_r
4177
"add %1, #16 \n" // src0 += 16
4179
"vtrn.u16 q10, q8 \n" // _src46tt_r
4181
"add %2, #16 \n" // src1 += 16
4183
"vtrn.u16 q11, q9 \n" // _src57tt_r
4185
"vtrn.u32 q10, q2 \n" // _src26ttt_r
4187
"vtrn.u32 q11, q3 \n" // _src37ttt_r
4188
"vst2.u8 {d20-d21}, [%4], %11\n"
4190
"vtrn.u32 q8, q0 \n" // _src04ttt_r
4191
"vst2.u8 {d22-d23}, [%3], %11\n"
4193
"vtrn.u32 q9, q1 \n" // _src15ttt_r
4194
"vst2.u8 {d16-d17}, [%4], %11\n"
4198
"vst2.u8 {d4-d5}, [%4], %11 \n"
4199
"vst2.u8 {d18-d19}, [%3], %11\n"
4200
"vst2.u8 {d6-d7}, [%3], %11 \n"
4201
"vst2.u8 {d0-d1}, [%4], %11 \n"
4202
"vst2.u8 {d2-d3}, [%3], %11 \n"
4215
"r"(src_step), // %10
4216
"r"(dst_step) // %11
4217
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
4219
#endif // __aarch64__
4220
for (; remain > 0; remain--)
4222
dst7[0] = src1[0 + 3 * src_step];
4223
dst7[1] = src1[1 + 3 * src_step];
4224
dst7[2] = src0[0 + 3 * src_step];
4225
dst7[3] = src0[1 + 3 * src_step];
4226
dst7[4] = src1[0 + 2 * src_step];
4227
dst7[5] = src1[1 + 2 * src_step];
4228
dst7[6] = src0[0 + 2 * src_step];
4229
dst7[7] = src0[1 + 2 * src_step];
4230
dst7[8] = src1[0 + src_step];
4231
dst7[9] = src1[1 + src_step];
4232
dst7[10] = src0[0 + src_step];
4233
dst7[11] = src0[1 + src_step];
4245
src0 += srcwgap + 7 * srcstride;
4248
for (; y < srch; y++)
4250
unsigned char* dst0 = dstend - y * 2 - 2;
4253
for (; x < srcw; x++)
4266
static void kanna_rotate_7_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
4268
const int srcwgap = srcstride - srcw * 3;
4270
// point to the last dst pixel
4271
unsigned char* dstend = dst + stride * (h - 1) + w * 3;
4273
const unsigned char* src0 = src;
4277
for (; y + 7 < srch; y += 8)
4279
const unsigned char* src1 = src0 + srcstride;
4281
unsigned char* dst6 = dstend - y * 3 - 8 * 3 - stride;
4282
unsigned char* dst7 = dstend - y * 3 - 8 * 3;
4284
int src_step = 2 * srcstride;
4285
int dst_step = -2 * stride;
4288
int remain = srcw - (nn << 3);
4290
#if !NCNN_GNU_INLINE_ASM || __aarch64__
4291
for (; nn > 0; nn--)
4293
uint8x8x3_t _src0 = vld3_u8(src0);
4294
uint8x8x3_t _src1 = vld3_u8(src1);
4296
uint8x8x3_t _src2 = vld3_u8(src0 + src_step);
4297
uint8x8x3_t _src3 = vld3_u8(src1 + src_step);
4299
uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step);
4300
uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step);
4302
uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step);
4303
uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step);
4305
uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
4306
uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
4307
uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
4308
uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
4310
uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
4311
uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
4312
uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
4313
uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
4315
uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]);
4316
uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]);
4317
uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]);
4318
uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]);
4320
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
4321
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
4322
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
4323
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
4325
uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
4326
uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
4327
uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
4328
uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
4330
uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1]));
4331
uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0]));
4332
uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1]));
4333
uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0]));
4335
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
4336
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
4337
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
4338
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
4340
uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
4341
uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
4342
uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
4343
uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
4345
uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1]));
4346
uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1]));
4347
uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0]));
4348
uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0]));
4359
_dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
4360
_dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
4361
_dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
4362
_dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
4363
_dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
4364
_dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
4365
_dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
4366
_dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
4368
_dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
4369
_dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
4370
_dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
4371
_dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
4372
_dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
4373
_dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
4374
_dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
4375
_dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
4377
_dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
4378
_dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
4379
_dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
4380
_dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
4381
_dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
4382
_dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
4383
_dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
4384
_dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
4386
vst3_u8(dst7, _dst7);
4387
vst3_u8(dst6, _dst6);
4388
vst3_u8(dst7 + dst_step, _dst5);
4389
vst3_u8(dst6 + dst_step, _dst4);
4390
vst3_u8(dst7 + 2 * dst_step, _dst3);
4391
vst3_u8(dst6 + 2 * dst_step, _dst2);
4392
vst3_u8(dst7 + 3 * dst_step, _dst1);
4393
vst3_u8(dst6 + 3 * dst_step, _dst0);
4398
dst7 += 4 * dst_step;
4399
dst6 += 4 * dst_step;
4407
"vld3.u8 {d0-d2}, [%1], %10 \n"
4410
"vld3.u8 {d4-d6}, [%2], %10 \n"
4413
"vld3.u8 {d8-d10}, [%1], %10 \n"
4415
"vtrn.u8 q2, q0 \n" // _src01t_r
4419
"vld3.u8 {d12-d14}, [%2], %10\n"
4422
"vld3.u8 {d16-d18}, [%1], %10\n"
4424
"vtrn.u8 q6, q4 \n" // _src23t_r
4425
"vtrn.u8 d14, d10 \n"
4428
"vld3.u8 {d20-d22}, [%2], %10\n"
4431
"vld3.u8 {d24-d26}, [%1], %10\n"
4433
"vtrn.u8 q10, q8 \n" // _src45t_r
4434
"vtrn.u8 d22, d18 \n"
4437
"vld3.u8 {d28-d30}, [%2], %10\n"
4439
"vtrn.u8 q14, q12 \n" // _src67t_r
4440
"vtrn.u8 d30, d26 \n"
4442
"sub %1, %1, %10, lsl #2 \n" // restore src0
4444
"vtrn.u16 q4, q0 \n" // _src02tt_r
4445
"vtrn.u16 d10, d2 \n"
4447
"sub %2, %2, %10, lsl #2 \n" // restore src1
4449
"vtrn.u16 q6, q2 \n" // _src13tt_r
4450
"vtrn.u16 d14, d6 \n"
4452
"add %1, #24 \n" // src0 += 24
4454
"vtrn.u16 q12, q8 \n" // _src46tt_r
4455
"vtrn.u16 d26, d18 \n"
4457
"add %2, #24 \n" // src1 += 24
4459
"vtrn.u16 q14, q10 \n" // _src57tt_r
4460
"vtrn.u16 d30, d22 \n"
4462
"vtrn.u32 q12, q4 \n" // _src26ttt_r
4463
"vtrn.u32 d26, d10 \n"
4465
"vtrn.u32 q14, q6 \n" // _src37ttt_r
4466
"vst3.u8 {d24-d26}, [%4], %11\n"
4467
"vtrn.u32 d30, d14 \n"
4469
"vtrn.u32 q8, q0 \n" // _src04ttt_r
4470
"vst3.u8 {d28-d30}, [%3], %11\n"
4471
"vtrn.u32 d18, d2 \n"
4473
"vtrn.u32 q10, q2 \n" // _src15ttt_r
4474
"vst3.u8 {d16-d18}, [%4], %11\n"
4475
"vtrn.u32 d22, d6 \n"
4479
"vst3.u8 {d8-d10}, [%4], %11 \n"
4480
"vst3.u8 {d20-d22}, [%3], %11\n"
4481
"vst3.u8 {d12-d14}, [%3], %11\n"
4482
"vst3.u8 {d0-d2}, [%4], %11 \n"
4483
"vst3.u8 {d4-d6}, [%3], %11 \n"
4496
"r"(src_step), // %10
4497
"r"(dst_step) // %11
4498
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
4500
#endif // __aarch64__
4501
for (; remain > 0; remain--)
4503
dst7[0] = src1[0 + 3 * src_step];
4504
dst7[1] = src1[1 + 3 * src_step];
4505
dst7[2] = src1[2 + 3 * src_step];
4506
dst7[3] = src0[0 + 3 * src_step];
4507
dst7[4] = src0[1 + 3 * src_step];
4508
dst7[5] = src0[2 + 3 * src_step];
4509
dst7[6] = src1[0 + 2 * src_step];
4510
dst7[7] = src1[1 + 2 * src_step];
4511
dst7[8] = src1[2 + 2 * src_step];
4512
dst7[9] = src0[0 + 2 * src_step];
4513
dst7[10] = src0[1 + 2 * src_step];
4514
dst7[11] = src0[2 + 2 * src_step];
4515
dst7[12] = src1[0 + src_step];
4516
dst7[13] = src1[1 + src_step];
4517
dst7[14] = src1[2 + src_step];
4518
dst7[15] = src0[0 + src_step];
4519
dst7[16] = src0[1 + src_step];
4520
dst7[17] = src0[2 + src_step];
4534
src0 += srcwgap + 7 * srcstride;
4537
for (; y < srch; y++)
4539
unsigned char* dst0 = dstend - y * 3 - 3;
4542
for (; x < srcw; x++)
4556
static void kanna_rotate_7_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
4558
const int srcwgap = srcstride - srcw * 4;
4560
// point to the last dst pixel
4561
unsigned char* dstend = dst + stride * (h - 1) + w * 4;
4563
const unsigned char* src0 = src;
4567
for (; y + 7 < srch; y += 8)
4569
const unsigned char* src1 = src0 + srcstride;
4571
unsigned char* dst6 = dstend - y * 4 - 8 * 4 - stride;
4572
unsigned char* dst7 = dstend - y * 4 - 8 * 4;
4574
int src_step = 2 * srcstride;
4575
int dst_step = -2 * stride;
4578
int remain = srcw - (nn << 3);
4580
#if !NCNN_GNU_INLINE_ASM || __aarch64__
4581
for (; nn > 0; nn--)
4583
uint8x8x4_t _src0 = vld4_u8(src0);
4584
uint8x8x4_t _src1 = vld4_u8(src1);
4586
uint8x8x4_t _src2 = vld4_u8(src0 + src_step);
4587
uint8x8x4_t _src3 = vld4_u8(src1 + src_step);
4589
uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step);
4590
uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step);
4592
uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step);
4593
uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step);
4595
uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
4596
uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
4597
uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
4598
uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
4600
uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
4601
uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
4602
uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
4603
uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
4605
uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]);
4606
uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]);
4607
uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]);
4608
uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]);
4610
uint8x8x2_t _src01t_a = vtrn_u8(_src1.val[3], _src0.val[3]);
4611
uint8x8x2_t _src23t_a = vtrn_u8(_src3.val[3], _src2.val[3]);
4612
uint8x8x2_t _src45t_a = vtrn_u8(_src5.val[3], _src4.val[3]);
4613
uint8x8x2_t _src67t_a = vtrn_u8(_src7.val[3], _src6.val[3]);
4615
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
4616
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
4617
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
4618
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
4620
uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
4621
uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
4622
uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
4623
uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
4625
uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1]));
4626
uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0]));
4627
uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1]));
4628
uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0]));
4630
uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[1]), vreinterpret_u16_u8(_src01t_a.val[1]));
4631
uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[0]), vreinterpret_u16_u8(_src01t_a.val[0]));
4632
uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[1]), vreinterpret_u16_u8(_src45t_a.val[1]));
4633
uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[0]), vreinterpret_u16_u8(_src45t_a.val[0]));
4635
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
4636
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
4637
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
4638
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
4640
uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
4641
uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
4642
uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
4643
uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
4645
uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1]));
4646
uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1]));
4647
uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0]));
4648
uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0]));
4650
uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[1]), vreinterpret_u32_u16(_src02tt_a.val[1]));
4651
uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[1]), vreinterpret_u32_u16(_src13tt_a.val[1]));
4652
uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[0]), vreinterpret_u32_u16(_src02tt_a.val[0]));
4653
uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[0]), vreinterpret_u32_u16(_src13tt_a.val[0]));
4664
_dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
4665
_dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
4666
_dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
4667
_dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
4668
_dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
4669
_dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
4670
_dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
4671
_dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
4673
_dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
4674
_dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
4675
_dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
4676
_dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
4677
_dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
4678
_dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
4679
_dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
4680
_dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
4682
_dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
4683
_dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
4684
_dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
4685
_dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
4686
_dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
4687
_dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
4688
_dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
4689
_dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
4691
_dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]);
4692
_dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]);
4693
_dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]);
4694
_dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]);
4695
_dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]);
4696
_dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]);
4697
_dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]);
4698
_dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]);
4700
vst4_u8(dst7, _dst7);
4701
vst4_u8(dst6, _dst6);
4702
vst4_u8(dst7 + dst_step, _dst5);
4703
vst4_u8(dst6 + dst_step, _dst4);
4704
vst4_u8(dst7 + 2 * dst_step, _dst3);
4705
vst4_u8(dst6 + 2 * dst_step, _dst2);
4706
vst4_u8(dst7 + 3 * dst_step, _dst1);
4707
vst4_u8(dst6 + 3 * dst_step, _dst0);
4712
dst7 += 4 * dst_step;
4713
dst6 += 4 * dst_step;
4721
"vld4.u8 {d0-d3}, [%1], %10 \n"
4724
"vld4.u8 {d4-d7}, [%2], %10 \n"
4727
"vld4.u8 {d8-d11}, [%1], %10 \n"
4729
"vtrn.u8 q2, q0 \n" // _src01t_r
4733
"vld4.u8 {d12-d15}, [%2], %10\n"
4736
"vld4.u8 {d16-d19}, [%1], %10\n"
4738
"vtrn.u8 q6, q4 \n" // _src23t_r
4742
"vld4.u8 {d20-d23}, [%2], %10\n"
4745
"vld4.u8 {d24-d27}, [%1], %10\n"
4747
"vtrn.u8 q10, q8 \n" // _src45t_r
4748
"vtrn.u8 q11, q9 \n"
4751
"vld4.u8 {d28-d31}, [%2], %10\n"
4753
"vtrn.u8 q14, q12 \n" // _src67t_r
4754
"vtrn.u8 q15, q13 \n"
4756
"sub %1, %1, %10, lsl #2 \n" // restore src0
4758
"vtrn.u16 q4, q0 \n" // _src02tt_r
4759
"vtrn.u16 q5, q1 \n"
4761
"sub %2, %2, %10, lsl #2 \n" // restore src1
4763
"vtrn.u16 q6, q2 \n" // _src13tt_r
4764
"vtrn.u16 q7, q3 \n"
4766
"add %1, #32 \n" // src0 += 32
4768
"vtrn.u16 q12, q8 \n" // _src46tt_r
4769
"vtrn.u16 q13, q9 \n"
4771
"add %2, #32 \n" // src1 += 32
4773
"vtrn.u16 q14, q10 \n" // _src57tt_r
4774
"vtrn.u16 q15, q11 \n"
4776
"vtrn.u32 q12, q4 \n" // _src26ttt_r
4777
"vtrn.u32 q13, q5 \n"
4779
"vtrn.u32 q14, q6 \n" // _src37ttt_r
4780
"vst4.u8 {d24-d27}, [%4], %11\n"
4781
"vtrn.u32 q15, q7 \n"
4783
"vtrn.u32 q8, q0 \n" // _src04ttt_r
4784
"vst4.u8 {d28-d31}, [%3], %11\n"
4785
"vtrn.u32 q9, q1 \n"
4787
"vtrn.u32 q10, q2 \n" // _src15ttt_r
4788
"vst4.u8 {d16-d19}, [%4], %11\n"
4789
"vtrn.u32 q11, q3 \n"
4793
"vst4.u8 {d8-d11}, [%4], %11 \n"
4794
"vst4.u8 {d20-d23}, [%3], %11\n"
4795
"vst4.u8 {d12-d15}, [%3], %11\n"
4796
"vst4.u8 {d0-d3}, [%4], %11 \n"
4797
"vst4.u8 {d4-d7}, [%3], %11 \n"
4810
"r"(src_step), // %10
4811
"r"(dst_step) // %11
4812
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
4814
#endif // __aarch64__
4815
for (; remain > 0; remain--)
4817
dst7[0] = src1[0 + 3 * src_step];
4818
dst7[1] = src1[1 + 3 * src_step];
4819
dst7[2] = src1[2 + 3 * src_step];
4820
dst7[3] = src1[3 + 3 * src_step];
4821
dst7[4] = src0[0 + 3 * src_step];
4822
dst7[5] = src0[1 + 3 * src_step];
4823
dst7[6] = src0[2 + 3 * src_step];
4824
dst7[7] = src0[3 + 3 * src_step];
4825
dst7[8] = src1[0 + 2 * src_step];
4826
dst7[9] = src1[1 + 2 * src_step];
4827
dst7[10] = src1[2 + 2 * src_step];
4828
dst7[11] = src1[3 + 2 * src_step];
4829
dst7[12] = src0[0 + 2 * src_step];
4830
dst7[13] = src0[1 + 2 * src_step];
4831
dst7[14] = src0[2 + 2 * src_step];
4832
dst7[15] = src0[3 + 2 * src_step];
4833
dst7[16] = src1[0 + src_step];
4834
dst7[17] = src1[1 + src_step];
4835
dst7[18] = src1[2 + src_step];
4836
dst7[19] = src1[3 + src_step];
4837
dst7[20] = src0[0 + src_step];
4838
dst7[21] = src0[1 + src_step];
4839
dst7[22] = src0[2 + src_step];
4840
dst7[23] = src0[3 + src_step];
4856
src0 += srcwgap + 7 * srcstride;
4859
for (; y < srch; y++)
4861
unsigned char* dst0 = dstend - y * 4 - 4;
4864
for (; x < srcw; x++)
4879
static void kanna_rotate_8_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int h, int stride)
4881
const int srcwgap = srcstride - srcw;
4883
// point to the last dst pixel row
4884
unsigned char* dstend = dst + stride * (h - 1);
4886
const unsigned char* src0 = src;
4890
for (; y + 7 < srch; y += 8)
4892
const unsigned char* src1 = src0 + srcstride;
4894
unsigned char* dst7 = dstend + y;
4895
unsigned char* dst6 = dstend + y - stride;
4897
int src_step = 2 * srcstride;
4898
int dst_step = -2 * stride;
4901
int remain = srcw - (nn << 3);
4903
#if !NCNN_GNU_INLINE_ASM || __aarch64__
4904
for (; nn > 0; nn--)
4906
uint8x8_t _src0 = vld1_u8(src0);
4907
uint8x8_t _src1 = vld1_u8(src1);
4909
uint8x8_t _src2 = vld1_u8(src0 + src_step);
4910
uint8x8_t _src3 = vld1_u8(src1 + src_step);
4912
uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step);
4913
uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step);
4915
uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step);
4916
uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step);
4918
uint8x8x2_t _src01t_r = vtrn_u8(_src0, _src1);
4919
uint8x8x2_t _src23t_r = vtrn_u8(_src2, _src3);
4920
uint8x8x2_t _src45t_r = vtrn_u8(_src4, _src5);
4921
uint8x8x2_t _src67t_r = vtrn_u8(_src6, _src7);
4923
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
4924
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
4925
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
4926
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
4928
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
4929
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
4930
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
4931
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
4933
uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[0]);
4934
uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[0]);
4935
uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[0]);
4936
uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[0]);
4937
uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[1]);
4938
uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[1]);
4939
uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[1]);
4940
uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[1]);
4942
vst1_u8(dst7, _dst0);
4943
vst1_u8(dst6, _dst1);
4944
vst1_u8(dst7 + dst_step, _dst2);
4945
vst1_u8(dst6 + dst_step, _dst3);
4946
vst1_u8(dst7 + 2 * dst_step, _dst4);
4947
vst1_u8(dst6 + 2 * dst_step, _dst5);
4948
vst1_u8(dst7 + 3 * dst_step, _dst6);
4949
vst1_u8(dst6 + 3 * dst_step, _dst7);
4954
dst7 += 4 * dst_step;
4955
dst6 += 4 * dst_step;
4963
"vld1.u8 {d0}, [%1], %10 \n"
4966
"vld1.u8 {d1}, [%2], %10 \n"
4969
"vld1.u8 {d2}, [%1], %10 \n"
4971
"vtrn.u8 d0, d1 \n" // _src01t_r
4974
"vld1.u8 {d3}, [%2], %10 \n"
4977
"vld1.u8 {d4}, [%1], %10 \n"
4979
"vtrn.u8 d2, d3 \n" // _src23t_r
4982
"vld1.u8 {d5}, [%2], %10 \n"
4985
"vld1.u8 {d6}, [%1], %10 \n"
4987
"vtrn.u8 d4, d5 \n" // _src45t_r
4990
"vld1.u8 {d7}, [%2], %10 \n"
4992
"vtrn.u8 d6, d7 \n" // _src67t_r
4994
"sub %1, %1, %10, lsl #2 \n" // restore src0
4996
"vtrn.u16 q0, q1 \n" // _src02tt_r _src13tt_r
4998
"sub %2, %2, %10, lsl #2 \n" // restore src1
5000
"vtrn.u16 q2, q3 \n" // _src46tt_r _src57tt_r
5002
"add %1, #8 \n" // src0 += 8
5004
"vtrn.u32 q0, q2 \n" // _src04ttt_r _src15ttt_r
5006
"add %2, #8 \n" // src1 += 8
5008
"vtrn.u32 q1, q3 \n" // _src26ttt_r _src37ttt_r
5009
"vst1.u8 {d0}, [%3], %11 \n"
5010
"vst1.u8 {d1}, [%4], %11 \n"
5014
"vst1.u8 {d2}, [%3], %11 \n"
5015
"vst1.u8 {d3}, [%4], %11 \n"
5016
"vst1.u8 {d4}, [%3], %11 \n"
5017
"vst1.u8 {d5}, [%4], %11 \n"
5018
"vst1.u8 {d6}, [%3], %11 \n"
5019
"vst1.u8 {d7}, [%4], %11 \n"
5032
"r"(src_step), // %10
5033
"r"(dst_step) // %11
5034
: "cc", "memory", "q0", "q1", "q2", "q3");
5036
#endif // __aarch64__
5037
for (; remain > 0; remain--)
5041
dst7[2] = src0[0 + src_step];
5042
dst7[3] = src1[0 + src_step];
5043
dst7[4] = src0[0 + 2 * src_step];
5044
dst7[5] = src1[0 + 2 * src_step];
5045
dst7[6] = src0[0 + 3 * src_step];
5046
dst7[7] = src1[0 + 3 * src_step];
5054
src0 += srcwgap + 7 * srcstride;
5057
for (; y < srch; y++)
5059
unsigned char* dst0 = dstend + y;
5062
for (; x < srcw; x++)
5074
static void kanna_rotate_8_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int h, int stride)
5076
const int srcwgap = srcstride - srcw * 2;
5078
// point to the last dst pixel row
5079
unsigned char* dstend = dst + stride * (h - 1);
5081
const unsigned char* src0 = src;
5085
for (; y + 7 < srch; y += 8)
5087
const unsigned char* src1 = src0 + srcstride;
5089
unsigned char* dst7 = dstend + y * 2;
5090
unsigned char* dst6 = dstend + y * 2 - stride;
5092
int src_step = 2 * srcstride;
5093
int dst_step = -2 * stride;
5096
int remain = srcw - (nn << 3);
5098
#if !NCNN_GNU_INLINE_ASM || __aarch64__
5099
for (; nn > 0; nn--)
5101
uint8x8x2_t _src0 = vld2_u8(src0);
5102
uint8x8x2_t _src1 = vld2_u8(src1);
5104
uint8x8x2_t _src2 = vld2_u8(src0 + src_step);
5105
uint8x8x2_t _src3 = vld2_u8(src1 + src_step);
5107
uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step);
5108
uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step);
5110
uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step);
5111
uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step);
5113
uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
5114
uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
5115
uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
5116
uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
5118
uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
5119
uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
5120
uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
5121
uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
5123
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
5124
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
5125
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
5126
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
5128
uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
5129
uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
5130
uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
5131
uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
5133
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
5134
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
5135
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
5136
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
5138
uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
5139
uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
5140
uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
5141
uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
5152
_dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
5153
_dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
5154
_dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
5155
_dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
5156
_dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
5157
_dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
5158
_dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
5159
_dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
5161
_dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
5162
_dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
5163
_dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
5164
_dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
5165
_dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
5166
_dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
5167
_dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
5168
_dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
5170
vst2_u8(dst7, _dst0);
5171
vst2_u8(dst6, _dst1);
5172
vst2_u8(dst7 + dst_step, _dst2);
5173
vst2_u8(dst6 + dst_step, _dst3);
5174
vst2_u8(dst7 + 2 * dst_step, _dst4);
5175
vst2_u8(dst6 + 2 * dst_step, _dst5);
5176
vst2_u8(dst7 + 3 * dst_step, _dst6);
5177
vst2_u8(dst6 + 3 * dst_step, _dst7);
5182
dst7 += 4 * dst_step;
5183
dst6 += 4 * dst_step;
5191
"vld2.u8 {d0-d1}, [%1], %10 \n"
5194
"vld2.u8 {d2-d3}, [%2], %10 \n"
5197
"vld2.u8 {d4-d5}, [%1], %10 \n"
5199
"vtrn.u8 q0, q1 \n" // _src01t_r
5202
"vld2.u8 {d6-d7}, [%2], %10 \n"
5205
"vld2.u8 {d16-d17}, [%1], %10\n"
5207
"vtrn.u8 q2, q3 \n" // _src23t_r
5210
"vld2.u8 {d18-d19}, [%2], %10\n"
5213
"vld2.u8 {d20-d21}, [%1], %10\n"
5215
"vtrn.u8 q8, q9 \n" // _src45t_r
5218
"vld2.u8 {d22-d23}, [%2], %10\n"
5220
"vtrn.u8 q10, q11 \n" // _src67t_r
5222
"sub %1, %1, %10, lsl #2 \n" // restore src0
5224
"vtrn.u16 q0, q2 \n" // _src02tt_r
5226
"sub %2, %2, %10, lsl #2 \n" // restore src1
5228
"vtrn.u16 q1, q3 \n" // _src13tt_r
5230
"add %1, #16 \n" // src0 += 16
5232
"vtrn.u16 q8, q10 \n" // _src46tt_r
5234
"add %2, #16 \n" // src1 += 16
5236
"vtrn.u16 q9, q11 \n" // _src57tt_r
5238
"vtrn.u32 q0, q8 \n" // _src04ttt_r
5240
"vtrn.u32 q1, q9 \n" // _src15ttt_r
5241
"vst2.u8 {d0-d1}, [%3], %11 \n"
5243
"vtrn.u32 q2, q10 \n" // _src26ttt_r
5244
"vst2.u8 {d2-d3}, [%4], %11 \n"
5246
"vtrn.u32 q3, q11 \n" // _src37ttt_r
5247
"vst2.u8 {d4-d5}, [%3], %11 \n"
5251
"vst2.u8 {d16-d17}, [%3], %11\n"
5252
"vst2.u8 {d6-d7}, [%4], %11 \n"
5253
"vst2.u8 {d18-d19}, [%4], %11\n"
5254
"vst2.u8 {d20-d21}, [%3], %11\n"
5255
"vst2.u8 {d22-d23}, [%4], %11\n"
5268
"r"(src_step), // %10
5269
"r"(dst_step) // %11
5270
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
5272
#endif // __aarch64__
5273
for (; remain > 0; remain--)
5279
dst7[4] = src0[0 + src_step];
5280
dst7[5] = src0[1 + src_step];
5281
dst7[6] = src1[0 + src_step];
5282
dst7[7] = src1[1 + src_step];
5283
dst7[8] = src0[0 + 2 * src_step];
5284
dst7[9] = src0[1 + 2 * src_step];
5285
dst7[10] = src1[0 + 2 * src_step];
5286
dst7[11] = src1[1 + 2 * src_step];
5287
dst7[12] = src0[0 + 3 * src_step];
5288
dst7[13] = src0[1 + 3 * src_step];
5289
dst7[14] = src1[0 + 3 * src_step];
5290
dst7[15] = src1[1 + 3 * src_step];
5298
src0 += srcwgap + 7 * srcstride;
5301
for (; y < srch; y++)
5303
unsigned char* dst0 = dstend + y * 2;
5306
for (; x < srcw; x++)
5319
static void kanna_rotate_8_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int h, int stride)
5321
const int srcwgap = srcstride - srcw * 3;
5323
// point to the last dst pixel row
5324
unsigned char* dstend = dst + stride * (h - 1);
5326
const unsigned char* src0 = src;
5330
for (; y + 7 < srch; y += 8)
5332
const unsigned char* src1 = src0 + srcstride;
5334
unsigned char* dst7 = dstend + y * 3;
5335
unsigned char* dst6 = dstend + y * 3 - stride;
5337
int src_step = 2 * srcstride;
5338
int dst_step = -2 * stride;
5341
int remain = srcw - (nn << 3);
5343
#if !NCNN_GNU_INLINE_ASM || __aarch64__
5344
for (; nn > 0; nn--)
5346
uint8x8x3_t _src0 = vld3_u8(src0);
5347
uint8x8x3_t _src1 = vld3_u8(src1);
5349
uint8x8x3_t _src2 = vld3_u8(src0 + src_step);
5350
uint8x8x3_t _src3 = vld3_u8(src1 + src_step);
5352
uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step);
5353
uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step);
5355
uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step);
5356
uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step);
5358
uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
5359
uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
5360
uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
5361
uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
5363
uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
5364
uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
5365
uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
5366
uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
5368
uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]);
5369
uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]);
5370
uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]);
5371
uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]);
5373
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
5374
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
5375
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
5376
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
5378
uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
5379
uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
5380
uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
5381
uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
5383
uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0]));
5384
uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1]));
5385
uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0]));
5386
uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1]));
5388
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
5389
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
5390
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
5391
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
5393
uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
5394
uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
5395
uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
5396
uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
5398
uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0]));
5399
uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0]));
5400
uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1]));
5401
uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1]));
5412
_dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
5413
_dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
5414
_dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
5415
_dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
5416
_dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
5417
_dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
5418
_dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
5419
_dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
5421
_dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
5422
_dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
5423
_dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
5424
_dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
5425
_dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
5426
_dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
5427
_dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
5428
_dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
5430
_dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
5431
_dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
5432
_dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
5433
_dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
5434
_dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
5435
_dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
5436
_dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
5437
_dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
5439
vst3_u8(dst7, _dst0);
5440
vst3_u8(dst6, _dst1);
5441
vst3_u8(dst7 + dst_step, _dst2);
5442
vst3_u8(dst6 + dst_step, _dst3);
5443
vst3_u8(dst7 + 2 * dst_step, _dst4);
5444
vst3_u8(dst6 + 2 * dst_step, _dst5);
5445
vst3_u8(dst7 + 3 * dst_step, _dst6);
5446
vst3_u8(dst6 + 3 * dst_step, _dst7);
5451
dst7 += 4 * dst_step;
5452
dst6 += 4 * dst_step;
5460
"vld3.u8 {d0-d2}, [%1], %10 \n"
5463
"vld3.u8 {d4-d6}, [%2], %10 \n"
5466
"vld3.u8 {d8-d10}, [%1], %10 \n"
5468
"vtrn.u8 q0, q2 \n" // _src01t_r
5472
"vld3.u8 {d12-d14}, [%2], %10\n"
5475
"vld3.u8 {d16-d18}, [%1], %10\n"
5477
"vtrn.u8 q4, q6 \n" // _src23t_r
5478
"vtrn.u8 d10, d14 \n"
5481
"vld3.u8 {d20-d22}, [%2], %10\n"
5484
"vld3.u8 {d24-d26}, [%1], %10\n"
5486
"vtrn.u8 q8, q10 \n" // _src45t_r
5487
"vtrn.u8 d18, d22 \n"
5490
"vld3.u8 {d28-d30}, [%2], %10\n"
5492
"vtrn.u8 q12, q14 \n" // _src67t_r
5493
"vtrn.u8 d26, d30 \n"
5495
"sub %1, %1, %10, lsl #2 \n" // restore src0
5497
"vtrn.u16 q0, q4 \n" // _src02tt_r
5498
"vtrn.u16 d2, d10 \n"
5500
"sub %2, %2, %10, lsl #2 \n" // restore src1
5502
"vtrn.u16 q2, q6 \n" // _src13tt_r
5503
"vtrn.u16 d6, d14 \n"
5505
"add %1, #24 \n" // src0 += 24
5507
"vtrn.u16 q8, q12 \n" // _src46tt_r
5508
"vtrn.u16 d18, d26 \n"
5510
"add %2, #24 \n" // src1 += 24
5512
"vtrn.u16 q10, q14 \n" // _src57tt_r
5513
"vtrn.u16 d22, d30 \n"
5515
"vtrn.u32 q0, q8 \n" // _src04ttt_r
5516
"vtrn.u32 d2, d18 \n"
5518
"vtrn.u32 q2, q10 \n" // _src15ttt_r
5519
"vst3.u8 {d0-d2}, [%3], %11 \n"
5520
"vtrn.u32 d6, d22 \n"
5522
"vtrn.u32 q4, q12 \n" // _src26ttt_r
5523
"vst3.u8 {d4-d6}, [%4], %11 \n"
5524
"vtrn.u32 d10, d26 \n"
5526
"vtrn.u32 q6, q14 \n" // _src37ttt_r
5527
"vst3.u8 {d8-d10}, [%3], %11 \n"
5528
"vtrn.u32 d14, d30 \n"
5532
"vst3.u8 {d16-d18}, [%3], %11\n"
5533
"vst3.u8 {d12-d14}, [%4], %11\n"
5534
"vst3.u8 {d20-d22}, [%4], %11\n"
5535
"vst3.u8 {d24-d26}, [%3], %11\n"
5536
"vst3.u8 {d28-d30}, [%4], %11\n"
5549
"r"(src_step), // %10
5550
"r"(dst_step) // %11
5551
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
5553
#endif // __aarch64__
5554
for (; remain > 0; remain--)
5562
dst7[6] = src0[0 + src_step];
5563
dst7[7] = src0[1 + src_step];
5564
dst7[8] = src0[2 + src_step];
5565
dst7[9] = src1[0 + src_step];
5566
dst7[10] = src1[1 + src_step];
5567
dst7[11] = src1[2 + src_step];
5568
dst7[12] = src0[0 + 2 * src_step];
5569
dst7[13] = src0[1 + 2 * src_step];
5570
dst7[14] = src0[2 + 2 * src_step];
5571
dst7[15] = src1[0 + 2 * src_step];
5572
dst7[16] = src1[1 + 2 * src_step];
5573
dst7[17] = src1[2 + 2 * src_step];
5574
dst7[18] = src0[0 + 3 * src_step];
5575
dst7[19] = src0[1 + 3 * src_step];
5576
dst7[20] = src0[2 + 3 * src_step];
5577
dst7[21] = src1[0 + 3 * src_step];
5578
dst7[22] = src1[1 + 3 * src_step];
5579
dst7[23] = src1[2 + 3 * src_step];
5587
src0 += srcwgap + 7 * srcstride;
5590
for (; y < srch; y++)
5592
unsigned char* dst0 = dstend + y * 3;
5595
for (; x < srcw; x++)
5609
static void kanna_rotate_8_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int h, int stride)
5611
const int srcwgap = srcstride - srcw * 4;
5613
// point to the last dst pixel row
5614
unsigned char* dstend = dst + stride * (h - 1);
5616
const unsigned char* src0 = src;
5620
for (; y + 7 < srch; y += 8)
5622
const unsigned char* src1 = src0 + srcstride;
5624
unsigned char* dst7 = dstend + y * 4;
5625
unsigned char* dst6 = dstend + y * 4 - stride;
5627
int src_step = 2 * srcstride;
5628
int dst_step = -2 * stride;
5631
int remain = srcw - (nn << 3);
5633
#if !NCNN_GNU_INLINE_ASM || __aarch64__
5634
for (; nn > 0; nn--)
5636
uint8x8x4_t _src0 = vld4_u8(src0);
5637
uint8x8x4_t _src1 = vld4_u8(src1);
5639
uint8x8x4_t _src2 = vld4_u8(src0 + src_step);
5640
uint8x8x4_t _src3 = vld4_u8(src1 + src_step);
5642
uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step);
5643
uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step);
5645
uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step);
5646
uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step);
5648
uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
5649
uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
5650
uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
5651
uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
5653
uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
5654
uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
5655
uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
5656
uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
5658
uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]);
5659
uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]);
5660
uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]);
5661
uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]);
5663
uint8x8x2_t _src01t_a = vtrn_u8(_src0.val[3], _src1.val[3]);
5664
uint8x8x2_t _src23t_a = vtrn_u8(_src2.val[3], _src3.val[3]);
5665
uint8x8x2_t _src45t_a = vtrn_u8(_src4.val[3], _src5.val[3]);
5666
uint8x8x2_t _src67t_a = vtrn_u8(_src6.val[3], _src7.val[3]);
5668
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
5669
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
5670
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
5671
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
5673
uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
5674
uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
5675
uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
5676
uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
5678
uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0]));
5679
uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1]));
5680
uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0]));
5681
uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1]));
5683
uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[0]), vreinterpret_u16_u8(_src23t_a.val[0]));
5684
uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[1]), vreinterpret_u16_u8(_src23t_a.val[1]));
5685
uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[0]), vreinterpret_u16_u8(_src67t_a.val[0]));
5686
uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[1]), vreinterpret_u16_u8(_src67t_a.val[1]));
5688
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
5689
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
5690
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
5691
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
5693
uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
5694
uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
5695
uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
5696
uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
5698
uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0]));
5699
uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0]));
5700
uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1]));
5701
uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1]));
5703
uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[0]), vreinterpret_u32_u16(_src46tt_a.val[0]));
5704
uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[0]), vreinterpret_u32_u16(_src57tt_a.val[0]));
5705
uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[1]), vreinterpret_u32_u16(_src46tt_a.val[1]));
5706
uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[1]), vreinterpret_u32_u16(_src57tt_a.val[1]));
5717
_dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
5718
_dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
5719
_dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
5720
_dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
5721
_dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
5722
_dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
5723
_dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
5724
_dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
5726
_dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
5727
_dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
5728
_dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
5729
_dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
5730
_dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
5731
_dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
5732
_dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
5733
_dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
5735
_dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
5736
_dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
5737
_dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
5738
_dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
5739
_dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
5740
_dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
5741
_dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
5742
_dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
5744
_dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]);
5745
_dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]);
5746
_dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]);
5747
_dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]);
5748
_dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]);
5749
_dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]);
5750
_dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]);
5751
_dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]);
5753
vst4_u8(dst7, _dst0);
5754
vst4_u8(dst6, _dst1);
5755
vst4_u8(dst7 + dst_step, _dst2);
5756
vst4_u8(dst6 + dst_step, _dst3);
5757
vst4_u8(dst7 + 2 * dst_step, _dst4);
5758
vst4_u8(dst6 + 2 * dst_step, _dst5);
5759
vst4_u8(dst7 + 3 * dst_step, _dst6);
5760
vst4_u8(dst6 + 3 * dst_step, _dst7);
5765
dst7 += 4 * dst_step;
5766
dst6 += 4 * dst_step;
5774
"vld4.u8 {d0-d3}, [%1], %10 \n"
5777
"vld4.u8 {d4-d7}, [%2], %10 \n"
5780
"vld4.u8 {d8-d11}, [%1], %10 \n"
5782
"vtrn.u8 q0, q2 \n" // _src01t_r
5786
"vld4.u8 {d12-d15}, [%2], %10\n"
5789
"vld4.u8 {d16-d19}, [%1], %10\n"
5791
"vtrn.u8 q4, q6 \n" // _src23t_r
5795
"vld4.u8 {d20-d23}, [%2], %10\n"
5798
"vld4.u8 {d24-d27}, [%1], %10\n"
5800
"vtrn.u8 q8, q10 \n" // _src45t_r
5801
"vtrn.u8 q9, q11 \n"
5804
"vld4.u8 {d28-d31}, [%2], %10\n"
5806
"vtrn.u8 q12, q14 \n" // _src67t_r
5807
"vtrn.u8 q13, q15 \n"
5809
"sub %1, %1, %10, lsl #2 \n" // restore src0
5811
"vtrn.u16 q0, q4 \n" // _src02tt_r
5812
"vtrn.u16 q1, q5 \n"
5814
"sub %2, %2, %10, lsl #2 \n" // restore src1
5816
"vtrn.u16 q2, q6 \n" // _src13tt_r
5817
"vtrn.u16 q3, q7 \n"
5819
"add %1, #32 \n" // src0 += 32
5821
"vtrn.u16 q8, q12 \n" // _src46tt_r
5822
"vtrn.u16 q9, q13 \n"
5824
"add %2, #32 \n" // src1 += 32
5826
"vtrn.u16 q10, q14 \n" // _src57tt_r
5827
"vtrn.u16 q11, q15 \n"
5829
"vtrn.u32 q0, q8 \n" // _src04ttt_r
5830
"vtrn.u32 q1, q9 \n"
5832
"vtrn.u32 q2, q10 \n" // _src15ttt_r
5833
"vst4.u8 {d0-d3}, [%3], %11 \n"
5834
"vtrn.u32 q3, q11 \n"
5836
"vtrn.u32 q4, q12 \n" // _src26ttt_r
5837
"vst4.u8 {d4-d7}, [%4], %11 \n"
5838
"vtrn.u32 q5, q13 \n"
5840
"vtrn.u32 q6, q14 \n" // _src37ttt_r
5841
"vst4.u8 {d8-d11}, [%3], %11 \n"
5842
"vtrn.u32 q7, q15 \n"
5846
"vst4.u8 {d16-d19}, [%3], %11\n"
5847
"vst4.u8 {d12-d15}, [%4], %11\n"
5848
"vst4.u8 {d20-d23}, [%4], %11\n"
5849
"vst4.u8 {d24-d27}, [%3], %11\n"
5850
"vst4.u8 {d28-d31}, [%4], %11\n"
5863
"r"(src_step), // %10
5864
"r"(dst_step) // %11
5865
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
5867
#endif // __aarch64__
5868
for (; remain > 0; remain--)
5878
dst7[8] = src0[0 + src_step];
5879
dst7[9] = src0[1 + src_step];
5880
dst7[10] = src0[2 + src_step];
5881
dst7[11] = src0[3 + src_step];
5882
dst7[12] = src1[0 + src_step];
5883
dst7[13] = src1[1 + src_step];
5884
dst7[14] = src1[2 + src_step];
5885
dst7[15] = src1[3 + src_step];
5886
dst7[16] = src0[0 + 2 * src_step];
5887
dst7[17] = src0[1 + 2 * src_step];
5888
dst7[18] = src0[2 + 2 * src_step];
5889
dst7[19] = src0[3 + 2 * src_step];
5890
dst7[20] = src1[0 + 2 * src_step];
5891
dst7[21] = src1[1 + 2 * src_step];
5892
dst7[22] = src1[2 + 2 * src_step];
5893
dst7[23] = src1[3 + 2 * src_step];
5894
dst7[24] = src0[0 + 3 * src_step];
5895
dst7[25] = src0[1 + 3 * src_step];
5896
dst7[26] = src0[2 + 3 * src_step];
5897
dst7[27] = src0[3 + 3 * src_step];
5898
dst7[28] = src1[0 + 3 * src_step];
5899
dst7[29] = src1[1 + 3 * src_step];
5900
dst7[30] = src1[2 + 3 * src_step];
5901
dst7[31] = src1[3 + 3 * src_step];
5909
src0 += srcwgap + 7 * srcstride;
5912
for (; y < srch; y++)
5914
unsigned char* dst0 = dstend + y * 4;
5917
for (; x < srcw; x++)
5932
void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type)
5934
return kanna_rotate_c1(src, srcw, srch, srcw, dst, w, h, w, type);
5937
void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type)
5939
return kanna_rotate_c2(src, srcw, srch, srcw * 2, dst, w, h, w * 2, type);
5942
void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type)
5944
return kanna_rotate_c3(src, srcw, srch, srcw * 3, dst, w, h, w * 3, type);
5947
void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type)
5949
return kanna_rotate_c4(src, srcw, srch, srcw * 4, dst, w, h, w * 4, type);
5952
void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type)
5954
// assert srcw == w && srch == h for type 1234
5955
// assert srcw == h && srch == w for type 5678
5960
kanna_rotate_1_c1(src, srcw, srch, srcstride, dst, w, h, stride);
5963
kanna_rotate_2_c1(src, srcw, srch, srcstride, dst, w, h, stride);
5966
kanna_rotate_3_c1(src, srcw, srch, srcstride, dst, w, h, stride);
5969
kanna_rotate_4_c1(src, srcw, srch, srcstride, dst, w, h, stride);
5972
kanna_rotate_5_c1(src, srcw, srch, srcstride, dst, w, h, stride);
5975
kanna_rotate_6_c1(src, srcw, srch, srcstride, dst, w, h, stride);
5978
kanna_rotate_7_c1(src, srcw, srch, srcstride, dst, w, h, stride);
5981
kanna_rotate_8_c1(src, srcw, srch, srcstride, dst, w, h, stride);
5984
// unsupported rotate type
5989
void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type)
5991
// assert srcw == w && srch == h for type 1234
5992
// assert srcw == h && srch == w for type 5678
5997
kanna_rotate_1_c2(src, srcw, srch, srcstride, dst, w, h, stride);
6000
kanna_rotate_2_c2(src, srcw, srch, srcstride, dst, w, h, stride);
6003
kanna_rotate_3_c2(src, srcw, srch, srcstride, dst, w, h, stride);
6006
kanna_rotate_4_c2(src, srcw, srch, srcstride, dst, w, h, stride);
6009
kanna_rotate_5_c2(src, srcw, srch, srcstride, dst, w, h, stride);
6012
kanna_rotate_6_c2(src, srcw, srch, srcstride, dst, w, h, stride);
6015
kanna_rotate_7_c2(src, srcw, srch, srcstride, dst, w, h, stride);
6018
kanna_rotate_8_c2(src, srcw, srch, srcstride, dst, w, h, stride);
6021
// unsupported rotate type
6026
void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type)
6028
// assert srcw == w && srch == h for type 1234
6029
// assert srcw == h && srch == w for type 5678
6034
kanna_rotate_1_c3(src, srcw, srch, srcstride, dst, w, h, stride);
6037
kanna_rotate_2_c3(src, srcw, srch, srcstride, dst, w, h, stride);
6040
kanna_rotate_3_c3(src, srcw, srch, srcstride, dst, w, h, stride);
6043
kanna_rotate_4_c3(src, srcw, srch, srcstride, dst, w, h, stride);
6046
kanna_rotate_5_c3(src, srcw, srch, srcstride, dst, w, h, stride);
6049
kanna_rotate_6_c3(src, srcw, srch, srcstride, dst, w, h, stride);
6052
kanna_rotate_7_c3(src, srcw, srch, srcstride, dst, w, h, stride);
6055
kanna_rotate_8_c3(src, srcw, srch, srcstride, dst, w, h, stride);
6058
// unsupported rotate type
6063
void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type)
6065
// assert srcw == w && srch == h for type 1234
6066
// assert srcw == h && srch == w for type 5678
6071
kanna_rotate_1_c4(src, srcw, srch, srcstride, dst, w, h, stride);
6074
kanna_rotate_2_c4(src, srcw, srch, srcstride, dst, w, h, stride);
6077
kanna_rotate_3_c4(src, srcw, srch, srcstride, dst, w, h, stride);
6080
kanna_rotate_4_c4(src, srcw, srch, srcstride, dst, w, h, stride);
6083
kanna_rotate_5_c4(src, srcw, srch, srcstride, dst, w, h, stride);
6086
kanna_rotate_6_c4(src, srcw, srch, srcstride, dst, w, h, stride);
6089
kanna_rotate_7_c4(src, srcw, srch, srcstride, dst, w, h, stride);
6092
kanna_rotate_8_c4(src, srcw, srch, srcstride, dst, w, h, stride);
6095
// unsupported rotate type
6100
void kanna_rotate_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type)
6102
// assert srcw % 2 == 0
6103
// assert srch % 2 == 0
6104
// assert w % 2 == 0
6105
// assert h % 2 == 0
6107
const unsigned char* srcY = src;
6108
unsigned char* dstY = dst;
6109
kanna_rotate_c1(srcY, srcw, srch, dstY, w, h, type);
6111
const unsigned char* srcUV = src + srcw * srch;
6112
unsigned char* dstUV = dst + w * h;
6113
kanna_rotate_c2(srcUV, srcw / 2, srch / 2, dstUV, w / 2, h / 2, type);
6115
#endif // NCNN_PIXEL_ROTATE