1
// Tencent is pleased to support the open source community by making ncnn available.
3
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
8
// https://opensource.org/licenses/BSD-3-Clause
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
26
void get_rotation_matrix(float angle, float scale, float dx, float dy, float* tm)
28
angle *= (float)(3.14159265358979323846 / 180);
29
float alpha = cosf(angle) * scale;
30
float beta = sinf(angle) * scale;
34
tm[2] = (1.f - alpha) * dx - beta * dy;
37
tm[5] = beta * dx + (1.f - alpha) * dy;
40
void get_affine_transform(const float* points_from, const float* points_to, int num_point, float* tm)
42
float ma[4][4] = {{0.f}};
46
for (int i = 0; i < num_point; i++)
48
ma[0][0] += points_from[0] * points_from[0] + points_from[1] * points_from[1];
49
ma[0][2] += points_from[0];
50
ma[0][3] += points_from[1];
52
mb[0] += points_from[0] * points_to[0] + points_from[1] * points_to[1];
53
mb[1] += points_from[0] * points_to[1] - points_from[1] * points_to[0];
54
mb[2] += points_to[0];
55
mb[3] += points_to[1];
62
ma[2][1] = ma[1][2] = -ma[0][3];
63
ma[3][1] = ma[1][3] = ma[2][0] = ma[0][2];
64
ma[2][2] = ma[3][3] = (float)num_point;
68
// matrix 4x4 invert by https://github.com/willnode/N-Matrix-Programmer
69
// suppose the user provide valid points combination
70
// I have not taken det == zero into account here :> --- nihui
76
float A2323 = ma[2][2] * ma[3][3] - ma[2][3] * ma[3][2];
77
float A1323 = ma[2][1] * ma[3][3] - ma[2][3] * ma[3][1];
78
float A1223 = ma[2][1] * ma[3][2] - ma[2][2] * ma[3][1];
79
float A0323 = ma[2][0] * ma[3][3] - ma[2][3] * ma[3][0];
80
float A0223 = ma[2][0] * ma[3][2] - ma[2][2] * ma[3][0];
81
float A0123 = ma[2][0] * ma[3][1] - ma[2][1] * ma[3][0];
82
float A2313 = ma[1][2] * ma[3][3] - ma[1][3] * ma[3][2];
83
float A1313 = ma[1][1] * ma[3][3] - ma[1][3] * ma[3][1];
84
float A1213 = ma[1][1] * ma[3][2] - ma[1][2] * ma[3][1];
85
float A2312 = ma[1][2] * ma[2][3] - ma[1][3] * ma[2][2];
86
float A1312 = ma[1][1] * ma[2][3] - ma[1][3] * ma[2][1];
87
float A1212 = ma[1][1] * ma[2][2] - ma[1][2] * ma[2][1];
88
float A0313 = ma[1][0] * ma[3][3] - ma[1][3] * ma[3][0];
89
float A0213 = ma[1][0] * ma[3][2] - ma[1][2] * ma[3][0];
90
float A0312 = ma[1][0] * ma[2][3] - ma[1][3] * ma[2][0];
91
float A0212 = ma[1][0] * ma[2][2] - ma[1][2] * ma[2][0];
92
float A0113 = ma[1][0] * ma[3][1] - ma[1][1] * ma[3][0];
93
float A0112 = ma[1][0] * ma[2][1] - ma[1][1] * ma[2][0];
95
det = ma[0][0] * (ma[1][1] * A2323 - ma[1][2] * A1323 + ma[1][3] * A1223)
96
- ma[0][1] * (ma[1][0] * A2323 - ma[1][2] * A0323 + ma[1][3] * A0223)
97
+ ma[0][2] * (ma[1][0] * A1323 - ma[1][1] * A0323 + ma[1][3] * A0123)
98
- ma[0][3] * (ma[1][0] * A1223 - ma[1][1] * A0223 + ma[1][2] * A0123);
102
mai[0][0] = (ma[1][1] * A2323 - ma[1][2] * A1323 + ma[1][3] * A1223);
103
mai[0][1] = - (ma[0][1] * A2323 - ma[0][2] * A1323 + ma[0][3] * A1223);
104
mai[0][2] = (ma[0][1] * A2313 - ma[0][2] * A1313 + ma[0][3] * A1213);
105
mai[0][3] = - (ma[0][1] * A2312 - ma[0][2] * A1312 + ma[0][3] * A1212);
106
mai[1][0] = - (ma[1][0] * A2323 - ma[1][2] * A0323 + ma[1][3] * A0223);
107
mai[1][1] = (ma[0][0] * A2323 - ma[0][2] * A0323 + ma[0][3] * A0223);
108
mai[1][2] = - (ma[0][0] * A2313 - ma[0][2] * A0313 + ma[0][3] * A0213);
109
mai[1][3] = (ma[0][0] * A2312 - ma[0][2] * A0312 + ma[0][3] * A0212);
110
mai[2][0] = (ma[1][0] * A1323 - ma[1][1] * A0323 + ma[1][3] * A0123);
111
mai[2][1] = - (ma[0][0] * A1323 - ma[0][1] * A0323 + ma[0][3] * A0123);
112
mai[2][2] = (ma[0][0] * A1313 - ma[0][1] * A0313 + ma[0][3] * A0113);
113
mai[2][3] = - (ma[0][0] * A1312 - ma[0][1] * A0312 + ma[0][3] * A0112);
114
mai[3][0] = - (ma[1][0] * A1223 - ma[1][1] * A0223 + ma[1][2] * A0123);
115
mai[3][1] = (ma[0][0] * A1223 - ma[0][1] * A0223 + ma[0][2] * A0123);
116
mai[3][2] = - (ma[0][0] * A1213 - ma[0][1] * A0213 + ma[0][2] * A0113);
117
mai[3][3] = (ma[0][0] * A1212 - ma[0][1] * A0212 + ma[0][2] * A0112);
122
mm[0] = det * (mai[0][0] * mb[0] + mai[0][1] * mb[1] + mai[0][2] * mb[2] + mai[0][3] * mb[3]);
123
mm[1] = det * (mai[1][0] * mb[0] + mai[1][1] * mb[1] + mai[1][2] * mb[2] + mai[1][3] * mb[3]);
124
mm[2] = det * (mai[2][0] * mb[0] + mai[2][1] * mb[1] + mai[2][2] * mb[2] + mai[2][3] * mb[3]);
125
mm[3] = det * (mai[3][0] * mb[0] + mai[3][1] * mb[1] + mai[3][2] * mb[2] + mai[3][3] * mb[3]);
127
tm[0] = tm[4] = mm[0];
134
void invert_affine_transform(const float* tm, float* tm_inv)
136
float D = tm[0] * tm[4] - tm[1] * tm[3];
137
D = D != 0.f ? 1.f / D : 0.f;
139
float A11 = tm[4] * D;
140
float A22 = tm[0] * D;
141
float A12 = -tm[1] * D;
142
float A21 = -tm[3] * D;
143
float b1 = -A11 * tm[2] - A12 * tm[5];
144
float b2 = -A21 * tm[2] - A22 * tm[5];
154
void warpaffine_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type, unsigned int v)
156
return warpaffine_bilinear_c1(src, srcw, srch, srcw, dst, w, h, w, tm, type, v);
159
void warpaffine_bilinear_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type, unsigned int v)
161
return warpaffine_bilinear_c2(src, srcw, srch, srcw * 2, dst, w, h, w * 2, tm, type, v);
164
void warpaffine_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type, unsigned int v)
166
return warpaffine_bilinear_c3(src, srcw, srch, srcw * 3, dst, w, h, w * 3, tm, type, v);
169
void warpaffine_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type, unsigned int v)
171
return warpaffine_bilinear_c4(src, srcw, srch, srcw * 4, dst, w, h, w * 4, tm, type, v);
174
void warpaffine_bilinear_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type, unsigned int v)
176
const unsigned char* border_color = (const unsigned char*)&v;
177
const int wgap = stride - w;
179
const unsigned char* src0 = src;
180
unsigned char* dst0 = dst;
182
#define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X), SHRT_MIN), SHRT_MAX)
183
#define SATURATE_CAST_INT(X) (int)::std::min(::std::max((int)((X) + ((X) >= 0.f ? 0.5f : -0.5f)), INT_MIN), INT_MAX)
185
std::vector<int> adelta(w);
186
std::vector<int> bdelta(w);
187
for (int x = 0; x < w; x++)
189
adelta[x] = SATURATE_CAST_INT(tm[0] * x * (1 << 10));
190
bdelta[x] = SATURATE_CAST_INT(tm[3] * x * (1 << 10));
196
int X0 = SATURATE_CAST_INT((tm[1] * y + tm[2]) * (1 << 10));
197
int Y0 = SATURATE_CAST_INT((tm[4] * y + tm[5]) * (1 << 10));
200
for (; x + 7 < w; x += 8)
204
int X_0 = X0 + adelta[x];
205
int Y_0 = Y0 + bdelta[x];
206
int X_7 = X0 + adelta[x + 7];
207
int Y_7 = Y0 + bdelta[x + 7];
209
short sx_0 = SATURATE_CAST_SHORT((X_0 >> 10));
210
short sy_0 = SATURATE_CAST_SHORT((Y_0 >> 10));
211
short sx_7 = SATURATE_CAST_SHORT((X_7 >> 10));
212
short sy_7 = SATURATE_CAST_SHORT((Y_7 >> 10));
214
if (((unsigned short)sx_0 < srcw - 1 && (unsigned short)sy_0 < srch - 1) && ((unsigned short)sx_7 < srcw - 1 && (unsigned short)sy_7 < srch - 1))
219
else if ((sx_0 < -1 && sx_7 < -1) || (sx_0 >= srcw && sx_7 >= srcw) || (sy_0 < -1 && sy_7 < -1) || (sy_0 >= srch && sy_7 >= srch))
230
int32x4_t _Xl = vaddq_s32(vdupq_n_s32(X0), vld1q_s32(adelta.data() + x));
231
int32x4_t _Xh = vaddq_s32(vdupq_n_s32(X0), vld1q_s32(adelta.data() + x + 4));
232
int32x4_t _Yl = vaddq_s32(vdupq_n_s32(Y0), vld1q_s32(bdelta.data() + x));
233
int32x4_t _Yh = vaddq_s32(vdupq_n_s32(Y0), vld1q_s32(bdelta.data() + x + 4));
235
int16x4_t _sxl = vqshrn_n_s32(_Xl, 10);
236
int16x4_t _sxh = vqshrn_n_s32(_Xh, 10);
237
int16x4_t _syl = vqshrn_n_s32(_Yl, 10);
238
int16x4_t _syh = vqshrn_n_s32(_Yh, 10);
240
uint32x4_t _v1024m1 = vdupq_n_u32((1 << 10) - 1);
241
uint16x8_t _fx = vcombine_u16(vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Xl), _v1024m1)), vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Xh), _v1024m1)));
242
uint16x8_t _fy = vcombine_u16(vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Yl), _v1024m1)), vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Yh), _v1024m1)));
244
uint16x8_t _alpha0 = vsubq_u16(vdupq_n_u16(1 << 10), _fx);
245
uint16x8_t _alpha1 = _fx;
246
uint16x8_t _beta0 = vsubq_u16(vdupq_n_u16(1 << 10), _fy);
247
uint16x8_t _beta1 = _fy;
249
int16x4_t _srcstride = vdup_n_s16(srcstride);
251
int32x4_t _a0l = vaddw_s16(vmull_s16(_srcstride, _syl), _sxl);
252
int32x4_t _a0h = vaddw_s16(vmull_s16(_srcstride, _syh), _sxh);
253
int32x4_t _b0l = vaddw_s16(_a0l, _srcstride);
254
int32x4_t _b0h = vaddw_s16(_a0h, _srcstride);
256
uint8x8x2_t _a0a1 = uint8x8x2_t();
257
uint8x8x2_t _b0b1 = uint8x8x2_t();
259
_a0a1 = vld2_lane_u8(src0 + vgetq_lane_s32(_a0l, 0), _a0a1, 0);
260
_b0b1 = vld2_lane_u8(src0 + vgetq_lane_s32(_b0l, 0), _b0b1, 0);
262
_a0a1 = vld2_lane_u8(src0 + vgetq_lane_s32(_a0l, 1), _a0a1, 1);
263
_b0b1 = vld2_lane_u8(src0 + vgetq_lane_s32(_b0l, 1), _b0b1, 1);
265
_a0a1 = vld2_lane_u8(src0 + vgetq_lane_s32(_a0l, 2), _a0a1, 2);
266
_b0b1 = vld2_lane_u8(src0 + vgetq_lane_s32(_b0l, 2), _b0b1, 2);
268
_a0a1 = vld2_lane_u8(src0 + vgetq_lane_s32(_a0l, 3), _a0a1, 3);
269
_b0b1 = vld2_lane_u8(src0 + vgetq_lane_s32(_b0l, 3), _b0b1, 3);
271
_a0a1 = vld2_lane_u8(src0 + vgetq_lane_s32(_a0h, 0), _a0a1, 4);
272
_b0b1 = vld2_lane_u8(src0 + vgetq_lane_s32(_b0h, 0), _b0b1, 4);
274
_a0a1 = vld2_lane_u8(src0 + vgetq_lane_s32(_a0h, 1), _a0a1, 5);
275
_b0b1 = vld2_lane_u8(src0 + vgetq_lane_s32(_b0h, 1), _b0b1, 5);
277
_a0a1 = vld2_lane_u8(src0 + vgetq_lane_s32(_a0h, 2), _a0a1, 6);
278
_b0b1 = vld2_lane_u8(src0 + vgetq_lane_s32(_b0h, 2), _b0b1, 6);
280
_a0a1 = vld2_lane_u8(src0 + vgetq_lane_s32(_a0h, 3), _a0a1, 7);
281
_b0b1 = vld2_lane_u8(src0 + vgetq_lane_s32(_b0h, 3), _b0b1, 7);
284
uint16x8_t _a0_0 = vmovl_u8(_a0a1.val[0]);
285
uint16x8_t _a1_0 = vmovl_u8(_a0a1.val[1]);
286
uint16x8_t _b0_0 = vmovl_u8(_b0b1.val[0]);
287
uint16x8_t _b1_0 = vmovl_u8(_b0b1.val[1]);
289
uint16x4_t _a00_0l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_a0_0), vget_low_u16(_alpha0)), vget_low_u16(_a1_0), vget_low_u16(_alpha1)), 5);
290
uint16x4_t _a00_0h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_a0_0), vget_high_u16(_alpha0)), vget_high_u16(_a1_0), vget_high_u16(_alpha1)), 5);
291
uint16x4_t _b00_0l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_b0_0), vget_low_u16(_alpha0)), vget_low_u16(_b1_0), vget_low_u16(_alpha1)), 5);
292
uint16x4_t _b00_0h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_b0_0), vget_high_u16(_alpha0)), vget_high_u16(_b1_0), vget_high_u16(_alpha1)), 5);
294
uint16x4_t _dst_0l = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_0l, vget_low_u16(_beta0)), _b00_0l, vget_low_u16(_beta1)), 15);
295
uint16x4_t _dst_0h = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_0h, vget_high_u16(_beta0)), _b00_0h, vget_high_u16(_beta1)), 15);
297
uint8x8_t _dst = vqmovn_u16(vcombine_u16(_dst_0l, _dst_0h));
303
for (int xi = 0; xi < 8; xi++)
305
int X = X0 + adelta[x + xi];
306
int Y = Y0 + bdelta[x + xi];
308
short sx = SATURATE_CAST_SHORT((X >> 10));
309
short sy = SATURATE_CAST_SHORT((Y >> 10));
311
short fx = X & ((1 << 10) - 1);
312
short fy = Y & ((1 << 10) - 1);
314
short alpha0 = (1 << 10) - fx;
317
short beta0 = (1 << 10) - fy;
320
const unsigned char* a0 = src0 + srcstride * sy + sx;
321
const unsigned char* a1 = src0 + srcstride * sy + sx + 1;
322
const unsigned char* b0 = src0 + srcstride * (sy + 1) + sx;
323
const unsigned char* b1 = src0 + srcstride * (sy + 1) + sx + 1;
325
dst0[0] = (unsigned char)(((((unsigned short)((a0[0] * alpha0 + a1[0] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[0] * alpha0 + b1[0] * alpha1) >> 5) * beta1))) >> 15);
331
else if (sxy_inout == 2)
337
uint8x8_t _border_color = vdup_n_u8(border_color[0]);
338
vst1_u8(dst0, _border_color);
340
for (int xi = 0; xi < 8; xi++)
342
dst0[xi] = border_color[0];
353
else // if (sxy_inout == 0)
355
for (int xi = 0; xi < 8; xi++)
357
int X = X0 + adelta[x + xi];
358
int Y = Y0 + bdelta[x + xi];
360
short sx = SATURATE_CAST_SHORT((X >> 10));
361
short sy = SATURATE_CAST_SHORT((Y >> 10));
363
if (type != -233 && (sx < -1 || sx >= srcw || sy < -1 || sy >= srch))
365
dst0[0] = border_color[0];
367
else if (type == -233 && ((unsigned short)sx >= srcw - 1 || (unsigned short)sy >= srch - 1))
373
short fx = X & ((1 << 10) - 1);
374
short fy = Y & ((1 << 10) - 1);
376
short alpha0 = (1 << 10) - fx;
379
short beta0 = (1 << 10) - fy;
385
const unsigned char* a0 = src0 + srcstride * sy + sx;
386
const unsigned char* a1 = src0 + srcstride * sy + sx + 1;
387
const unsigned char* b0 = src0 + srcstride * (sy + 1) + sx;
388
const unsigned char* b1 = src0 + srcstride * (sy + 1) + sx + 1;
390
if ((unsigned short)sx >= srcw || (unsigned short)sy >= srch)
392
a0 = type != -233 ? border_color : dst0;
394
if ((unsigned short)sx1 >= srcw || (unsigned short)sy >= srch)
396
a1 = type != -233 ? border_color : dst0;
398
if ((unsigned short)sx >= srcw || (unsigned short)sy1 >= srch)
400
b0 = type != -233 ? border_color : dst0;
402
if ((unsigned short)sx1 >= srcw || (unsigned short)sy1 >= srch)
404
b1 = type != -233 ? border_color : dst0;
407
dst0[0] = (unsigned char)(((((unsigned short)((a0[0] * alpha0 + a1[0] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[0] * alpha0 + b1[0] * alpha1) >> 5) * beta1))) >> 15);
416
int X = X0 + adelta[x];
417
int Y = Y0 + bdelta[x];
419
short sx = SATURATE_CAST_SHORT((X >> 10));
420
short sy = SATURATE_CAST_SHORT((Y >> 10));
422
if (type != -233 && (sx < -1 || sx >= srcw || sy < -1 || sy >= srch))
424
dst0[0] = border_color[0];
426
else if (type == -233 && ((unsigned short)sx >= srcw - 1 || (unsigned short)sy >= srch - 1))
432
short fx = X & ((1 << 10) - 1);
433
short fy = Y & ((1 << 10) - 1);
435
short alpha0 = (1 << 10) - fx;
438
short beta0 = (1 << 10) - fy;
444
const unsigned char* a0 = src0 + srcstride * sy + sx;
445
const unsigned char* a1 = src0 + srcstride * sy + sx + 1;
446
const unsigned char* b0 = src0 + srcstride * (sy + 1) + sx;
447
const unsigned char* b1 = src0 + srcstride * (sy + 1) + sx + 1;
449
if ((unsigned short)sx >= srcw || (unsigned short)sy >= srch)
451
a0 = type != -233 ? border_color : dst0;
453
if ((unsigned short)sx1 >= srcw || (unsigned short)sy >= srch)
455
a1 = type != -233 ? border_color : dst0;
457
if ((unsigned short)sx >= srcw || (unsigned short)sy1 >= srch)
459
b0 = type != -233 ? border_color : dst0;
461
if ((unsigned short)sx1 >= srcw || (unsigned short)sy1 >= srch)
463
b1 = type != -233 ? border_color : dst0;
466
dst0[0] = (unsigned char)(((((unsigned short)((a0[0] * alpha0 + a1[0] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[0] * alpha0 + b1[0] * alpha1) >> 5) * beta1))) >> 15);
475
#undef SATURATE_CAST_SHORT
476
#undef SATURATE_CAST_INT
479
void warpaffine_bilinear_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type, unsigned int v)
481
const unsigned char* border_color = (const unsigned char*)&v;
482
const int wgap = stride - w * 2;
484
const unsigned char* src0 = src;
485
unsigned char* dst0 = dst;
487
#define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X), SHRT_MIN), SHRT_MAX)
488
#define SATURATE_CAST_INT(X) (int)::std::min(::std::max((int)((X) + ((X) >= 0.f ? 0.5f : -0.5f)), INT_MIN), INT_MAX)
490
std::vector<int> adelta(w);
491
std::vector<int> bdelta(w);
492
for (int x = 0; x < w; x++)
494
adelta[x] = SATURATE_CAST_INT(tm[0] * x * (1 << 10));
495
bdelta[x] = SATURATE_CAST_INT(tm[3] * x * (1 << 10));
501
int X0 = SATURATE_CAST_INT((tm[1] * y + tm[2]) * (1 << 10));
502
int Y0 = SATURATE_CAST_INT((tm[4] * y + tm[5]) * (1 << 10));
505
for (; x + 7 < w; x += 8)
509
int X_0 = X0 + adelta[x];
510
int Y_0 = Y0 + bdelta[x];
511
int X_7 = X0 + adelta[x + 7];
512
int Y_7 = Y0 + bdelta[x + 7];
514
short sx_0 = SATURATE_CAST_SHORT((X_0 >> 10));
515
short sy_0 = SATURATE_CAST_SHORT((Y_0 >> 10));
516
short sx_7 = SATURATE_CAST_SHORT((X_7 >> 10));
517
short sy_7 = SATURATE_CAST_SHORT((Y_7 >> 10));
519
if (((unsigned short)sx_0 < srcw - 1 && (unsigned short)sy_0 < srch - 1) && ((unsigned short)sx_7 < srcw - 1 && (unsigned short)sy_7 < srch - 1))
524
else if ((sx_0 < -1 && sx_7 < -1) || (sx_0 >= srcw && sx_7 >= srcw) || (sy_0 < -1 && sy_7 < -1) || (sy_0 >= srch && sy_7 >= srch))
535
int32x4_t _Xl = vaddq_s32(vdupq_n_s32(X0), vld1q_s32(adelta.data() + x));
536
int32x4_t _Xh = vaddq_s32(vdupq_n_s32(X0), vld1q_s32(adelta.data() + x + 4));
537
int32x4_t _Yl = vaddq_s32(vdupq_n_s32(Y0), vld1q_s32(bdelta.data() + x));
538
int32x4_t _Yh = vaddq_s32(vdupq_n_s32(Y0), vld1q_s32(bdelta.data() + x + 4));
540
int16x4_t _sxl = vqshrn_n_s32(_Xl, 10);
541
int16x4_t _sxh = vqshrn_n_s32(_Xh, 10);
542
int16x4_t _syl = vqshrn_n_s32(_Yl, 10);
543
int16x4_t _syh = vqshrn_n_s32(_Yh, 10);
545
uint32x4_t _v1024m1 = vdupq_n_u32((1 << 10) - 1);
546
uint16x8_t _fx = vcombine_u16(vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Xl), _v1024m1)), vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Xh), _v1024m1)));
547
uint16x8_t _fy = vcombine_u16(vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Yl), _v1024m1)), vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Yh), _v1024m1)));
549
uint16x8_t _alpha0 = vsubq_u16(vdupq_n_u16(1 << 10), _fx);
550
uint16x8_t _alpha1 = _fx;
551
uint16x8_t _beta0 = vsubq_u16(vdupq_n_u16(1 << 10), _fy);
552
uint16x8_t _beta1 = _fy;
554
int16x4_t _srcstride = vdup_n_s16(srcstride);
555
int16x4_t _v2 = vdup_n_s16(2);
557
int32x4_t _a0l = vmlal_s16(vmull_s16(_srcstride, _syl), _sxl, _v2);
558
int32x4_t _a0h = vmlal_s16(vmull_s16(_srcstride, _syh), _sxh, _v2);
559
int32x4_t _b0l = vaddw_s16(_a0l, _srcstride);
560
int32x4_t _b0h = vaddw_s16(_a0h, _srcstride);
562
uint8x8x4_t _a0a1 = uint8x8x4_t();
563
uint8x8x4_t _b0b1 = uint8x8x4_t();
565
_a0a1 = vld4_lane_u8(src0 + vgetq_lane_s32(_a0l, 0), _a0a1, 0);
566
_b0b1 = vld4_lane_u8(src0 + vgetq_lane_s32(_b0l, 0), _b0b1, 0);
568
_a0a1 = vld4_lane_u8(src0 + vgetq_lane_s32(_a0l, 1), _a0a1, 1);
569
_b0b1 = vld4_lane_u8(src0 + vgetq_lane_s32(_b0l, 1), _b0b1, 1);
571
_a0a1 = vld4_lane_u8(src0 + vgetq_lane_s32(_a0l, 2), _a0a1, 2);
572
_b0b1 = vld4_lane_u8(src0 + vgetq_lane_s32(_b0l, 2), _b0b1, 2);
574
_a0a1 = vld4_lane_u8(src0 + vgetq_lane_s32(_a0l, 3), _a0a1, 3);
575
_b0b1 = vld4_lane_u8(src0 + vgetq_lane_s32(_b0l, 3), _b0b1, 3);
577
_a0a1 = vld4_lane_u8(src0 + vgetq_lane_s32(_a0h, 0), _a0a1, 4);
578
_b0b1 = vld4_lane_u8(src0 + vgetq_lane_s32(_b0h, 0), _b0b1, 4);
580
_a0a1 = vld4_lane_u8(src0 + vgetq_lane_s32(_a0h, 1), _a0a1, 5);
581
_b0b1 = vld4_lane_u8(src0 + vgetq_lane_s32(_b0h, 1), _b0b1, 5);
583
_a0a1 = vld4_lane_u8(src0 + vgetq_lane_s32(_a0h, 2), _a0a1, 6);
584
_b0b1 = vld4_lane_u8(src0 + vgetq_lane_s32(_b0h, 2), _b0b1, 6);
586
_a0a1 = vld4_lane_u8(src0 + vgetq_lane_s32(_a0h, 3), _a0a1, 7);
587
_b0b1 = vld4_lane_u8(src0 + vgetq_lane_s32(_b0h, 3), _b0b1, 7);
590
uint16x8_t _a0_0 = vmovl_u8(_a0a1.val[0]);
591
uint16x8_t _a0_1 = vmovl_u8(_a0a1.val[1]);
592
uint16x8_t _a1_0 = vmovl_u8(_a0a1.val[2]);
593
uint16x8_t _a1_1 = vmovl_u8(_a0a1.val[3]);
594
uint16x8_t _b0_0 = vmovl_u8(_b0b1.val[0]);
595
uint16x8_t _b0_1 = vmovl_u8(_b0b1.val[1]);
596
uint16x8_t _b1_0 = vmovl_u8(_b0b1.val[2]);
597
uint16x8_t _b1_1 = vmovl_u8(_b0b1.val[3]);
599
uint16x4_t _a00_0l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_a0_0), vget_low_u16(_alpha0)), vget_low_u16(_a1_0), vget_low_u16(_alpha1)), 5);
600
uint16x4_t _a00_1l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_a0_1), vget_low_u16(_alpha0)), vget_low_u16(_a1_1), vget_low_u16(_alpha1)), 5);
601
uint16x4_t _a00_0h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_a0_0), vget_high_u16(_alpha0)), vget_high_u16(_a1_0), vget_high_u16(_alpha1)), 5);
602
uint16x4_t _a00_1h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_a0_1), vget_high_u16(_alpha0)), vget_high_u16(_a1_1), vget_high_u16(_alpha1)), 5);
603
uint16x4_t _b00_0l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_b0_0), vget_low_u16(_alpha0)), vget_low_u16(_b1_0), vget_low_u16(_alpha1)), 5);
604
uint16x4_t _b00_1l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_b0_1), vget_low_u16(_alpha0)), vget_low_u16(_b1_1), vget_low_u16(_alpha1)), 5);
605
uint16x4_t _b00_0h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_b0_0), vget_high_u16(_alpha0)), vget_high_u16(_b1_0), vget_high_u16(_alpha1)), 5);
606
uint16x4_t _b00_1h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_b0_1), vget_high_u16(_alpha0)), vget_high_u16(_b1_1), vget_high_u16(_alpha1)), 5);
608
uint16x4_t _dst_0l = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_0l, vget_low_u16(_beta0)), _b00_0l, vget_low_u16(_beta1)), 15);
609
uint16x4_t _dst_1l = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_1l, vget_low_u16(_beta0)), _b00_1l, vget_low_u16(_beta1)), 15);
610
uint16x4_t _dst_0h = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_0h, vget_high_u16(_beta0)), _b00_0h, vget_high_u16(_beta1)), 15);
611
uint16x4_t _dst_1h = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_1h, vget_high_u16(_beta0)), _b00_1h, vget_high_u16(_beta1)), 15);
614
_dst.val[0] = vqmovn_u16(vcombine_u16(_dst_0l, _dst_0h));
615
_dst.val[1] = vqmovn_u16(vcombine_u16(_dst_1l, _dst_1h));
621
for (int xi = 0; xi < 8; xi++)
623
int X = X0 + adelta[x + xi];
624
int Y = Y0 + bdelta[x + xi];
626
short sx = SATURATE_CAST_SHORT((X >> 10));
627
short sy = SATURATE_CAST_SHORT((Y >> 10));
629
short fx = X & ((1 << 10) - 1);
630
short fy = Y & ((1 << 10) - 1);
632
short alpha0 = (1 << 10) - fx;
635
short beta0 = (1 << 10) - fy;
638
const unsigned char* a0 = src0 + srcstride * sy + sx * 2;
639
const unsigned char* a1 = src0 + srcstride * sy + sx * 2 + 2;
640
const unsigned char* b0 = src0 + srcstride * (sy + 1) + sx * 2;
641
const unsigned char* b1 = src0 + srcstride * (sy + 1) + sx * 2 + 2;
643
dst0[0] = (unsigned char)(((((unsigned short)((a0[0] * alpha0 + a1[0] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[0] * alpha0 + b1[0] * alpha1) >> 5) * beta1))) >> 15);
644
dst0[1] = (unsigned char)(((((unsigned short)((a0[1] * alpha0 + a1[1] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[1] * alpha0 + b1[1] * alpha1) >> 5) * beta1))) >> 15);
650
else if (sxy_inout == 2)
656
uint8x8x2_t _border_color;
657
_border_color.val[0] = vdup_n_u8(border_color[0]);
658
_border_color.val[1] = vdup_n_u8(border_color[1]);
660
vst2_u8(dst0, _border_color);
662
for (int xi = 0; xi < 8; xi++)
664
dst0[xi * 2] = border_color[0];
665
dst0[xi * 2 + 1] = border_color[1];
676
else // if (sxy_inout == 0)
678
for (int xi = 0; xi < 8; xi++)
680
int X = X0 + adelta[x + xi];
681
int Y = Y0 + bdelta[x + xi];
683
short sx = SATURATE_CAST_SHORT((X >> 10));
684
short sy = SATURATE_CAST_SHORT((Y >> 10));
686
if (type != -233 && (sx < -1 || sx >= srcw || sy < -1 || sy >= srch))
688
dst0[0] = border_color[0];
689
dst0[1] = border_color[1];
691
else if (type == -233 && ((unsigned short)sx >= srcw - 1 || (unsigned short)sy >= srch - 1))
697
short fx = X & ((1 << 10) - 1);
698
short fy = Y & ((1 << 10) - 1);
700
short alpha0 = (1 << 10) - fx;
703
short beta0 = (1 << 10) - fy;
709
const unsigned char* a0 = src0 + srcstride * sy + sx * 2;
710
const unsigned char* a1 = src0 + srcstride * sy + sx * 2 + 2;
711
const unsigned char* b0 = src0 + srcstride * (sy + 1) + sx * 2;
712
const unsigned char* b1 = src0 + srcstride * (sy + 1) + sx * 2 + 2;
714
if ((unsigned short)sx >= srcw || (unsigned short)sy >= srch)
716
a0 = type != -233 ? border_color : dst0;
718
if ((unsigned short)sx1 >= srcw || (unsigned short)sy >= srch)
720
a1 = type != -233 ? border_color : dst0;
722
if ((unsigned short)sx >= srcw || (unsigned short)sy1 >= srch)
724
b0 = type != -233 ? border_color : dst0;
726
if ((unsigned short)sx1 >= srcw || (unsigned short)sy1 >= srch)
728
b1 = type != -233 ? border_color : dst0;
731
dst0[0] = (unsigned char)(((((unsigned short)((a0[0] * alpha0 + a1[0] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[0] * alpha0 + b1[0] * alpha1) >> 5) * beta1))) >> 15);
732
dst0[1] = (unsigned char)(((((unsigned short)((a0[1] * alpha0 + a1[1] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[1] * alpha0 + b1[1] * alpha1) >> 5) * beta1))) >> 15);
741
int X = X0 + adelta[x];
742
int Y = Y0 + bdelta[x];
744
short sx = SATURATE_CAST_SHORT((X >> 10));
745
short sy = SATURATE_CAST_SHORT((Y >> 10));
747
if (type != -233 && (sx < -1 || sx >= srcw || sy < -1 || sy >= srch))
749
dst0[0] = border_color[0];
750
dst0[1] = border_color[1];
752
else if (type == -233 && ((unsigned short)sx >= srcw - 1 || (unsigned short)sy >= srch - 1))
758
short fx = X & ((1 << 10) - 1);
759
short fy = Y & ((1 << 10) - 1);
761
short alpha0 = (1 << 10) - fx;
764
short beta0 = (1 << 10) - fy;
770
const unsigned char* a0 = src0 + srcstride * sy + sx * 2;
771
const unsigned char* a1 = src0 + srcstride * sy + sx * 2 + 2;
772
const unsigned char* b0 = src0 + srcstride * (sy + 1) + sx * 2;
773
const unsigned char* b1 = src0 + srcstride * (sy + 1) + sx * 2 + 2;
775
if ((unsigned short)sx >= srcw || (unsigned short)sy >= srch)
777
a0 = type != -233 ? border_color : dst0;
779
if ((unsigned short)sx1 >= srcw || (unsigned short)sy >= srch)
781
a1 = type != -233 ? border_color : dst0;
783
if ((unsigned short)sx >= srcw || (unsigned short)sy1 >= srch)
785
b0 = type != -233 ? border_color : dst0;
787
if ((unsigned short)sx1 >= srcw || (unsigned short)sy1 >= srch)
789
b1 = type != -233 ? border_color : dst0;
792
dst0[0] = (unsigned char)(((((unsigned short)((a0[0] * alpha0 + a1[0] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[0] * alpha0 + b1[0] * alpha1) >> 5) * beta1))) >> 15);
793
dst0[1] = (unsigned char)(((((unsigned short)((a0[1] * alpha0 + a1[1] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[1] * alpha0 + b1[1] * alpha1) >> 5) * beta1))) >> 15);
802
#undef SATURATE_CAST_SHORT
803
#undef SATURATE_CAST_INT
806
void warpaffine_bilinear_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type, unsigned int v)
808
const unsigned char* border_color = (const unsigned char*)&v;
809
const int wgap = stride - w * 3;
811
const unsigned char* src0 = src;
812
unsigned char* dst0 = dst;
814
#define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X), SHRT_MIN), SHRT_MAX)
815
#define SATURATE_CAST_INT(X) (int)::std::min(::std::max((int)((X) + ((X) >= 0.f ? 0.5f : -0.5f)), INT_MIN), INT_MAX)
817
std::vector<int> adelta(w);
818
std::vector<int> bdelta(w);
819
for (int x = 0; x < w; x++)
821
adelta[x] = SATURATE_CAST_INT(tm[0] * x * (1 << 10));
822
bdelta[x] = SATURATE_CAST_INT(tm[3] * x * (1 << 10));
828
int X0 = SATURATE_CAST_INT((tm[1] * y + tm[2]) * (1 << 10));
829
int Y0 = SATURATE_CAST_INT((tm[4] * y + tm[5]) * (1 << 10));
832
for (; x + 7 < w; x += 8)
836
int X_0 = X0 + adelta[x];
837
int Y_0 = Y0 + bdelta[x];
838
int X_7 = X0 + adelta[x + 7];
839
int Y_7 = Y0 + bdelta[x + 7];
841
short sx_0 = SATURATE_CAST_SHORT((X_0 >> 10));
842
short sy_0 = SATURATE_CAST_SHORT((Y_0 >> 10));
843
short sx_7 = SATURATE_CAST_SHORT((X_7 >> 10));
844
short sy_7 = SATURATE_CAST_SHORT((Y_7 >> 10));
846
if (((unsigned short)sx_0 < srcw - 1 && (unsigned short)sy_0 < srch - 1) && ((unsigned short)sx_7 < srcw - 1 && (unsigned short)sy_7 < srch - 1))
851
else if ((sx_0 < -1 && sx_7 < -1) || (sx_0 >= srcw && sx_7 >= srcw) || (sy_0 < -1 && sy_7 < -1) || (sy_0 >= srch && sy_7 >= srch))
862
int32x4_t _Xl = vaddq_s32(vdupq_n_s32(X0), vld1q_s32(adelta.data() + x));
863
int32x4_t _Xh = vaddq_s32(vdupq_n_s32(X0), vld1q_s32(adelta.data() + x + 4));
864
int32x4_t _Yl = vaddq_s32(vdupq_n_s32(Y0), vld1q_s32(bdelta.data() + x));
865
int32x4_t _Yh = vaddq_s32(vdupq_n_s32(Y0), vld1q_s32(bdelta.data() + x + 4));
867
int16x4_t _sxl = vqshrn_n_s32(_Xl, 10);
868
int16x4_t _sxh = vqshrn_n_s32(_Xh, 10);
869
int16x4_t _syl = vqshrn_n_s32(_Yl, 10);
870
int16x4_t _syh = vqshrn_n_s32(_Yh, 10);
872
uint32x4_t _v1024m1 = vdupq_n_u32((1 << 10) - 1);
873
uint16x8_t _fx = vcombine_u16(vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Xl), _v1024m1)), vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Xh), _v1024m1)));
874
uint16x8_t _fy = vcombine_u16(vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Yl), _v1024m1)), vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Yh), _v1024m1)));
876
uint16x8_t _alpha0 = vsubq_u16(vdupq_n_u16(1 << 10), _fx);
877
uint16x8_t _alpha1 = _fx;
878
uint16x8_t _beta0 = vsubq_u16(vdupq_n_u16(1 << 10), _fy);
879
uint16x8_t _beta1 = _fy;
881
int16x4_t _srcstride = vdup_n_s16(srcstride);
882
int16x4_t _v3 = vdup_n_s16(3);
884
int32x4_t _a0l = vmlal_s16(vmull_s16(_srcstride, _syl), _sxl, _v3);
885
int32x4_t _a0h = vmlal_s16(vmull_s16(_srcstride, _syh), _sxh, _v3);
886
int32x4_t _b0l = vaddw_s16(_a0l, _srcstride);
887
int32x4_t _b0h = vaddw_s16(_a0h, _srcstride);
888
int32x4_t _a1l = vaddw_s16(_a0l, _v3);
889
int32x4_t _a1h = vaddw_s16(_a0h, _v3);
890
int32x4_t _b1l = vaddw_s16(_b0l, _v3);
891
int32x4_t _b1h = vaddw_s16(_b0h, _v3);
893
uint8x8x3_t _a0 = uint8x8x3_t();
894
uint8x8x3_t _a1 = uint8x8x3_t();
895
uint8x8x3_t _b0 = uint8x8x3_t();
896
uint8x8x3_t _b1 = uint8x8x3_t();
898
_a0 = vld3_lane_u8(src0 + vgetq_lane_s32(_a0l, 0), _a0, 0);
899
_a1 = vld3_lane_u8(src0 + vgetq_lane_s32(_a1l, 0), _a1, 0);
900
_b0 = vld3_lane_u8(src0 + vgetq_lane_s32(_b0l, 0), _b0, 0);
901
_b1 = vld3_lane_u8(src0 + vgetq_lane_s32(_b1l, 0), _b1, 0);
903
_a0 = vld3_lane_u8(src0 + vgetq_lane_s32(_a0l, 1), _a0, 1);
904
_a1 = vld3_lane_u8(src0 + vgetq_lane_s32(_a1l, 1), _a1, 1);
905
_b0 = vld3_lane_u8(src0 + vgetq_lane_s32(_b0l, 1), _b0, 1);
906
_b1 = vld3_lane_u8(src0 + vgetq_lane_s32(_b1l, 1), _b1, 1);
908
_a0 = vld3_lane_u8(src0 + vgetq_lane_s32(_a0l, 2), _a0, 2);
909
_a1 = vld3_lane_u8(src0 + vgetq_lane_s32(_a1l, 2), _a1, 2);
910
_b0 = vld3_lane_u8(src0 + vgetq_lane_s32(_b0l, 2), _b0, 2);
911
_b1 = vld3_lane_u8(src0 + vgetq_lane_s32(_b1l, 2), _b1, 2);
913
_a0 = vld3_lane_u8(src0 + vgetq_lane_s32(_a0l, 3), _a0, 3);
914
_a1 = vld3_lane_u8(src0 + vgetq_lane_s32(_a1l, 3), _a1, 3);
915
_b0 = vld3_lane_u8(src0 + vgetq_lane_s32(_b0l, 3), _b0, 3);
916
_b1 = vld3_lane_u8(src0 + vgetq_lane_s32(_b1l, 3), _b1, 3);
918
_a0 = vld3_lane_u8(src0 + vgetq_lane_s32(_a0h, 0), _a0, 4);
919
_a1 = vld3_lane_u8(src0 + vgetq_lane_s32(_a1h, 0), _a1, 4);
920
_b0 = vld3_lane_u8(src0 + vgetq_lane_s32(_b0h, 0), _b0, 4);
921
_b1 = vld3_lane_u8(src0 + vgetq_lane_s32(_b1h, 0), _b1, 4);
923
_a0 = vld3_lane_u8(src0 + vgetq_lane_s32(_a0h, 1), _a0, 5);
924
_a1 = vld3_lane_u8(src0 + vgetq_lane_s32(_a1h, 1), _a1, 5);
925
_b0 = vld3_lane_u8(src0 + vgetq_lane_s32(_b0h, 1), _b0, 5);
926
_b1 = vld3_lane_u8(src0 + vgetq_lane_s32(_b1h, 1), _b1, 5);
928
_a0 = vld3_lane_u8(src0 + vgetq_lane_s32(_a0h, 2), _a0, 6);
929
_a1 = vld3_lane_u8(src0 + vgetq_lane_s32(_a1h, 2), _a1, 6);
930
_b0 = vld3_lane_u8(src0 + vgetq_lane_s32(_b0h, 2), _b0, 6);
931
_b1 = vld3_lane_u8(src0 + vgetq_lane_s32(_b1h, 2), _b1, 6);
933
_a0 = vld3_lane_u8(src0 + vgetq_lane_s32(_a0h, 3), _a0, 7);
934
_a1 = vld3_lane_u8(src0 + vgetq_lane_s32(_a1h, 3), _a1, 7);
935
_b0 = vld3_lane_u8(src0 + vgetq_lane_s32(_b0h, 3), _b0, 7);
936
_b1 = vld3_lane_u8(src0 + vgetq_lane_s32(_b1h, 3), _b1, 7);
939
uint16x8_t _a0_0 = vmovl_u8(_a0.val[0]);
940
uint16x8_t _a0_1 = vmovl_u8(_a0.val[1]);
941
uint16x8_t _a0_2 = vmovl_u8(_a0.val[2]);
942
uint16x8_t _a1_0 = vmovl_u8(_a1.val[0]);
943
uint16x8_t _a1_1 = vmovl_u8(_a1.val[1]);
944
uint16x8_t _a1_2 = vmovl_u8(_a1.val[2]);
945
uint16x8_t _b0_0 = vmovl_u8(_b0.val[0]);
946
uint16x8_t _b0_1 = vmovl_u8(_b0.val[1]);
947
uint16x8_t _b0_2 = vmovl_u8(_b0.val[2]);
948
uint16x8_t _b1_0 = vmovl_u8(_b1.val[0]);
949
uint16x8_t _b1_1 = vmovl_u8(_b1.val[1]);
950
uint16x8_t _b1_2 = vmovl_u8(_b1.val[2]);
952
uint16x4_t _a00_0l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_a0_0), vget_low_u16(_alpha0)), vget_low_u16(_a1_0), vget_low_u16(_alpha1)), 5);
953
uint16x4_t _a00_1l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_a0_1), vget_low_u16(_alpha0)), vget_low_u16(_a1_1), vget_low_u16(_alpha1)), 5);
954
uint16x4_t _a00_2l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_a0_2), vget_low_u16(_alpha0)), vget_low_u16(_a1_2), vget_low_u16(_alpha1)), 5);
955
uint16x4_t _a00_0h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_a0_0), vget_high_u16(_alpha0)), vget_high_u16(_a1_0), vget_high_u16(_alpha1)), 5);
956
uint16x4_t _a00_1h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_a0_1), vget_high_u16(_alpha0)), vget_high_u16(_a1_1), vget_high_u16(_alpha1)), 5);
957
uint16x4_t _a00_2h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_a0_2), vget_high_u16(_alpha0)), vget_high_u16(_a1_2), vget_high_u16(_alpha1)), 5);
958
uint16x4_t _b00_0l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_b0_0), vget_low_u16(_alpha0)), vget_low_u16(_b1_0), vget_low_u16(_alpha1)), 5);
959
uint16x4_t _b00_1l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_b0_1), vget_low_u16(_alpha0)), vget_low_u16(_b1_1), vget_low_u16(_alpha1)), 5);
960
uint16x4_t _b00_2l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_b0_2), vget_low_u16(_alpha0)), vget_low_u16(_b1_2), vget_low_u16(_alpha1)), 5);
961
uint16x4_t _b00_0h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_b0_0), vget_high_u16(_alpha0)), vget_high_u16(_b1_0), vget_high_u16(_alpha1)), 5);
962
uint16x4_t _b00_1h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_b0_1), vget_high_u16(_alpha0)), vget_high_u16(_b1_1), vget_high_u16(_alpha1)), 5);
963
uint16x4_t _b00_2h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_b0_2), vget_high_u16(_alpha0)), vget_high_u16(_b1_2), vget_high_u16(_alpha1)), 5);
965
uint16x4_t _dst_0l = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_0l, vget_low_u16(_beta0)), _b00_0l, vget_low_u16(_beta1)), 15);
966
uint16x4_t _dst_1l = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_1l, vget_low_u16(_beta0)), _b00_1l, vget_low_u16(_beta1)), 15);
967
uint16x4_t _dst_2l = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_2l, vget_low_u16(_beta0)), _b00_2l, vget_low_u16(_beta1)), 15);
968
uint16x4_t _dst_0h = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_0h, vget_high_u16(_beta0)), _b00_0h, vget_high_u16(_beta1)), 15);
969
uint16x4_t _dst_1h = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_1h, vget_high_u16(_beta0)), _b00_1h, vget_high_u16(_beta1)), 15);
970
uint16x4_t _dst_2h = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_2h, vget_high_u16(_beta0)), _b00_2h, vget_high_u16(_beta1)), 15);
973
_dst.val[0] = vqmovn_u16(vcombine_u16(_dst_0l, _dst_0h));
974
_dst.val[1] = vqmovn_u16(vcombine_u16(_dst_1l, _dst_1h));
975
_dst.val[2] = vqmovn_u16(vcombine_u16(_dst_2l, _dst_2h));
981
for (int xi = 0; xi < 8; xi++)
983
int X = X0 + adelta[x + xi];
984
int Y = Y0 + bdelta[x + xi];
986
short sx = SATURATE_CAST_SHORT((X >> 10));
987
short sy = SATURATE_CAST_SHORT((Y >> 10));
989
short fx = X & ((1 << 10) - 1);
990
short fy = Y & ((1 << 10) - 1);
992
short alpha0 = (1 << 10) - fx;
995
short beta0 = (1 << 10) - fy;
998
const unsigned char* a0 = src0 + srcstride * sy + sx * 3;
999
const unsigned char* a1 = src0 + srcstride * sy + sx * 3 + 3;
1000
const unsigned char* b0 = src0 + srcstride * (sy + 1) + sx * 3;
1001
const unsigned char* b1 = src0 + srcstride * (sy + 1) + sx * 3 + 3;
1003
dst0[0] = (unsigned char)(((((unsigned short)((a0[0] * alpha0 + a1[0] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[0] * alpha0 + b1[0] * alpha1) >> 5) * beta1))) >> 15);
1004
dst0[1] = (unsigned char)(((((unsigned short)((a0[1] * alpha0 + a1[1] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[1] * alpha0 + b1[1] * alpha1) >> 5) * beta1))) >> 15);
1005
dst0[2] = (unsigned char)(((((unsigned short)((a0[2] * alpha0 + a1[2] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[2] * alpha0 + b1[2] * alpha1) >> 5) * beta1))) >> 15);
1011
else if (sxy_inout == 2)
1017
uint8x8x3_t _border_color;
1018
_border_color.val[0] = vdup_n_u8(border_color[0]);
1019
_border_color.val[1] = vdup_n_u8(border_color[1]);
1020
_border_color.val[2] = vdup_n_u8(border_color[2]);
1022
vst3_u8(dst0, _border_color);
1024
for (int xi = 0; xi < 8; xi++)
1026
dst0[xi * 3] = border_color[0];
1027
dst0[xi * 3 + 1] = border_color[1];
1028
dst0[xi * 3 + 2] = border_color[2];
1039
else // if (sxy_inout == 0)
1041
for (int xi = 0; xi < 8; xi++)
1043
int X = X0 + adelta[x + xi];
1044
int Y = Y0 + bdelta[x + xi];
1046
short sx = SATURATE_CAST_SHORT((X >> 10));
1047
short sy = SATURATE_CAST_SHORT((Y >> 10));
1049
if (type != -233 && (sx < -1 || sx >= srcw || sy < -1 || sy >= srch))
1051
dst0[0] = border_color[0];
1052
dst0[1] = border_color[1];
1053
dst0[2] = border_color[2];
1055
else if (type == -233 && ((unsigned short)sx >= srcw - 1 || (unsigned short)sy >= srch - 1))
1061
short fx = X & ((1 << 10) - 1);
1062
short fy = Y & ((1 << 10) - 1);
1064
short alpha0 = (1 << 10) - fx;
1067
short beta0 = (1 << 10) - fy;
1073
const unsigned char* a0 = src0 + srcstride * sy + sx * 3;
1074
const unsigned char* a1 = src0 + srcstride * sy + sx * 3 + 3;
1075
const unsigned char* b0 = src0 + srcstride * (sy + 1) + sx * 3;
1076
const unsigned char* b1 = src0 + srcstride * (sy + 1) + sx * 3 + 3;
1078
if ((unsigned short)sx >= srcw || (unsigned short)sy >= srch)
1080
a0 = type != -233 ? border_color : dst0;
1082
if ((unsigned short)sx1 >= srcw || (unsigned short)sy >= srch)
1084
a1 = type != -233 ? border_color : dst0;
1086
if ((unsigned short)sx >= srcw || (unsigned short)sy1 >= srch)
1088
b0 = type != -233 ? border_color : dst0;
1090
if ((unsigned short)sx1 >= srcw || (unsigned short)sy1 >= srch)
1092
b1 = type != -233 ? border_color : dst0;
1095
dst0[0] = (unsigned char)(((((unsigned short)((a0[0] * alpha0 + a1[0] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[0] * alpha0 + b1[0] * alpha1) >> 5) * beta1))) >> 15);
1096
dst0[1] = (unsigned char)(((((unsigned short)((a0[1] * alpha0 + a1[1] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[1] * alpha0 + b1[1] * alpha1) >> 5) * beta1))) >> 15);
1097
dst0[2] = (unsigned char)(((((unsigned short)((a0[2] * alpha0 + a1[2] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[2] * alpha0 + b1[2] * alpha1) >> 5) * beta1))) >> 15);
1106
int X = X0 + adelta[x];
1107
int Y = Y0 + bdelta[x];
1109
short sx = SATURATE_CAST_SHORT((X >> 10));
1110
short sy = SATURATE_CAST_SHORT((Y >> 10));
1112
if (type != -233 && (sx < -1 || sx >= srcw || sy < -1 || sy >= srch))
1114
dst0[0] = border_color[0];
1115
dst0[1] = border_color[1];
1116
dst0[2] = border_color[2];
1118
else if (type == -233 && ((unsigned short)sx >= srcw - 1 || (unsigned short)sy >= srch - 1))
1124
short fx = X & ((1 << 10) - 1);
1125
short fy = Y & ((1 << 10) - 1);
1127
short alpha0 = (1 << 10) - fx;
1130
short beta0 = (1 << 10) - fy;
1136
const unsigned char* a0 = src0 + srcstride * sy + sx * 3;
1137
const unsigned char* a1 = src0 + srcstride * sy + sx * 3 + 3;
1138
const unsigned char* b0 = src0 + srcstride * (sy + 1) + sx * 3;
1139
const unsigned char* b1 = src0 + srcstride * (sy + 1) + sx * 3 + 3;
1141
if ((unsigned short)sx >= srcw || (unsigned short)sy >= srch)
1143
a0 = type != -233 ? border_color : dst0;
1145
if ((unsigned short)sx1 >= srcw || (unsigned short)sy >= srch)
1147
a1 = type != -233 ? border_color : dst0;
1149
if ((unsigned short)sx >= srcw || (unsigned short)sy1 >= srch)
1151
b0 = type != -233 ? border_color : dst0;
1153
if ((unsigned short)sx1 >= srcw || (unsigned short)sy1 >= srch)
1155
b1 = type != -233 ? border_color : dst0;
1158
dst0[0] = (unsigned char)(((((unsigned short)((a0[0] * alpha0 + a1[0] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[0] * alpha0 + b1[0] * alpha1) >> 5) * beta1))) >> 15);
1159
dst0[1] = (unsigned char)(((((unsigned short)((a0[1] * alpha0 + a1[1] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[1] * alpha0 + b1[1] * alpha1) >> 5) * beta1))) >> 15);
1160
dst0[2] = (unsigned char)(((((unsigned short)((a0[2] * alpha0 + a1[2] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[2] * alpha0 + b1[2] * alpha1) >> 5) * beta1))) >> 15);
1169
#undef SATURATE_CAST_SHORT
1170
#undef SATURATE_CAST_INT
1173
void warpaffine_bilinear_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type, unsigned int v)
1175
const unsigned char* border_color = (const unsigned char*)&v;
1176
const int wgap = stride - w * 4;
1178
const unsigned char* src0 = src;
1179
unsigned char* dst0 = dst;
1181
#define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X), SHRT_MIN), SHRT_MAX)
1182
#define SATURATE_CAST_INT(X) (int)::std::min(::std::max((int)((X) + ((X) >= 0.f ? 0.5f : -0.5f)), INT_MIN), INT_MAX)
1184
std::vector<int> adelta(w);
1185
std::vector<int> bdelta(w);
1186
for (int x = 0; x < w; x++)
1188
adelta[x] = SATURATE_CAST_INT(tm[0] * x * (1 << 10));
1189
bdelta[x] = SATURATE_CAST_INT(tm[3] * x * (1 << 10));
1195
int X0 = SATURATE_CAST_INT((tm[1] * y + tm[2]) * (1 << 10));
1196
int Y0 = SATURATE_CAST_INT((tm[4] * y + tm[5]) * (1 << 10));
1199
for (; x + 7 < w; x += 8)
1203
int X_0 = X0 + adelta[x];
1204
int Y_0 = Y0 + bdelta[x];
1205
int X_7 = X0 + adelta[x + 7];
1206
int Y_7 = Y0 + bdelta[x + 7];
1208
short sx_0 = SATURATE_CAST_SHORT((X_0 >> 10));
1209
short sy_0 = SATURATE_CAST_SHORT((Y_0 >> 10));
1210
short sx_7 = SATURATE_CAST_SHORT((X_7 >> 10));
1211
short sy_7 = SATURATE_CAST_SHORT((Y_7 >> 10));
1213
if (((unsigned short)sx_0 < srcw - 1 && (unsigned short)sy_0 < srch - 1) && ((unsigned short)sx_7 < srcw - 1 && (unsigned short)sy_7 < srch - 1))
1218
else if ((sx_0 < -1 && sx_7 < -1) || (sx_0 >= srcw && sx_7 >= srcw) || (sy_0 < -1 && sy_7 < -1) || (sy_0 >= srch && sy_7 >= srch))
1229
int32x4_t _Xl = vaddq_s32(vdupq_n_s32(X0), vld1q_s32(adelta.data() + x));
1230
int32x4_t _Xh = vaddq_s32(vdupq_n_s32(X0), vld1q_s32(adelta.data() + x + 4));
1231
int32x4_t _Yl = vaddq_s32(vdupq_n_s32(Y0), vld1q_s32(bdelta.data() + x));
1232
int32x4_t _Yh = vaddq_s32(vdupq_n_s32(Y0), vld1q_s32(bdelta.data() + x + 4));
1234
int16x4_t _sxl = vqshrn_n_s32(_Xl, 10);
1235
int16x4_t _sxh = vqshrn_n_s32(_Xh, 10);
1236
int16x4_t _syl = vqshrn_n_s32(_Yl, 10);
1237
int16x4_t _syh = vqshrn_n_s32(_Yh, 10);
1239
uint32x4_t _v1024m1 = vdupq_n_u32((1 << 10) - 1);
1240
uint16x8_t _fx = vcombine_u16(vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Xl), _v1024m1)), vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Xh), _v1024m1)));
1241
uint16x8_t _fy = vcombine_u16(vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Yl), _v1024m1)), vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Yh), _v1024m1)));
1243
uint16x8_t _alpha0 = vsubq_u16(vdupq_n_u16(1 << 10), _fx);
1244
uint16x8_t _alpha1 = _fx;
1245
uint16x8_t _beta0 = vsubq_u16(vdupq_n_u16(1 << 10), _fy);
1246
uint16x8_t _beta1 = _fy;
1248
int16x4_t _srcstride = vdup_n_s16(srcstride);
1249
int16x4_t _v4 = vdup_n_s16(4);
1251
int32x4_t _a0l = vmlal_s16(vmull_s16(_srcstride, _syl), _sxl, _v4);
1252
int32x4_t _a0h = vmlal_s16(vmull_s16(_srcstride, _syh), _sxh, _v4);
1253
int32x4_t _b0l = vaddw_s16(_a0l, _srcstride);
1254
int32x4_t _b0h = vaddw_s16(_a0h, _srcstride);
1256
uint8x8x4_t _a0 = uint8x8x4_t();
1257
uint8x8x4_t _a1 = uint8x8x4_t();
1258
uint8x8x4_t _b0 = uint8x8x4_t();
1259
uint8x8x4_t _b1 = uint8x8x4_t();
1261
uint8x8_t _a0a1_0 = vld1_u8(src0 + vgetq_lane_s32(_a0l, 0));
1262
uint8x8_t _a0a1_1 = vld1_u8(src0 + vgetq_lane_s32(_a0l, 1));
1263
uint8x8_t _a0a1_2 = vld1_u8(src0 + vgetq_lane_s32(_a0l, 2));
1264
uint8x8_t _a0a1_3 = vld1_u8(src0 + vgetq_lane_s32(_a0l, 3));
1265
uint8x8_t _a0a1_4 = vld1_u8(src0 + vgetq_lane_s32(_a0h, 0));
1266
uint8x8_t _a0a1_5 = vld1_u8(src0 + vgetq_lane_s32(_a0h, 1));
1267
uint8x8_t _a0a1_6 = vld1_u8(src0 + vgetq_lane_s32(_a0h, 2));
1268
uint8x8_t _a0a1_7 = vld1_u8(src0 + vgetq_lane_s32(_a0h, 3));
1271
uint8x8x2_t _a0a101t_r = vtrn_u8(_a0a1_0, _a0a1_1);
1272
uint8x8x2_t _a0a123t_r = vtrn_u8(_a0a1_2, _a0a1_3);
1273
uint8x8x2_t _a0a145t_r = vtrn_u8(_a0a1_4, _a0a1_5);
1274
uint8x8x2_t _a0a167t_r = vtrn_u8(_a0a1_6, _a0a1_7);
1276
uint16x4x2_t _a0a102tt_r = vtrn_u16(vreinterpret_u16_u8(_a0a101t_r.val[0]), vreinterpret_u16_u8(_a0a123t_r.val[0]));
1277
uint16x4x2_t _a0a113tt_r = vtrn_u16(vreinterpret_u16_u8(_a0a101t_r.val[1]), vreinterpret_u16_u8(_a0a123t_r.val[1]));
1278
uint16x4x2_t _a0a146tt_r = vtrn_u16(vreinterpret_u16_u8(_a0a145t_r.val[0]), vreinterpret_u16_u8(_a0a167t_r.val[0]));
1279
uint16x4x2_t _a0a157tt_r = vtrn_u16(vreinterpret_u16_u8(_a0a145t_r.val[1]), vreinterpret_u16_u8(_a0a167t_r.val[1]));
1281
uint32x2x2_t _a0a104ttt_r = vtrn_u32(vreinterpret_u32_u16(_a0a102tt_r.val[0]), vreinterpret_u32_u16(_a0a146tt_r.val[0]));
1282
uint32x2x2_t _a0a115ttt_r = vtrn_u32(vreinterpret_u32_u16(_a0a113tt_r.val[0]), vreinterpret_u32_u16(_a0a157tt_r.val[0]));
1283
uint32x2x2_t _a0a126ttt_r = vtrn_u32(vreinterpret_u32_u16(_a0a102tt_r.val[1]), vreinterpret_u32_u16(_a0a146tt_r.val[1]));
1284
uint32x2x2_t _a0a137ttt_r = vtrn_u32(vreinterpret_u32_u16(_a0a113tt_r.val[1]), vreinterpret_u32_u16(_a0a157tt_r.val[1]));
1286
_a0.val[0] = vreinterpret_u8_u32(_a0a104ttt_r.val[0]);
1287
_a0.val[1] = vreinterpret_u8_u32(_a0a115ttt_r.val[0]);
1288
_a0.val[2] = vreinterpret_u8_u32(_a0a126ttt_r.val[0]);
1289
_a0.val[3] = vreinterpret_u8_u32(_a0a137ttt_r.val[0]);
1290
_a1.val[0] = vreinterpret_u8_u32(_a0a104ttt_r.val[1]);
1291
_a1.val[1] = vreinterpret_u8_u32(_a0a115ttt_r.val[1]);
1292
_a1.val[2] = vreinterpret_u8_u32(_a0a126ttt_r.val[1]);
1293
_a1.val[3] = vreinterpret_u8_u32(_a0a137ttt_r.val[1]);
1295
uint8x8_t _b0b1_0 = vld1_u8(src0 + vgetq_lane_s32(_b0l, 0));
1296
uint8x8_t _b0b1_1 = vld1_u8(src0 + vgetq_lane_s32(_b0l, 1));
1297
uint8x8_t _b0b1_2 = vld1_u8(src0 + vgetq_lane_s32(_b0l, 2));
1298
uint8x8_t _b0b1_3 = vld1_u8(src0 + vgetq_lane_s32(_b0l, 3));
1299
uint8x8_t _b0b1_4 = vld1_u8(src0 + vgetq_lane_s32(_b0h, 0));
1300
uint8x8_t _b0b1_5 = vld1_u8(src0 + vgetq_lane_s32(_b0h, 1));
1301
uint8x8_t _b0b1_6 = vld1_u8(src0 + vgetq_lane_s32(_b0h, 2));
1302
uint8x8_t _b0b1_7 = vld1_u8(src0 + vgetq_lane_s32(_b0h, 3));
1305
uint8x8x2_t _b0b101t_r = vtrn_u8(_b0b1_0, _b0b1_1);
1306
uint8x8x2_t _b0b123t_r = vtrn_u8(_b0b1_2, _b0b1_3);
1307
uint8x8x2_t _b0b145t_r = vtrn_u8(_b0b1_4, _b0b1_5);
1308
uint8x8x2_t _b0b167t_r = vtrn_u8(_b0b1_6, _b0b1_7);
1310
uint16x4x2_t _b0b102tt_r = vtrn_u16(vreinterpret_u16_u8(_b0b101t_r.val[0]), vreinterpret_u16_u8(_b0b123t_r.val[0]));
1311
uint16x4x2_t _b0b113tt_r = vtrn_u16(vreinterpret_u16_u8(_b0b101t_r.val[1]), vreinterpret_u16_u8(_b0b123t_r.val[1]));
1312
uint16x4x2_t _b0b146tt_r = vtrn_u16(vreinterpret_u16_u8(_b0b145t_r.val[0]), vreinterpret_u16_u8(_b0b167t_r.val[0]));
1313
uint16x4x2_t _b0b157tt_r = vtrn_u16(vreinterpret_u16_u8(_b0b145t_r.val[1]), vreinterpret_u16_u8(_b0b167t_r.val[1]));
1315
uint32x2x2_t _b0b104ttt_r = vtrn_u32(vreinterpret_u32_u16(_b0b102tt_r.val[0]), vreinterpret_u32_u16(_b0b146tt_r.val[0]));
1316
uint32x2x2_t _b0b115ttt_r = vtrn_u32(vreinterpret_u32_u16(_b0b113tt_r.val[0]), vreinterpret_u32_u16(_b0b157tt_r.val[0]));
1317
uint32x2x2_t _b0b126ttt_r = vtrn_u32(vreinterpret_u32_u16(_b0b102tt_r.val[1]), vreinterpret_u32_u16(_b0b146tt_r.val[1]));
1318
uint32x2x2_t _b0b137ttt_r = vtrn_u32(vreinterpret_u32_u16(_b0b113tt_r.val[1]), vreinterpret_u32_u16(_b0b157tt_r.val[1]));
1320
_b0.val[0] = vreinterpret_u8_u32(_b0b104ttt_r.val[0]);
1321
_b0.val[1] = vreinterpret_u8_u32(_b0b115ttt_r.val[0]);
1322
_b0.val[2] = vreinterpret_u8_u32(_b0b126ttt_r.val[0]);
1323
_b0.val[3] = vreinterpret_u8_u32(_b0b137ttt_r.val[0]);
1324
_b1.val[0] = vreinterpret_u8_u32(_b0b104ttt_r.val[1]);
1325
_b1.val[1] = vreinterpret_u8_u32(_b0b115ttt_r.val[1]);
1326
_b1.val[2] = vreinterpret_u8_u32(_b0b126ttt_r.val[1]);
1327
_b1.val[3] = vreinterpret_u8_u32(_b0b137ttt_r.val[1]);
1330
uint16x8_t _a0_0 = vmovl_u8(_a0.val[0]);
1331
uint16x8_t _a0_1 = vmovl_u8(_a0.val[1]);
1332
uint16x8_t _a0_2 = vmovl_u8(_a0.val[2]);
1333
uint16x8_t _a0_3 = vmovl_u8(_a0.val[3]);
1334
uint16x8_t _a1_0 = vmovl_u8(_a1.val[0]);
1335
uint16x8_t _a1_1 = vmovl_u8(_a1.val[1]);
1336
uint16x8_t _a1_2 = vmovl_u8(_a1.val[2]);
1337
uint16x8_t _a1_3 = vmovl_u8(_a1.val[3]);
1338
uint16x8_t _b0_0 = vmovl_u8(_b0.val[0]);
1339
uint16x8_t _b0_1 = vmovl_u8(_b0.val[1]);
1340
uint16x8_t _b0_2 = vmovl_u8(_b0.val[2]);
1341
uint16x8_t _b0_3 = vmovl_u8(_b0.val[3]);
1342
uint16x8_t _b1_0 = vmovl_u8(_b1.val[0]);
1343
uint16x8_t _b1_1 = vmovl_u8(_b1.val[1]);
1344
uint16x8_t _b1_2 = vmovl_u8(_b1.val[2]);
1345
uint16x8_t _b1_3 = vmovl_u8(_b1.val[3]);
1347
uint16x4_t _a00_0l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_a0_0), vget_low_u16(_alpha0)), vget_low_u16(_a1_0), vget_low_u16(_alpha1)), 5);
1348
uint16x4_t _a00_1l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_a0_1), vget_low_u16(_alpha0)), vget_low_u16(_a1_1), vget_low_u16(_alpha1)), 5);
1349
uint16x4_t _a00_2l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_a0_2), vget_low_u16(_alpha0)), vget_low_u16(_a1_2), vget_low_u16(_alpha1)), 5);
1350
uint16x4_t _a00_3l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_a0_3), vget_low_u16(_alpha0)), vget_low_u16(_a1_3), vget_low_u16(_alpha1)), 5);
1351
uint16x4_t _a00_0h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_a0_0), vget_high_u16(_alpha0)), vget_high_u16(_a1_0), vget_high_u16(_alpha1)), 5);
1352
uint16x4_t _a00_1h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_a0_1), vget_high_u16(_alpha0)), vget_high_u16(_a1_1), vget_high_u16(_alpha1)), 5);
1353
uint16x4_t _a00_2h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_a0_2), vget_high_u16(_alpha0)), vget_high_u16(_a1_2), vget_high_u16(_alpha1)), 5);
1354
uint16x4_t _a00_3h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_a0_3), vget_high_u16(_alpha0)), vget_high_u16(_a1_3), vget_high_u16(_alpha1)), 5);
1355
uint16x4_t _b00_0l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_b0_0), vget_low_u16(_alpha0)), vget_low_u16(_b1_0), vget_low_u16(_alpha1)), 5);
1356
uint16x4_t _b00_1l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_b0_1), vget_low_u16(_alpha0)), vget_low_u16(_b1_1), vget_low_u16(_alpha1)), 5);
1357
uint16x4_t _b00_2l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_b0_2), vget_low_u16(_alpha0)), vget_low_u16(_b1_2), vget_low_u16(_alpha1)), 5);
1358
uint16x4_t _b00_3l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_b0_3), vget_low_u16(_alpha0)), vget_low_u16(_b1_3), vget_low_u16(_alpha1)), 5);
1359
uint16x4_t _b00_0h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_b0_0), vget_high_u16(_alpha0)), vget_high_u16(_b1_0), vget_high_u16(_alpha1)), 5);
1360
uint16x4_t _b00_1h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_b0_1), vget_high_u16(_alpha0)), vget_high_u16(_b1_1), vget_high_u16(_alpha1)), 5);
1361
uint16x4_t _b00_2h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_b0_2), vget_high_u16(_alpha0)), vget_high_u16(_b1_2), vget_high_u16(_alpha1)), 5);
1362
uint16x4_t _b00_3h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_b0_3), vget_high_u16(_alpha0)), vget_high_u16(_b1_3), vget_high_u16(_alpha1)), 5);
1364
uint16x4_t _dst_0l = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_0l, vget_low_u16(_beta0)), _b00_0l, vget_low_u16(_beta1)), 15);
1365
uint16x4_t _dst_1l = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_1l, vget_low_u16(_beta0)), _b00_1l, vget_low_u16(_beta1)), 15);
1366
uint16x4_t _dst_2l = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_2l, vget_low_u16(_beta0)), _b00_2l, vget_low_u16(_beta1)), 15);
1367
uint16x4_t _dst_3l = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_3l, vget_low_u16(_beta0)), _b00_3l, vget_low_u16(_beta1)), 15);
1368
uint16x4_t _dst_0h = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_0h, vget_high_u16(_beta0)), _b00_0h, vget_high_u16(_beta1)), 15);
1369
uint16x4_t _dst_1h = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_1h, vget_high_u16(_beta0)), _b00_1h, vget_high_u16(_beta1)), 15);
1370
uint16x4_t _dst_2h = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_2h, vget_high_u16(_beta0)), _b00_2h, vget_high_u16(_beta1)), 15);
1371
uint16x4_t _dst_3h = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_3h, vget_high_u16(_beta0)), _b00_3h, vget_high_u16(_beta1)), 15);
1374
_dst.val[0] = vqmovn_u16(vcombine_u16(_dst_0l, _dst_0h));
1375
_dst.val[1] = vqmovn_u16(vcombine_u16(_dst_1l, _dst_1h));
1376
_dst.val[2] = vqmovn_u16(vcombine_u16(_dst_2l, _dst_2h));
1377
_dst.val[3] = vqmovn_u16(vcombine_u16(_dst_3l, _dst_3h));
1379
vst4_u8(dst0, _dst);
1383
for (int xi = 0; xi < 8; xi++)
1385
int X = X0 + adelta[x + xi];
1386
int Y = Y0 + bdelta[x + xi];
1388
short sx = SATURATE_CAST_SHORT((X >> 10));
1389
short sy = SATURATE_CAST_SHORT((Y >> 10));
1391
short fx = X & ((1 << 10) - 1);
1392
short fy = Y & ((1 << 10) - 1);
1394
short alpha0 = (1 << 10) - fx;
1397
short beta0 = (1 << 10) - fy;
1400
const unsigned char* a0 = src0 + srcstride * sy + sx * 4;
1401
const unsigned char* a1 = src0 + srcstride * sy + sx * 4 + 4;
1402
const unsigned char* b0 = src0 + srcstride * (sy + 1) + sx * 4;
1403
const unsigned char* b1 = src0 + srcstride * (sy + 1) + sx * 4 + 4;
1405
dst0[0] = (unsigned char)(((((unsigned short)((a0[0] * alpha0 + a1[0] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[0] * alpha0 + b1[0] * alpha1) >> 5) * beta1))) >> 15);
1406
dst0[1] = (unsigned char)(((((unsigned short)((a0[1] * alpha0 + a1[1] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[1] * alpha0 + b1[1] * alpha1) >> 5) * beta1))) >> 15);
1407
dst0[2] = (unsigned char)(((((unsigned short)((a0[2] * alpha0 + a1[2] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[2] * alpha0 + b1[2] * alpha1) >> 5) * beta1))) >> 15);
1408
dst0[3] = (unsigned char)(((((unsigned short)((a0[3] * alpha0 + a1[3] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[3] * alpha0 + b1[3] * alpha1) >> 5) * beta1))) >> 15);
1414
else if (sxy_inout == 2)
1420
uint8x8x4_t _border_color;
1421
_border_color.val[0] = vdup_n_u8(border_color[0]);
1422
_border_color.val[1] = vdup_n_u8(border_color[1]);
1423
_border_color.val[2] = vdup_n_u8(border_color[2]);
1424
_border_color.val[3] = vdup_n_u8(border_color[3]);
1426
vst4_u8(dst0, _border_color);
1428
for (int xi = 0; xi < 8; xi++)
1430
dst0[xi * 4] = border_color[0];
1431
dst0[xi * 4 + 1] = border_color[1];
1432
dst0[xi * 4 + 2] = border_color[2];
1433
dst0[xi * 4 + 3] = border_color[3];
1444
else // if (sxy_inout == 0)
1446
for (int xi = 0; xi < 8; xi++)
1448
int X = X0 + adelta[x + xi];
1449
int Y = Y0 + bdelta[x + xi];
1451
short sx = SATURATE_CAST_SHORT((X >> 10));
1452
short sy = SATURATE_CAST_SHORT((Y >> 10));
1454
if (type != -233 && (sx < -1 || sx >= srcw || sy < -1 || sy >= srch))
1456
dst0[0] = border_color[0];
1457
dst0[1] = border_color[1];
1458
dst0[2] = border_color[2];
1459
dst0[3] = border_color[3];
1461
else if (type == -233 && ((unsigned short)sx >= srcw - 1 || (unsigned short)sy >= srch - 1))
1467
short fx = X & ((1 << 10) - 1);
1468
short fy = Y & ((1 << 10) - 1);
1470
short alpha0 = (1 << 10) - fx;
1473
short beta0 = (1 << 10) - fy;
1479
const unsigned char* a0 = src0 + srcstride * sy + sx * 4;
1480
const unsigned char* a1 = src0 + srcstride * sy + sx * 4 + 4;
1481
const unsigned char* b0 = src0 + srcstride * (sy + 1) + sx * 4;
1482
const unsigned char* b1 = src0 + srcstride * (sy + 1) + sx * 4 + 4;
1484
if ((unsigned short)sx >= srcw || (unsigned short)sy >= srch)
1486
a0 = type != -233 ? border_color : dst0;
1488
if ((unsigned short)sx1 >= srcw || (unsigned short)sy >= srch)
1490
a1 = type != -233 ? border_color : dst0;
1492
if ((unsigned short)sx >= srcw || (unsigned short)sy1 >= srch)
1494
b0 = type != -233 ? border_color : dst0;
1496
if ((unsigned short)sx1 >= srcw || (unsigned short)sy1 >= srch)
1498
b1 = type != -233 ? border_color : dst0;
1501
dst0[0] = (unsigned char)(((((unsigned short)((a0[0] * alpha0 + a1[0] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[0] * alpha0 + b1[0] * alpha1) >> 5) * beta1))) >> 15);
1502
dst0[1] = (unsigned char)(((((unsigned short)((a0[1] * alpha0 + a1[1] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[1] * alpha0 + b1[1] * alpha1) >> 5) * beta1))) >> 15);
1503
dst0[2] = (unsigned char)(((((unsigned short)((a0[2] * alpha0 + a1[2] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[2] * alpha0 + b1[2] * alpha1) >> 5) * beta1))) >> 15);
1504
dst0[3] = (unsigned char)(((((unsigned short)((a0[3] * alpha0 + a1[3] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[3] * alpha0 + b1[3] * alpha1) >> 5) * beta1))) >> 15);
1513
int X = X0 + adelta[x];
1514
int Y = Y0 + bdelta[x];
1516
short sx = SATURATE_CAST_SHORT((X >> 10));
1517
short sy = SATURATE_CAST_SHORT((Y >> 10));
1519
if (type != -233 && (sx < -1 || sx >= srcw || sy < -1 || sy >= srch))
1521
dst0[0] = border_color[0];
1522
dst0[1] = border_color[1];
1523
dst0[2] = border_color[2];
1524
dst0[3] = border_color[3];
1526
else if (type == -233 && ((unsigned short)sx >= srcw - 1 || (unsigned short)sy >= srch - 1))
1532
short fx = X & ((1 << 10) - 1);
1533
short fy = Y & ((1 << 10) - 1);
1535
short alpha0 = (1 << 10) - fx;
1538
short beta0 = (1 << 10) - fy;
1544
const unsigned char* a0 = src0 + srcstride * sy + sx * 4;
1545
const unsigned char* a1 = src0 + srcstride * sy + sx * 4 + 4;
1546
const unsigned char* b0 = src0 + srcstride * (sy + 1) + sx * 4;
1547
const unsigned char* b1 = src0 + srcstride * (sy + 1) + sx * 4 + 4;
1549
if ((unsigned short)sx >= srcw || (unsigned short)sy >= srch)
1551
a0 = type != -233 ? border_color : dst0;
1553
if ((unsigned short)sx1 >= srcw || (unsigned short)sy >= srch)
1555
a1 = type != -233 ? border_color : dst0;
1557
if ((unsigned short)sx >= srcw || (unsigned short)sy1 >= srch)
1559
b0 = type != -233 ? border_color : dst0;
1561
if ((unsigned short)sx1 >= srcw || (unsigned short)sy1 >= srch)
1563
b1 = type != -233 ? border_color : dst0;
1566
dst0[0] = (unsigned char)(((((unsigned short)((a0[0] * alpha0 + a1[0] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[0] * alpha0 + b1[0] * alpha1) >> 5) * beta1))) >> 15);
1567
dst0[1] = (unsigned char)(((((unsigned short)((a0[1] * alpha0 + a1[1] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[1] * alpha0 + b1[1] * alpha1) >> 5) * beta1))) >> 15);
1568
dst0[2] = (unsigned char)(((((unsigned short)((a0[2] * alpha0 + a1[2] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[2] * alpha0 + b1[2] * alpha1) >> 5) * beta1))) >> 15);
1569
dst0[3] = (unsigned char)(((((unsigned short)((a0[3] * alpha0 + a1[3] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[3] * alpha0 + b1[3] * alpha1) >> 5) * beta1))) >> 15);
1578
#undef SATURATE_CAST_SHORT
1579
#undef SATURATE_CAST_INT
1582
void warpaffine_bilinear_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type, unsigned int v)
1584
// assert srcw % 2 == 0
1585
// assert srch % 2 == 0
1586
// assert w % 2 == 0
1587
// assert h % 2 == 0
1589
const unsigned char* border_color = (const unsigned char*)&v;
1593
unsigned char* border_color_y = (unsigned char*)&v_y;
1594
unsigned char* border_color_uv = (unsigned char*)&v_uv;
1595
border_color_y[0] = border_color[0];
1596
border_color_uv[0] = border_color[1];
1597
border_color_uv[1] = border_color[2];
1599
const unsigned char* srcY = src;
1600
unsigned char* dstY = dst;
1601
warpaffine_bilinear_c1(srcY, srcw, srch, dstY, w, h, tm, type, v_y);
1603
const float tm_uv[6] = {
1612
const unsigned char* srcUV = src + srcw * srch;
1613
unsigned char* dstUV = dst + w * h;
1614
warpaffine_bilinear_c2(srcUV, srcw / 2, srch / 2, dstUV, w / 2, h / 2, tm_uv, type, v_uv);
1616
#endif // NCNN_PIXEL_AFFINE