ncnn

Форк
0
/
mat_pixel_affine.cpp 
1618 строк · 77.7 Кб
1
// Tencent is pleased to support the open source community by making ncnn available.
2
//
3
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4
//
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
7
//
8
// https://opensource.org/licenses/BSD-3-Clause
9
//
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
14

15
#include "mat.h"
16
#if __ARM_NEON
17
#include <arm_neon.h>
18
#endif // __ARM_NEON
19
#include <limits.h>
20

21
#include "platform.h"
22

23
namespace ncnn {
24

25
#if NCNN_PIXEL_AFFINE
26
void get_rotation_matrix(float angle, float scale, float dx, float dy, float* tm)
27
{
28
    angle *= (float)(3.14159265358979323846 / 180);
29
    float alpha = cosf(angle) * scale;
30
    float beta = sinf(angle) * scale;
31

32
    tm[0] = alpha;
33
    tm[1] = beta;
34
    tm[2] = (1.f - alpha) * dx - beta * dy;
35
    tm[3] = -beta;
36
    tm[4] = alpha;
37
    tm[5] = beta * dx + (1.f - alpha) * dy;
38
}
39

40
void get_affine_transform(const float* points_from, const float* points_to, int num_point, float* tm)
41
{
42
    float ma[4][4] = {{0.f}};
43
    float mb[4] = {0.f};
44
    float mm[4];
45

46
    for (int i = 0; i < num_point; i++)
47
    {
48
        ma[0][0] += points_from[0] * points_from[0] + points_from[1] * points_from[1];
49
        ma[0][2] += points_from[0];
50
        ma[0][3] += points_from[1];
51

52
        mb[0] += points_from[0] * points_to[0] + points_from[1] * points_to[1];
53
        mb[1] += points_from[0] * points_to[1] - points_from[1] * points_to[0];
54
        mb[2] += points_to[0];
55
        mb[3] += points_to[1];
56

57
        points_from += 2;
58
        points_to += 2;
59
    }
60

61
    ma[1][1] = ma[0][0];
62
    ma[2][1] = ma[1][2] = -ma[0][3];
63
    ma[3][1] = ma[1][3] = ma[2][0] = ma[0][2];
64
    ma[2][2] = ma[3][3] = (float)num_point;
65
    ma[3][0] = ma[0][3];
66

67
    // MM = inv(A) * B
68
    // matrix 4x4 invert by https://github.com/willnode/N-Matrix-Programmer
69
    // suppose the user provide valid points combination
70
    // I have not taken det == zero into account here   :>  --- nihui
71
    float mai[4][4];
72
    float det;
73
    // clang-format off
74
    // *INDENT-OFF*
75
    {
76
        float A2323 = ma[2][2] * ma[3][3] - ma[2][3] * ma[3][2];
77
        float A1323 = ma[2][1] * ma[3][3] - ma[2][3] * ma[3][1];
78
        float A1223 = ma[2][1] * ma[3][2] - ma[2][2] * ma[3][1];
79
        float A0323 = ma[2][0] * ma[3][3] - ma[2][3] * ma[3][0];
80
        float A0223 = ma[2][0] * ma[3][2] - ma[2][2] * ma[3][0];
81
        float A0123 = ma[2][0] * ma[3][1] - ma[2][1] * ma[3][0];
82
        float A2313 = ma[1][2] * ma[3][3] - ma[1][3] * ma[3][2];
83
        float A1313 = ma[1][1] * ma[3][3] - ma[1][3] * ma[3][1];
84
        float A1213 = ma[1][1] * ma[3][2] - ma[1][2] * ma[3][1];
85
        float A2312 = ma[1][2] * ma[2][3] - ma[1][3] * ma[2][2];
86
        float A1312 = ma[1][1] * ma[2][3] - ma[1][3] * ma[2][1];
87
        float A1212 = ma[1][1] * ma[2][2] - ma[1][2] * ma[2][1];
88
        float A0313 = ma[1][0] * ma[3][3] - ma[1][3] * ma[3][0];
89
        float A0213 = ma[1][0] * ma[3][2] - ma[1][2] * ma[3][0];
90
        float A0312 = ma[1][0] * ma[2][3] - ma[1][3] * ma[2][0];
91
        float A0212 = ma[1][0] * ma[2][2] - ma[1][2] * ma[2][0];
92
        float A0113 = ma[1][0] * ma[3][1] - ma[1][1] * ma[3][0];
93
        float A0112 = ma[1][0] * ma[2][1] - ma[1][1] * ma[2][0];
94

95
        det = ma[0][0] * (ma[1][1] * A2323 - ma[1][2] * A1323 + ma[1][3] * A1223)
96
            - ma[0][1] * (ma[1][0] * A2323 - ma[1][2] * A0323 + ma[1][3] * A0223)
97
            + ma[0][2] * (ma[1][0] * A1323 - ma[1][1] * A0323 + ma[1][3] * A0123)
98
            - ma[0][3] * (ma[1][0] * A1223 - ma[1][1] * A0223 + ma[1][2] * A0123);
99

100
        det = 1.f / det;
101

102
        mai[0][0] =   (ma[1][1] * A2323 - ma[1][2] * A1323 + ma[1][3] * A1223);
103
        mai[0][1] = - (ma[0][1] * A2323 - ma[0][2] * A1323 + ma[0][3] * A1223);
104
        mai[0][2] =   (ma[0][1] * A2313 - ma[0][2] * A1313 + ma[0][3] * A1213);
105
        mai[0][3] = - (ma[0][1] * A2312 - ma[0][2] * A1312 + ma[0][3] * A1212);
106
        mai[1][0] = - (ma[1][0] * A2323 - ma[1][2] * A0323 + ma[1][3] * A0223);
107
        mai[1][1] =   (ma[0][0] * A2323 - ma[0][2] * A0323 + ma[0][3] * A0223);
108
        mai[1][2] = - (ma[0][0] * A2313 - ma[0][2] * A0313 + ma[0][3] * A0213);
109
        mai[1][3] =   (ma[0][0] * A2312 - ma[0][2] * A0312 + ma[0][3] * A0212);
110
        mai[2][0] =   (ma[1][0] * A1323 - ma[1][1] * A0323 + ma[1][3] * A0123);
111
        mai[2][1] = - (ma[0][0] * A1323 - ma[0][1] * A0323 + ma[0][3] * A0123);
112
        mai[2][2] =   (ma[0][0] * A1313 - ma[0][1] * A0313 + ma[0][3] * A0113);
113
        mai[2][3] = - (ma[0][0] * A1312 - ma[0][1] * A0312 + ma[0][3] * A0112);
114
        mai[3][0] = - (ma[1][0] * A1223 - ma[1][1] * A0223 + ma[1][2] * A0123);
115
        mai[3][1] =   (ma[0][0] * A1223 - ma[0][1] * A0223 + ma[0][2] * A0123);
116
        mai[3][2] = - (ma[0][0] * A1213 - ma[0][1] * A0213 + ma[0][2] * A0113);
117
        mai[3][3] =   (ma[0][0] * A1212 - ma[0][1] * A0212 + ma[0][2] * A0112);
118
    }
119
    // *INDENT-ON*
120
    // clang-format on
121

122
    mm[0] = det * (mai[0][0] * mb[0] + mai[0][1] * mb[1] + mai[0][2] * mb[2] + mai[0][3] * mb[3]);
123
    mm[1] = det * (mai[1][0] * mb[0] + mai[1][1] * mb[1] + mai[1][2] * mb[2] + mai[1][3] * mb[3]);
124
    mm[2] = det * (mai[2][0] * mb[0] + mai[2][1] * mb[1] + mai[2][2] * mb[2] + mai[2][3] * mb[3]);
125
    mm[3] = det * (mai[3][0] * mb[0] + mai[3][1] * mb[1] + mai[3][2] * mb[2] + mai[3][3] * mb[3]);
126

127
    tm[0] = tm[4] = mm[0];
128
    tm[1] = -mm[1];
129
    tm[3] = mm[1];
130
    tm[2] = mm[2];
131
    tm[5] = mm[3];
132
}
133

134
void invert_affine_transform(const float* tm, float* tm_inv)
135
{
136
    float D = tm[0] * tm[4] - tm[1] * tm[3];
137
    D = D != 0.f ? 1.f / D : 0.f;
138

139
    float A11 = tm[4] * D;
140
    float A22 = tm[0] * D;
141
    float A12 = -tm[1] * D;
142
    float A21 = -tm[3] * D;
143
    float b1 = -A11 * tm[2] - A12 * tm[5];
144
    float b2 = -A21 * tm[2] - A22 * tm[5];
145

146
    tm_inv[0] = A11;
147
    tm_inv[1] = A12;
148
    tm_inv[2] = b1;
149
    tm_inv[3] = A21;
150
    tm_inv[4] = A22;
151
    tm_inv[5] = b2;
152
}
153

154
void warpaffine_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type, unsigned int v)
155
{
156
    return warpaffine_bilinear_c1(src, srcw, srch, srcw, dst, w, h, w, tm, type, v);
157
}
158

159
void warpaffine_bilinear_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type, unsigned int v)
160
{
161
    return warpaffine_bilinear_c2(src, srcw, srch, srcw * 2, dst, w, h, w * 2, tm, type, v);
162
}
163

164
void warpaffine_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type, unsigned int v)
165
{
166
    return warpaffine_bilinear_c3(src, srcw, srch, srcw * 3, dst, w, h, w * 3, tm, type, v);
167
}
168

169
void warpaffine_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type, unsigned int v)
170
{
171
    return warpaffine_bilinear_c4(src, srcw, srch, srcw * 4, dst, w, h, w * 4, tm, type, v);
172
}
173

174
void warpaffine_bilinear_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type, unsigned int v)
175
{
176
    const unsigned char* border_color = (const unsigned char*)&v;
177
    const int wgap = stride - w;
178

179
    const unsigned char* src0 = src;
180
    unsigned char* dst0 = dst;
181

182
#define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X), SHRT_MIN), SHRT_MAX)
183
#define SATURATE_CAST_INT(X)   (int)::std::min(::std::max((int)((X) + ((X) >= 0.f ? 0.5f : -0.5f)), INT_MIN), INT_MAX)
184

185
    std::vector<int> adelta(w);
186
    std::vector<int> bdelta(w);
187
    for (int x = 0; x < w; x++)
188
    {
189
        adelta[x] = SATURATE_CAST_INT(tm[0] * x * (1 << 10));
190
        bdelta[x] = SATURATE_CAST_INT(tm[3] * x * (1 << 10));
191
    }
192

193
    int y = 0;
194
    for (; y < h; y++)
195
    {
196
        int X0 = SATURATE_CAST_INT((tm[1] * y + tm[2]) * (1 << 10));
197
        int Y0 = SATURATE_CAST_INT((tm[4] * y + tm[5]) * (1 << 10));
198

199
        int x = 0;
200
        for (; x + 7 < w; x += 8)
201
        {
202
            int sxy_inout = 0;
203
            {
204
                int X_0 = X0 + adelta[x];
205
                int Y_0 = Y0 + bdelta[x];
206
                int X_7 = X0 + adelta[x + 7];
207
                int Y_7 = Y0 + bdelta[x + 7];
208

209
                short sx_0 = SATURATE_CAST_SHORT((X_0 >> 10));
210
                short sy_0 = SATURATE_CAST_SHORT((Y_0 >> 10));
211
                short sx_7 = SATURATE_CAST_SHORT((X_7 >> 10));
212
                short sy_7 = SATURATE_CAST_SHORT((Y_7 >> 10));
213

214
                if (((unsigned short)sx_0 < srcw - 1 && (unsigned short)sy_0 < srch - 1) && ((unsigned short)sx_7 < srcw - 1 && (unsigned short)sy_7 < srch - 1))
215
                {
216
                    // all inside
217
                    sxy_inout = 1;
218
                }
219
                else if ((sx_0 < -1 && sx_7 < -1) || (sx_0 >= srcw && sx_7 >= srcw) || (sy_0 < -1 && sy_7 < -1) || (sy_0 >= srch && sy_7 >= srch))
220
                {
221
                    // all outside
222
                    sxy_inout = 2;
223
                }
224
            }
225

226
            if (sxy_inout == 1)
227
            {
228
                // all inside
229
#if __ARM_NEON
230
                int32x4_t _Xl = vaddq_s32(vdupq_n_s32(X0), vld1q_s32(adelta.data() + x));
231
                int32x4_t _Xh = vaddq_s32(vdupq_n_s32(X0), vld1q_s32(adelta.data() + x + 4));
232
                int32x4_t _Yl = vaddq_s32(vdupq_n_s32(Y0), vld1q_s32(bdelta.data() + x));
233
                int32x4_t _Yh = vaddq_s32(vdupq_n_s32(Y0), vld1q_s32(bdelta.data() + x + 4));
234

235
                int16x4_t _sxl = vqshrn_n_s32(_Xl, 10);
236
                int16x4_t _sxh = vqshrn_n_s32(_Xh, 10);
237
                int16x4_t _syl = vqshrn_n_s32(_Yl, 10);
238
                int16x4_t _syh = vqshrn_n_s32(_Yh, 10);
239

240
                uint32x4_t _v1024m1 = vdupq_n_u32((1 << 10) - 1);
241
                uint16x8_t _fx = vcombine_u16(vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Xl), _v1024m1)), vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Xh), _v1024m1)));
242
                uint16x8_t _fy = vcombine_u16(vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Yl), _v1024m1)), vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Yh), _v1024m1)));
243

244
                uint16x8_t _alpha0 = vsubq_u16(vdupq_n_u16(1 << 10), _fx);
245
                uint16x8_t _alpha1 = _fx;
246
                uint16x8_t _beta0 = vsubq_u16(vdupq_n_u16(1 << 10), _fy);
247
                uint16x8_t _beta1 = _fy;
248

249
                int16x4_t _srcstride = vdup_n_s16(srcstride);
250

251
                int32x4_t _a0l = vaddw_s16(vmull_s16(_srcstride, _syl), _sxl);
252
                int32x4_t _a0h = vaddw_s16(vmull_s16(_srcstride, _syh), _sxh);
253
                int32x4_t _b0l = vaddw_s16(_a0l, _srcstride);
254
                int32x4_t _b0h = vaddw_s16(_a0h, _srcstride);
255

256
                uint8x8x2_t _a0a1 = uint8x8x2_t();
257
                uint8x8x2_t _b0b1 = uint8x8x2_t();
258
                {
259
                    _a0a1 = vld2_lane_u8(src0 + vgetq_lane_s32(_a0l, 0), _a0a1, 0);
260
                    _b0b1 = vld2_lane_u8(src0 + vgetq_lane_s32(_b0l, 0), _b0b1, 0);
261

262
                    _a0a1 = vld2_lane_u8(src0 + vgetq_lane_s32(_a0l, 1), _a0a1, 1);
263
                    _b0b1 = vld2_lane_u8(src0 + vgetq_lane_s32(_b0l, 1), _b0b1, 1);
264

265
                    _a0a1 = vld2_lane_u8(src0 + vgetq_lane_s32(_a0l, 2), _a0a1, 2);
266
                    _b0b1 = vld2_lane_u8(src0 + vgetq_lane_s32(_b0l, 2), _b0b1, 2);
267

268
                    _a0a1 = vld2_lane_u8(src0 + vgetq_lane_s32(_a0l, 3), _a0a1, 3);
269
                    _b0b1 = vld2_lane_u8(src0 + vgetq_lane_s32(_b0l, 3), _b0b1, 3);
270

271
                    _a0a1 = vld2_lane_u8(src0 + vgetq_lane_s32(_a0h, 0), _a0a1, 4);
272
                    _b0b1 = vld2_lane_u8(src0 + vgetq_lane_s32(_b0h, 0), _b0b1, 4);
273

274
                    _a0a1 = vld2_lane_u8(src0 + vgetq_lane_s32(_a0h, 1), _a0a1, 5);
275
                    _b0b1 = vld2_lane_u8(src0 + vgetq_lane_s32(_b0h, 1), _b0b1, 5);
276

277
                    _a0a1 = vld2_lane_u8(src0 + vgetq_lane_s32(_a0h, 2), _a0a1, 6);
278
                    _b0b1 = vld2_lane_u8(src0 + vgetq_lane_s32(_b0h, 2), _b0b1, 6);
279

280
                    _a0a1 = vld2_lane_u8(src0 + vgetq_lane_s32(_a0h, 3), _a0a1, 7);
281
                    _b0b1 = vld2_lane_u8(src0 + vgetq_lane_s32(_b0h, 3), _b0b1, 7);
282
                }
283

284
                uint16x8_t _a0_0 = vmovl_u8(_a0a1.val[0]);
285
                uint16x8_t _a1_0 = vmovl_u8(_a0a1.val[1]);
286
                uint16x8_t _b0_0 = vmovl_u8(_b0b1.val[0]);
287
                uint16x8_t _b1_0 = vmovl_u8(_b0b1.val[1]);
288

289
                uint16x4_t _a00_0l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_a0_0), vget_low_u16(_alpha0)), vget_low_u16(_a1_0), vget_low_u16(_alpha1)), 5);
290
                uint16x4_t _a00_0h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_a0_0), vget_high_u16(_alpha0)), vget_high_u16(_a1_0), vget_high_u16(_alpha1)), 5);
291
                uint16x4_t _b00_0l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_b0_0), vget_low_u16(_alpha0)), vget_low_u16(_b1_0), vget_low_u16(_alpha1)), 5);
292
                uint16x4_t _b00_0h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_b0_0), vget_high_u16(_alpha0)), vget_high_u16(_b1_0), vget_high_u16(_alpha1)), 5);
293

294
                uint16x4_t _dst_0l = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_0l, vget_low_u16(_beta0)), _b00_0l, vget_low_u16(_beta1)), 15);
295
                uint16x4_t _dst_0h = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_0h, vget_high_u16(_beta0)), _b00_0h, vget_high_u16(_beta1)), 15);
296

297
                uint8x8_t _dst = vqmovn_u16(vcombine_u16(_dst_0l, _dst_0h));
298

299
                vst1_u8(dst0, _dst);
300

301
                dst0 += 8;
302
#else
303
                for (int xi = 0; xi < 8; xi++)
304
                {
305
                    int X = X0 + adelta[x + xi];
306
                    int Y = Y0 + bdelta[x + xi];
307

308
                    short sx = SATURATE_CAST_SHORT((X >> 10));
309
                    short sy = SATURATE_CAST_SHORT((Y >> 10));
310

311
                    short fx = X & ((1 << 10) - 1);
312
                    short fy = Y & ((1 << 10) - 1);
313

314
                    short alpha0 = (1 << 10) - fx;
315
                    short alpha1 = fx;
316

317
                    short beta0 = (1 << 10) - fy;
318
                    short beta1 = fy;
319

320
                    const unsigned char* a0 = src0 + srcstride * sy + sx;
321
                    const unsigned char* a1 = src0 + srcstride * sy + sx + 1;
322
                    const unsigned char* b0 = src0 + srcstride * (sy + 1) + sx;
323
                    const unsigned char* b1 = src0 + srcstride * (sy + 1) + sx + 1;
324

325
                    dst0[0] = (unsigned char)(((((unsigned short)((a0[0] * alpha0 + a1[0] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[0] * alpha0 + b1[0] * alpha1) >> 5) * beta1))) >> 15);
326

327
                    dst0 += 1;
328
                }
329
#endif // __ARM_NEON
330
            }
331
            else if (sxy_inout == 2)
332
            {
333
                // all outside
334
                if (type != -233)
335
                {
336
#if __ARM_NEON
337
                    uint8x8_t _border_color = vdup_n_u8(border_color[0]);
338
                    vst1_u8(dst0, _border_color);
339
#else
340
                    for (int xi = 0; xi < 8; xi++)
341
                    {
342
                        dst0[xi] = border_color[0];
343
                    }
344
#endif // __ARM_NEON
345
                }
346
                else
347
                {
348
                    // skip
349
                }
350

351
                dst0 += 8;
352
            }
353
            else // if (sxy_inout == 0)
354
            {
355
                for (int xi = 0; xi < 8; xi++)
356
                {
357
                    int X = X0 + adelta[x + xi];
358
                    int Y = Y0 + bdelta[x + xi];
359

360
                    short sx = SATURATE_CAST_SHORT((X >> 10));
361
                    short sy = SATURATE_CAST_SHORT((Y >> 10));
362

363
                    if (type != -233 && (sx < -1 || sx >= srcw || sy < -1 || sy >= srch))
364
                    {
365
                        dst0[0] = border_color[0];
366
                    }
367
                    else if (type == -233 && ((unsigned short)sx >= srcw - 1 || (unsigned short)sy >= srch - 1))
368
                    {
369
                        // skip
370
                    }
371
                    else
372
                    {
373
                        short fx = X & ((1 << 10) - 1);
374
                        short fy = Y & ((1 << 10) - 1);
375

376
                        short alpha0 = (1 << 10) - fx;
377
                        short alpha1 = fx;
378

379
                        short beta0 = (1 << 10) - fy;
380
                        short beta1 = fy;
381

382
                        short sx1 = sx + 1;
383
                        short sy1 = sy + 1;
384

385
                        const unsigned char* a0 = src0 + srcstride * sy + sx;
386
                        const unsigned char* a1 = src0 + srcstride * sy + sx + 1;
387
                        const unsigned char* b0 = src0 + srcstride * (sy + 1) + sx;
388
                        const unsigned char* b1 = src0 + srcstride * (sy + 1) + sx + 1;
389

390
                        if ((unsigned short)sx >= srcw || (unsigned short)sy >= srch)
391
                        {
392
                            a0 = type != -233 ? border_color : dst0;
393
                        }
394
                        if ((unsigned short)sx1 >= srcw || (unsigned short)sy >= srch)
395
                        {
396
                            a1 = type != -233 ? border_color : dst0;
397
                        }
398
                        if ((unsigned short)sx >= srcw || (unsigned short)sy1 >= srch)
399
                        {
400
                            b0 = type != -233 ? border_color : dst0;
401
                        }
402
                        if ((unsigned short)sx1 >= srcw || (unsigned short)sy1 >= srch)
403
                        {
404
                            b1 = type != -233 ? border_color : dst0;
405
                        }
406

407
                        dst0[0] = (unsigned char)(((((unsigned short)((a0[0] * alpha0 + a1[0] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[0] * alpha0 + b1[0] * alpha1) >> 5) * beta1))) >> 15);
408
                    }
409

410
                    dst0 += 1;
411
                }
412
            }
413
        }
414
        for (; x < w; x++)
415
        {
416
            int X = X0 + adelta[x];
417
            int Y = Y0 + bdelta[x];
418

419
            short sx = SATURATE_CAST_SHORT((X >> 10));
420
            short sy = SATURATE_CAST_SHORT((Y >> 10));
421

422
            if (type != -233 && (sx < -1 || sx >= srcw || sy < -1 || sy >= srch))
423
            {
424
                dst0[0] = border_color[0];
425
            }
426
            else if (type == -233 && ((unsigned short)sx >= srcw - 1 || (unsigned short)sy >= srch - 1))
427
            {
428
                // skip
429
            }
430
            else
431
            {
432
                short fx = X & ((1 << 10) - 1);
433
                short fy = Y & ((1 << 10) - 1);
434

435
                short alpha0 = (1 << 10) - fx;
436
                short alpha1 = fx;
437

438
                short beta0 = (1 << 10) - fy;
439
                short beta1 = fy;
440

441
                short sx1 = sx + 1;
442
                short sy1 = sy + 1;
443

444
                const unsigned char* a0 = src0 + srcstride * sy + sx;
445
                const unsigned char* a1 = src0 + srcstride * sy + sx + 1;
446
                const unsigned char* b0 = src0 + srcstride * (sy + 1) + sx;
447
                const unsigned char* b1 = src0 + srcstride * (sy + 1) + sx + 1;
448

449
                if ((unsigned short)sx >= srcw || (unsigned short)sy >= srch)
450
                {
451
                    a0 = type != -233 ? border_color : dst0;
452
                }
453
                if ((unsigned short)sx1 >= srcw || (unsigned short)sy >= srch)
454
                {
455
                    a1 = type != -233 ? border_color : dst0;
456
                }
457
                if ((unsigned short)sx >= srcw || (unsigned short)sy1 >= srch)
458
                {
459
                    b0 = type != -233 ? border_color : dst0;
460
                }
461
                if ((unsigned short)sx1 >= srcw || (unsigned short)sy1 >= srch)
462
                {
463
                    b1 = type != -233 ? border_color : dst0;
464
                }
465

466
                dst0[0] = (unsigned char)(((((unsigned short)((a0[0] * alpha0 + a1[0] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[0] * alpha0 + b1[0] * alpha1) >> 5) * beta1))) >> 15);
467
            }
468

469
            dst0 += 1;
470
        }
471

472
        dst0 += wgap;
473
    }
474

475
#undef SATURATE_CAST_SHORT
476
#undef SATURATE_CAST_INT
477
}
478

479
void warpaffine_bilinear_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type, unsigned int v)
480
{
481
    const unsigned char* border_color = (const unsigned char*)&v;
482
    const int wgap = stride - w * 2;
483

484
    const unsigned char* src0 = src;
485
    unsigned char* dst0 = dst;
486

487
#define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X), SHRT_MIN), SHRT_MAX)
488
#define SATURATE_CAST_INT(X)   (int)::std::min(::std::max((int)((X) + ((X) >= 0.f ? 0.5f : -0.5f)), INT_MIN), INT_MAX)
489

490
    std::vector<int> adelta(w);
491
    std::vector<int> bdelta(w);
492
    for (int x = 0; x < w; x++)
493
    {
494
        adelta[x] = SATURATE_CAST_INT(tm[0] * x * (1 << 10));
495
        bdelta[x] = SATURATE_CAST_INT(tm[3] * x * (1 << 10));
496
    }
497

498
    int y = 0;
499
    for (; y < h; y++)
500
    {
501
        int X0 = SATURATE_CAST_INT((tm[1] * y + tm[2]) * (1 << 10));
502
        int Y0 = SATURATE_CAST_INT((tm[4] * y + tm[5]) * (1 << 10));
503

504
        int x = 0;
505
        for (; x + 7 < w; x += 8)
506
        {
507
            int sxy_inout = 0;
508
            {
509
                int X_0 = X0 + adelta[x];
510
                int Y_0 = Y0 + bdelta[x];
511
                int X_7 = X0 + adelta[x + 7];
512
                int Y_7 = Y0 + bdelta[x + 7];
513

514
                short sx_0 = SATURATE_CAST_SHORT((X_0 >> 10));
515
                short sy_0 = SATURATE_CAST_SHORT((Y_0 >> 10));
516
                short sx_7 = SATURATE_CAST_SHORT((X_7 >> 10));
517
                short sy_7 = SATURATE_CAST_SHORT((Y_7 >> 10));
518

519
                if (((unsigned short)sx_0 < srcw - 1 && (unsigned short)sy_0 < srch - 1) && ((unsigned short)sx_7 < srcw - 1 && (unsigned short)sy_7 < srch - 1))
520
                {
521
                    // all inside
522
                    sxy_inout = 1;
523
                }
524
                else if ((sx_0 < -1 && sx_7 < -1) || (sx_0 >= srcw && sx_7 >= srcw) || (sy_0 < -1 && sy_7 < -1) || (sy_0 >= srch && sy_7 >= srch))
525
                {
526
                    // all outside
527
                    sxy_inout = 2;
528
                }
529
            }
530

531
            if (sxy_inout == 1)
532
            {
533
                // all inside
534
#if __ARM_NEON
535
                int32x4_t _Xl = vaddq_s32(vdupq_n_s32(X0), vld1q_s32(adelta.data() + x));
536
                int32x4_t _Xh = vaddq_s32(vdupq_n_s32(X0), vld1q_s32(adelta.data() + x + 4));
537
                int32x4_t _Yl = vaddq_s32(vdupq_n_s32(Y0), vld1q_s32(bdelta.data() + x));
538
                int32x4_t _Yh = vaddq_s32(vdupq_n_s32(Y0), vld1q_s32(bdelta.data() + x + 4));
539

540
                int16x4_t _sxl = vqshrn_n_s32(_Xl, 10);
541
                int16x4_t _sxh = vqshrn_n_s32(_Xh, 10);
542
                int16x4_t _syl = vqshrn_n_s32(_Yl, 10);
543
                int16x4_t _syh = vqshrn_n_s32(_Yh, 10);
544

545
                uint32x4_t _v1024m1 = vdupq_n_u32((1 << 10) - 1);
546
                uint16x8_t _fx = vcombine_u16(vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Xl), _v1024m1)), vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Xh), _v1024m1)));
547
                uint16x8_t _fy = vcombine_u16(vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Yl), _v1024m1)), vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Yh), _v1024m1)));
548

549
                uint16x8_t _alpha0 = vsubq_u16(vdupq_n_u16(1 << 10), _fx);
550
                uint16x8_t _alpha1 = _fx;
551
                uint16x8_t _beta0 = vsubq_u16(vdupq_n_u16(1 << 10), _fy);
552
                uint16x8_t _beta1 = _fy;
553

554
                int16x4_t _srcstride = vdup_n_s16(srcstride);
555
                int16x4_t _v2 = vdup_n_s16(2);
556

557
                int32x4_t _a0l = vmlal_s16(vmull_s16(_srcstride, _syl), _sxl, _v2);
558
                int32x4_t _a0h = vmlal_s16(vmull_s16(_srcstride, _syh), _sxh, _v2);
559
                int32x4_t _b0l = vaddw_s16(_a0l, _srcstride);
560
                int32x4_t _b0h = vaddw_s16(_a0h, _srcstride);
561

562
                uint8x8x4_t _a0a1 = uint8x8x4_t();
563
                uint8x8x4_t _b0b1 = uint8x8x4_t();
564
                {
565
                    _a0a1 = vld4_lane_u8(src0 + vgetq_lane_s32(_a0l, 0), _a0a1, 0);
566
                    _b0b1 = vld4_lane_u8(src0 + vgetq_lane_s32(_b0l, 0), _b0b1, 0);
567

568
                    _a0a1 = vld4_lane_u8(src0 + vgetq_lane_s32(_a0l, 1), _a0a1, 1);
569
                    _b0b1 = vld4_lane_u8(src0 + vgetq_lane_s32(_b0l, 1), _b0b1, 1);
570

571
                    _a0a1 = vld4_lane_u8(src0 + vgetq_lane_s32(_a0l, 2), _a0a1, 2);
572
                    _b0b1 = vld4_lane_u8(src0 + vgetq_lane_s32(_b0l, 2), _b0b1, 2);
573

574
                    _a0a1 = vld4_lane_u8(src0 + vgetq_lane_s32(_a0l, 3), _a0a1, 3);
575
                    _b0b1 = vld4_lane_u8(src0 + vgetq_lane_s32(_b0l, 3), _b0b1, 3);
576

577
                    _a0a1 = vld4_lane_u8(src0 + vgetq_lane_s32(_a0h, 0), _a0a1, 4);
578
                    _b0b1 = vld4_lane_u8(src0 + vgetq_lane_s32(_b0h, 0), _b0b1, 4);
579

580
                    _a0a1 = vld4_lane_u8(src0 + vgetq_lane_s32(_a0h, 1), _a0a1, 5);
581
                    _b0b1 = vld4_lane_u8(src0 + vgetq_lane_s32(_b0h, 1), _b0b1, 5);
582

583
                    _a0a1 = vld4_lane_u8(src0 + vgetq_lane_s32(_a0h, 2), _a0a1, 6);
584
                    _b0b1 = vld4_lane_u8(src0 + vgetq_lane_s32(_b0h, 2), _b0b1, 6);
585

586
                    _a0a1 = vld4_lane_u8(src0 + vgetq_lane_s32(_a0h, 3), _a0a1, 7);
587
                    _b0b1 = vld4_lane_u8(src0 + vgetq_lane_s32(_b0h, 3), _b0b1, 7);
588
                }
589

590
                uint16x8_t _a0_0 = vmovl_u8(_a0a1.val[0]);
591
                uint16x8_t _a0_1 = vmovl_u8(_a0a1.val[1]);
592
                uint16x8_t _a1_0 = vmovl_u8(_a0a1.val[2]);
593
                uint16x8_t _a1_1 = vmovl_u8(_a0a1.val[3]);
594
                uint16x8_t _b0_0 = vmovl_u8(_b0b1.val[0]);
595
                uint16x8_t _b0_1 = vmovl_u8(_b0b1.val[1]);
596
                uint16x8_t _b1_0 = vmovl_u8(_b0b1.val[2]);
597
                uint16x8_t _b1_1 = vmovl_u8(_b0b1.val[3]);
598

599
                uint16x4_t _a00_0l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_a0_0), vget_low_u16(_alpha0)), vget_low_u16(_a1_0), vget_low_u16(_alpha1)), 5);
600
                uint16x4_t _a00_1l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_a0_1), vget_low_u16(_alpha0)), vget_low_u16(_a1_1), vget_low_u16(_alpha1)), 5);
601
                uint16x4_t _a00_0h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_a0_0), vget_high_u16(_alpha0)), vget_high_u16(_a1_0), vget_high_u16(_alpha1)), 5);
602
                uint16x4_t _a00_1h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_a0_1), vget_high_u16(_alpha0)), vget_high_u16(_a1_1), vget_high_u16(_alpha1)), 5);
603
                uint16x4_t _b00_0l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_b0_0), vget_low_u16(_alpha0)), vget_low_u16(_b1_0), vget_low_u16(_alpha1)), 5);
604
                uint16x4_t _b00_1l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_b0_1), vget_low_u16(_alpha0)), vget_low_u16(_b1_1), vget_low_u16(_alpha1)), 5);
605
                uint16x4_t _b00_0h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_b0_0), vget_high_u16(_alpha0)), vget_high_u16(_b1_0), vget_high_u16(_alpha1)), 5);
606
                uint16x4_t _b00_1h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_b0_1), vget_high_u16(_alpha0)), vget_high_u16(_b1_1), vget_high_u16(_alpha1)), 5);
607

608
                uint16x4_t _dst_0l = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_0l, vget_low_u16(_beta0)), _b00_0l, vget_low_u16(_beta1)), 15);
609
                uint16x4_t _dst_1l = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_1l, vget_low_u16(_beta0)), _b00_1l, vget_low_u16(_beta1)), 15);
610
                uint16x4_t _dst_0h = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_0h, vget_high_u16(_beta0)), _b00_0h, vget_high_u16(_beta1)), 15);
611
                uint16x4_t _dst_1h = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_1h, vget_high_u16(_beta0)), _b00_1h, vget_high_u16(_beta1)), 15);
612

613
                uint8x8x2_t _dst;
614
                _dst.val[0] = vqmovn_u16(vcombine_u16(_dst_0l, _dst_0h));
615
                _dst.val[1] = vqmovn_u16(vcombine_u16(_dst_1l, _dst_1h));
616

617
                vst2_u8(dst0, _dst);
618

619
                dst0 += 2 * 8;
620
#else
621
                for (int xi = 0; xi < 8; xi++)
622
                {
623
                    int X = X0 + adelta[x + xi];
624
                    int Y = Y0 + bdelta[x + xi];
625

626
                    short sx = SATURATE_CAST_SHORT((X >> 10));
627
                    short sy = SATURATE_CAST_SHORT((Y >> 10));
628

629
                    short fx = X & ((1 << 10) - 1);
630
                    short fy = Y & ((1 << 10) - 1);
631

632
                    short alpha0 = (1 << 10) - fx;
633
                    short alpha1 = fx;
634

635
                    short beta0 = (1 << 10) - fy;
636
                    short beta1 = fy;
637

638
                    const unsigned char* a0 = src0 + srcstride * sy + sx * 2;
639
                    const unsigned char* a1 = src0 + srcstride * sy + sx * 2 + 2;
640
                    const unsigned char* b0 = src0 + srcstride * (sy + 1) + sx * 2;
641
                    const unsigned char* b1 = src0 + srcstride * (sy + 1) + sx * 2 + 2;
642

643
                    dst0[0] = (unsigned char)(((((unsigned short)((a0[0] * alpha0 + a1[0] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[0] * alpha0 + b1[0] * alpha1) >> 5) * beta1))) >> 15);
644
                    dst0[1] = (unsigned char)(((((unsigned short)((a0[1] * alpha0 + a1[1] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[1] * alpha0 + b1[1] * alpha1) >> 5) * beta1))) >> 15);
645

646
                    dst0 += 2;
647
                }
648
#endif // __ARM_NEON
649
            }
650
            else if (sxy_inout == 2)
651
            {
652
                // all outside
653
                if (type != -233)
654
                {
655
#if __ARM_NEON
656
                    uint8x8x2_t _border_color;
657
                    _border_color.val[0] = vdup_n_u8(border_color[0]);
658
                    _border_color.val[1] = vdup_n_u8(border_color[1]);
659

660
                    vst2_u8(dst0, _border_color);
661
#else
662
                    for (int xi = 0; xi < 8; xi++)
663
                    {
664
                        dst0[xi * 2] = border_color[0];
665
                        dst0[xi * 2 + 1] = border_color[1];
666
                    }
667
#endif
668
                }
669
                else
670
                {
671
                    // skip
672
                }
673

674
                dst0 += 16;
675
            }
676
            else // if (sxy_inout == 0)
677
            {
678
                for (int xi = 0; xi < 8; xi++)
679
                {
680
                    int X = X0 + adelta[x + xi];
681
                    int Y = Y0 + bdelta[x + xi];
682

683
                    short sx = SATURATE_CAST_SHORT((X >> 10));
684
                    short sy = SATURATE_CAST_SHORT((Y >> 10));
685

686
                    if (type != -233 && (sx < -1 || sx >= srcw || sy < -1 || sy >= srch))
687
                    {
688
                        dst0[0] = border_color[0];
689
                        dst0[1] = border_color[1];
690
                    }
691
                    else if (type == -233 && ((unsigned short)sx >= srcw - 1 || (unsigned short)sy >= srch - 1))
692
                    {
693
                        // skip
694
                    }
695
                    else
696
                    {
697
                        short fx = X & ((1 << 10) - 1);
698
                        short fy = Y & ((1 << 10) - 1);
699

700
                        short alpha0 = (1 << 10) - fx;
701
                        short alpha1 = fx;
702

703
                        short beta0 = (1 << 10) - fy;
704
                        short beta1 = fy;
705

706
                        short sx1 = sx + 1;
707
                        short sy1 = sy + 1;
708

709
                        const unsigned char* a0 = src0 + srcstride * sy + sx * 2;
710
                        const unsigned char* a1 = src0 + srcstride * sy + sx * 2 + 2;
711
                        const unsigned char* b0 = src0 + srcstride * (sy + 1) + sx * 2;
712
                        const unsigned char* b1 = src0 + srcstride * (sy + 1) + sx * 2 + 2;
713

714
                        if ((unsigned short)sx >= srcw || (unsigned short)sy >= srch)
715
                        {
716
                            a0 = type != -233 ? border_color : dst0;
717
                        }
718
                        if ((unsigned short)sx1 >= srcw || (unsigned short)sy >= srch)
719
                        {
720
                            a1 = type != -233 ? border_color : dst0;
721
                        }
722
                        if ((unsigned short)sx >= srcw || (unsigned short)sy1 >= srch)
723
                        {
724
                            b0 = type != -233 ? border_color : dst0;
725
                        }
726
                        if ((unsigned short)sx1 >= srcw || (unsigned short)sy1 >= srch)
727
                        {
728
                            b1 = type != -233 ? border_color : dst0;
729
                        }
730

731
                        dst0[0] = (unsigned char)(((((unsigned short)((a0[0] * alpha0 + a1[0] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[0] * alpha0 + b1[0] * alpha1) >> 5) * beta1))) >> 15);
732
                        dst0[1] = (unsigned char)(((((unsigned short)((a0[1] * alpha0 + a1[1] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[1] * alpha0 + b1[1] * alpha1) >> 5) * beta1))) >> 15);
733
                    }
734

735
                    dst0 += 2;
736
                }
737
            }
738
        }
739
        for (; x < w; x++)
740
        {
741
            int X = X0 + adelta[x];
742
            int Y = Y0 + bdelta[x];
743

744
            short sx = SATURATE_CAST_SHORT((X >> 10));
745
            short sy = SATURATE_CAST_SHORT((Y >> 10));
746

747
            if (type != -233 && (sx < -1 || sx >= srcw || sy < -1 || sy >= srch))
748
            {
749
                dst0[0] = border_color[0];
750
                dst0[1] = border_color[1];
751
            }
752
            else if (type == -233 && ((unsigned short)sx >= srcw - 1 || (unsigned short)sy >= srch - 1))
753
            {
754
                // skip
755
            }
756
            else
757
            {
758
                short fx = X & ((1 << 10) - 1);
759
                short fy = Y & ((1 << 10) - 1);
760

761
                short alpha0 = (1 << 10) - fx;
762
                short alpha1 = fx;
763

764
                short beta0 = (1 << 10) - fy;
765
                short beta1 = fy;
766

767
                short sx1 = sx + 1;
768
                short sy1 = sy + 1;
769

770
                const unsigned char* a0 = src0 + srcstride * sy + sx * 2;
771
                const unsigned char* a1 = src0 + srcstride * sy + sx * 2 + 2;
772
                const unsigned char* b0 = src0 + srcstride * (sy + 1) + sx * 2;
773
                const unsigned char* b1 = src0 + srcstride * (sy + 1) + sx * 2 + 2;
774

775
                if ((unsigned short)sx >= srcw || (unsigned short)sy >= srch)
776
                {
777
                    a0 = type != -233 ? border_color : dst0;
778
                }
779
                if ((unsigned short)sx1 >= srcw || (unsigned short)sy >= srch)
780
                {
781
                    a1 = type != -233 ? border_color : dst0;
782
                }
783
                if ((unsigned short)sx >= srcw || (unsigned short)sy1 >= srch)
784
                {
785
                    b0 = type != -233 ? border_color : dst0;
786
                }
787
                if ((unsigned short)sx1 >= srcw || (unsigned short)sy1 >= srch)
788
                {
789
                    b1 = type != -233 ? border_color : dst0;
790
                }
791

792
                dst0[0] = (unsigned char)(((((unsigned short)((a0[0] * alpha0 + a1[0] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[0] * alpha0 + b1[0] * alpha1) >> 5) * beta1))) >> 15);
793
                dst0[1] = (unsigned char)(((((unsigned short)((a0[1] * alpha0 + a1[1] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[1] * alpha0 + b1[1] * alpha1) >> 5) * beta1))) >> 15);
794
            }
795

796
            dst0 += 2;
797
        }
798

799
        dst0 += wgap;
800
    }
801

802
#undef SATURATE_CAST_SHORT
803
#undef SATURATE_CAST_INT
804
}
805

806
void warpaffine_bilinear_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type, unsigned int v)
807
{
808
    const unsigned char* border_color = (const unsigned char*)&v;
809
    const int wgap = stride - w * 3;
810

811
    const unsigned char* src0 = src;
812
    unsigned char* dst0 = dst;
813

814
#define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X), SHRT_MIN), SHRT_MAX)
815
#define SATURATE_CAST_INT(X)   (int)::std::min(::std::max((int)((X) + ((X) >= 0.f ? 0.5f : -0.5f)), INT_MIN), INT_MAX)
816

817
    std::vector<int> adelta(w);
818
    std::vector<int> bdelta(w);
819
    for (int x = 0; x < w; x++)
820
    {
821
        adelta[x] = SATURATE_CAST_INT(tm[0] * x * (1 << 10));
822
        bdelta[x] = SATURATE_CAST_INT(tm[3] * x * (1 << 10));
823
    }
824

825
    int y = 0;
826
    for (; y < h; y++)
827
    {
828
        int X0 = SATURATE_CAST_INT((tm[1] * y + tm[2]) * (1 << 10));
829
        int Y0 = SATURATE_CAST_INT((tm[4] * y + tm[5]) * (1 << 10));
830

831
        int x = 0;
832
        for (; x + 7 < w; x += 8)
833
        {
834
            int sxy_inout = 0;
835
            {
836
                int X_0 = X0 + adelta[x];
837
                int Y_0 = Y0 + bdelta[x];
838
                int X_7 = X0 + adelta[x + 7];
839
                int Y_7 = Y0 + bdelta[x + 7];
840

841
                short sx_0 = SATURATE_CAST_SHORT((X_0 >> 10));
842
                short sy_0 = SATURATE_CAST_SHORT((Y_0 >> 10));
843
                short sx_7 = SATURATE_CAST_SHORT((X_7 >> 10));
844
                short sy_7 = SATURATE_CAST_SHORT((Y_7 >> 10));
845

846
                if (((unsigned short)sx_0 < srcw - 1 && (unsigned short)sy_0 < srch - 1) && ((unsigned short)sx_7 < srcw - 1 && (unsigned short)sy_7 < srch - 1))
847
                {
848
                    // all inside
849
                    sxy_inout = 1;
850
                }
851
                else if ((sx_0 < -1 && sx_7 < -1) || (sx_0 >= srcw && sx_7 >= srcw) || (sy_0 < -1 && sy_7 < -1) || (sy_0 >= srch && sy_7 >= srch))
852
                {
853
                    // all outside
854
                    sxy_inout = 2;
855
                }
856
            }
857

858
            if (sxy_inout == 1)
859
            {
860
                // all inside
861
#if __ARM_NEON
862
                int32x4_t _Xl = vaddq_s32(vdupq_n_s32(X0), vld1q_s32(adelta.data() + x));
863
                int32x4_t _Xh = vaddq_s32(vdupq_n_s32(X0), vld1q_s32(adelta.data() + x + 4));
864
                int32x4_t _Yl = vaddq_s32(vdupq_n_s32(Y0), vld1q_s32(bdelta.data() + x));
865
                int32x4_t _Yh = vaddq_s32(vdupq_n_s32(Y0), vld1q_s32(bdelta.data() + x + 4));
866

867
                int16x4_t _sxl = vqshrn_n_s32(_Xl, 10);
868
                int16x4_t _sxh = vqshrn_n_s32(_Xh, 10);
869
                int16x4_t _syl = vqshrn_n_s32(_Yl, 10);
870
                int16x4_t _syh = vqshrn_n_s32(_Yh, 10);
871

872
                uint32x4_t _v1024m1 = vdupq_n_u32((1 << 10) - 1);
873
                uint16x8_t _fx = vcombine_u16(vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Xl), _v1024m1)), vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Xh), _v1024m1)));
874
                uint16x8_t _fy = vcombine_u16(vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Yl), _v1024m1)), vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Yh), _v1024m1)));
875

876
                uint16x8_t _alpha0 = vsubq_u16(vdupq_n_u16(1 << 10), _fx);
877
                uint16x8_t _alpha1 = _fx;
878
                uint16x8_t _beta0 = vsubq_u16(vdupq_n_u16(1 << 10), _fy);
879
                uint16x8_t _beta1 = _fy;
880

881
                int16x4_t _srcstride = vdup_n_s16(srcstride);
882
                int16x4_t _v3 = vdup_n_s16(3);
883

884
                int32x4_t _a0l = vmlal_s16(vmull_s16(_srcstride, _syl), _sxl, _v3);
885
                int32x4_t _a0h = vmlal_s16(vmull_s16(_srcstride, _syh), _sxh, _v3);
886
                int32x4_t _b0l = vaddw_s16(_a0l, _srcstride);
887
                int32x4_t _b0h = vaddw_s16(_a0h, _srcstride);
888
                int32x4_t _a1l = vaddw_s16(_a0l, _v3);
889
                int32x4_t _a1h = vaddw_s16(_a0h, _v3);
890
                int32x4_t _b1l = vaddw_s16(_b0l, _v3);
891
                int32x4_t _b1h = vaddw_s16(_b0h, _v3);
892

893
                uint8x8x3_t _a0 = uint8x8x3_t();
894
                uint8x8x3_t _a1 = uint8x8x3_t();
895
                uint8x8x3_t _b0 = uint8x8x3_t();
896
                uint8x8x3_t _b1 = uint8x8x3_t();
897
                {
898
                    _a0 = vld3_lane_u8(src0 + vgetq_lane_s32(_a0l, 0), _a0, 0);
899
                    _a1 = vld3_lane_u8(src0 + vgetq_lane_s32(_a1l, 0), _a1, 0);
900
                    _b0 = vld3_lane_u8(src0 + vgetq_lane_s32(_b0l, 0), _b0, 0);
901
                    _b1 = vld3_lane_u8(src0 + vgetq_lane_s32(_b1l, 0), _b1, 0);
902

903
                    _a0 = vld3_lane_u8(src0 + vgetq_lane_s32(_a0l, 1), _a0, 1);
904
                    _a1 = vld3_lane_u8(src0 + vgetq_lane_s32(_a1l, 1), _a1, 1);
905
                    _b0 = vld3_lane_u8(src0 + vgetq_lane_s32(_b0l, 1), _b0, 1);
906
                    _b1 = vld3_lane_u8(src0 + vgetq_lane_s32(_b1l, 1), _b1, 1);
907

908
                    _a0 = vld3_lane_u8(src0 + vgetq_lane_s32(_a0l, 2), _a0, 2);
909
                    _a1 = vld3_lane_u8(src0 + vgetq_lane_s32(_a1l, 2), _a1, 2);
910
                    _b0 = vld3_lane_u8(src0 + vgetq_lane_s32(_b0l, 2), _b0, 2);
911
                    _b1 = vld3_lane_u8(src0 + vgetq_lane_s32(_b1l, 2), _b1, 2);
912

913
                    _a0 = vld3_lane_u8(src0 + vgetq_lane_s32(_a0l, 3), _a0, 3);
914
                    _a1 = vld3_lane_u8(src0 + vgetq_lane_s32(_a1l, 3), _a1, 3);
915
                    _b0 = vld3_lane_u8(src0 + vgetq_lane_s32(_b0l, 3), _b0, 3);
916
                    _b1 = vld3_lane_u8(src0 + vgetq_lane_s32(_b1l, 3), _b1, 3);
917

918
                    _a0 = vld3_lane_u8(src0 + vgetq_lane_s32(_a0h, 0), _a0, 4);
919
                    _a1 = vld3_lane_u8(src0 + vgetq_lane_s32(_a1h, 0), _a1, 4);
920
                    _b0 = vld3_lane_u8(src0 + vgetq_lane_s32(_b0h, 0), _b0, 4);
921
                    _b1 = vld3_lane_u8(src0 + vgetq_lane_s32(_b1h, 0), _b1, 4);
922

923
                    _a0 = vld3_lane_u8(src0 + vgetq_lane_s32(_a0h, 1), _a0, 5);
924
                    _a1 = vld3_lane_u8(src0 + vgetq_lane_s32(_a1h, 1), _a1, 5);
925
                    _b0 = vld3_lane_u8(src0 + vgetq_lane_s32(_b0h, 1), _b0, 5);
926
                    _b1 = vld3_lane_u8(src0 + vgetq_lane_s32(_b1h, 1), _b1, 5);
927

928
                    _a0 = vld3_lane_u8(src0 + vgetq_lane_s32(_a0h, 2), _a0, 6);
929
                    _a1 = vld3_lane_u8(src0 + vgetq_lane_s32(_a1h, 2), _a1, 6);
930
                    _b0 = vld3_lane_u8(src0 + vgetq_lane_s32(_b0h, 2), _b0, 6);
931
                    _b1 = vld3_lane_u8(src0 + vgetq_lane_s32(_b1h, 2), _b1, 6);
932

933
                    _a0 = vld3_lane_u8(src0 + vgetq_lane_s32(_a0h, 3), _a0, 7);
934
                    _a1 = vld3_lane_u8(src0 + vgetq_lane_s32(_a1h, 3), _a1, 7);
935
                    _b0 = vld3_lane_u8(src0 + vgetq_lane_s32(_b0h, 3), _b0, 7);
936
                    _b1 = vld3_lane_u8(src0 + vgetq_lane_s32(_b1h, 3), _b1, 7);
937
                }
938

939
                uint16x8_t _a0_0 = vmovl_u8(_a0.val[0]);
940
                uint16x8_t _a0_1 = vmovl_u8(_a0.val[1]);
941
                uint16x8_t _a0_2 = vmovl_u8(_a0.val[2]);
942
                uint16x8_t _a1_0 = vmovl_u8(_a1.val[0]);
943
                uint16x8_t _a1_1 = vmovl_u8(_a1.val[1]);
944
                uint16x8_t _a1_2 = vmovl_u8(_a1.val[2]);
945
                uint16x8_t _b0_0 = vmovl_u8(_b0.val[0]);
946
                uint16x8_t _b0_1 = vmovl_u8(_b0.val[1]);
947
                uint16x8_t _b0_2 = vmovl_u8(_b0.val[2]);
948
                uint16x8_t _b1_0 = vmovl_u8(_b1.val[0]);
949
                uint16x8_t _b1_1 = vmovl_u8(_b1.val[1]);
950
                uint16x8_t _b1_2 = vmovl_u8(_b1.val[2]);
951

952
                uint16x4_t _a00_0l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_a0_0), vget_low_u16(_alpha0)), vget_low_u16(_a1_0), vget_low_u16(_alpha1)), 5);
953
                uint16x4_t _a00_1l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_a0_1), vget_low_u16(_alpha0)), vget_low_u16(_a1_1), vget_low_u16(_alpha1)), 5);
954
                uint16x4_t _a00_2l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_a0_2), vget_low_u16(_alpha0)), vget_low_u16(_a1_2), vget_low_u16(_alpha1)), 5);
955
                uint16x4_t _a00_0h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_a0_0), vget_high_u16(_alpha0)), vget_high_u16(_a1_0), vget_high_u16(_alpha1)), 5);
956
                uint16x4_t _a00_1h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_a0_1), vget_high_u16(_alpha0)), vget_high_u16(_a1_1), vget_high_u16(_alpha1)), 5);
957
                uint16x4_t _a00_2h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_a0_2), vget_high_u16(_alpha0)), vget_high_u16(_a1_2), vget_high_u16(_alpha1)), 5);
958
                uint16x4_t _b00_0l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_b0_0), vget_low_u16(_alpha0)), vget_low_u16(_b1_0), vget_low_u16(_alpha1)), 5);
959
                uint16x4_t _b00_1l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_b0_1), vget_low_u16(_alpha0)), vget_low_u16(_b1_1), vget_low_u16(_alpha1)), 5);
960
                uint16x4_t _b00_2l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_b0_2), vget_low_u16(_alpha0)), vget_low_u16(_b1_2), vget_low_u16(_alpha1)), 5);
961
                uint16x4_t _b00_0h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_b0_0), vget_high_u16(_alpha0)), vget_high_u16(_b1_0), vget_high_u16(_alpha1)), 5);
962
                uint16x4_t _b00_1h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_b0_1), vget_high_u16(_alpha0)), vget_high_u16(_b1_1), vget_high_u16(_alpha1)), 5);
963
                uint16x4_t _b00_2h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_b0_2), vget_high_u16(_alpha0)), vget_high_u16(_b1_2), vget_high_u16(_alpha1)), 5);
964

965
                uint16x4_t _dst_0l = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_0l, vget_low_u16(_beta0)), _b00_0l, vget_low_u16(_beta1)), 15);
966
                uint16x4_t _dst_1l = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_1l, vget_low_u16(_beta0)), _b00_1l, vget_low_u16(_beta1)), 15);
967
                uint16x4_t _dst_2l = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_2l, vget_low_u16(_beta0)), _b00_2l, vget_low_u16(_beta1)), 15);
968
                uint16x4_t _dst_0h = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_0h, vget_high_u16(_beta0)), _b00_0h, vget_high_u16(_beta1)), 15);
969
                uint16x4_t _dst_1h = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_1h, vget_high_u16(_beta0)), _b00_1h, vget_high_u16(_beta1)), 15);
970
                uint16x4_t _dst_2h = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_2h, vget_high_u16(_beta0)), _b00_2h, vget_high_u16(_beta1)), 15);
971

972
                uint8x8x3_t _dst;
973
                _dst.val[0] = vqmovn_u16(vcombine_u16(_dst_0l, _dst_0h));
974
                _dst.val[1] = vqmovn_u16(vcombine_u16(_dst_1l, _dst_1h));
975
                _dst.val[2] = vqmovn_u16(vcombine_u16(_dst_2l, _dst_2h));
976

977
                vst3_u8(dst0, _dst);
978

979
                dst0 += 3 * 8;
980
#else
981
                for (int xi = 0; xi < 8; xi++)
982
                {
983
                    int X = X0 + adelta[x + xi];
984
                    int Y = Y0 + bdelta[x + xi];
985

986
                    short sx = SATURATE_CAST_SHORT((X >> 10));
987
                    short sy = SATURATE_CAST_SHORT((Y >> 10));
988

989
                    short fx = X & ((1 << 10) - 1);
990
                    short fy = Y & ((1 << 10) - 1);
991

992
                    short alpha0 = (1 << 10) - fx;
993
                    short alpha1 = fx;
994

995
                    short beta0 = (1 << 10) - fy;
996
                    short beta1 = fy;
997

998
                    const unsigned char* a0 = src0 + srcstride * sy + sx * 3;
999
                    const unsigned char* a1 = src0 + srcstride * sy + sx * 3 + 3;
1000
                    const unsigned char* b0 = src0 + srcstride * (sy + 1) + sx * 3;
1001
                    const unsigned char* b1 = src0 + srcstride * (sy + 1) + sx * 3 + 3;
1002

1003
                    dst0[0] = (unsigned char)(((((unsigned short)((a0[0] * alpha0 + a1[0] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[0] * alpha0 + b1[0] * alpha1) >> 5) * beta1))) >> 15);
1004
                    dst0[1] = (unsigned char)(((((unsigned short)((a0[1] * alpha0 + a1[1] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[1] * alpha0 + b1[1] * alpha1) >> 5) * beta1))) >> 15);
1005
                    dst0[2] = (unsigned char)(((((unsigned short)((a0[2] * alpha0 + a1[2] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[2] * alpha0 + b1[2] * alpha1) >> 5) * beta1))) >> 15);
1006

1007
                    dst0 += 3;
1008
                }
1009
#endif // __ARM_NEON
1010
            }
1011
            else if (sxy_inout == 2)
1012
            {
1013
                // all outside
1014
                if (type != -233)
1015
                {
1016
#if __ARM_NEON
1017
                    uint8x8x3_t _border_color;
1018
                    _border_color.val[0] = vdup_n_u8(border_color[0]);
1019
                    _border_color.val[1] = vdup_n_u8(border_color[1]);
1020
                    _border_color.val[2] = vdup_n_u8(border_color[2]);
1021

1022
                    vst3_u8(dst0, _border_color);
1023
#else
1024
                    for (int xi = 0; xi < 8; xi++)
1025
                    {
1026
                        dst0[xi * 3] = border_color[0];
1027
                        dst0[xi * 3 + 1] = border_color[1];
1028
                        dst0[xi * 3 + 2] = border_color[2];
1029
                    }
1030
#endif // __ARM_NEON
1031
                }
1032
                else
1033
                {
1034
                    // skip
1035
                }
1036

1037
                dst0 += 24;
1038
            }
1039
            else // if (sxy_inout == 0)
1040
            {
1041
                for (int xi = 0; xi < 8; xi++)
1042
                {
1043
                    int X = X0 + adelta[x + xi];
1044
                    int Y = Y0 + bdelta[x + xi];
1045

1046
                    short sx = SATURATE_CAST_SHORT((X >> 10));
1047
                    short sy = SATURATE_CAST_SHORT((Y >> 10));
1048

1049
                    if (type != -233 && (sx < -1 || sx >= srcw || sy < -1 || sy >= srch))
1050
                    {
1051
                        dst0[0] = border_color[0];
1052
                        dst0[1] = border_color[1];
1053
                        dst0[2] = border_color[2];
1054
                    }
1055
                    else if (type == -233 && ((unsigned short)sx >= srcw - 1 || (unsigned short)sy >= srch - 1))
1056
                    {
1057
                        // skip
1058
                    }
1059
                    else
1060
                    {
1061
                        short fx = X & ((1 << 10) - 1);
1062
                        short fy = Y & ((1 << 10) - 1);
1063

1064
                        short alpha0 = (1 << 10) - fx;
1065
                        short alpha1 = fx;
1066

1067
                        short beta0 = (1 << 10) - fy;
1068
                        short beta1 = fy;
1069

1070
                        short sx1 = sx + 1;
1071
                        short sy1 = sy + 1;
1072

1073
                        const unsigned char* a0 = src0 + srcstride * sy + sx * 3;
1074
                        const unsigned char* a1 = src0 + srcstride * sy + sx * 3 + 3;
1075
                        const unsigned char* b0 = src0 + srcstride * (sy + 1) + sx * 3;
1076
                        const unsigned char* b1 = src0 + srcstride * (sy + 1) + sx * 3 + 3;
1077

1078
                        if ((unsigned short)sx >= srcw || (unsigned short)sy >= srch)
1079
                        {
1080
                            a0 = type != -233 ? border_color : dst0;
1081
                        }
1082
                        if ((unsigned short)sx1 >= srcw || (unsigned short)sy >= srch)
1083
                        {
1084
                            a1 = type != -233 ? border_color : dst0;
1085
                        }
1086
                        if ((unsigned short)sx >= srcw || (unsigned short)sy1 >= srch)
1087
                        {
1088
                            b0 = type != -233 ? border_color : dst0;
1089
                        }
1090
                        if ((unsigned short)sx1 >= srcw || (unsigned short)sy1 >= srch)
1091
                        {
1092
                            b1 = type != -233 ? border_color : dst0;
1093
                        }
1094

1095
                        dst0[0] = (unsigned char)(((((unsigned short)((a0[0] * alpha0 + a1[0] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[0] * alpha0 + b1[0] * alpha1) >> 5) * beta1))) >> 15);
1096
                        dst0[1] = (unsigned char)(((((unsigned short)((a0[1] * alpha0 + a1[1] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[1] * alpha0 + b1[1] * alpha1) >> 5) * beta1))) >> 15);
1097
                        dst0[2] = (unsigned char)(((((unsigned short)((a0[2] * alpha0 + a1[2] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[2] * alpha0 + b1[2] * alpha1) >> 5) * beta1))) >> 15);
1098
                    }
1099

1100
                    dst0 += 3;
1101
                }
1102
            }
1103
        }
1104
        for (; x < w; x++)
1105
        {
1106
            int X = X0 + adelta[x];
1107
            int Y = Y0 + bdelta[x];
1108

1109
            short sx = SATURATE_CAST_SHORT((X >> 10));
1110
            short sy = SATURATE_CAST_SHORT((Y >> 10));
1111

1112
            if (type != -233 && (sx < -1 || sx >= srcw || sy < -1 || sy >= srch))
1113
            {
1114
                dst0[0] = border_color[0];
1115
                dst0[1] = border_color[1];
1116
                dst0[2] = border_color[2];
1117
            }
1118
            else if (type == -233 && ((unsigned short)sx >= srcw - 1 || (unsigned short)sy >= srch - 1))
1119
            {
1120
                // skip
1121
            }
1122
            else
1123
            {
1124
                short fx = X & ((1 << 10) - 1);
1125
                short fy = Y & ((1 << 10) - 1);
1126

1127
                short alpha0 = (1 << 10) - fx;
1128
                short alpha1 = fx;
1129

1130
                short beta0 = (1 << 10) - fy;
1131
                short beta1 = fy;
1132

1133
                short sx1 = sx + 1;
1134
                short sy1 = sy + 1;
1135

1136
                const unsigned char* a0 = src0 + srcstride * sy + sx * 3;
1137
                const unsigned char* a1 = src0 + srcstride * sy + sx * 3 + 3;
1138
                const unsigned char* b0 = src0 + srcstride * (sy + 1) + sx * 3;
1139
                const unsigned char* b1 = src0 + srcstride * (sy + 1) + sx * 3 + 3;
1140

1141
                if ((unsigned short)sx >= srcw || (unsigned short)sy >= srch)
1142
                {
1143
                    a0 = type != -233 ? border_color : dst0;
1144
                }
1145
                if ((unsigned short)sx1 >= srcw || (unsigned short)sy >= srch)
1146
                {
1147
                    a1 = type != -233 ? border_color : dst0;
1148
                }
1149
                if ((unsigned short)sx >= srcw || (unsigned short)sy1 >= srch)
1150
                {
1151
                    b0 = type != -233 ? border_color : dst0;
1152
                }
1153
                if ((unsigned short)sx1 >= srcw || (unsigned short)sy1 >= srch)
1154
                {
1155
                    b1 = type != -233 ? border_color : dst0;
1156
                }
1157

1158
                dst0[0] = (unsigned char)(((((unsigned short)((a0[0] * alpha0 + a1[0] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[0] * alpha0 + b1[0] * alpha1) >> 5) * beta1))) >> 15);
1159
                dst0[1] = (unsigned char)(((((unsigned short)((a0[1] * alpha0 + a1[1] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[1] * alpha0 + b1[1] * alpha1) >> 5) * beta1))) >> 15);
1160
                dst0[2] = (unsigned char)(((((unsigned short)((a0[2] * alpha0 + a1[2] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[2] * alpha0 + b1[2] * alpha1) >> 5) * beta1))) >> 15);
1161
            }
1162

1163
            dst0 += 3;
1164
        }
1165

1166
        dst0 += wgap;
1167
    }
1168

1169
#undef SATURATE_CAST_SHORT
1170
#undef SATURATE_CAST_INT
1171
}
1172

1173
void warpaffine_bilinear_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type, unsigned int v)
1174
{
1175
    const unsigned char* border_color = (const unsigned char*)&v;
1176
    const int wgap = stride - w * 4;
1177

1178
    const unsigned char* src0 = src;
1179
    unsigned char* dst0 = dst;
1180

1181
#define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X), SHRT_MIN), SHRT_MAX)
1182
#define SATURATE_CAST_INT(X)   (int)::std::min(::std::max((int)((X) + ((X) >= 0.f ? 0.5f : -0.5f)), INT_MIN), INT_MAX)
1183

1184
    std::vector<int> adelta(w);
1185
    std::vector<int> bdelta(w);
1186
    for (int x = 0; x < w; x++)
1187
    {
1188
        adelta[x] = SATURATE_CAST_INT(tm[0] * x * (1 << 10));
1189
        bdelta[x] = SATURATE_CAST_INT(tm[3] * x * (1 << 10));
1190
    }
1191

1192
    int y = 0;
1193
    for (; y < h; y++)
1194
    {
1195
        int X0 = SATURATE_CAST_INT((tm[1] * y + tm[2]) * (1 << 10));
1196
        int Y0 = SATURATE_CAST_INT((tm[4] * y + tm[5]) * (1 << 10));
1197

1198
        int x = 0;
1199
        for (; x + 7 < w; x += 8)
1200
        {
1201
            int sxy_inout = 0;
1202
            {
1203
                int X_0 = X0 + adelta[x];
1204
                int Y_0 = Y0 + bdelta[x];
1205
                int X_7 = X0 + adelta[x + 7];
1206
                int Y_7 = Y0 + bdelta[x + 7];
1207

1208
                short sx_0 = SATURATE_CAST_SHORT((X_0 >> 10));
1209
                short sy_0 = SATURATE_CAST_SHORT((Y_0 >> 10));
1210
                short sx_7 = SATURATE_CAST_SHORT((X_7 >> 10));
1211
                short sy_7 = SATURATE_CAST_SHORT((Y_7 >> 10));
1212

1213
                if (((unsigned short)sx_0 < srcw - 1 && (unsigned short)sy_0 < srch - 1) && ((unsigned short)sx_7 < srcw - 1 && (unsigned short)sy_7 < srch - 1))
1214
                {
1215
                    // all inside
1216
                    sxy_inout = 1;
1217
                }
1218
                else if ((sx_0 < -1 && sx_7 < -1) || (sx_0 >= srcw && sx_7 >= srcw) || (sy_0 < -1 && sy_7 < -1) || (sy_0 >= srch && sy_7 >= srch))
1219
                {
1220
                    // all outside
1221
                    sxy_inout = 2;
1222
                }
1223
            }
1224

1225
            if (sxy_inout == 1)
1226
            {
1227
                // all inside
1228
#if __ARM_NEON
1229
                int32x4_t _Xl = vaddq_s32(vdupq_n_s32(X0), vld1q_s32(adelta.data() + x));
1230
                int32x4_t _Xh = vaddq_s32(vdupq_n_s32(X0), vld1q_s32(adelta.data() + x + 4));
1231
                int32x4_t _Yl = vaddq_s32(vdupq_n_s32(Y0), vld1q_s32(bdelta.data() + x));
1232
                int32x4_t _Yh = vaddq_s32(vdupq_n_s32(Y0), vld1q_s32(bdelta.data() + x + 4));
1233

1234
                int16x4_t _sxl = vqshrn_n_s32(_Xl, 10);
1235
                int16x4_t _sxh = vqshrn_n_s32(_Xh, 10);
1236
                int16x4_t _syl = vqshrn_n_s32(_Yl, 10);
1237
                int16x4_t _syh = vqshrn_n_s32(_Yh, 10);
1238

1239
                uint32x4_t _v1024m1 = vdupq_n_u32((1 << 10) - 1);
1240
                uint16x8_t _fx = vcombine_u16(vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Xl), _v1024m1)), vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Xh), _v1024m1)));
1241
                uint16x8_t _fy = vcombine_u16(vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Yl), _v1024m1)), vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Yh), _v1024m1)));
1242

1243
                uint16x8_t _alpha0 = vsubq_u16(vdupq_n_u16(1 << 10), _fx);
1244
                uint16x8_t _alpha1 = _fx;
1245
                uint16x8_t _beta0 = vsubq_u16(vdupq_n_u16(1 << 10), _fy);
1246
                uint16x8_t _beta1 = _fy;
1247

1248
                int16x4_t _srcstride = vdup_n_s16(srcstride);
1249
                int16x4_t _v4 = vdup_n_s16(4);
1250

1251
                int32x4_t _a0l = vmlal_s16(vmull_s16(_srcstride, _syl), _sxl, _v4);
1252
                int32x4_t _a0h = vmlal_s16(vmull_s16(_srcstride, _syh), _sxh, _v4);
1253
                int32x4_t _b0l = vaddw_s16(_a0l, _srcstride);
1254
                int32x4_t _b0h = vaddw_s16(_a0h, _srcstride);
1255

1256
                uint8x8x4_t _a0 = uint8x8x4_t();
1257
                uint8x8x4_t _a1 = uint8x8x4_t();
1258
                uint8x8x4_t _b0 = uint8x8x4_t();
1259
                uint8x8x4_t _b1 = uint8x8x4_t();
1260
                {
1261
                    uint8x8_t _a0a1_0 = vld1_u8(src0 + vgetq_lane_s32(_a0l, 0));
1262
                    uint8x8_t _a0a1_1 = vld1_u8(src0 + vgetq_lane_s32(_a0l, 1));
1263
                    uint8x8_t _a0a1_2 = vld1_u8(src0 + vgetq_lane_s32(_a0l, 2));
1264
                    uint8x8_t _a0a1_3 = vld1_u8(src0 + vgetq_lane_s32(_a0l, 3));
1265
                    uint8x8_t _a0a1_4 = vld1_u8(src0 + vgetq_lane_s32(_a0h, 0));
1266
                    uint8x8_t _a0a1_5 = vld1_u8(src0 + vgetq_lane_s32(_a0h, 1));
1267
                    uint8x8_t _a0a1_6 = vld1_u8(src0 + vgetq_lane_s32(_a0h, 2));
1268
                    uint8x8_t _a0a1_7 = vld1_u8(src0 + vgetq_lane_s32(_a0h, 3));
1269

1270
                    // transpose 8x8
1271
                    uint8x8x2_t _a0a101t_r = vtrn_u8(_a0a1_0, _a0a1_1);
1272
                    uint8x8x2_t _a0a123t_r = vtrn_u8(_a0a1_2, _a0a1_3);
1273
                    uint8x8x2_t _a0a145t_r = vtrn_u8(_a0a1_4, _a0a1_5);
1274
                    uint8x8x2_t _a0a167t_r = vtrn_u8(_a0a1_6, _a0a1_7);
1275

1276
                    uint16x4x2_t _a0a102tt_r = vtrn_u16(vreinterpret_u16_u8(_a0a101t_r.val[0]), vreinterpret_u16_u8(_a0a123t_r.val[0]));
1277
                    uint16x4x2_t _a0a113tt_r = vtrn_u16(vreinterpret_u16_u8(_a0a101t_r.val[1]), vreinterpret_u16_u8(_a0a123t_r.val[1]));
1278
                    uint16x4x2_t _a0a146tt_r = vtrn_u16(vreinterpret_u16_u8(_a0a145t_r.val[0]), vreinterpret_u16_u8(_a0a167t_r.val[0]));
1279
                    uint16x4x2_t _a0a157tt_r = vtrn_u16(vreinterpret_u16_u8(_a0a145t_r.val[1]), vreinterpret_u16_u8(_a0a167t_r.val[1]));
1280

1281
                    uint32x2x2_t _a0a104ttt_r = vtrn_u32(vreinterpret_u32_u16(_a0a102tt_r.val[0]), vreinterpret_u32_u16(_a0a146tt_r.val[0]));
1282
                    uint32x2x2_t _a0a115ttt_r = vtrn_u32(vreinterpret_u32_u16(_a0a113tt_r.val[0]), vreinterpret_u32_u16(_a0a157tt_r.val[0]));
1283
                    uint32x2x2_t _a0a126ttt_r = vtrn_u32(vreinterpret_u32_u16(_a0a102tt_r.val[1]), vreinterpret_u32_u16(_a0a146tt_r.val[1]));
1284
                    uint32x2x2_t _a0a137ttt_r = vtrn_u32(vreinterpret_u32_u16(_a0a113tt_r.val[1]), vreinterpret_u32_u16(_a0a157tt_r.val[1]));
1285

1286
                    _a0.val[0] = vreinterpret_u8_u32(_a0a104ttt_r.val[0]);
1287
                    _a0.val[1] = vreinterpret_u8_u32(_a0a115ttt_r.val[0]);
1288
                    _a0.val[2] = vreinterpret_u8_u32(_a0a126ttt_r.val[0]);
1289
                    _a0.val[3] = vreinterpret_u8_u32(_a0a137ttt_r.val[0]);
1290
                    _a1.val[0] = vreinterpret_u8_u32(_a0a104ttt_r.val[1]);
1291
                    _a1.val[1] = vreinterpret_u8_u32(_a0a115ttt_r.val[1]);
1292
                    _a1.val[2] = vreinterpret_u8_u32(_a0a126ttt_r.val[1]);
1293
                    _a1.val[3] = vreinterpret_u8_u32(_a0a137ttt_r.val[1]);
1294

1295
                    uint8x8_t _b0b1_0 = vld1_u8(src0 + vgetq_lane_s32(_b0l, 0));
1296
                    uint8x8_t _b0b1_1 = vld1_u8(src0 + vgetq_lane_s32(_b0l, 1));
1297
                    uint8x8_t _b0b1_2 = vld1_u8(src0 + vgetq_lane_s32(_b0l, 2));
1298
                    uint8x8_t _b0b1_3 = vld1_u8(src0 + vgetq_lane_s32(_b0l, 3));
1299
                    uint8x8_t _b0b1_4 = vld1_u8(src0 + vgetq_lane_s32(_b0h, 0));
1300
                    uint8x8_t _b0b1_5 = vld1_u8(src0 + vgetq_lane_s32(_b0h, 1));
1301
                    uint8x8_t _b0b1_6 = vld1_u8(src0 + vgetq_lane_s32(_b0h, 2));
1302
                    uint8x8_t _b0b1_7 = vld1_u8(src0 + vgetq_lane_s32(_b0h, 3));
1303

1304
                    // transpose 8x8
1305
                    uint8x8x2_t _b0b101t_r = vtrn_u8(_b0b1_0, _b0b1_1);
1306
                    uint8x8x2_t _b0b123t_r = vtrn_u8(_b0b1_2, _b0b1_3);
1307
                    uint8x8x2_t _b0b145t_r = vtrn_u8(_b0b1_4, _b0b1_5);
1308
                    uint8x8x2_t _b0b167t_r = vtrn_u8(_b0b1_6, _b0b1_7);
1309

1310
                    uint16x4x2_t _b0b102tt_r = vtrn_u16(vreinterpret_u16_u8(_b0b101t_r.val[0]), vreinterpret_u16_u8(_b0b123t_r.val[0]));
1311
                    uint16x4x2_t _b0b113tt_r = vtrn_u16(vreinterpret_u16_u8(_b0b101t_r.val[1]), vreinterpret_u16_u8(_b0b123t_r.val[1]));
1312
                    uint16x4x2_t _b0b146tt_r = vtrn_u16(vreinterpret_u16_u8(_b0b145t_r.val[0]), vreinterpret_u16_u8(_b0b167t_r.val[0]));
1313
                    uint16x4x2_t _b0b157tt_r = vtrn_u16(vreinterpret_u16_u8(_b0b145t_r.val[1]), vreinterpret_u16_u8(_b0b167t_r.val[1]));
1314

1315
                    uint32x2x2_t _b0b104ttt_r = vtrn_u32(vreinterpret_u32_u16(_b0b102tt_r.val[0]), vreinterpret_u32_u16(_b0b146tt_r.val[0]));
1316
                    uint32x2x2_t _b0b115ttt_r = vtrn_u32(vreinterpret_u32_u16(_b0b113tt_r.val[0]), vreinterpret_u32_u16(_b0b157tt_r.val[0]));
1317
                    uint32x2x2_t _b0b126ttt_r = vtrn_u32(vreinterpret_u32_u16(_b0b102tt_r.val[1]), vreinterpret_u32_u16(_b0b146tt_r.val[1]));
1318
                    uint32x2x2_t _b0b137ttt_r = vtrn_u32(vreinterpret_u32_u16(_b0b113tt_r.val[1]), vreinterpret_u32_u16(_b0b157tt_r.val[1]));
1319

1320
                    _b0.val[0] = vreinterpret_u8_u32(_b0b104ttt_r.val[0]);
1321
                    _b0.val[1] = vreinterpret_u8_u32(_b0b115ttt_r.val[0]);
1322
                    _b0.val[2] = vreinterpret_u8_u32(_b0b126ttt_r.val[0]);
1323
                    _b0.val[3] = vreinterpret_u8_u32(_b0b137ttt_r.val[0]);
1324
                    _b1.val[0] = vreinterpret_u8_u32(_b0b104ttt_r.val[1]);
1325
                    _b1.val[1] = vreinterpret_u8_u32(_b0b115ttt_r.val[1]);
1326
                    _b1.val[2] = vreinterpret_u8_u32(_b0b126ttt_r.val[1]);
1327
                    _b1.val[3] = vreinterpret_u8_u32(_b0b137ttt_r.val[1]);
1328
                }
1329

1330
                uint16x8_t _a0_0 = vmovl_u8(_a0.val[0]);
1331
                uint16x8_t _a0_1 = vmovl_u8(_a0.val[1]);
1332
                uint16x8_t _a0_2 = vmovl_u8(_a0.val[2]);
1333
                uint16x8_t _a0_3 = vmovl_u8(_a0.val[3]);
1334
                uint16x8_t _a1_0 = vmovl_u8(_a1.val[0]);
1335
                uint16x8_t _a1_1 = vmovl_u8(_a1.val[1]);
1336
                uint16x8_t _a1_2 = vmovl_u8(_a1.val[2]);
1337
                uint16x8_t _a1_3 = vmovl_u8(_a1.val[3]);
1338
                uint16x8_t _b0_0 = vmovl_u8(_b0.val[0]);
1339
                uint16x8_t _b0_1 = vmovl_u8(_b0.val[1]);
1340
                uint16x8_t _b0_2 = vmovl_u8(_b0.val[2]);
1341
                uint16x8_t _b0_3 = vmovl_u8(_b0.val[3]);
1342
                uint16x8_t _b1_0 = vmovl_u8(_b1.val[0]);
1343
                uint16x8_t _b1_1 = vmovl_u8(_b1.val[1]);
1344
                uint16x8_t _b1_2 = vmovl_u8(_b1.val[2]);
1345
                uint16x8_t _b1_3 = vmovl_u8(_b1.val[3]);
1346

1347
                uint16x4_t _a00_0l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_a0_0), vget_low_u16(_alpha0)), vget_low_u16(_a1_0), vget_low_u16(_alpha1)), 5);
1348
                uint16x4_t _a00_1l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_a0_1), vget_low_u16(_alpha0)), vget_low_u16(_a1_1), vget_low_u16(_alpha1)), 5);
1349
                uint16x4_t _a00_2l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_a0_2), vget_low_u16(_alpha0)), vget_low_u16(_a1_2), vget_low_u16(_alpha1)), 5);
1350
                uint16x4_t _a00_3l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_a0_3), vget_low_u16(_alpha0)), vget_low_u16(_a1_3), vget_low_u16(_alpha1)), 5);
1351
                uint16x4_t _a00_0h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_a0_0), vget_high_u16(_alpha0)), vget_high_u16(_a1_0), vget_high_u16(_alpha1)), 5);
1352
                uint16x4_t _a00_1h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_a0_1), vget_high_u16(_alpha0)), vget_high_u16(_a1_1), vget_high_u16(_alpha1)), 5);
1353
                uint16x4_t _a00_2h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_a0_2), vget_high_u16(_alpha0)), vget_high_u16(_a1_2), vget_high_u16(_alpha1)), 5);
1354
                uint16x4_t _a00_3h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_a0_3), vget_high_u16(_alpha0)), vget_high_u16(_a1_3), vget_high_u16(_alpha1)), 5);
1355
                uint16x4_t _b00_0l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_b0_0), vget_low_u16(_alpha0)), vget_low_u16(_b1_0), vget_low_u16(_alpha1)), 5);
1356
                uint16x4_t _b00_1l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_b0_1), vget_low_u16(_alpha0)), vget_low_u16(_b1_1), vget_low_u16(_alpha1)), 5);
1357
                uint16x4_t _b00_2l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_b0_2), vget_low_u16(_alpha0)), vget_low_u16(_b1_2), vget_low_u16(_alpha1)), 5);
1358
                uint16x4_t _b00_3l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_b0_3), vget_low_u16(_alpha0)), vget_low_u16(_b1_3), vget_low_u16(_alpha1)), 5);
1359
                uint16x4_t _b00_0h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_b0_0), vget_high_u16(_alpha0)), vget_high_u16(_b1_0), vget_high_u16(_alpha1)), 5);
1360
                uint16x4_t _b00_1h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_b0_1), vget_high_u16(_alpha0)), vget_high_u16(_b1_1), vget_high_u16(_alpha1)), 5);
1361
                uint16x4_t _b00_2h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_b0_2), vget_high_u16(_alpha0)), vget_high_u16(_b1_2), vget_high_u16(_alpha1)), 5);
1362
                uint16x4_t _b00_3h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_b0_3), vget_high_u16(_alpha0)), vget_high_u16(_b1_3), vget_high_u16(_alpha1)), 5);
1363

1364
                uint16x4_t _dst_0l = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_0l, vget_low_u16(_beta0)), _b00_0l, vget_low_u16(_beta1)), 15);
1365
                uint16x4_t _dst_1l = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_1l, vget_low_u16(_beta0)), _b00_1l, vget_low_u16(_beta1)), 15);
1366
                uint16x4_t _dst_2l = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_2l, vget_low_u16(_beta0)), _b00_2l, vget_low_u16(_beta1)), 15);
1367
                uint16x4_t _dst_3l = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_3l, vget_low_u16(_beta0)), _b00_3l, vget_low_u16(_beta1)), 15);
1368
                uint16x4_t _dst_0h = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_0h, vget_high_u16(_beta0)), _b00_0h, vget_high_u16(_beta1)), 15);
1369
                uint16x4_t _dst_1h = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_1h, vget_high_u16(_beta0)), _b00_1h, vget_high_u16(_beta1)), 15);
1370
                uint16x4_t _dst_2h = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_2h, vget_high_u16(_beta0)), _b00_2h, vget_high_u16(_beta1)), 15);
1371
                uint16x4_t _dst_3h = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_3h, vget_high_u16(_beta0)), _b00_3h, vget_high_u16(_beta1)), 15);
1372

1373
                uint8x8x4_t _dst;
1374
                _dst.val[0] = vqmovn_u16(vcombine_u16(_dst_0l, _dst_0h));
1375
                _dst.val[1] = vqmovn_u16(vcombine_u16(_dst_1l, _dst_1h));
1376
                _dst.val[2] = vqmovn_u16(vcombine_u16(_dst_2l, _dst_2h));
1377
                _dst.val[3] = vqmovn_u16(vcombine_u16(_dst_3l, _dst_3h));
1378

1379
                vst4_u8(dst0, _dst);
1380

1381
                dst0 += 4 * 8;
1382
#else
1383
                for (int xi = 0; xi < 8; xi++)
1384
                {
1385
                    int X = X0 + adelta[x + xi];
1386
                    int Y = Y0 + bdelta[x + xi];
1387

1388
                    short sx = SATURATE_CAST_SHORT((X >> 10));
1389
                    short sy = SATURATE_CAST_SHORT((Y >> 10));
1390

1391
                    short fx = X & ((1 << 10) - 1);
1392
                    short fy = Y & ((1 << 10) - 1);
1393

1394
                    short alpha0 = (1 << 10) - fx;
1395
                    short alpha1 = fx;
1396

1397
                    short beta0 = (1 << 10) - fy;
1398
                    short beta1 = fy;
1399

1400
                    const unsigned char* a0 = src0 + srcstride * sy + sx * 4;
1401
                    const unsigned char* a1 = src0 + srcstride * sy + sx * 4 + 4;
1402
                    const unsigned char* b0 = src0 + srcstride * (sy + 1) + sx * 4;
1403
                    const unsigned char* b1 = src0 + srcstride * (sy + 1) + sx * 4 + 4;
1404

1405
                    dst0[0] = (unsigned char)(((((unsigned short)((a0[0] * alpha0 + a1[0] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[0] * alpha0 + b1[0] * alpha1) >> 5) * beta1))) >> 15);
1406
                    dst0[1] = (unsigned char)(((((unsigned short)((a0[1] * alpha0 + a1[1] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[1] * alpha0 + b1[1] * alpha1) >> 5) * beta1))) >> 15);
1407
                    dst0[2] = (unsigned char)(((((unsigned short)((a0[2] * alpha0 + a1[2] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[2] * alpha0 + b1[2] * alpha1) >> 5) * beta1))) >> 15);
1408
                    dst0[3] = (unsigned char)(((((unsigned short)((a0[3] * alpha0 + a1[3] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[3] * alpha0 + b1[3] * alpha1) >> 5) * beta1))) >> 15);
1409

1410
                    dst0 += 4;
1411
                }
1412
#endif // __ARM_NEON
1413
            }
1414
            else if (sxy_inout == 2)
1415
            {
1416
                // all outside
1417
                if (type != -233)
1418
                {
1419
#if __ARM_NEON
1420
                    uint8x8x4_t _border_color;
1421
                    _border_color.val[0] = vdup_n_u8(border_color[0]);
1422
                    _border_color.val[1] = vdup_n_u8(border_color[1]);
1423
                    _border_color.val[2] = vdup_n_u8(border_color[2]);
1424
                    _border_color.val[3] = vdup_n_u8(border_color[3]);
1425

1426
                    vst4_u8(dst0, _border_color);
1427
#else
1428
                    for (int xi = 0; xi < 8; xi++)
1429
                    {
1430
                        dst0[xi * 4] = border_color[0];
1431
                        dst0[xi * 4 + 1] = border_color[1];
1432
                        dst0[xi * 4 + 2] = border_color[2];
1433
                        dst0[xi * 4 + 3] = border_color[3];
1434
                    }
1435
#endif // __ARM_NEON
1436
                }
1437
                else
1438
                {
1439
                    // skip
1440
                }
1441

1442
                dst0 += 32;
1443
            }
1444
            else // if (sxy_inout == 0)
1445
            {
1446
                for (int xi = 0; xi < 8; xi++)
1447
                {
1448
                    int X = X0 + adelta[x + xi];
1449
                    int Y = Y0 + bdelta[x + xi];
1450

1451
                    short sx = SATURATE_CAST_SHORT((X >> 10));
1452
                    short sy = SATURATE_CAST_SHORT((Y >> 10));
1453

1454
                    if (type != -233 && (sx < -1 || sx >= srcw || sy < -1 || sy >= srch))
1455
                    {
1456
                        dst0[0] = border_color[0];
1457
                        dst0[1] = border_color[1];
1458
                        dst0[2] = border_color[2];
1459
                        dst0[3] = border_color[3];
1460
                    }
1461
                    else if (type == -233 && ((unsigned short)sx >= srcw - 1 || (unsigned short)sy >= srch - 1))
1462
                    {
1463
                        // skip
1464
                    }
1465
                    else
1466
                    {
1467
                        short fx = X & ((1 << 10) - 1);
1468
                        short fy = Y & ((1 << 10) - 1);
1469

1470
                        short alpha0 = (1 << 10) - fx;
1471
                        short alpha1 = fx;
1472

1473
                        short beta0 = (1 << 10) - fy;
1474
                        short beta1 = fy;
1475

1476
                        short sx1 = sx + 1;
1477
                        short sy1 = sy + 1;
1478

1479
                        const unsigned char* a0 = src0 + srcstride * sy + sx * 4;
1480
                        const unsigned char* a1 = src0 + srcstride * sy + sx * 4 + 4;
1481
                        const unsigned char* b0 = src0 + srcstride * (sy + 1) + sx * 4;
1482
                        const unsigned char* b1 = src0 + srcstride * (sy + 1) + sx * 4 + 4;
1483

1484
                        if ((unsigned short)sx >= srcw || (unsigned short)sy >= srch)
1485
                        {
1486
                            a0 = type != -233 ? border_color : dst0;
1487
                        }
1488
                        if ((unsigned short)sx1 >= srcw || (unsigned short)sy >= srch)
1489
                        {
1490
                            a1 = type != -233 ? border_color : dst0;
1491
                        }
1492
                        if ((unsigned short)sx >= srcw || (unsigned short)sy1 >= srch)
1493
                        {
1494
                            b0 = type != -233 ? border_color : dst0;
1495
                        }
1496
                        if ((unsigned short)sx1 >= srcw || (unsigned short)sy1 >= srch)
1497
                        {
1498
                            b1 = type != -233 ? border_color : dst0;
1499
                        }
1500

1501
                        dst0[0] = (unsigned char)(((((unsigned short)((a0[0] * alpha0 + a1[0] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[0] * alpha0 + b1[0] * alpha1) >> 5) * beta1))) >> 15);
1502
                        dst0[1] = (unsigned char)(((((unsigned short)((a0[1] * alpha0 + a1[1] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[1] * alpha0 + b1[1] * alpha1) >> 5) * beta1))) >> 15);
1503
                        dst0[2] = (unsigned char)(((((unsigned short)((a0[2] * alpha0 + a1[2] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[2] * alpha0 + b1[2] * alpha1) >> 5) * beta1))) >> 15);
1504
                        dst0[3] = (unsigned char)(((((unsigned short)((a0[3] * alpha0 + a1[3] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[3] * alpha0 + b1[3] * alpha1) >> 5) * beta1))) >> 15);
1505
                    }
1506

1507
                    dst0 += 4;
1508
                }
1509
            }
1510
        }
1511
        for (; x < w; x++)
1512
        {
1513
            int X = X0 + adelta[x];
1514
            int Y = Y0 + bdelta[x];
1515

1516
            short sx = SATURATE_CAST_SHORT((X >> 10));
1517
            short sy = SATURATE_CAST_SHORT((Y >> 10));
1518

1519
            if (type != -233 && (sx < -1 || sx >= srcw || sy < -1 || sy >= srch))
1520
            {
1521
                dst0[0] = border_color[0];
1522
                dst0[1] = border_color[1];
1523
                dst0[2] = border_color[2];
1524
                dst0[3] = border_color[3];
1525
            }
1526
            else if (type == -233 && ((unsigned short)sx >= srcw - 1 || (unsigned short)sy >= srch - 1))
1527
            {
1528
                // skip
1529
            }
1530
            else
1531
            {
1532
                short fx = X & ((1 << 10) - 1);
1533
                short fy = Y & ((1 << 10) - 1);
1534

1535
                short alpha0 = (1 << 10) - fx;
1536
                short alpha1 = fx;
1537

1538
                short beta0 = (1 << 10) - fy;
1539
                short beta1 = fy;
1540

1541
                short sx1 = sx + 1;
1542
                short sy1 = sy + 1;
1543

1544
                const unsigned char* a0 = src0 + srcstride * sy + sx * 4;
1545
                const unsigned char* a1 = src0 + srcstride * sy + sx * 4 + 4;
1546
                const unsigned char* b0 = src0 + srcstride * (sy + 1) + sx * 4;
1547
                const unsigned char* b1 = src0 + srcstride * (sy + 1) + sx * 4 + 4;
1548

1549
                if ((unsigned short)sx >= srcw || (unsigned short)sy >= srch)
1550
                {
1551
                    a0 = type != -233 ? border_color : dst0;
1552
                }
1553
                if ((unsigned short)sx1 >= srcw || (unsigned short)sy >= srch)
1554
                {
1555
                    a1 = type != -233 ? border_color : dst0;
1556
                }
1557
                if ((unsigned short)sx >= srcw || (unsigned short)sy1 >= srch)
1558
                {
1559
                    b0 = type != -233 ? border_color : dst0;
1560
                }
1561
                if ((unsigned short)sx1 >= srcw || (unsigned short)sy1 >= srch)
1562
                {
1563
                    b1 = type != -233 ? border_color : dst0;
1564
                }
1565

1566
                dst0[0] = (unsigned char)(((((unsigned short)((a0[0] * alpha0 + a1[0] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[0] * alpha0 + b1[0] * alpha1) >> 5) * beta1))) >> 15);
1567
                dst0[1] = (unsigned char)(((((unsigned short)((a0[1] * alpha0 + a1[1] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[1] * alpha0 + b1[1] * alpha1) >> 5) * beta1))) >> 15);
1568
                dst0[2] = (unsigned char)(((((unsigned short)((a0[2] * alpha0 + a1[2] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[2] * alpha0 + b1[2] * alpha1) >> 5) * beta1))) >> 15);
1569
                dst0[3] = (unsigned char)(((((unsigned short)((a0[3] * alpha0 + a1[3] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[3] * alpha0 + b1[3] * alpha1) >> 5) * beta1))) >> 15);
1570
            }
1571

1572
            dst0 += 4;
1573
        }
1574

1575
        dst0 += wgap;
1576
    }
1577

1578
#undef SATURATE_CAST_SHORT
1579
#undef SATURATE_CAST_INT
1580
}
1581

1582
void warpaffine_bilinear_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type, unsigned int v)
1583
{
1584
    // assert srcw % 2 == 0
1585
    // assert srch % 2 == 0
1586
    // assert w % 2 == 0
1587
    // assert h % 2 == 0
1588

1589
    const unsigned char* border_color = (const unsigned char*)&v;
1590

1591
    unsigned int v_y;
1592
    unsigned int v_uv;
1593
    unsigned char* border_color_y = (unsigned char*)&v_y;
1594
    unsigned char* border_color_uv = (unsigned char*)&v_uv;
1595
    border_color_y[0] = border_color[0];
1596
    border_color_uv[0] = border_color[1];
1597
    border_color_uv[1] = border_color[2];
1598

1599
    const unsigned char* srcY = src;
1600
    unsigned char* dstY = dst;
1601
    warpaffine_bilinear_c1(srcY, srcw, srch, dstY, w, h, tm, type, v_y);
1602

1603
    const float tm_uv[6] = {
1604
        tm[0],
1605
        tm[1],
1606
        tm[2] / 2.0f,
1607
        tm[3],
1608
        tm[4],
1609
        tm[5] / 2.0f,
1610
    };
1611

1612
    const unsigned char* srcUV = src + srcw * srch;
1613
    unsigned char* dstUV = dst + w * h;
1614
    warpaffine_bilinear_c2(srcUV, srcw / 2, srch / 2, dstUV, w / 2, h / 2, tm_uv, type, v_uv);
1615
}
1616
#endif // NCNN_PIXEL_AFFINE
1617

1618
} // namespace ncnn
1619

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.