ncnn

Форк
0
/
padding_pack4to8.comp 
658 строк · 22.0 Кб
1
// Tencent is pleased to support the open source community by making ncnn available.
2
//
3
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4
//
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
7
//
8
// https://opensource.org/licenses/BSD-3-Clause
9
//
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
14

15
#version 450
16

17
#if NCNN_fp16_storage
18
#extension GL_EXT_shader_16bit_storage: require
19
struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
20
#endif
21
#if NCNN_fp16_arithmetic
22
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
23
#endif
24

25
layout (constant_id = 0) const int type = 1;
26
layout (constant_id = 1) const float value = 0;
27
layout (constant_id = 2) const int per_channel_pad = 0;
28

29
#define shape_constant_id_offset 3
30
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
31
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
32
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
33
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
34
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
35

36
layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
37
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
38
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
39
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
40
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
41

42
#if NCNN_image_shader
43
layout (binding = 0) uniform unfp sampler3D bottom_blob;
44
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
45
layout (binding = 2) uniform unfp sampler3D per_channel_pad_blob;
46
#else
47
layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
48
layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
49
layout (binding = 2) readonly buffer per_channel_pad_blob { sfpvec4 per_channel_pad_blob_data[]; };
50
#endif
51

52
layout (push_constant) uniform parameter
53
{
54
    int dims;
55
    int w;
56
    int h;
57
    int c;
58
    int cstep;
59

60
    int outdims;
61
    int outw;
62
    int outh;
63
    int outc;
64
    int outcstep;
65

66
    int left;
67
    int top;
68
    int front;
69
} p;
70

71
void main()
72
{
73
    int gx = int(gl_GlobalInvocationID.x);
74
    int gy = int(gl_GlobalInvocationID.y);
75
    int gz = int(gl_GlobalInvocationID.z);
76

77
    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
78
        return;
79

80
    afpvec8 v;
81

82
    if (psc(dims) == 1)
83
    {
84
        ivec2 x2 = gx * 2 - p.left / 4 + ivec2(0, 1);
85

86
        if (type == 0)
87
        {
88
#if NCNN_image_shader
89
            if (x2.x >= 0 && x2.x < psc(w))
90
            {
91
                v[0] = image3d_ld4(bottom_blob, ivec3(x2.x, 0, 0));
92
            }
93
            else
94
            {
95
                v[0] = afpvec4(value);
96
            }
97
            if (x2.y >= 0 && x2.y < psc(w))
98
            {
99
                v[1] = image3d_ld4(bottom_blob, ivec3(x2.y, 0, 0));
100
            }
101
            else
102
            {
103
                v[1] = afpvec4(value);
104
            }
105

106
            image3d_st8(top_blob, ivec3(gx, 0, 0), v);
107
#else
108
            if (x2.x >= 0 && x2.x < psc(w))
109
            {
110
                v[0] = buffer_ld4(bottom_blob_data, x2.x);
111
            }
112
            else
113
            {
114
                v[0] = afpvec4(value);
115
            }
116
            if (x2.y >= 0 && x2.y < psc(w))
117
            {
118
                v[1] = buffer_ld4(bottom_blob_data, x2.y);
119
            }
120
            else
121
            {
122
                v[1] = afpvec4(value);
123
            }
124

125
            buffer_st8(top_blob_data, gx, v);
126
#endif
127
        }
128
        if (type == 1)
129
        {
130
#if NCNN_image_shader
131
            if (x2.x < 0)
132
            {
133
                v[0] = afpvec4(image3d_ld4(bottom_blob, ivec3(0, 0, 0)).r);
134
            }
135
            else if (x2.x >= psc(w))
136
            {
137
                v[0] = afpvec4(image3d_ld4(bottom_blob, ivec3(psc(w) - 1, 0, 0)).a);
138
            }
139
            else
140
            {
141
                v[0] = image3d_ld4(bottom_blob, ivec3(x2.x, 0, 0));
142
            }
143
            if (x2.y < 0)
144
            {
145
                v[1] = afpvec4(image3d_ld4(bottom_blob, ivec3(0, 0, 0)).r);
146
            }
147
            else if (x2.y >= psc(w))
148
            {
149
                v[1] = afpvec4(image3d_ld4(bottom_blob, ivec3(psc(w) - 1, 0, 0)).a);
150
            }
151
            else
152
            {
153
                v[1] = image3d_ld4(bottom_blob, ivec3(x2.y, 0, 0));
154
            }
155

156
            image3d_st8(top_blob, ivec3(gx, 0, 0), v);
157
#else
158
            if (x2.x < 0)
159
            {
160
                v[0] = afpvec4(buffer_ld4(bottom_blob_data, 0).r);
161
            }
162
            else if (x2.x >= psc(w))
163
            {
164
                v[0] = afpvec4(buffer_ld4(bottom_blob_data, psc(w) - 1).a);
165
            }
166
            else
167
            {
168
                v[0] = buffer_ld4(bottom_blob_data, x2.x);
169
            }
170
            if (x2.y < 0)
171
            {
172
                v[1] = afpvec4(buffer_ld4(bottom_blob_data, 0).r);
173
            }
174
            else if (x2.y >= psc(w))
175
            {
176
                v[1] = afpvec4(buffer_ld4(bottom_blob_data, psc(w) - 1).a);
177
            }
178
            else
179
            {
180
                v[1] = buffer_ld4(bottom_blob_data, x2.y);
181
            }
182

183
            buffer_st8(top_blob_data, gx, v);
184
#endif
185
        }
186
        if (type == 2)
187
        {
188
#if NCNN_image_shader
189
            if (x2.x < 0)
190
            {
191
                afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(-x2.x + 1, 0, 0));
192
                afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(-x2.x, 0, 0));
193
                v[0] = afpvec4(v1.r, v0.a, v0.b, v0.g);
194
            }
195
            else if (x2.x >= psc(w))
196
            {
197
                afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(psc(w) - x2.x + psc(w) - 2, 0, 0));
198
                afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(psc(w) - x2.x + psc(w) - 1, 0, 0));
199
                v[0] = afpvec4(v1.b, v1.g, v1.r, v0.a);
200
            }
201
            else
202
            {
203
                v[0] = image3d_ld4(bottom_blob, ivec3(x2.x, 0, 0));
204
            }
205
            if (x2.y < 0)
206
            {
207
                afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(-x2.y + 1, 0, 0));
208
                afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(-x2.y, 0, 0));
209
                v[1] = afpvec4(v1.r, v0.a, v0.b, v0.g);
210
            }
211
            else if (x2.y >= psc(w))
212
            {
213
                afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(psc(w) - x2.y + psc(w) - 2, 0, 0));
214
                afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(psc(w) - x2.y + psc(w) - 1, 0, 0));
215
                v[1] = afpvec4(v1.b, v1.g, v1.r, v0.a);
216
            }
217
            else
218
            {
219
                v[1] = image3d_ld4(bottom_blob, ivec3(x2.y, 0, 0));
220
            }
221

222
            image3d_st8(top_blob, ivec3(gx, 0, 0), v);
223
#else
224
            if (x2.x < 0)
225
            {
226
                afpvec4 v0 = buffer_ld4(bottom_blob_data, -x2.x + 1);
227
                afpvec4 v1 = buffer_ld4(bottom_blob_data, -x2.x);
228
                v[0] = afpvec4(v1.r, v0.a, v0.b, v0.g);
229
            }
230
            else if (x2.x >= psc(w))
231
            {
232
                afpvec4 v0 = buffer_ld4(bottom_blob_data, psc(w) - x2.x + psc(w) - 2);
233
                afpvec4 v1 = buffer_ld4(bottom_blob_data, psc(w) - x2.x + psc(w) - 1);
234
                v[0] = afpvec4(v1.b, v1.g, v1.r, v0.a);
235
            }
236
            else
237
            {
238
                v[0] = buffer_ld4(bottom_blob_data, x2.x);
239
            }
240
            if (x2.y < 0)
241
            {
242
                afpvec4 v0 = buffer_ld4(bottom_blob_data, -x2.y + 1);
243
                afpvec4 v1 = buffer_ld4(bottom_blob_data, -x2.y);
244
                v[1] = afpvec4(v1.r, v0.a, v0.b, v0.g);
245
            }
246
            else if (x2.y >= psc(w))
247
            {
248
                afpvec4 v0 = buffer_ld4(bottom_blob_data, psc(w) - x2.y + psc(w) - 2);
249
                afpvec4 v1 = buffer_ld4(bottom_blob_data, psc(w) - x2.y + psc(w) - 1);
250
                v[1] = afpvec4(v1.b, v1.g, v1.r, v0.a);
251
            }
252
            else
253
            {
254
                v[1] = buffer_ld4(bottom_blob_data, x2.y);
255
            }
256

257
            buffer_st8(top_blob_data, gx, v);
258
#endif
259
        }
260
    }
261
    else if (psc(dims) == 2)
262
    {
263
        int x = gx - p.left;
264
        ivec2 y2 = gy * 2 - p.top / 4 + ivec2(0, 1);
265

266
        if (type == 0)
267
        {
268
#if NCNN_image_shader
269
            if (x >= 0 && x < psc(w) && y2.x >= 0 && y2.x < psc(h))
270
            {
271
                v[0] = image3d_ld4(bottom_blob, ivec3(x, y2.x, 0));
272
            }
273
            else
274
            {
275
                v[0] = afpvec4(value);
276
            }
277
            if (x >= 0 && x < psc(w) && y2.y >= 0 && y2.y < psc(h))
278
            {
279
                v[1] = image3d_ld4(bottom_blob, ivec3(x, y2.y, 0));
280
            }
281
            else
282
            {
283
                v[1] = afpvec4(value);
284
            }
285

286
            image3d_st8(top_blob, ivec3(gx, gy, 0), v);
287
#else
288
            ivec2 v_offset = y2 * psc(w) + x;
289

290
            if (x >= 0 && x < psc(w) && y2.x >= 0 && y2.x < psc(h))
291
            {
292
                v[0] = buffer_ld4(bottom_blob_data, v_offset.x);
293
            }
294
            else
295
            {
296
                v[0] = afpvec4(value);
297
            }
298
            if (x >= 0 && x < psc(w) && y2.y >= 0 && y2.y < psc(h))
299
            {
300
                v[1] = buffer_ld4(bottom_blob_data, v_offset.y);
301
            }
302
            else
303
            {
304
                v[1] = afpvec4(value);
305
            }
306

307
            const int gi = gy * psc(outw) + gx;
308

309
            buffer_st8(top_blob_data, gi, v);
310
#endif
311
        }
312
        if (type == 1)
313
        {
314
            x = clamp(x, 0, psc(w) - 1);
315

316
#if NCNN_image_shader
317
            if (y2.x < 0)
318
            {
319
                v[0] = afpvec4(image3d_ld4(bottom_blob, ivec3(x, 0, 0)).r);
320
            }
321
            else if (y2.x >= psc(h))
322
            {
323
                v[0] = afpvec4(image3d_ld4(bottom_blob, ivec3(x, psc(h) - 1, 0)).a);
324
            }
325
            else
326
            {
327
                v[0] = image3d_ld4(bottom_blob, ivec3(x, y2.x, 0));
328
            }
329
            if (y2.y < 0)
330
            {
331
                v[1] = afpvec4(image3d_ld4(bottom_blob, ivec3(x, 0, 0)).r);
332
            }
333
            else if (y2.y >= psc(h))
334
            {
335
                v[1] = afpvec4(image3d_ld4(bottom_blob, ivec3(x, psc(h) - 1, 0)).a);
336
            }
337
            else
338
            {
339
                v[1] = image3d_ld4(bottom_blob, ivec3(x, y2.y, 0));
340
            }
341

342
            image3d_st8(top_blob, ivec3(gx, gy, 0), v);
343
#else
344
            if (y2.x < 0)
345
            {
346
                v[0] = afpvec4(buffer_ld4(bottom_blob_data, x).r);
347
            }
348
            else if (y2.x >= psc(h))
349
            {
350
                v[0] = afpvec4(buffer_ld4(bottom_blob_data, (psc(h) - 1) * psc(w) + x).a);
351
            }
352
            else
353
            {
354
                v[0] = buffer_ld4(bottom_blob_data, y2.x * psc(w) + x);
355
            }
356
            if (y2.y < 0)
357
            {
358
                v[1] = afpvec4(buffer_ld4(bottom_blob_data, x).r);
359
            }
360
            else if (y2.y >= psc(h))
361
            {
362
                v[1] = afpvec4(buffer_ld4(bottom_blob_data, (psc(h) - 1) * psc(w) + x).a);
363
            }
364
            else
365
            {
366
                v[1] = buffer_ld4(bottom_blob_data, y2.y * psc(w) + x);
367
            }
368

369
            const int gi = gy * psc(outw) + gx;
370

371
            buffer_st8(top_blob_data, gi, v);
372
#endif
373
        }
374
        if (type == 2)
375
        {
376
            x = abs(x);
377
            // NOTE psc(X) get zeros on nvidia
378
            // TODO only enable this workaround for some nvidia driver
379
            x = (p.w - 1) - abs(x - (p.w - 1));
380
//             x = (psc(w) - 1) - abs(x - (psc(w) - 1));
381

382
#if NCNN_image_shader
383
            if (y2.x < 0)
384
            {
385
                afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x, -y2.x + 1, 0));
386
                afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x, -y2.x, 0));
387
                v[0] = afpvec4(v1.r, v0.a, v0.b, v0.g);
388
            }
389
            else if (y2.x >= psc(h))
390
            {
391
                afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x, psc(h) - y2.x + psc(h) - 2, 0));
392
                afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x, psc(h) - y2.x + psc(h) - 1, 0));
393
                v[0] = afpvec4(v1.b, v1.g, v1.r, v0.a);
394
            }
395
            else
396
            {
397
                v[0] = image3d_ld4(bottom_blob, ivec3(x, y2.x, 0));
398
            }
399
            if (y2.y < 0)
400
            {
401
                afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x, -y2.y + 1, 0));
402
                afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x, -y2.y, 0));
403
                v[1] = afpvec4(v1.r, v0.a, v0.b, v0.g);
404
            }
405
            else if (y2.y >= psc(h))
406
            {
407
                afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x, psc(h) - y2.y + psc(h) - 2, 0));
408
                afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x, psc(h) - y2.y + psc(h) - 1, 0));
409
                v[1] = afpvec4(v1.b, v1.g, v1.r, v0.a);
410
            }
411
            else
412
            {
413
                v[1] = image3d_ld4(bottom_blob, ivec3(x, y2.y, 0));
414
            }
415

416
            image3d_st8(top_blob, ivec3(gx, gy, 0), v);
417
#else
418
            if (y2.x < 0)
419
            {
420
                afpvec4 v0 = buffer_ld4(bottom_blob_data, (-y2.x + 1) * psc(w) + x);
421
                afpvec4 v1 = buffer_ld4(bottom_blob_data, (-y2.x) * psc(w) + x);
422
                v[0] = afpvec4(v1.r, v0.a, v0.b, v0.g);
423
            }
424
            else if (y2.x >= psc(h))
425
            {
426
                afpvec4 v0 = buffer_ld4(bottom_blob_data, (psc(h) - y2.x + psc(h) - 2) * psc(w) + x);
427
                afpvec4 v1 = buffer_ld4(bottom_blob_data, (psc(h) - y2.x + psc(h) - 1) * psc(w) + x);
428
                v[0] = afpvec4(v1.b, v1.g, v1.r, v0.a);
429
            }
430
            else
431
            {
432
                v[0] = buffer_ld4(bottom_blob_data, y2.x * psc(w) + x);
433
            }
434
            if (y2.y < 0)
435
            {
436
                afpvec4 v0 = buffer_ld4(bottom_blob_data, (-y2.y + 1) * psc(w) + x);
437
                afpvec4 v1 = buffer_ld4(bottom_blob_data, (-y2.y) * psc(w) + x);
438
                v[1] = afpvec4(v1.r, v0.a, v0.b, v0.g);
439
            }
440
            else if (y2.y >= psc(h))
441
            {
442
                afpvec4 v0 = buffer_ld4(bottom_blob_data, (psc(h) - y2.y + psc(h) - 2) * psc(w) + x);
443
                afpvec4 v1 = buffer_ld4(bottom_blob_data, (psc(h) - y2.y + psc(h) - 1) * psc(w) + x);
444
                v[1] = afpvec4(v1.b, v1.g, v1.r, v0.a);
445
            }
446
            else
447
            {
448
                v[1] = buffer_ld4(bottom_blob_data, y2.y * psc(w) + x);
449
            }
450

451
            const int gi = gy * psc(outw) + gx;
452

453
            buffer_st8(top_blob_data, gi, v);
454
#endif
455
        }
456
    }
457
    else // if (psc(dims) == 3)
458
    {
459
        int x = gx - p.left;
460
        int y = gy - p.top;
461
        ivec2 z2 = gz * 2 - p.front / 4 + ivec2(0, 1);
462

463
        if (type == 0)
464
        {
465
#if NCNN_image_shader
466
            if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h) && z2.x >= 0 && z2.x < psc(c))
467
            {
468
                v[0] = image3d_ld4(bottom_blob, ivec3(x, y, z2.x));
469
            }
470
            else
471
            {
472
                v[0] = per_channel_pad == 1 ? image3d_ld4(per_channel_pad_blob, ivec3(gz * 2, 0, 0)) : afpvec4(value);
473
            }
474
            if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h) && z2.y >= 0 && z2.y < psc(c))
475
            {
476
                v[1] = image3d_ld4(bottom_blob, ivec3(x, y, z2.y));
477
            }
478
            else
479
            {
480
                v[1] = per_channel_pad == 1 ? image3d_ld4(per_channel_pad_blob, ivec3(gz * 2 + 1, 0, 0)) : afpvec4(value);
481
            }
482

483
            image3d_st8(top_blob, ivec3(gx, gy, gz), v);
484
#else
485
            ivec2 v_offset = z2 * psc(cstep) + y * psc(w) + x;
486

487
            if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h) && z2.x >= 0 && z2.x < psc(c))
488
            {
489
                v[0] = buffer_ld4(bottom_blob_data, v_offset.x);
490
            }
491
            else
492
            {
493
                v[0] = per_channel_pad == 1 ? buffer_ld4(per_channel_pad_blob_data, gz * 2) : afpvec4(value);
494
            }
495
            if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h) && z2.y >= 0 && z2.y < psc(c))
496
            {
497
                v[1] = buffer_ld4(bottom_blob_data, v_offset.y);
498
            }
499
            else
500
            {
501
                v[1] = per_channel_pad == 1 ? buffer_ld4(per_channel_pad_blob_data, gz * 2 + 1) : afpvec4(value);
502
            }
503

504
            const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
505

506
            buffer_st8(top_blob_data, gi, v);
507
#endif
508
        }
509
        if (type == 1)
510
        {
511
            x = clamp(x, 0, psc(w) - 1);
512
            y = clamp(y, 0, psc(h) - 1);
513

514
#if NCNN_image_shader
515
            if (z2.x < 0)
516
            {
517
                v[0] = afpvec4(image3d_ld4(bottom_blob, ivec3(x, y, 0)).r);
518
            }
519
            else if (z2.x >= psc(c))
520
            {
521
                v[0] = afpvec4(image3d_ld4(bottom_blob, ivec3(x, y, psc(c) - 1)).a);
522
            }
523
            else
524
            {
525
                v[0] = image3d_ld4(bottom_blob, ivec3(x, y, z2.x));
526
            }
527
            if (z2.y < 0)
528
            {
529
                v[1] = afpvec4(image3d_ld4(bottom_blob, ivec3(x, y, 0)).r);
530
            }
531
            else if (z2.y >= psc(c))
532
            {
533
                v[1] = afpvec4(image3d_ld4(bottom_blob, ivec3(x, y, psc(c) - 1)).a);
534
            }
535
            else
536
            {
537
                v[1] = image3d_ld4(bottom_blob, ivec3(x, y, z2.y));
538
            }
539

540
            image3d_st8(top_blob, ivec3(gx, gy, gz), v);
541
#else
542
            if (z2.x < 0)
543
            {
544
                v[0] = afpvec4(buffer_ld4(bottom_blob_data, y * psc(w) + x).r);
545
            }
546
            else if (z2.x >= psc(c))
547
            {
548
                v[0] = afpvec4(buffer_ld4(bottom_blob_data, (psc(c) - 1) * psc(cstep) + y * psc(w) + x).a);
549
            }
550
            else
551
            {
552
                v[0] = buffer_ld4(bottom_blob_data, z2.x * psc(cstep) + y * psc(w) + x);
553
            }
554
            if (z2.y < 0)
555
            {
556
                v[1] = afpvec4(buffer_ld4(bottom_blob_data, y * psc(w) + x).r);
557
            }
558
            else if (z2.y >= psc(c))
559
            {
560
                v[1] = afpvec4(buffer_ld4(bottom_blob_data, (psc(c) - 1) * psc(cstep) + y * psc(w) + x).a);
561
            }
562
            else
563
            {
564
                v[1] = buffer_ld4(bottom_blob_data, z2.y * psc(cstep) + y * psc(w) + x);
565
            }
566

567
            const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
568

569
            buffer_st8(top_blob_data, gi, v);
570
#endif
571
        }
572
        if (type == 2)
573
        {
574
            x = abs(x);
575
            y = abs(y);
576
            // NOTE psc(X) get zeros on nvidia
577
            // TODO only enable this workaround for some nvidia driver
578
            x = (p.w - 1) - abs(x - (p.w - 1));
579
            y = (p.h - 1) - abs(y - (p.h - 1));
580
//             x = (psc(w) - 1) - abs(x - (psc(w) - 1));
581
//             y = (psc(h) - 1) - abs(y - (psc(h) - 1));
582

583
#if NCNN_image_shader
584
            if (z2.x < 0)
585
            {
586
                afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x, y, -z2.x + 1));
587
                afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x, y, -z2.x));
588
                v[0] = afpvec4(v1.r, v0.a, v0.b, v0.g);
589
            }
590
            else if (z2.x >= psc(c))
591
            {
592
                afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x, y, psc(c) - z2.x + psc(c) - 2));
593
                afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x, y, psc(c) - z2.x + psc(c) - 1));
594
                v[0] = afpvec4(v1.b, v1.g, v1.r, v0.a);
595
            }
596
            else
597
            {
598
                v[0] = image3d_ld4(bottom_blob, ivec3(x, y, z2.x));
599
            }
600
            if (z2.y < 0)
601
            {
602
                afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x, y, -z2.y + 1));
603
                afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x, y, -z2.y));
604
                v[1] = afpvec4(v1.r, v0.a, v0.b, v0.g);
605
            }
606
            else if (z2.y >= psc(c))
607
            {
608
                afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x, y, psc(c) - z2.y + psc(c) - 2));
609
                afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x, y, psc(c) - z2.y + psc(c) - 1));
610
                v[1] = afpvec4(v1.b, v1.g, v1.r, v0.a);
611
            }
612
            else
613
            {
614
                v[1] = image3d_ld4(bottom_blob, ivec3(x, y, z2.y));
615
            }
616

617
            image3d_st8(top_blob, ivec3(gx, gy, gz), v);
618
#else
619
            if (z2.x < 0)
620
            {
621
                afpvec4 v0 = buffer_ld4(bottom_blob_data, (-z2.x + 1) * psc(cstep) + y * psc(w) + x);
622
                afpvec4 v1 = buffer_ld4(bottom_blob_data, (-z2.x) * psc(cstep) + y * psc(w) + x);
623
                v[0] = afpvec4(v1.r, v0.a, v0.b, v0.g);
624
            }
625
            else if (z2.x >= psc(c))
626
            {
627
                afpvec4 v0 = buffer_ld4(bottom_blob_data, (psc(c) - z2.x + psc(c) - 2) * psc(cstep) + y * psc(w) + x);
628
                afpvec4 v1 = buffer_ld4(bottom_blob_data, (psc(c) - z2.x + psc(c) - 1) * psc(cstep) + y * psc(w) + x);
629
                v[0] = afpvec4(v1.b, v1.g, v1.r, v0.a);
630
            }
631
            else
632
            {
633
                v[0] = buffer_ld4(bottom_blob_data, z2.x * psc(cstep) + y * psc(w) + x);
634
            }
635
            if (z2.y < 0)
636
            {
637
                afpvec4 v0 = buffer_ld4(bottom_blob_data, (-z2.y + 1) * psc(cstep) + y * psc(w) + x);
638
                afpvec4 v1 = buffer_ld4(bottom_blob_data, (-z2.y) * psc(cstep) + y * psc(w) + x);
639
                v[1] = afpvec4(v1.r, v0.a, v0.b, v0.g);
640
            }
641
            else if (z2.y >= psc(c))
642
            {
643
                afpvec4 v0 = buffer_ld4(bottom_blob_data, (psc(c) - z2.y + psc(c) - 2) * psc(cstep) + y * psc(w) + x);
644
                afpvec4 v1 = buffer_ld4(bottom_blob_data, (psc(c) - z2.y + psc(c) - 1) * psc(cstep) + y * psc(w) + x);
645
                v[1] = afpvec4(v1.b, v1.g, v1.r, v0.a);
646
            }
647
            else
648
            {
649
                v[1] = buffer_ld4(bottom_blob_data, z2.y * psc(cstep) + y * psc(w) + x);
650
            }
651

652
            const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
653

654
            buffer_st8(top_blob_data, gi, v);
655
#endif
656
        }
657
    }
658
}
659

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.