ncnn

Форк
0
/
crop_pack4to8.comp 
487 строк · 18.2 Кб
1
// Tencent is pleased to support the open source community by making ncnn available.
2
//
3
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4
//
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
7
//
8
// https://opensource.org/licenses/BSD-3-Clause
9
//
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
14

15
#version 450
16

17
#if NCNN_fp16_storage
18
#extension GL_EXT_shader_16bit_storage: require
19
struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
20
#endif
21
#if NCNN_fp16_arithmetic
22
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
23
#endif
24

25
layout (constant_id = 0) const int bugihfa = 0;
26

27
#define shape_constant_id_offset 1
28
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
29
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
30
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
31
layout (constant_id = shape_constant_id_offset + 3) const int d = 0;
32
layout (constant_id = shape_constant_id_offset + 4) const int c = 0;
33
layout (constant_id = shape_constant_id_offset + 5) const int cstep = 0;
34

35
layout (constant_id = shape_constant_id_offset + 6) const int outdims = 0;
36
layout (constant_id = shape_constant_id_offset + 7) const int outw = 0;
37
layout (constant_id = shape_constant_id_offset + 8) const int outh = 0;
38
layout (constant_id = shape_constant_id_offset + 9) const int outd = 0;
39
layout (constant_id = shape_constant_id_offset + 10) const int outc = 0;
40
layout (constant_id = shape_constant_id_offset + 11) const int outcstep = 0;
41

42
#if NCNN_image_shader
43
layout (binding = 0) uniform unfp sampler3D bottom_blob;
44
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
45
#else
46
#if NCNN_fp16_packed
47
layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
48
#else
49
layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
50
#endif
51
layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
52
#endif
53

54
layout (push_constant) uniform parameter
55
{
56
    int dims;
57
    int w;
58
    int h;
59
    int d;
60
    int c;
61
    int cstep;
62

63
    int outdims;
64
    int outw;
65
    int outh;
66
    int outd;
67
    int outc;
68
    int outcstep;
69

70
    int woffset;
71
    int hoffset;
72
    int doffset;
73
    int coffset;
74
} p;
75

76
void main()
77
{
78
    int gx = int(gl_GlobalInvocationID.x);
79
    int gy = int(gl_GlobalInvocationID.y);
80
    int gz = int(gl_GlobalInvocationID.z);
81

82
    if (gx >= psc(outw) || gy >= psc(outh) * psc(outd) || gz >= psc(outc))
83
        return;
84

85
    if (psc(dims) == 1)
86
    {
87
        ivec4 x4 = gx * 8 + p.woffset + ivec4(0, 1, 2, 3);
88
        ivec4 xx4 = x4 + 4;
89

90
#if NCNN_image_shader
91
        afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x4.r / 4, 0, 0));
92
        afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x4.g / 4, 0, 0));
93
        afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(x4.b / 4, 0, 0));
94
        afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(x4.a / 4, 0, 0));
95
        afpvec4 v4 = image3d_ld4(bottom_blob, ivec3(xx4.r / 4, 0, 0));
96
        afpvec4 v5 = image3d_ld4(bottom_blob, ivec3(xx4.g / 4, 0, 0));
97
        afpvec4 v6 = image3d_ld4(bottom_blob, ivec3(xx4.b / 4, 0, 0));
98
        afpvec4 v7 = image3d_ld4(bottom_blob, ivec3(xx4.a / 4, 0, 0));
99

100
        afpvec8 v;
101
#if NCNN_fp16_arithmetic
102
        if (bugihfa == 1)
103
        {
104
            ivec4 x4m4 = x4 % 4;
105
            ivec4 xx4m4 = xx4 % 4;
106

107
            if (x4m4.r == 0) v[0].r = v0.r;
108
            if (x4m4.r == 1) v[0].r = v0.g;
109
            if (x4m4.r == 2) v[0].r = v0.b;
110
            if (x4m4.r == 3) v[0].r = v0.a;
111
            if (x4m4.g == 0) v[0].g = v1.r;
112
            if (x4m4.g == 1) v[0].g = v1.g;
113
            if (x4m4.g == 2) v[0].g = v1.b;
114
            if (x4m4.g == 3) v[0].g = v1.a;
115
            if (x4m4.b == 0) v[0].b = v2.r;
116
            if (x4m4.b == 1) v[0].b = v2.g;
117
            if (x4m4.b == 2) v[0].b = v2.b;
118
            if (x4m4.b == 3) v[0].b = v2.a;
119
            if (x4m4.a == 0) v[0].a = v3.r;
120
            if (x4m4.a == 1) v[0].a = v3.g;
121
            if (x4m4.a == 2) v[0].a = v3.b;
122
            if (x4m4.a == 3) v[0].a = v3.a;
123
            if (xx4m4.r == 0) v[1].r = v4.r;
124
            if (xx4m4.r == 1) v[1].r = v4.g;
125
            if (xx4m4.r == 2) v[1].r = v4.b;
126
            if (xx4m4.r == 3) v[1].r = v4.a;
127
            if (xx4m4.g == 0) v[1].g = v5.r;
128
            if (xx4m4.g == 1) v[1].g = v5.g;
129
            if (xx4m4.g == 2) v[1].g = v5.b;
130
            if (xx4m4.g == 3) v[1].g = v5.a;
131
            if (xx4m4.b == 0) v[1].b = v6.r;
132
            if (xx4m4.b == 1) v[1].b = v6.g;
133
            if (xx4m4.b == 2) v[1].b = v6.b;
134
            if (xx4m4.b == 3) v[1].b = v6.a;
135
            if (xx4m4.a == 0) v[1].a = v7.r;
136
            if (xx4m4.a == 1) v[1].a = v7.g;
137
            if (xx4m4.a == 2) v[1].a = v7.b;
138
            if (xx4m4.a == 3) v[1].a = v7.a;
139
        }
140
        else
141
#endif
142
        {
143
            v[0].r = v0[x4.r % 4];
144
            v[0].g = v1[x4.g % 4];
145
            v[0].b = v2[x4.b % 4];
146
            v[0].a = v3[x4.a % 4];
147
            v[1].r = v4[xx4.r % 4];
148
            v[1].g = v5[xx4.g % 4];
149
            v[1].b = v6[xx4.b % 4];
150
            v[1].a = v7[xx4.a % 4];
151
        }
152

153
        image3d_st8(top_blob, ivec3(gx, 0, 0), v);
154
#else
155
#if NCNN_fp16_packed
156
        ivec4 v_offset = (x4 / 4) * 2 + (x4 % 4) / 2;
157
        ivec4 lane2 = x4 % 2;
158
        ivec4 vv_offset = (xx4 / 4) * 2 + (xx4 % 4) / 2;
159
        ivec4 lane4 = xx4 % 2;
160

161
        afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
162
        afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
163
        afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
164
        afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
165

166
        afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r);
167
        afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g);
168
        afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b);
169
        afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a);
170

171
        afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]);
172

173
        buffer_st8(top_blob_data, gx, v);
174
#else
175
        ivec4 v_offset = (x4 / 4) * 4 + x4 % 4;
176
        ivec4 vv_offset = (xx4 / 4) * 4 + xx4 % 4;
177

178
        buffer_cp1to8(top_blob_data, gx, bottom_blob_data, v_offset, vv_offset);
179
#endif
180
#endif
181
    }
182
    else if (psc(dims) == 2)
183
    {
184
        int x = gx + p.woffset;
185
        ivec4 y4 = gy * 8 + p.hoffset + ivec4(0, 1, 2, 3);
186
        ivec4 yy4 = y4 + 4;
187

188
#if NCNN_image_shader
189
        afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x, y4.r / 4, 0));
190
        afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x, y4.g / 4, 0));
191
        afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(x, y4.b / 4, 0));
192
        afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(x, y4.a / 4, 0));
193
        afpvec4 v4 = image3d_ld4(bottom_blob, ivec3(x, yy4.r / 4, 0));
194
        afpvec4 v5 = image3d_ld4(bottom_blob, ivec3(x, yy4.g / 4, 0));
195
        afpvec4 v6 = image3d_ld4(bottom_blob, ivec3(x, yy4.b / 4, 0));
196
        afpvec4 v7 = image3d_ld4(bottom_blob, ivec3(x, yy4.a / 4, 0));
197

198
        afpvec8 v;
199
#if NCNN_fp16_arithmetic
200
        if (bugihfa == 1)
201
        {
202
            ivec4 y4m4 = y4 % 4;
203
            ivec4 yy4m4 = yy4 % 4;
204

205
            if (y4m4.r == 0) v[0].r = v0.r;
206
            if (y4m4.r == 1) v[0].r = v0.g;
207
            if (y4m4.r == 2) v[0].r = v0.b;
208
            if (y4m4.r == 3) v[0].r = v0.a;
209
            if (y4m4.g == 0) v[0].g = v1.r;
210
            if (y4m4.g == 1) v[0].g = v1.g;
211
            if (y4m4.g == 2) v[0].g = v1.b;
212
            if (y4m4.g == 3) v[0].g = v1.a;
213
            if (y4m4.b == 0) v[0].b = v2.r;
214
            if (y4m4.b == 1) v[0].b = v2.g;
215
            if (y4m4.b == 2) v[0].b = v2.b;
216
            if (y4m4.b == 3) v[0].b = v2.a;
217
            if (y4m4.a == 0) v[0].a = v3.r;
218
            if (y4m4.a == 1) v[0].a = v3.g;
219
            if (y4m4.a == 2) v[0].a = v3.b;
220
            if (y4m4.a == 3) v[0].a = v3.a;
221
            if (yy4m4.r == 0) v[1].r = v4.r;
222
            if (yy4m4.r == 1) v[1].r = v4.g;
223
            if (yy4m4.r == 2) v[1].r = v4.b;
224
            if (yy4m4.r == 3) v[1].r = v4.a;
225
            if (yy4m4.g == 0) v[1].g = v5.r;
226
            if (yy4m4.g == 1) v[1].g = v5.g;
227
            if (yy4m4.g == 2) v[1].g = v5.b;
228
            if (yy4m4.g == 3) v[1].g = v5.a;
229
            if (yy4m4.b == 0) v[1].b = v6.r;
230
            if (yy4m4.b == 1) v[1].b = v6.g;
231
            if (yy4m4.b == 2) v[1].b = v6.b;
232
            if (yy4m4.b == 3) v[1].b = v6.a;
233
            if (yy4m4.a == 0) v[1].a = v7.r;
234
            if (yy4m4.a == 1) v[1].a = v7.g;
235
            if (yy4m4.a == 2) v[1].a = v7.b;
236
            if (yy4m4.a == 3) v[1].a = v7.a;
237
        }
238
        else
239
#endif
240
        {
241
            v[0].r = v0[y4.r % 4];
242
            v[0].g = v1[y4.g % 4];
243
            v[0].b = v2[y4.b % 4];
244
            v[0].a = v3[y4.a % 4];
245
            v[1].r = v4[yy4.r % 4];
246
            v[1].g = v5[yy4.g % 4];
247
            v[1].b = v6[yy4.b % 4];
248
            v[1].a = v7[yy4.a % 4];
249
        }
250

251
        image3d_st8(top_blob, ivec3(gx, gy, 0), v);
252
#else
253
        int gi = gy * psc(outw) + gx;
254

255
#if NCNN_fp16_packed
256
        ivec4 v_offset = ((y4 / 4) * psc(w) + x) * 2 + (y4 % 4) / 2;
257
        ivec4 lane2 = y4 % 2;
258
        ivec4 vv_offset = ((yy4 / 4) * psc(w) + x) * 2 + (yy4 % 4) / 2;
259
        ivec4 lane4 = yy4 % 2;
260

261
        afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
262
        afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
263
        afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
264
        afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
265

266
        afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r);
267
        afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g);
268
        afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b);
269
        afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a);
270

271
        afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]);
272

273
        buffer_st8(top_blob_data, gi, v);
274
#else
275
        ivec4 v_offset = ((y4 / 4) * psc(w) + x) * 4 + y4 % 4;
276
        ivec4 vv_offset = ((yy4 / 4) * psc(w) + x) * 4 + yy4 % 4;
277

278
        buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset);
279
#endif
280
#endif
281
    }
282
    else if (psc(dims) == 3)
283
    {
284
        int x = gx + p.woffset;
285
        int y = gy + p.hoffset;
286
        ivec4 z4 = gz * 8 + p.coffset + ivec4(0, 1, 2, 3);
287
        ivec4 zz4 = z4 + 4;
288

289
#if NCNN_image_shader
290
        afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x, y, z4.r / 4));
291
        afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x, y, z4.g / 4));
292
        afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(x, y, z4.b / 4));
293
        afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(x, y, z4.a / 4));
294
        afpvec4 v4 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.r / 4));
295
        afpvec4 v5 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.g / 4));
296
        afpvec4 v6 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.b / 4));
297
        afpvec4 v7 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.a / 4));
298

299
        afpvec8 v;
300
#if NCNN_fp16_arithmetic
301
        if (bugihfa == 1)
302
        {
303
            ivec4 z4m4 = z4 % 4;
304
            ivec4 zz4m4 = zz4 % 4;
305

306
            if (z4m4.r == 0) v[0].r = v0.r;
307
            if (z4m4.r == 1) v[0].r = v0.g;
308
            if (z4m4.r == 2) v[0].r = v0.b;
309
            if (z4m4.r == 3) v[0].r = v0.a;
310
            if (z4m4.g == 0) v[0].g = v1.r;
311
            if (z4m4.g == 1) v[0].g = v1.g;
312
            if (z4m4.g == 2) v[0].g = v1.b;
313
            if (z4m4.g == 3) v[0].g = v1.a;
314
            if (z4m4.b == 0) v[0].b = v2.r;
315
            if (z4m4.b == 1) v[0].b = v2.g;
316
            if (z4m4.b == 2) v[0].b = v2.b;
317
            if (z4m4.b == 3) v[0].b = v2.a;
318
            if (z4m4.a == 0) v[0].a = v3.r;
319
            if (z4m4.a == 1) v[0].a = v3.g;
320
            if (z4m4.a == 2) v[0].a = v3.b;
321
            if (z4m4.a == 3) v[0].a = v3.a;
322
            if (zz4m4.r == 0) v[1].r = v4.r;
323
            if (zz4m4.r == 1) v[1].r = v4.g;
324
            if (zz4m4.r == 2) v[1].r = v4.b;
325
            if (zz4m4.r == 3) v[1].r = v4.a;
326
            if (zz4m4.g == 0) v[1].g = v5.r;
327
            if (zz4m4.g == 1) v[1].g = v5.g;
328
            if (zz4m4.g == 2) v[1].g = v5.b;
329
            if (zz4m4.g == 3) v[1].g = v5.a;
330
            if (zz4m4.b == 0) v[1].b = v6.r;
331
            if (zz4m4.b == 1) v[1].b = v6.g;
332
            if (zz4m4.b == 2) v[1].b = v6.b;
333
            if (zz4m4.b == 3) v[1].b = v6.a;
334
            if (zz4m4.a == 0) v[1].a = v7.r;
335
            if (zz4m4.a == 1) v[1].a = v7.g;
336
            if (zz4m4.a == 2) v[1].a = v7.b;
337
            if (zz4m4.a == 3) v[1].a = v7.a;
338
        }
339
        else
340
#endif
341
        {
342
            v[0].r = v0[z4.r % 4];
343
            v[0].g = v1[z4.g % 4];
344
            v[0].b = v2[z4.b % 4];
345
            v[0].a = v3[z4.a % 4];
346
            v[1].r = v4[zz4.r % 4];
347
            v[1].g = v5[zz4.g % 4];
348
            v[1].b = v6[zz4.b % 4];
349
            v[1].a = v7[zz4.a % 4];
350
        }
351

352
        image3d_st8(top_blob, ivec3(gx, gy, gz), v);
353
#else
354
        int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
355

356
#if NCNN_fp16_packed
357
        ivec4 v_offset = ((z4 / 4) * psc(cstep) + y * psc(w) + x) * 2 + (z4 % 4) / 2;
358
        ivec4 lane2 = z4 % 2;
359
        ivec4 vv_offset = ((zz4 / 4) * psc(cstep) + y * psc(w) + x) * 2 + (zz4 % 4) / 2;
360
        ivec4 lane4 = zz4 % 2;
361

362
        afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
363
        afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
364
        afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
365
        afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
366

367
        afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r);
368
        afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g);
369
        afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b);
370
        afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a);
371

372
        afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]);
373

374
        buffer_st8(top_blob_data, gi, v);
375
#else
376
        ivec4 v_offset = ((z4 / 4) * psc(cstep) + y * psc(w) + x) * 4 + z4 % 4;
377
        ivec4 vv_offset = ((zz4 / 4) * psc(cstep) + y * psc(w) + x) * 4 + zz4 % 4;
378

379
        buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset);
380
#endif
381
#endif
382
    }
383
    else // if (psc(dims) == 4)
384
    {
385
        int yd = gy / psc(outh);
386
        int yh = gy % psc(outh);
387

388
        int x = gx + p.woffset;
389
        int y = (yd + p.doffset) * psc(h) + (yh + p.hoffset);
390
        ivec4 z4 = gz * 8 + p.coffset + ivec4(0, 1, 2, 3);
391
        ivec4 zz4 = z4 + 4;
392

393
#if NCNN_image_shader
394
        afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x, y, z4.r / 4));
395
        afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x, y, z4.g / 4));
396
        afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(x, y, z4.b / 4));
397
        afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(x, y, z4.a / 4));
398
        afpvec4 v4 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.r / 4));
399
        afpvec4 v5 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.g / 4));
400
        afpvec4 v6 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.b / 4));
401
        afpvec4 v7 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.a / 4));
402

403
        afpvec8 v;
404
#if NCNN_fp16_arithmetic
405
        if (bugihfa == 1)
406
        {
407
            ivec4 z4m4 = z4 % 4;
408
            ivec4 zz4m4 = zz4 % 4;
409

410
            if (z4m4.r == 0) v[0].r = v0.r;
411
            if (z4m4.r == 1) v[0].r = v0.g;
412
            if (z4m4.r == 2) v[0].r = v0.b;
413
            if (z4m4.r == 3) v[0].r = v0.a;
414
            if (z4m4.g == 0) v[0].g = v1.r;
415
            if (z4m4.g == 1) v[0].g = v1.g;
416
            if (z4m4.g == 2) v[0].g = v1.b;
417
            if (z4m4.g == 3) v[0].g = v1.a;
418
            if (z4m4.b == 0) v[0].b = v2.r;
419
            if (z4m4.b == 1) v[0].b = v2.g;
420
            if (z4m4.b == 2) v[0].b = v2.b;
421
            if (z4m4.b == 3) v[0].b = v2.a;
422
            if (z4m4.a == 0) v[0].a = v3.r;
423
            if (z4m4.a == 1) v[0].a = v3.g;
424
            if (z4m4.a == 2) v[0].a = v3.b;
425
            if (z4m4.a == 3) v[0].a = v3.a;
426
            if (zz4m4.r == 0) v[1].r = v4.r;
427
            if (zz4m4.r == 1) v[1].r = v4.g;
428
            if (zz4m4.r == 2) v[1].r = v4.b;
429
            if (zz4m4.r == 3) v[1].r = v4.a;
430
            if (zz4m4.g == 0) v[1].g = v5.r;
431
            if (zz4m4.g == 1) v[1].g = v5.g;
432
            if (zz4m4.g == 2) v[1].g = v5.b;
433
            if (zz4m4.g == 3) v[1].g = v5.a;
434
            if (zz4m4.b == 0) v[1].b = v6.r;
435
            if (zz4m4.b == 1) v[1].b = v6.g;
436
            if (zz4m4.b == 2) v[1].b = v6.b;
437
            if (zz4m4.b == 3) v[1].b = v6.a;
438
            if (zz4m4.a == 0) v[1].a = v7.r;
439
            if (zz4m4.a == 1) v[1].a = v7.g;
440
            if (zz4m4.a == 2) v[1].a = v7.b;
441
            if (zz4m4.a == 3) v[1].a = v7.a;
442
        }
443
        else
444
#endif
445
        {
446
            v[0].r = v0[z4.r % 4];
447
            v[0].g = v1[z4.g % 4];
448
            v[0].b = v2[z4.b % 4];
449
            v[0].a = v3[z4.a % 4];
450
            v[1].r = v4[zz4.r % 4];
451
            v[1].g = v5[zz4.g % 4];
452
            v[1].b = v6[zz4.b % 4];
453
            v[1].a = v7[zz4.a % 4];
454
        }
455

456
        image3d_st8(top_blob, ivec3(gx, gy, gz), v);
457
#else
458
        int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
459

460
#if NCNN_fp16_packed
461
        ivec4 v_offset = ((z4 / 4) * psc(cstep) + y * psc(w) + x) * 2 + (z4 % 4) / 2;
462
        ivec4 lane2 = z4 % 2;
463
        ivec4 vv_offset = ((zz4 / 4) * psc(cstep) + y * psc(w) + x) * 2 + (zz4 % 4) / 2;
464
        ivec4 lane4 = zz4 % 2;
465

466
        afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
467
        afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
468
        afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
469
        afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
470

471
        afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r);
472
        afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g);
473
        afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b);
474
        afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a);
475

476
        afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]);
477

478
        buffer_st8(top_blob_data, gi, v);
479
#else
480
        ivec4 v_offset = ((z4 / 4) * psc(cstep) + y * psc(w) + x) * 4 + z4 % 4;
481
        ivec4 vv_offset = ((zz4 / 4) * psc(cstep) + y * psc(w) + x) * 4 + zz4 % 4;
482

483
        buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset);
484
#endif
485
#endif
486
    }
487
}
488

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.