1
// Tencent is pleased to support the open source community by making ncnn available.
3
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
8
// https://opensource.org/licenses/BSD-3-Clause
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
18
#extension GL_EXT_shader_16bit_storage: require
19
struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
21
#if NCNN_fp16_arithmetic
22
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
25
layout (constant_id = 0) const int bugihfa = 0;
27
#define shape_constant_id_offset 1
28
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
29
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
30
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
31
layout (constant_id = shape_constant_id_offset + 3) const int d = 0;
32
layout (constant_id = shape_constant_id_offset + 4) const int c = 0;
33
layout (constant_id = shape_constant_id_offset + 5) const int cstep = 0;
35
layout (constant_id = shape_constant_id_offset + 6) const int outdims = 0;
36
layout (constant_id = shape_constant_id_offset + 7) const int outw = 0;
37
layout (constant_id = shape_constant_id_offset + 8) const int outh = 0;
38
layout (constant_id = shape_constant_id_offset + 9) const int outd = 0;
39
layout (constant_id = shape_constant_id_offset + 10) const int outc = 0;
40
layout (constant_id = shape_constant_id_offset + 11) const int outcstep = 0;
43
layout (binding = 0) uniform unfp sampler3D bottom_blob;
44
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
47
layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
49
layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
51
layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
54
layout (push_constant) uniform parameter
78
int gx = int(gl_GlobalInvocationID.x);
79
int gy = int(gl_GlobalInvocationID.y);
80
int gz = int(gl_GlobalInvocationID.z);
82
if (gx >= psc(outw) || gy >= psc(outh) * psc(outd) || gz >= psc(outc))
87
ivec4 x4 = gx * 8 + p.woffset + ivec4(0, 1, 2, 3);
91
afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x4.r / 4, 0, 0));
92
afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x4.g / 4, 0, 0));
93
afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(x4.b / 4, 0, 0));
94
afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(x4.a / 4, 0, 0));
95
afpvec4 v4 = image3d_ld4(bottom_blob, ivec3(xx4.r / 4, 0, 0));
96
afpvec4 v5 = image3d_ld4(bottom_blob, ivec3(xx4.g / 4, 0, 0));
97
afpvec4 v6 = image3d_ld4(bottom_blob, ivec3(xx4.b / 4, 0, 0));
98
afpvec4 v7 = image3d_ld4(bottom_blob, ivec3(xx4.a / 4, 0, 0));
101
#if NCNN_fp16_arithmetic
105
ivec4 xx4m4 = xx4 % 4;
107
if (x4m4.r == 0) v[0].r = v0.r;
108
if (x4m4.r == 1) v[0].r = v0.g;
109
if (x4m4.r == 2) v[0].r = v0.b;
110
if (x4m4.r == 3) v[0].r = v0.a;
111
if (x4m4.g == 0) v[0].g = v1.r;
112
if (x4m4.g == 1) v[0].g = v1.g;
113
if (x4m4.g == 2) v[0].g = v1.b;
114
if (x4m4.g == 3) v[0].g = v1.a;
115
if (x4m4.b == 0) v[0].b = v2.r;
116
if (x4m4.b == 1) v[0].b = v2.g;
117
if (x4m4.b == 2) v[0].b = v2.b;
118
if (x4m4.b == 3) v[0].b = v2.a;
119
if (x4m4.a == 0) v[0].a = v3.r;
120
if (x4m4.a == 1) v[0].a = v3.g;
121
if (x4m4.a == 2) v[0].a = v3.b;
122
if (x4m4.a == 3) v[0].a = v3.a;
123
if (xx4m4.r == 0) v[1].r = v4.r;
124
if (xx4m4.r == 1) v[1].r = v4.g;
125
if (xx4m4.r == 2) v[1].r = v4.b;
126
if (xx4m4.r == 3) v[1].r = v4.a;
127
if (xx4m4.g == 0) v[1].g = v5.r;
128
if (xx4m4.g == 1) v[1].g = v5.g;
129
if (xx4m4.g == 2) v[1].g = v5.b;
130
if (xx4m4.g == 3) v[1].g = v5.a;
131
if (xx4m4.b == 0) v[1].b = v6.r;
132
if (xx4m4.b == 1) v[1].b = v6.g;
133
if (xx4m4.b == 2) v[1].b = v6.b;
134
if (xx4m4.b == 3) v[1].b = v6.a;
135
if (xx4m4.a == 0) v[1].a = v7.r;
136
if (xx4m4.a == 1) v[1].a = v7.g;
137
if (xx4m4.a == 2) v[1].a = v7.b;
138
if (xx4m4.a == 3) v[1].a = v7.a;
143
v[0].r = v0[x4.r % 4];
144
v[0].g = v1[x4.g % 4];
145
v[0].b = v2[x4.b % 4];
146
v[0].a = v3[x4.a % 4];
147
v[1].r = v4[xx4.r % 4];
148
v[1].g = v5[xx4.g % 4];
149
v[1].b = v6[xx4.b % 4];
150
v[1].a = v7[xx4.a % 4];
153
image3d_st8(top_blob, ivec3(gx, 0, 0), v);
156
ivec4 v_offset = (x4 / 4) * 2 + (x4 % 4) / 2;
157
ivec4 lane2 = x4 % 2;
158
ivec4 vv_offset = (xx4 / 4) * 2 + (xx4 % 4) / 2;
159
ivec4 lane4 = xx4 % 2;
161
afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
162
afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
163
afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
164
afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
166
afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r);
167
afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g);
168
afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b);
169
afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a);
171
afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]);
173
buffer_st8(top_blob_data, gx, v);
175
ivec4 v_offset = (x4 / 4) * 4 + x4 % 4;
176
ivec4 vv_offset = (xx4 / 4) * 4 + xx4 % 4;
178
buffer_cp1to8(top_blob_data, gx, bottom_blob_data, v_offset, vv_offset);
182
else if (psc(dims) == 2)
184
int x = gx + p.woffset;
185
ivec4 y4 = gy * 8 + p.hoffset + ivec4(0, 1, 2, 3);
189
afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x, y4.r / 4, 0));
190
afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x, y4.g / 4, 0));
191
afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(x, y4.b / 4, 0));
192
afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(x, y4.a / 4, 0));
193
afpvec4 v4 = image3d_ld4(bottom_blob, ivec3(x, yy4.r / 4, 0));
194
afpvec4 v5 = image3d_ld4(bottom_blob, ivec3(x, yy4.g / 4, 0));
195
afpvec4 v6 = image3d_ld4(bottom_blob, ivec3(x, yy4.b / 4, 0));
196
afpvec4 v7 = image3d_ld4(bottom_blob, ivec3(x, yy4.a / 4, 0));
199
#if NCNN_fp16_arithmetic
203
ivec4 yy4m4 = yy4 % 4;
205
if (y4m4.r == 0) v[0].r = v0.r;
206
if (y4m4.r == 1) v[0].r = v0.g;
207
if (y4m4.r == 2) v[0].r = v0.b;
208
if (y4m4.r == 3) v[0].r = v0.a;
209
if (y4m4.g == 0) v[0].g = v1.r;
210
if (y4m4.g == 1) v[0].g = v1.g;
211
if (y4m4.g == 2) v[0].g = v1.b;
212
if (y4m4.g == 3) v[0].g = v1.a;
213
if (y4m4.b == 0) v[0].b = v2.r;
214
if (y4m4.b == 1) v[0].b = v2.g;
215
if (y4m4.b == 2) v[0].b = v2.b;
216
if (y4m4.b == 3) v[0].b = v2.a;
217
if (y4m4.a == 0) v[0].a = v3.r;
218
if (y4m4.a == 1) v[0].a = v3.g;
219
if (y4m4.a == 2) v[0].a = v3.b;
220
if (y4m4.a == 3) v[0].a = v3.a;
221
if (yy4m4.r == 0) v[1].r = v4.r;
222
if (yy4m4.r == 1) v[1].r = v4.g;
223
if (yy4m4.r == 2) v[1].r = v4.b;
224
if (yy4m4.r == 3) v[1].r = v4.a;
225
if (yy4m4.g == 0) v[1].g = v5.r;
226
if (yy4m4.g == 1) v[1].g = v5.g;
227
if (yy4m4.g == 2) v[1].g = v5.b;
228
if (yy4m4.g == 3) v[1].g = v5.a;
229
if (yy4m4.b == 0) v[1].b = v6.r;
230
if (yy4m4.b == 1) v[1].b = v6.g;
231
if (yy4m4.b == 2) v[1].b = v6.b;
232
if (yy4m4.b == 3) v[1].b = v6.a;
233
if (yy4m4.a == 0) v[1].a = v7.r;
234
if (yy4m4.a == 1) v[1].a = v7.g;
235
if (yy4m4.a == 2) v[1].a = v7.b;
236
if (yy4m4.a == 3) v[1].a = v7.a;
241
v[0].r = v0[y4.r % 4];
242
v[0].g = v1[y4.g % 4];
243
v[0].b = v2[y4.b % 4];
244
v[0].a = v3[y4.a % 4];
245
v[1].r = v4[yy4.r % 4];
246
v[1].g = v5[yy4.g % 4];
247
v[1].b = v6[yy4.b % 4];
248
v[1].a = v7[yy4.a % 4];
251
image3d_st8(top_blob, ivec3(gx, gy, 0), v);
253
int gi = gy * psc(outw) + gx;
256
ivec4 v_offset = ((y4 / 4) * psc(w) + x) * 2 + (y4 % 4) / 2;
257
ivec4 lane2 = y4 % 2;
258
ivec4 vv_offset = ((yy4 / 4) * psc(w) + x) * 2 + (yy4 % 4) / 2;
259
ivec4 lane4 = yy4 % 2;
261
afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
262
afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
263
afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
264
afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
266
afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r);
267
afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g);
268
afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b);
269
afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a);
271
afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]);
273
buffer_st8(top_blob_data, gi, v);
275
ivec4 v_offset = ((y4 / 4) * psc(w) + x) * 4 + y4 % 4;
276
ivec4 vv_offset = ((yy4 / 4) * psc(w) + x) * 4 + yy4 % 4;
278
buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset);
282
else if (psc(dims) == 3)
284
int x = gx + p.woffset;
285
int y = gy + p.hoffset;
286
ivec4 z4 = gz * 8 + p.coffset + ivec4(0, 1, 2, 3);
290
afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x, y, z4.r / 4));
291
afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x, y, z4.g / 4));
292
afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(x, y, z4.b / 4));
293
afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(x, y, z4.a / 4));
294
afpvec4 v4 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.r / 4));
295
afpvec4 v5 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.g / 4));
296
afpvec4 v6 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.b / 4));
297
afpvec4 v7 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.a / 4));
300
#if NCNN_fp16_arithmetic
304
ivec4 zz4m4 = zz4 % 4;
306
if (z4m4.r == 0) v[0].r = v0.r;
307
if (z4m4.r == 1) v[0].r = v0.g;
308
if (z4m4.r == 2) v[0].r = v0.b;
309
if (z4m4.r == 3) v[0].r = v0.a;
310
if (z4m4.g == 0) v[0].g = v1.r;
311
if (z4m4.g == 1) v[0].g = v1.g;
312
if (z4m4.g == 2) v[0].g = v1.b;
313
if (z4m4.g == 3) v[0].g = v1.a;
314
if (z4m4.b == 0) v[0].b = v2.r;
315
if (z4m4.b == 1) v[0].b = v2.g;
316
if (z4m4.b == 2) v[0].b = v2.b;
317
if (z4m4.b == 3) v[0].b = v2.a;
318
if (z4m4.a == 0) v[0].a = v3.r;
319
if (z4m4.a == 1) v[0].a = v3.g;
320
if (z4m4.a == 2) v[0].a = v3.b;
321
if (z4m4.a == 3) v[0].a = v3.a;
322
if (zz4m4.r == 0) v[1].r = v4.r;
323
if (zz4m4.r == 1) v[1].r = v4.g;
324
if (zz4m4.r == 2) v[1].r = v4.b;
325
if (zz4m4.r == 3) v[1].r = v4.a;
326
if (zz4m4.g == 0) v[1].g = v5.r;
327
if (zz4m4.g == 1) v[1].g = v5.g;
328
if (zz4m4.g == 2) v[1].g = v5.b;
329
if (zz4m4.g == 3) v[1].g = v5.a;
330
if (zz4m4.b == 0) v[1].b = v6.r;
331
if (zz4m4.b == 1) v[1].b = v6.g;
332
if (zz4m4.b == 2) v[1].b = v6.b;
333
if (zz4m4.b == 3) v[1].b = v6.a;
334
if (zz4m4.a == 0) v[1].a = v7.r;
335
if (zz4m4.a == 1) v[1].a = v7.g;
336
if (zz4m4.a == 2) v[1].a = v7.b;
337
if (zz4m4.a == 3) v[1].a = v7.a;
342
v[0].r = v0[z4.r % 4];
343
v[0].g = v1[z4.g % 4];
344
v[0].b = v2[z4.b % 4];
345
v[0].a = v3[z4.a % 4];
346
v[1].r = v4[zz4.r % 4];
347
v[1].g = v5[zz4.g % 4];
348
v[1].b = v6[zz4.b % 4];
349
v[1].a = v7[zz4.a % 4];
352
image3d_st8(top_blob, ivec3(gx, gy, gz), v);
354
int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
357
ivec4 v_offset = ((z4 / 4) * psc(cstep) + y * psc(w) + x) * 2 + (z4 % 4) / 2;
358
ivec4 lane2 = z4 % 2;
359
ivec4 vv_offset = ((zz4 / 4) * psc(cstep) + y * psc(w) + x) * 2 + (zz4 % 4) / 2;
360
ivec4 lane4 = zz4 % 2;
362
afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
363
afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
364
afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
365
afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
367
afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r);
368
afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g);
369
afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b);
370
afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a);
372
afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]);
374
buffer_st8(top_blob_data, gi, v);
376
ivec4 v_offset = ((z4 / 4) * psc(cstep) + y * psc(w) + x) * 4 + z4 % 4;
377
ivec4 vv_offset = ((zz4 / 4) * psc(cstep) + y * psc(w) + x) * 4 + zz4 % 4;
379
buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset);
383
else // if (psc(dims) == 4)
385
int yd = gy / psc(outh);
386
int yh = gy % psc(outh);
388
int x = gx + p.woffset;
389
int y = (yd + p.doffset) * psc(h) + (yh + p.hoffset);
390
ivec4 z4 = gz * 8 + p.coffset + ivec4(0, 1, 2, 3);
394
afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x, y, z4.r / 4));
395
afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x, y, z4.g / 4));
396
afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(x, y, z4.b / 4));
397
afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(x, y, z4.a / 4));
398
afpvec4 v4 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.r / 4));
399
afpvec4 v5 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.g / 4));
400
afpvec4 v6 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.b / 4));
401
afpvec4 v7 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.a / 4));
404
#if NCNN_fp16_arithmetic
408
ivec4 zz4m4 = zz4 % 4;
410
if (z4m4.r == 0) v[0].r = v0.r;
411
if (z4m4.r == 1) v[0].r = v0.g;
412
if (z4m4.r == 2) v[0].r = v0.b;
413
if (z4m4.r == 3) v[0].r = v0.a;
414
if (z4m4.g == 0) v[0].g = v1.r;
415
if (z4m4.g == 1) v[0].g = v1.g;
416
if (z4m4.g == 2) v[0].g = v1.b;
417
if (z4m4.g == 3) v[0].g = v1.a;
418
if (z4m4.b == 0) v[0].b = v2.r;
419
if (z4m4.b == 1) v[0].b = v2.g;
420
if (z4m4.b == 2) v[0].b = v2.b;
421
if (z4m4.b == 3) v[0].b = v2.a;
422
if (z4m4.a == 0) v[0].a = v3.r;
423
if (z4m4.a == 1) v[0].a = v3.g;
424
if (z4m4.a == 2) v[0].a = v3.b;
425
if (z4m4.a == 3) v[0].a = v3.a;
426
if (zz4m4.r == 0) v[1].r = v4.r;
427
if (zz4m4.r == 1) v[1].r = v4.g;
428
if (zz4m4.r == 2) v[1].r = v4.b;
429
if (zz4m4.r == 3) v[1].r = v4.a;
430
if (zz4m4.g == 0) v[1].g = v5.r;
431
if (zz4m4.g == 1) v[1].g = v5.g;
432
if (zz4m4.g == 2) v[1].g = v5.b;
433
if (zz4m4.g == 3) v[1].g = v5.a;
434
if (zz4m4.b == 0) v[1].b = v6.r;
435
if (zz4m4.b == 1) v[1].b = v6.g;
436
if (zz4m4.b == 2) v[1].b = v6.b;
437
if (zz4m4.b == 3) v[1].b = v6.a;
438
if (zz4m4.a == 0) v[1].a = v7.r;
439
if (zz4m4.a == 1) v[1].a = v7.g;
440
if (zz4m4.a == 2) v[1].a = v7.b;
441
if (zz4m4.a == 3) v[1].a = v7.a;
446
v[0].r = v0[z4.r % 4];
447
v[0].g = v1[z4.g % 4];
448
v[0].b = v2[z4.b % 4];
449
v[0].a = v3[z4.a % 4];
450
v[1].r = v4[zz4.r % 4];
451
v[1].g = v5[zz4.g % 4];
452
v[1].b = v6[zz4.b % 4];
453
v[1].a = v7[zz4.a % 4];
456
image3d_st8(top_blob, ivec3(gx, gy, gz), v);
458
int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
461
ivec4 v_offset = ((z4 / 4) * psc(cstep) + y * psc(w) + x) * 2 + (z4 % 4) / 2;
462
ivec4 lane2 = z4 % 2;
463
ivec4 vv_offset = ((zz4 / 4) * psc(cstep) + y * psc(w) + x) * 2 + (zz4 % 4) / 2;
464
ivec4 lane4 = zz4 % 2;
466
afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
467
afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
468
afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
469
afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
471
afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r);
472
afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g);
473
afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b);
474
afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a);
476
afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]);
478
buffer_st8(top_blob_data, gi, v);
480
ivec4 v_offset = ((z4 / 4) * psc(cstep) + y * psc(w) + x) * 4 + z4 % 4;
481
ivec4 vv_offset = ((zz4 / 4) * psc(cstep) + y * psc(w) + x) * 4 + zz4 % 4;
483
buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset);