1
// Tencent is pleased to support the open source community by making ncnn available.
3
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
8
// https://opensource.org/licenses/BSD-3-Clause
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
18
#extension GL_EXT_shader_16bit_storage: require
19
struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
21
#if NCNN_fp16_arithmetic
22
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
25
layout (constant_id = 0) const int bugihfa = 0;
27
#define shape_constant_id_offset 1
28
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
29
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
30
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
31
layout (constant_id = shape_constant_id_offset + 3) const int d = 0;
32
layout (constant_id = shape_constant_id_offset + 4) const int c = 0;
33
layout (constant_id = shape_constant_id_offset + 5) const int cstep = 0;
35
layout (constant_id = shape_constant_id_offset + 6) const int outdims = 0;
36
layout (constant_id = shape_constant_id_offset + 7) const int outw = 0;
37
layout (constant_id = shape_constant_id_offset + 8) const int outh = 0;
38
layout (constant_id = shape_constant_id_offset + 9) const int outd = 0;
39
layout (constant_id = shape_constant_id_offset + 10) const int outc = 0;
40
layout (constant_id = shape_constant_id_offset + 11) const int outcstep = 0;
43
layout (binding = 0) uniform unfp sampler3D bottom_blob;
44
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
47
layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
49
layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
51
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
54
layout (push_constant) uniform parameter
78
int gx = int(gl_GlobalInvocationID.x);
79
int gy = int(gl_GlobalInvocationID.y);
80
int gz = int(gl_GlobalInvocationID.z);
82
if (gx >= psc(outw) || gy >= psc(outh) * psc(outd) || gz >= psc(outc))
87
ivec4 x4 = gx * 4 + p.woffset + ivec4(0, 1, 2, 3);
90
afpvec8 v0 = image3d_ld8(bottom_blob, ivec3(x4.r / 8, 0, 0));
91
afpvec8 v1 = image3d_ld8(bottom_blob, ivec3(x4.g / 8, 0, 0));
92
afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(x4.b / 8, 0, 0));
93
afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(x4.a / 8, 0, 0));
96
#if NCNN_fp16_arithmetic
99
ivec4 x4lane2 = (x4 % 8) / 4;
102
if (x4m4.r == 0) v.r = v0[x4lane2.r].r;
103
if (x4m4.r == 1) v.r = v0[x4lane2.r].g;
104
if (x4m4.r == 2) v.r = v0[x4lane2.r].b;
105
if (x4m4.r == 3) v.r = v0[x4lane2.r].a;
106
if (x4m4.g == 0) v.g = v0[x4lane2.g].r;
107
if (x4m4.g == 1) v.g = v0[x4lane2.g].g;
108
if (x4m4.g == 2) v.g = v0[x4lane2.g].b;
109
if (x4m4.g == 3) v.g = v0[x4lane2.g].a;
110
if (x4m4.b == 0) v.b = v0[x4lane2.b].r;
111
if (x4m4.b == 1) v.b = v0[x4lane2.b].g;
112
if (x4m4.b == 2) v.b = v0[x4lane2.b].b;
113
if (x4m4.b == 3) v.b = v0[x4lane2.b].a;
114
if (x4m4.a == 0) v.a = v0[x4lane2.a].r;
115
if (x4m4.a == 1) v.a = v0[x4lane2.a].g;
116
if (x4m4.a == 2) v.a = v0[x4lane2.a].b;
117
if (x4m4.a == 3) v.a = v0[x4lane2.a].a;
122
v.r = v0[(x4.r % 8) / 4][x4.r % 4];
123
v.g = v1[(x4.g % 8) / 4][x4.g % 4];
124
v.b = v2[(x4.b % 8) / 4][x4.b % 4];
125
v.a = v3[(x4.a % 8) / 4][x4.a % 4];
128
image3d_st4(top_blob, ivec3(gx, 0, 0), v);
131
ivec4 v_offset = (x4 / 8) * 4 + (x4 % 8) / 2;
132
ivec4 lane2 = x4 % 2;
134
afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
135
afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
136
afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
137
afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
139
afpvec4 v = afpvec4(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a]);
141
buffer_st4(top_blob_data, gx, v);
143
ivec4 v_offset = (x4 / 8) * 8 + x4 % 8;
145
buffer_cp1to4(top_blob_data, gx, bottom_blob_data, v_offset);
149
else if (psc(dims) == 2)
151
int x = gx + p.woffset;
152
ivec4 y4 = gy * 4 + p.hoffset + ivec4(0, 1, 2, 3);
155
afpvec8 v0 = image3d_ld8(bottom_blob, ivec3(x, y4.r / 8, 0));
156
afpvec8 v1 = image3d_ld8(bottom_blob, ivec3(x, y4.g / 8, 0));
157
afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(x, y4.b / 8, 0));
158
afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(x, y4.a / 8, 0));
161
#if NCNN_fp16_arithmetic
164
ivec4 y4lane2 = (y4 % 8) / 4;
167
if (y4m4.r == 0) v.r = v0[y4lane2.r].r;
168
if (y4m4.r == 1) v.r = v0[y4lane2.r].g;
169
if (y4m4.r == 2) v.r = v0[y4lane2.r].b;
170
if (y4m4.r == 3) v.r = v0[y4lane2.r].a;
171
if (y4m4.g == 0) v.g = v0[y4lane2.g].r;
172
if (y4m4.g == 1) v.g = v0[y4lane2.g].g;
173
if (y4m4.g == 2) v.g = v0[y4lane2.g].b;
174
if (y4m4.g == 3) v.g = v0[y4lane2.g].a;
175
if (y4m4.b == 0) v.b = v0[y4lane2.b].r;
176
if (y4m4.b == 1) v.b = v0[y4lane2.b].g;
177
if (y4m4.b == 2) v.b = v0[y4lane2.b].b;
178
if (y4m4.b == 3) v.b = v0[y4lane2.b].a;
179
if (y4m4.a == 0) v.a = v0[y4lane2.a].r;
180
if (y4m4.a == 1) v.a = v0[y4lane2.a].g;
181
if (y4m4.a == 2) v.a = v0[y4lane2.a].b;
182
if (y4m4.a == 3) v.a = v0[y4lane2.a].a;
187
v.r = v0[(y4.r % 8) / 4][y4.r % 4];
188
v.g = v1[(y4.g % 8) / 4][y4.g % 4];
189
v.b = v2[(y4.b % 8) / 4][y4.b % 4];
190
v.a = v3[(y4.a % 8) / 4][y4.a % 4];
193
image3d_st4(top_blob, ivec3(gx, gy, 0), v);
195
int gi = gy * psc(outw) + gx;
198
ivec4 v_offset = ((y4 / 8) * psc(w) + x) * 4 + (y4 % 8) / 2;
199
ivec4 lane2 = y4 % 2;
201
afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
202
afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
203
afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
204
afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
206
afpvec4 v = afpvec4(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a]);
208
buffer_st4(top_blob_data, gi, v);
210
ivec4 v_offset = ((y4 / 8) * psc(w) + x) * 8 + y4 % 8;
212
buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset);
216
else if (psc(dims) == 3)
218
int x = gx + p.woffset;
219
int y = gy + p.hoffset;
220
ivec4 z4 = gz * 4 + p.coffset + ivec4(0, 1, 2, 3);
223
afpvec8 v0 = image3d_ld8(bottom_blob, ivec3(x, y, z4.r / 8));
224
afpvec8 v1 = image3d_ld8(bottom_blob, ivec3(x, y, z4.g / 8));
225
afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(x, y, z4.b / 8));
226
afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(x, y, z4.a / 8));
229
#if NCNN_fp16_arithmetic
232
ivec4 z4lane2 = (z4 % 8) / 4;
235
if (z4m4.r == 0) v.r = v0[z4lane2.r].r;
236
if (z4m4.r == 1) v.r = v0[z4lane2.r].g;
237
if (z4m4.r == 2) v.r = v0[z4lane2.r].b;
238
if (z4m4.r == 3) v.r = v0[z4lane2.r].a;
239
if (z4m4.g == 0) v.g = v0[z4lane2.g].r;
240
if (z4m4.g == 1) v.g = v0[z4lane2.g].g;
241
if (z4m4.g == 2) v.g = v0[z4lane2.g].b;
242
if (z4m4.g == 3) v.g = v0[z4lane2.g].a;
243
if (z4m4.b == 0) v.b = v0[z4lane2.b].r;
244
if (z4m4.b == 1) v.b = v0[z4lane2.b].g;
245
if (z4m4.b == 2) v.b = v0[z4lane2.b].b;
246
if (z4m4.b == 3) v.b = v0[z4lane2.b].a;
247
if (z4m4.a == 0) v.a = v0[z4lane2.a].r;
248
if (z4m4.a == 1) v.a = v0[z4lane2.a].g;
249
if (z4m4.a == 2) v.a = v0[z4lane2.a].b;
250
if (z4m4.a == 3) v.a = v0[z4lane2.a].a;
255
v.r = v0[(z4.r % 8) / 4][z4.r % 4];
256
v.g = v1[(z4.g % 8) / 4][z4.g % 4];
257
v.b = v2[(z4.b % 8) / 4][z4.b % 4];
258
v.a = v3[(z4.a % 8) / 4][z4.a % 4];
261
image3d_st4(top_blob, ivec3(gx, gy, gz), v);
263
int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
266
ivec4 v_offset = ((z4 / 8) * psc(cstep) + y * psc(w) + x) * 4 + (z4 % 8) / 2;
267
ivec4 lane2 = z4 % 2;
269
afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
270
afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
271
afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
272
afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
274
afpvec4 v = afpvec4(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a]);
276
buffer_st4(top_blob_data, gi, v);
278
ivec4 v_offset = ((z4 / 8) * psc(cstep) + y * psc(w) + x) * 8 + z4 % 8;
280
buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset);
284
else // if (psc(dims) == 4)
286
int yd = gy / psc(outh);
287
int yh = gy % psc(outh);
289
int x = gx + p.woffset;
290
int y = (yd + p.doffset) * psc(h) + (yh + p.hoffset);
291
ivec4 z4 = gz * 4 + p.coffset + ivec4(0, 1, 2, 3);
294
afpvec8 v0 = image3d_ld8(bottom_blob, ivec3(x, y, z4.r / 8));
295
afpvec8 v1 = image3d_ld8(bottom_blob, ivec3(x, y, z4.g / 8));
296
afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(x, y, z4.b / 8));
297
afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(x, y, z4.a / 8));
300
#if NCNN_fp16_arithmetic
303
ivec4 z4lane2 = (z4 % 8) / 4;
306
if (z4m4.r == 0) v.r = v0[z4lane2.r].r;
307
if (z4m4.r == 1) v.r = v0[z4lane2.r].g;
308
if (z4m4.r == 2) v.r = v0[z4lane2.r].b;
309
if (z4m4.r == 3) v.r = v0[z4lane2.r].a;
310
if (z4m4.g == 0) v.g = v0[z4lane2.g].r;
311
if (z4m4.g == 1) v.g = v0[z4lane2.g].g;
312
if (z4m4.g == 2) v.g = v0[z4lane2.g].b;
313
if (z4m4.g == 3) v.g = v0[z4lane2.g].a;
314
if (z4m4.b == 0) v.b = v0[z4lane2.b].r;
315
if (z4m4.b == 1) v.b = v0[z4lane2.b].g;
316
if (z4m4.b == 2) v.b = v0[z4lane2.b].b;
317
if (z4m4.b == 3) v.b = v0[z4lane2.b].a;
318
if (z4m4.a == 0) v.a = v0[z4lane2.a].r;
319
if (z4m4.a == 1) v.a = v0[z4lane2.a].g;
320
if (z4m4.a == 2) v.a = v0[z4lane2.a].b;
321
if (z4m4.a == 3) v.a = v0[z4lane2.a].a;
326
v.r = v0[(z4.r % 8) / 4][z4.r % 4];
327
v.g = v1[(z4.g % 8) / 4][z4.g % 4];
328
v.b = v2[(z4.b % 8) / 4][z4.b % 4];
329
v.a = v3[(z4.a % 8) / 4][z4.a % 4];
332
image3d_st4(top_blob, ivec3(gx, gy, gz), v);
334
int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
337
ivec4 v_offset = ((z4 / 8) * psc(cstep) + y * psc(w) + x) * 4 + (z4 % 8) / 2;
338
ivec4 lane2 = z4 % 2;
340
afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
341
afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
342
afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
343
afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
345
afpvec4 v = afpvec4(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a]);
347
buffer_st4(top_blob_data, gi, v);
349
ivec4 v_offset = ((z4 / 8) * psc(cstep) + y * psc(w) + x) * 8 + z4 % 8;
351
buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset);