1
// Tencent is pleased to support the open source community by making ncnn available.
3
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
8
// https://opensource.org/licenses/BSD-3-Clause
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
18
#extension GL_EXT_shader_16bit_storage: require
19
struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
21
#if NCNN_fp16_arithmetic
22
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
25
layout (constant_id = 0) const int type = 1;
26
layout (constant_id = 1) const float value = 0;
27
layout (constant_id = 2) const int per_channel_pad = 0;
29
#define shape_constant_id_offset 3
30
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
31
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
32
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
33
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
34
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
36
layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
37
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
38
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
39
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
40
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
43
layout (binding = 0) uniform unfp sampler3D bottom_blob;
44
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
45
layout (binding = 2) uniform unfp sampler3D per_channel_pad_blob;
47
layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
48
layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
49
layout (binding = 2) readonly buffer per_channel_pad_blob { sfpvec8 per_channel_pad_blob_data[]; };
52
layout (push_constant) uniform parameter
73
int gx = int(gl_GlobalInvocationID.x);
74
int gy = int(gl_GlobalInvocationID.y);
75
int gz = int(gl_GlobalInvocationID.z);
77
if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
82
ivec4 x4 = gx * 8 - p.left + ivec4(0, 1, 2, 3);
87
bvec4 mask = bvec4(uvec4(greaterThanEqual(x4, ivec4(0))) & uvec4(lessThan(x4, ivec4(psc(w)))));
88
bvec4 mask2 = bvec4(uvec4(greaterThanEqual(xx4, ivec4(0))) & uvec4(lessThan(xx4, ivec4(psc(w)))));
92
v[0].r = image3d_ld1(bottom_blob, ivec3(x4.r, 0, 0));
93
v[0].g = image3d_ld1(bottom_blob, ivec3(x4.g, 0, 0));
94
v[0].b = image3d_ld1(bottom_blob, ivec3(x4.b, 0, 0));
95
v[0].a = image3d_ld1(bottom_blob, ivec3(x4.a, 0, 0));
96
v[1].r = image3d_ld1(bottom_blob, ivec3(xx4.r, 0, 0));
97
v[1].g = image3d_ld1(bottom_blob, ivec3(xx4.g, 0, 0));
98
v[1].b = image3d_ld1(bottom_blob, ivec3(xx4.b, 0, 0));
99
v[1].a = image3d_ld1(bottom_blob, ivec3(xx4.a, 0, 0));
101
v[0] = mix(afpvec4(value), v[0], mask);
102
v[1] = mix(afpvec4(value), v[1], mask2);
104
image3d_st8(top_blob, ivec3(gx, 0, 0), v);
107
// buffer_ld1 x4/xx4 index on vec returns zero on radv driver :(
108
// this is an ineffiecnt workaround --- nihui
109
if (x4.r < 0 && xx4.a >= 0)
111
v[0].r = x4.r >= 0 ? buffer_ld1(bottom_blob_data, x4.r) : afp(value);
112
v[0].g = x4.g >= 0 ? buffer_ld1(bottom_blob_data, x4.g) : afp(value);
113
v[0].b = x4.b >= 0 ? buffer_ld1(bottom_blob_data, x4.b) : afp(value);
114
v[0].a = x4.a >= 0 ? buffer_ld1(bottom_blob_data, x4.a) : afp(value);
115
v[1].r = xx4.r >= 0 ? buffer_ld1(bottom_blob_data, xx4.r) : afp(value);
116
v[1].g = xx4.g >= 0 ? buffer_ld1(bottom_blob_data, xx4.g) : afp(value);
117
v[1].b = xx4.b >= 0 ? buffer_ld1(bottom_blob_data, xx4.b) : afp(value);
118
v[1].a = xx4.a >= 0 ? buffer_ld1(bottom_blob_data, xx4.a) : afp(value);
122
v[0].r = buffer_ld1(bottom_blob_data, x4.r);
123
v[0].g = buffer_ld1(bottom_blob_data, x4.g);
124
v[0].b = buffer_ld1(bottom_blob_data, x4.b);
125
v[0].a = buffer_ld1(bottom_blob_data, x4.a);
126
v[1].r = buffer_ld1(bottom_blob_data, xx4.r);
127
v[1].g = buffer_ld1(bottom_blob_data, xx4.g);
128
v[1].b = buffer_ld1(bottom_blob_data, xx4.b);
129
v[1].a = buffer_ld1(bottom_blob_data, xx4.a);
131
v[0] = mix(afpvec4(value), v[0], mask);
132
v[1] = mix(afpvec4(value), v[1], mask2);
135
buffer_st8(top_blob_data, gx, v);
140
x4 = clamp(x4, 0, psc(w) - 1);
141
xx4 = clamp(xx4, 0, psc(w) - 1);
145
v[0].r = image3d_ld1(bottom_blob, ivec3(x4.r, 0, 0));
146
v[0].g = image3d_ld1(bottom_blob, ivec3(x4.g, 0, 0));
147
v[0].b = image3d_ld1(bottom_blob, ivec3(x4.b, 0, 0));
148
v[0].a = image3d_ld1(bottom_blob, ivec3(x4.a, 0, 0));
149
v[1].r = image3d_ld1(bottom_blob, ivec3(xx4.r, 0, 0));
150
v[1].g = image3d_ld1(bottom_blob, ivec3(xx4.g, 0, 0));
151
v[1].b = image3d_ld1(bottom_blob, ivec3(xx4.b, 0, 0));
152
v[1].a = image3d_ld1(bottom_blob, ivec3(xx4.a, 0, 0));
154
image3d_st8(top_blob, ivec3(gx, 0, 0), v);
156
buffer_cp1to8(top_blob_data, gx, bottom_blob_data, x4, xx4);
163
// NOTE psc(X) get zeros on nvidia
164
// TODO only enable this workaround for some nvidia driver
165
x4 = (p.w - 1) - abs(x4 - (p.w - 1));
166
xx4 = (p.w - 1) - abs(xx4 - (p.w - 1));
167
// x4 = (psc(w) - 1) - abs(x4 - (psc(w) - 1));
168
// xx4 = (psc(w) - 1) - abs(xx4 - (psc(w) - 1));
172
v[0].r = image3d_ld1(bottom_blob, ivec3(x4.r, 0, 0));
173
v[0].g = image3d_ld1(bottom_blob, ivec3(x4.g, 0, 0));
174
v[0].b = image3d_ld1(bottom_blob, ivec3(x4.b, 0, 0));
175
v[0].a = image3d_ld1(bottom_blob, ivec3(x4.a, 0, 0));
176
v[1].r = image3d_ld1(bottom_blob, ivec3(xx4.r, 0, 0));
177
v[1].g = image3d_ld1(bottom_blob, ivec3(xx4.g, 0, 0));
178
v[1].b = image3d_ld1(bottom_blob, ivec3(xx4.b, 0, 0));
179
v[1].a = image3d_ld1(bottom_blob, ivec3(xx4.a, 0, 0));
181
image3d_st8(top_blob, ivec3(gx, 0, 0), v);
183
buffer_cp1to8(top_blob_data, gx, bottom_blob_data, x4, xx4);
187
else if (psc(dims) == 2)
189
const int gi = gy * psc(outw) + gx;
192
ivec4 y4 = gy * 8 - p.top + ivec4(0, 1, 2, 3);
197
bvec4 mask = bvec4(uvec4(x >= 0 && x < psc(w)) & (uvec4(greaterThanEqual(y4, ivec4(0))) & uvec4(lessThan(y4, ivec4(psc(h))))));
198
bvec4 mask2 = bvec4(uvec4(x >= 0 && x < psc(w)) & (uvec4(greaterThanEqual(yy4, ivec4(0))) & uvec4(lessThan(yy4, ivec4(psc(h))))));
202
v[0].r = image3d_ld1(bottom_blob, ivec3(x, y4.r, 0));
203
v[0].g = image3d_ld1(bottom_blob, ivec3(x, y4.g, 0));
204
v[0].b = image3d_ld1(bottom_blob, ivec3(x, y4.b, 0));
205
v[0].a = image3d_ld1(bottom_blob, ivec3(x, y4.a, 0));
206
v[1].r = image3d_ld1(bottom_blob, ivec3(x, yy4.r, 0));
207
v[1].g = image3d_ld1(bottom_blob, ivec3(x, yy4.g, 0));
208
v[1].b = image3d_ld1(bottom_blob, ivec3(x, yy4.b, 0));
209
v[1].a = image3d_ld1(bottom_blob, ivec3(x, yy4.a, 0));
211
v[0] = mix(afpvec4(value), v[0], mask);
212
v[1] = mix(afpvec4(value), v[1], mask2);
214
image3d_st8(top_blob, ivec3(gx, gy, 0), v);
216
ivec4 v_offset = y4 * psc(w) + x;
217
ivec4 v_offset2 = yy4 * psc(w) + x;
220
v[0].r = buffer_ld1(bottom_blob_data, v_offset.r);
221
v[0].g = buffer_ld1(bottom_blob_data, v_offset.g);
222
v[0].b = buffer_ld1(bottom_blob_data, v_offset.b);
223
v[0].a = buffer_ld1(bottom_blob_data, v_offset.a);
224
v[1].r = buffer_ld1(bottom_blob_data, v_offset2.r);
225
v[1].g = buffer_ld1(bottom_blob_data, v_offset2.g);
226
v[1].b = buffer_ld1(bottom_blob_data, v_offset2.b);
227
v[1].a = buffer_ld1(bottom_blob_data, v_offset2.a);
229
v[0] = mix(afpvec4(value), v[0], mask);
230
v[1] = mix(afpvec4(value), v[1], mask2);
232
buffer_st8(top_blob_data, gi, v);
237
x = clamp(x, 0, psc(w) - 1);
238
y4 = clamp(y4, 0, psc(h) - 1);
239
yy4 = clamp(yy4, 0, psc(h) - 1);
243
v[0].r = image3d_ld1(bottom_blob, ivec3(x, y4.r, 0));
244
v[0].g = image3d_ld1(bottom_blob, ivec3(x, y4.g, 0));
245
v[0].b = image3d_ld1(bottom_blob, ivec3(x, y4.b, 0));
246
v[0].a = image3d_ld1(bottom_blob, ivec3(x, y4.a, 0));
247
v[1].r = image3d_ld1(bottom_blob, ivec3(x, yy4.r, 0));
248
v[1].g = image3d_ld1(bottom_blob, ivec3(x, yy4.g, 0));
249
v[1].b = image3d_ld1(bottom_blob, ivec3(x, yy4.b, 0));
250
v[1].a = image3d_ld1(bottom_blob, ivec3(x, yy4.a, 0));
252
image3d_st8(top_blob, ivec3(gx, gy, 0), v);
254
ivec4 v_offset = y4 * psc(w) + x;
255
ivec4 v_offset2 = yy4 * psc(w) + x;
256
buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, v_offset2);
264
// NOTE psc(X) get zeros on nvidia
265
// TODO only enable this workaround for some nvidia driver
266
x = (p.w - 1) - abs(x - (p.w - 1));
267
y4 = (p.h - 1) - abs(y4 - (p.h - 1));
268
yy4 = (p.h - 1) - abs(yy4 - (p.h - 1));
269
// x = (psc(w) - 1) - abs(x - (psc(w) - 1));
270
// y4 = (psc(h) - 1) - abs(y4 - (psc(h) - 1));
271
// yy4 = (psc(h) - 1) - abs(yy4 - (psc(h) - 1));
275
v[0].r = image3d_ld1(bottom_blob, ivec3(x, y4.r, 0));
276
v[0].g = image3d_ld1(bottom_blob, ivec3(x, y4.g, 0));
277
v[0].b = image3d_ld1(bottom_blob, ivec3(x, y4.b, 0));
278
v[0].a = image3d_ld1(bottom_blob, ivec3(x, y4.a, 0));
279
v[1].r = image3d_ld1(bottom_blob, ivec3(x, yy4.r, 0));
280
v[1].g = image3d_ld1(bottom_blob, ivec3(x, yy4.g, 0));
281
v[1].b = image3d_ld1(bottom_blob, ivec3(x, yy4.b, 0));
282
v[1].a = image3d_ld1(bottom_blob, ivec3(x, yy4.a, 0));
284
image3d_st8(top_blob, ivec3(gx, gy, 0), v);
286
ivec4 v_offset = y4 * psc(w) + x;
287
ivec4 v_offset2 = yy4 * psc(w) + x;
288
buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, v_offset2);
292
else // if (psc(dims) == 3)
294
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
298
ivec4 z4 = gz * 8 - p.front + ivec4(0, 1, 2, 3);
303
bvec4 mask = bvec4(uvec4(x >= 0 && x < psc(w) && y >= 0 && y < psc(h)) & (uvec4(greaterThanEqual(z4, ivec4(0))) & uvec4(lessThan(z4, ivec4(psc(c))))));
304
bvec4 mask2 = bvec4(uvec4(x >= 0 && x < psc(w) && y >= 0 && y < psc(h)) & (uvec4(greaterThanEqual(zz4, ivec4(0))) & uvec4(lessThan(zz4, ivec4(psc(c))))));
307
afpvec8 pad_value = per_channel_pad == 1 ? image3d_ld8(per_channel_pad_blob, ivec3(gz, 0, 0)) : afpvec8(afpvec4(value), afpvec4(value));
310
v[0].r = image3d_ld1(bottom_blob, ivec3(x, y, z4.r));
311
v[0].g = image3d_ld1(bottom_blob, ivec3(x, y, z4.g));
312
v[0].b = image3d_ld1(bottom_blob, ivec3(x, y, z4.b));
313
v[0].a = image3d_ld1(bottom_blob, ivec3(x, y, z4.a));
314
v[1].r = image3d_ld1(bottom_blob, ivec3(x, y, zz4.r));
315
v[1].g = image3d_ld1(bottom_blob, ivec3(x, y, zz4.g));
316
v[1].b = image3d_ld1(bottom_blob, ivec3(x, y, zz4.b));
317
v[1].a = image3d_ld1(bottom_blob, ivec3(x, y, zz4.a));
319
v[0] = mix(pad_value[0], v[0], mask);
320
v[1] = mix(pad_value[1], v[1], mask2);
322
image3d_st8(top_blob, ivec3(gx, gy, gz), v);
324
afpvec8 pad_value = per_channel_pad == 1 ? buffer_ld8(per_channel_pad_blob_data, gz) : afpvec8(afpvec4(value), afpvec4(value));
326
ivec4 v_offset = z4 * psc(cstep) + y * psc(w) + x;
327
ivec4 v_offset2 = zz4 * psc(cstep) + y * psc(w) + x;
330
v[0].r = buffer_ld1(bottom_blob_data, v_offset.r);
331
v[0].g = buffer_ld1(bottom_blob_data, v_offset.g);
332
v[0].b = buffer_ld1(bottom_blob_data, v_offset.b);
333
v[0].a = buffer_ld1(bottom_blob_data, v_offset.a);
334
v[1].r = buffer_ld1(bottom_blob_data, v_offset2.r);
335
v[1].g = buffer_ld1(bottom_blob_data, v_offset2.g);
336
v[1].b = buffer_ld1(bottom_blob_data, v_offset2.b);
337
v[1].a = buffer_ld1(bottom_blob_data, v_offset2.a);
339
v[0] = mix(pad_value[0], v[0], mask);
340
v[1] = mix(pad_value[1], v[1], mask2);
342
buffer_st8(top_blob_data, gi, v);
347
x = clamp(x, 0, psc(w) - 1);
348
y = clamp(y, 0, psc(h) - 1);
349
z4 = clamp(z4, 0, psc(c) - 1);
350
zz4 = clamp(zz4, 0, psc(c) - 1);
354
v[0].r = image3d_ld1(bottom_blob, ivec3(x, y, z4.r));
355
v[0].g = image3d_ld1(bottom_blob, ivec3(x, y, z4.g));
356
v[0].b = image3d_ld1(bottom_blob, ivec3(x, y, z4.b));
357
v[0].a = image3d_ld1(bottom_blob, ivec3(x, y, z4.a));
358
v[1].r = image3d_ld1(bottom_blob, ivec3(x, y, zz4.r));
359
v[1].g = image3d_ld1(bottom_blob, ivec3(x, y, zz4.g));
360
v[1].b = image3d_ld1(bottom_blob, ivec3(x, y, zz4.b));
361
v[1].a = image3d_ld1(bottom_blob, ivec3(x, y, zz4.a));
363
image3d_st8(top_blob, ivec3(gx, gy, gz), v);
365
ivec4 v_offset = z4 * psc(cstep) + y * psc(w) + x;
366
ivec4 v_offset2 = zz4 * psc(cstep) + y * psc(w) + x;
367
buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, v_offset2);
376
// NOTE psc(X) get zeros on nvidia
377
// TODO only enable this workaround for some nvidia driver
378
x = (p.w - 1) - abs(x - (p.w - 1));
379
y = (p.h - 1) - abs(y - (p.h - 1));
380
z4 = (p.c - 1) - abs(z4 - (p.c - 1));
381
zz4 = (p.c - 1) - abs(zz4 - (p.c - 1));
382
// x = (psc(w) - 1) - abs(x - (psc(w) - 1));
383
// y = (psc(h) - 1) - abs(y - (psc(h) - 1));
384
// z4 = (psc(c) - 1) - abs(z4 - (psc(c) - 1));
385
// zz4 = (psc(c) - 1) - abs(zz4 - (psc(c) - 1));
389
v[0].r = image3d_ld1(bottom_blob, ivec3(x, y, z4.r));
390
v[0].g = image3d_ld1(bottom_blob, ivec3(x, y, z4.g));
391
v[0].b = image3d_ld1(bottom_blob, ivec3(x, y, z4.b));
392
v[0].a = image3d_ld1(bottom_blob, ivec3(x, y, z4.a));
393
v[1].r = image3d_ld1(bottom_blob, ivec3(x, y, zz4.r));
394
v[1].g = image3d_ld1(bottom_blob, ivec3(x, y, zz4.g));
395
v[1].b = image3d_ld1(bottom_blob, ivec3(x, y, zz4.b));
396
v[1].a = image3d_ld1(bottom_blob, ivec3(x, y, zz4.a));
398
image3d_st8(top_blob, ivec3(gx, gy, gz), v);
400
ivec4 v_offset = z4 * psc(cstep) + y * psc(w) + x;
401
ivec4 v_offset2 = zz4 * psc(cstep) + y * psc(w) + x;
402
buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, v_offset2);