1
// Tencent is pleased to support the open source community by making ncnn available.
3
// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
8
// https://opensource.org/licenses/BSD-3-Clause
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
18
#extension GL_EXT_shader_16bit_storage: require
20
#if NCNN_fp16_arithmetic
21
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
24
layout (constant_id = 0) const int across_spatial = 0;
25
layout (constant_id = 1) const int across_channel = 0;
28
layout (binding = 0) uniform highp sampler3D square_blob;
29
layout (binding = 1, rgba32f) writeonly uniform highp image3D sqsum_blob;
31
layout (binding = 0) readonly buffer square_blob { vec4 square_blob_data[]; };
32
layout (binding = 1) writeonly buffer sqsum_blob { vec4 sqsum_blob_data[]; };
35
layout (push_constant) uniform parameter
50
int gx = int(gl_GlobalInvocationID.x);
51
int gy = int(gl_GlobalInvocationID.y);
52
int gz = int(gl_GlobalInvocationID.z);
54
if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
59
if (across_spatial == 1 && across_channel == 1)
72
vec4 v0 = texelFetch(square_blob, ivec3(sx, sy, sz), 0);
78
vec4 v0 = texelFetch(square_blob, ivec3(sx, sy, sz), 0);
79
vec4 v1 = texelFetch(square_blob, ivec3(sx + 1, sy, sz), 0);
88
vec4 v0 = texelFetch(square_blob, ivec3(sx, sy, sz), 0);
89
vec4 v2 = texelFetch(square_blob, ivec3(sx, sy + 1, sz), 0);
95
vec4 v0 = texelFetch(square_blob, ivec3(sx, sy, sz), 0);
96
vec4 v1 = texelFetch(square_blob, ivec3(sx + 1, sy, sz), 0);
97
vec4 v2 = texelFetch(square_blob, ivec3(sx, sy + 1, sz), 0);
98
vec4 v3 = texelFetch(square_blob, ivec3(sx + 1, sy + 1, sz), 0);
100
sqsum = v0 + v1 + v2 + v3;
110
vec4 v0 = texelFetch(square_blob, ivec3(sx, sy, sz), 0);
111
vec4 v4 = texelFetch(square_blob, ivec3(sx, sy, sz + 1), 0);
117
vec4 v0 = texelFetch(square_blob, ivec3(sx, sy, sz), 0);
118
vec4 v1 = texelFetch(square_blob, ivec3(sx + 1, sy, sz), 0);
119
vec4 v4 = texelFetch(square_blob, ivec3(sx, sy, sz + 1), 0);
120
vec4 v5 = texelFetch(square_blob, ivec3(sx + 1, sy, sz + 1), 0);
122
sqsum = v0 + v1 + v4 + v5;
129
vec4 v0 = texelFetch(square_blob, ivec3(sx, sy, sz), 0);
130
vec4 v2 = texelFetch(square_blob, ivec3(sx, sy + 1, sz), 0);
131
vec4 v4 = texelFetch(square_blob, ivec3(sx, sy, sz + 1), 0);
132
vec4 v6 = texelFetch(square_blob, ivec3(sx, sy + 1, sz + 1), 0);
134
sqsum = v0 + v2 + v4 + v6;
138
vec4 v0 = texelFetch(square_blob, ivec3(sx, sy, sz), 0);
139
vec4 v1 = texelFetch(square_blob, ivec3(sx + 1, sy, sz), 0);
140
vec4 v2 = texelFetch(square_blob, ivec3(sx, sy + 1, sz), 0);
141
vec4 v3 = texelFetch(square_blob, ivec3(sx + 1, sy + 1, sz), 0);
142
vec4 v4 = texelFetch(square_blob, ivec3(sx, sy, sz + 1), 0);
143
vec4 v5 = texelFetch(square_blob, ivec3(sx + 1, sy, sz + 1), 0);
144
vec4 v6 = texelFetch(square_blob, ivec3(sx, sy + 1, sz + 1), 0);
145
vec4 v7 = texelFetch(square_blob, ivec3(sx + 1, sy + 1, sz + 1), 0);
147
sqsum = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
155
ivec2 v_offset0 = sz * p.cstep + sx + ivec2(0, 1);
156
ivec2 v_offset1 = v_offset0 + p.cstep;
162
vec4 v0 = square_blob_data[v_offset0.r];
168
vec4 v0 = square_blob_data[v_offset0.r];
169
vec4 v1 = square_blob_data[v_offset0.g];
178
vec4 v0 = square_blob_data[v_offset0.r];
179
vec4 v2 = square_blob_data[v_offset1.r];
185
vec4 v0 = square_blob_data[v_offset0.r];
186
vec4 v1 = square_blob_data[v_offset0.g];
187
vec4 v2 = square_blob_data[v_offset1.r];
188
vec4 v3 = square_blob_data[v_offset1.g];
190
sqsum = v0 + v1 + v2 + v3;
196
if (across_spatial == 1 && across_channel == 0)
206
vec4 v0 = texelFetch(square_blob, ivec3(sx, sy, gz), 0);
212
vec4 v0 = texelFetch(square_blob, ivec3(sx, sy, gz), 0);
213
vec4 v1 = texelFetch(square_blob, ivec3(sx + 1, sy, gz), 0);
222
vec4 v0 = texelFetch(square_blob, ivec3(sx, sy, gz), 0);
223
vec4 v2 = texelFetch(square_blob, ivec3(sx, sy + 1, gz), 0);
229
vec4 v0 = texelFetch(square_blob, ivec3(sx, sy, gz), 0);
230
vec4 v1 = texelFetch(square_blob, ivec3(sx + 1, sy, gz), 0);
231
vec4 v2 = texelFetch(square_blob, ivec3(sx, sy + 1, gz), 0);
232
vec4 v3 = texelFetch(square_blob, ivec3(sx + 1, sy + 1, gz), 0);
234
sqsum = v0 + v1 + v2 + v3;
241
ivec4 v_offset = sz * p.cstep + sx + ivec4(0, 1, 2, 3);
245
vec4 v0 = square_blob_data[v_offset.r];
249
else if (sx == p.w - 2)
251
vec4 v0 = square_blob_data[v_offset.r];
252
vec4 v1 = square_blob_data[v_offset.g];
256
else if (sx == p.w - 3)
258
vec4 v0 = square_blob_data[v_offset.r];
259
vec4 v1 = square_blob_data[v_offset.g];
260
vec4 v2 = square_blob_data[v_offset.b];
262
sqsum = v0 + v1 + v2;
266
vec4 v0 = square_blob_data[v_offset.r];
267
vec4 v1 = square_blob_data[v_offset.g];
268
vec4 v2 = square_blob_data[v_offset.b];
269
vec4 v3 = square_blob_data[v_offset.a];
271
sqsum = v0 + v1 + v2 + v3;
276
if (across_spatial == 0 && across_channel == 1)
283
vec4 v0 = texelFetch(square_blob, ivec3(gx, gy, sz), 0);
287
else if (sz == p.c - 2)
289
vec4 v0 = texelFetch(square_blob, ivec3(gx, gy, sz), 0);
290
vec4 v1 = texelFetch(square_blob, ivec3(gx, gy, sz + 1), 0);
294
else if (sz == p.c - 3)
296
vec4 v0 = texelFetch(square_blob, ivec3(gx, gy, sz), 0);
297
vec4 v1 = texelFetch(square_blob, ivec3(gx, gy, sz + 1), 0);
298
vec4 v2 = texelFetch(square_blob, ivec3(gx, gy, sz + 2), 0);
300
sqsum = v0 + v1 + v2;
304
vec4 v0 = texelFetch(square_blob, ivec3(gx, gy, sz), 0);
305
vec4 v1 = texelFetch(square_blob, ivec3(gx, gy, sz + 1), 0);
306
vec4 v2 = texelFetch(square_blob, ivec3(gx, gy, sz + 2), 0);
307
vec4 v3 = texelFetch(square_blob, ivec3(gx, gy, sz + 3), 0);
309
sqsum = v0 + v1 + v2 + v3;
315
ivec4 v_offset = (sz + ivec4(0, 1, 2, 3)) * p.cstep + sx;
319
vec4 v0 = square_blob_data[v_offset.r];
323
else if (sz == p.c - 2)
325
vec4 v0 = square_blob_data[v_offset.r];
326
vec4 v1 = square_blob_data[v_offset.g];
330
else if (sz == p.c - 3)
332
vec4 v0 = square_blob_data[v_offset.r];
333
vec4 v1 = square_blob_data[v_offset.g];
334
vec4 v2 = square_blob_data[v_offset.b];
336
sqsum = v0 + v1 + v2;
340
vec4 v0 = square_blob_data[v_offset.r];
341
vec4 v1 = square_blob_data[v_offset.g];
342
vec4 v2 = square_blob_data[v_offset.b];
343
vec4 v3 = square_blob_data[v_offset.a];
345
sqsum = v0 + v1 + v2 + v3;
351
imageStore(sqsum_blob, ivec3(gx, gy, gz), sqsum);
353
int gi = gz * p.outcstep + gx;
355
sqsum_blob_data[gi] = sqsum;