1
// Tencent is pleased to support the open source community by making ncnn available.
3
// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
8
// https://opensource.org/licenses/BSD-3-Clause
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
18
#extension GL_EXT_shader_16bit_storage: require
20
#if NCNN_fp16_arithmetic
21
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
24
layout (constant_id = 0) const int across_spatial = 0;
25
layout (constant_id = 1) const int across_channel = 0;
28
layout (binding = 0) uniform highp sampler3D square_blob;
29
layout (binding = 1, r32f) writeonly uniform highp image3D sqsum_blob;
31
layout (binding = 0) readonly buffer square_blob { float square_blob_data[]; };
32
layout (binding = 1) writeonly buffer sqsum_blob { float sqsum_blob_data[]; };
35
layout (push_constant) uniform parameter
50
int gx = int(gl_GlobalInvocationID.x);
51
int gy = int(gl_GlobalInvocationID.y);
52
int gz = int(gl_GlobalInvocationID.z);
54
if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
59
if (across_spatial == 1 && across_channel == 1)
72
float v0 = texelFetch(square_blob, ivec3(sx, sy, sz), 0).r;
78
float v0 = texelFetch(square_blob, ivec3(sx, sy, sz), 0).r;
79
float v1 = texelFetch(square_blob, ivec3(sx + 1, sy, sz), 0).r;
88
float v0 = texelFetch(square_blob, ivec3(sx, sy, sz), 0).r;
89
float v2 = texelFetch(square_blob, ivec3(sx, sy + 1, sz), 0).r;
95
float v0 = texelFetch(square_blob, ivec3(sx, sy, sz), 0).r;
96
float v1 = texelFetch(square_blob, ivec3(sx + 1, sy, sz), 0).r;
97
float v2 = texelFetch(square_blob, ivec3(sx, sy + 1, sz), 0).r;
98
float v3 = texelFetch(square_blob, ivec3(sx + 1, sy + 1, sz), 0).r;
100
sqsum = v0 + v1 + v2 + v3;
110
float v0 = texelFetch(square_blob, ivec3(sx, sy, sz), 0).r;
111
float v4 = texelFetch(square_blob, ivec3(sx, sy, sz + 1), 0).r;
117
float v0 = texelFetch(square_blob, ivec3(sx, sy, sz), 0).r;
118
float v1 = texelFetch(square_blob, ivec3(sx + 1, sy, sz), 0).r;
119
float v4 = texelFetch(square_blob, ivec3(sx, sy, sz + 1), 0).r;
120
float v5 = texelFetch(square_blob, ivec3(sx + 1, sy, sz + 1), 0).r;
122
sqsum = v0 + v1 + v4 + v5;
129
float v0 = texelFetch(square_blob, ivec3(sx, sy, sz), 0).r;
130
float v2 = texelFetch(square_blob, ivec3(sx, sy + 1, sz), 0).r;
131
float v4 = texelFetch(square_blob, ivec3(sx, sy, sz + 1), 0).r;
132
float v6 = texelFetch(square_blob, ivec3(sx, sy + 1, sz + 1), 0).r;
134
sqsum = v0 + v2 + v4 + v6;
138
float v0 = texelFetch(square_blob, ivec3(sx, sy, sz), 0).r;
139
float v1 = texelFetch(square_blob, ivec3(sx + 1, sy, sz), 0).r;
140
float v2 = texelFetch(square_blob, ivec3(sx, sy + 1, sz), 0).r;
141
float v3 = texelFetch(square_blob, ivec3(sx + 1, sy + 1, sz), 0).r;
142
float v4 = texelFetch(square_blob, ivec3(sx, sy, sz + 1), 0).r;
143
float v5 = texelFetch(square_blob, ivec3(sx + 1, sy, sz + 1), 0).r;
144
float v6 = texelFetch(square_blob, ivec3(sx, sy + 1, sz + 1), 0).r;
145
float v7 = texelFetch(square_blob, ivec3(sx + 1, sy + 1, sz + 1), 0).r;
147
sqsum = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
155
ivec2 v_offset0 = sz * p.cstep + sx + ivec2(0, 1);
156
ivec2 v_offset1 = v_offset0 + p.cstep;
162
float v0 = square_blob_data[v_offset0.r];
168
float v0 = square_blob_data[v_offset0.r];
169
float v1 = square_blob_data[v_offset0.g];
178
float v0 = square_blob_data[v_offset0.r];
179
float v2 = square_blob_data[v_offset1.r];
185
float v0 = square_blob_data[v_offset0.r];
186
float v1 = square_blob_data[v_offset0.g];
187
float v2 = square_blob_data[v_offset1.r];
188
float v3 = square_blob_data[v_offset1.g];
190
sqsum = v0 + v1 + v2 + v3;
196
if (across_spatial == 1 && across_channel == 0)
206
float v0 = texelFetch(square_blob, ivec3(sx, sy, gz), 0).r;
212
float v0 = texelFetch(square_blob, ivec3(sx, sy, gz), 0).r;
213
float v1 = texelFetch(square_blob, ivec3(sx + 1, sy, gz), 0).r;
222
float v0 = texelFetch(square_blob, ivec3(sx, sy, gz), 0).r;
223
float v2 = texelFetch(square_blob, ivec3(sx, sy + 1, gz), 0).r;
229
float v0 = texelFetch(square_blob, ivec3(sx, sy, gz), 0).r;
230
float v1 = texelFetch(square_blob, ivec3(sx + 1, sy, gz), 0).r;
231
float v2 = texelFetch(square_blob, ivec3(sx, sy + 1, gz), 0).r;
232
float v3 = texelFetch(square_blob, ivec3(sx + 1, sy + 1, gz), 0).r;
234
sqsum = v0 + v1 + v2 + v3;
241
ivec4 v_offset = sz * p.cstep + sx + ivec4(0, 1, 2, 3);
245
float v0 = square_blob_data[v_offset.r];
249
else if (sx == p.w - 2)
251
float v0 = square_blob_data[v_offset.r];
252
float v1 = square_blob_data[v_offset.g];
256
else if (sx == p.w - 3)
258
float v0 = square_blob_data[v_offset.r];
259
float v1 = square_blob_data[v_offset.g];
260
float v2 = square_blob_data[v_offset.b];
262
sqsum = v0 + v1 + v2;
266
float v0 = square_blob_data[v_offset.r];
267
float v1 = square_blob_data[v_offset.g];
268
float v2 = square_blob_data[v_offset.b];
269
float v3 = square_blob_data[v_offset.a];
271
sqsum = v0 + v1 + v2 + v3;
276
if (across_spatial == 0 && across_channel == 1)
283
float v0 = texelFetch(square_blob, ivec3(gx, gy, sz), 0).r;
287
else if (sz == p.c - 2)
289
float v0 = texelFetch(square_blob, ivec3(gx, gy, sz), 0).r;
290
float v1 = texelFetch(square_blob, ivec3(gx, gy, sz + 1), 0).r;
294
else if (sz == p.c - 3)
296
float v0 = texelFetch(square_blob, ivec3(gx, gy, sz), 0).r;
297
float v1 = texelFetch(square_blob, ivec3(gx, gy, sz + 1), 0).r;
298
float v2 = texelFetch(square_blob, ivec3(gx, gy, sz + 2), 0).r;
300
sqsum = v0 + v1 + v2;
304
float v0 = texelFetch(square_blob, ivec3(gx, gy, sz), 0).r;
305
float v1 = texelFetch(square_blob, ivec3(gx, gy, sz + 1), 0).r;
306
float v2 = texelFetch(square_blob, ivec3(gx, gy, sz + 2), 0).r;
307
float v3 = texelFetch(square_blob, ivec3(gx, gy, sz + 3), 0).r;
309
sqsum = v0 + v1 + v2 + v3;
315
ivec4 v_offset = (sz + ivec4(0, 1, 2, 3)) * p.cstep + sx;
319
float v0 = square_blob_data[v_offset.r];
323
else if (sz == p.c - 2)
325
float v0 = square_blob_data[v_offset.r];
326
float v1 = square_blob_data[v_offset.g];
330
else if (sz == p.c - 3)
332
float v0 = square_blob_data[v_offset.r];
333
float v1 = square_blob_data[v_offset.g];
334
float v2 = square_blob_data[v_offset.b];
336
sqsum = v0 + v1 + v2;
340
float v0 = square_blob_data[v_offset.r];
341
float v1 = square_blob_data[v_offset.g];
342
float v2 = square_blob_data[v_offset.b];
343
float v3 = square_blob_data[v_offset.a];
345
sqsum = v0 + v1 + v2 + v3;
351
imageStore(sqsum_blob, ivec3(gx, gy, gz), vec4(sqsum));
353
int gi = gz * p.outcstep + gx;
355
sqsum_blob_data[gi] = sqsum;