1
// Tencent is pleased to support the open source community by making ncnn available.
3
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
8
// https://opensource.org/licenses/BSD-3-Clause
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
18
#extension GL_EXT_shader_16bit_storage: require
19
struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
21
#if NCNN_fp16_arithmetic
22
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
25
layout (constant_id = 0) const int group = 0;
26
layout (constant_id = 1) const int bugihfa = 0;
28
#define shape_constant_id_offset 2
29
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
30
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
31
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
32
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
33
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
35
layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
36
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
37
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
38
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
39
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
42
layout (binding = 0) uniform unfp sampler3D bottom_blob;
43
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
45
layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
46
layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
49
layout (push_constant) uniform parameter
68
int gx = int(gl_GlobalInvocationID.x);
69
int gy = int(gl_GlobalInvocationID.y);
70
int gz = int(gl_GlobalInvocationID.z);
72
if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
75
const int channels_g = psc(c) * 8 / psc(group);
77
ivec4 gz4 = ivec4(gz * 8) + ivec4(0, 1, 2, 3);
80
ivec4 z4 = (gz4 % psc(group)) * channels_g + (gz4 / psc(group));
81
ivec4 zz4 = (gzz4 % psc(group)) * channels_g + (gzz4 / psc(group));
87
afpvec8 vr = image3d_ld8(bottom_blob, ivec3(gx, gy, sz8.r));
88
afpvec8 vg = image3d_ld8(bottom_blob, ivec3(gx, gy, sz8.g));
89
afpvec8 vb = image3d_ld8(bottom_blob, ivec3(gx, gy, sz8.b));
90
afpvec8 va = image3d_ld8(bottom_blob, ivec3(gx, gy, sz8.a));
92
afpvec8 vvr = image3d_ld8(bottom_blob, ivec3(gx, gy, szz8.r));
93
afpvec8 vvg = image3d_ld8(bottom_blob, ivec3(gx, gy, szz8.g));
94
afpvec8 vvb = image3d_ld8(bottom_blob, ivec3(gx, gy, szz8.b));
95
afpvec8 vva = image3d_ld8(bottom_blob, ivec3(gx, gy, szz8.a));
97
ivec4 v_offset = (z4 / 8) * psc(cstep) + gy * psc(w) + gx;
98
ivec4 vv_offset = (zz4 / 8) * psc(cstep) + gy * psc(w) + gx;
102
afpvec8 vr = buffer_ld8(bottom_blob_data, v_offset.r);
103
afpvec8 vg = buffer_ld8(bottom_blob_data, v_offset.g);
104
afpvec8 vb = buffer_ld8(bottom_blob_data, v_offset.b);
105
afpvec8 va = buffer_ld8(bottom_blob_data, v_offset.a);
107
afpvec8 vvr = buffer_ld8(bottom_blob_data, vv_offset.r);
108
afpvec8 vvg = buffer_ld8(bottom_blob_data, vv_offset.g);
109
afpvec8 vvb = buffer_ld8(bottom_blob_data, vv_offset.b);
110
afpvec8 vva = buffer_ld8(bottom_blob_data, vv_offset.a);
113
ivec4 sz4 = (z4 % 8) / 4;
114
ivec4 szz4 = (zz4 % 8) / 4;
116
ivec4 lane4 = z4 % 4;
117
ivec4 lane8 = zz4 % 4;
120
#if NCNN_fp16_arithmetic
123
if (lane4.r == 0) v[0].r = vr[sz4.r].r;
124
if (lane4.r == 1) v[0].r = vr[sz4.r].g;
125
if (lane4.r == 2) v[0].r = vr[sz4.r].b;
126
if (lane4.r == 3) v[0].r = vr[sz4.r].a;
127
if (lane4.g == 0) v[0].g = vr[sz4.g].r;
128
if (lane4.g == 1) v[0].g = vr[sz4.g].g;
129
if (lane4.g == 2) v[0].g = vr[sz4.g].b;
130
if (lane4.g == 3) v[0].g = vr[sz4.g].a;
131
if (lane4.b == 0) v[0].b = vr[sz4.b].r;
132
if (lane4.b == 1) v[0].b = vr[sz4.b].g;
133
if (lane4.b == 2) v[0].b = vr[sz4.b].b;
134
if (lane4.b == 3) v[0].b = vr[sz4.b].a;
135
if (lane4.a == 0) v[0].a = vr[sz4.a].r;
136
if (lane4.a == 1) v[0].a = vr[sz4.a].g;
137
if (lane4.a == 2) v[0].a = vr[sz4.a].b;
138
if (lane4.a == 3) v[0].a = vr[sz4.a].a;
139
if (lane8.r == 0) v[1].r = vvr[szz4.r].r;
140
if (lane8.r == 1) v[1].r = vvr[szz4.r].g;
141
if (lane8.r == 2) v[1].r = vvr[szz4.r].b;
142
if (lane8.r == 3) v[1].r = vvr[szz4.r].a;
143
if (lane8.g == 0) v[1].g = vvr[szz4.g].r;
144
if (lane8.g == 1) v[1].g = vvr[szz4.g].g;
145
if (lane8.g == 2) v[1].g = vvr[szz4.g].b;
146
if (lane8.g == 3) v[1].g = vvr[szz4.g].a;
147
if (lane8.b == 0) v[1].b = vvr[szz4.b].r;
148
if (lane8.b == 1) v[1].b = vvr[szz4.b].g;
149
if (lane8.b == 2) v[1].b = vvr[szz4.b].b;
150
if (lane8.b == 3) v[1].b = vvr[szz4.b].a;
151
if (lane8.a == 0) v[1].a = vvr[szz4.a].r;
152
if (lane8.a == 1) v[1].a = vvr[szz4.a].g;
153
if (lane8.a == 2) v[1].a = vvr[szz4.a].b;
154
if (lane8.a == 3) v[1].a = vvr[szz4.a].a;
159
v = afpvec8(vr[sz4.r][lane4.r], vg[sz4.g][lane4.g], vb[sz4.b][lane4.b], va[sz4.a][lane4.a], vvr[szz4.r][lane8.r], vvg[szz4.g][lane8.g], vvb[szz4.b][lane8.b], vva[szz4.a][lane8.a]);
163
image3d_st8(top_blob, ivec3(gx, gy, gz), v);
165
int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
167
buffer_st8(top_blob_data, gi, v);