ncnn
178 строк · 6.7 Кб
1// Tencent is pleased to support the open source community by making ncnn available.
2//
3// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
4//
5// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6// in compliance with the License. You may obtain a copy of the License at
7//
8// https://opensource.org/licenses/BSD-3-Clause
9//
10// Unless required by applicable law or agreed to in writing, software distributed
11// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13// specific language governing permissions and limitations under the License.
14
15#version 450
16
17#if NCNN_fp16_storage
18#extension GL_EXT_shader_16bit_storage: require
19#endif
20#if NCNN_fp16_arithmetic
21#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
22#endif
23
24#extension GL_GOOGLE_include_directive: enable
25#include "vulkan_activation.comp"
26
27layout (constant_id = 0) const int bias_term = 0;
28layout (constant_id = 1) const int activation_type = 0;
29layout (constant_id = 2) const float activation_param_0 = 0;
30layout (constant_id = 3) const float activation_param_1 = 0;
31
32#define shape_constant_id_offset 4
33layout (constant_id = shape_constant_id_offset + 0) const int c = 0;
34layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0;
35
36layout (constant_id = shape_constant_id_offset + 2) const int block_x = 0;
37layout (constant_id = shape_constant_id_offset + 3) const int block_y = 0;
38
39layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
40layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
41layout (constant_id = shape_constant_id_offset + 6) const int outcstep = 0;
42
43#if NCNN_image_shader
44layout (binding = 0) uniform unfp sampler3D top_tm_blob;
45layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
46layout (binding = 2) uniform unfp sampler3D bias_blob;
47#else
48layout (binding = 0) readonly buffer top_tm_blob { sfp top_tm_blob_data[]; };
49layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
50layout (binding = 2) readonly buffer bias_blob { sfp bias_data[]; };
51#endif
52
53layout (push_constant) uniform parameter
54{
55int c;
56int cstep;
57
58int block_x;
59int block_y;
60
61int outw;
62int outh;
63int outcstep;
64} p;
65
66void main()
67{
68int gx = int(gl_GlobalInvocationID.x);
69int gy = int(gl_GlobalInvocationID.y);
70int gz = int(gl_GlobalInvocationID.z);
71
72if (gx >= psc(block_x) || gy >= psc(block_y) || gz >= psc(c))
73return;
74
75// load 16
76#if NCNN_image_shader
77int sx = gy * psc(block_x) + gx;
78
79afp v00 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 0));
80afp v01 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 1));
81afp v02 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 2));
82afp v03 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 3));
83afp v10 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 4));
84afp v11 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 5));
85afp v12 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 6));
86afp v13 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 7));
87afp v20 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 8));
88afp v21 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 9));
89afp v22 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 10));
90afp v23 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 11));
91afp v30 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 12));
92afp v31 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 13));
93afp v32 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 14));
94afp v33 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 15));
95#else
96int v_tm_offset = gz * psc(block_x) * psc(block_y) + gy * psc(block_x) + gx;
97
98afp v00 = buffer_ld1(top_tm_blob_data, v_tm_offset + 0 * psc(cstep));
99afp v01 = buffer_ld1(top_tm_blob_data, v_tm_offset + 1 * psc(cstep));
100afp v02 = buffer_ld1(top_tm_blob_data, v_tm_offset + 2 * psc(cstep));
101afp v03 = buffer_ld1(top_tm_blob_data, v_tm_offset + 3 * psc(cstep));
102afp v10 = buffer_ld1(top_tm_blob_data, v_tm_offset + 4 * psc(cstep));
103afp v11 = buffer_ld1(top_tm_blob_data, v_tm_offset + 5 * psc(cstep));
104afp v12 = buffer_ld1(top_tm_blob_data, v_tm_offset + 6 * psc(cstep));
105afp v13 = buffer_ld1(top_tm_blob_data, v_tm_offset + 7 * psc(cstep));
106afp v20 = buffer_ld1(top_tm_blob_data, v_tm_offset + 8 * psc(cstep));
107afp v21 = buffer_ld1(top_tm_blob_data, v_tm_offset + 9 * psc(cstep));
108afp v22 = buffer_ld1(top_tm_blob_data, v_tm_offset + 10 * psc(cstep));
109afp v23 = buffer_ld1(top_tm_blob_data, v_tm_offset + 11 * psc(cstep));
110afp v30 = buffer_ld1(top_tm_blob_data, v_tm_offset + 12 * psc(cstep));
111afp v31 = buffer_ld1(top_tm_blob_data, v_tm_offset + 13 * psc(cstep));
112afp v32 = buffer_ld1(top_tm_blob_data, v_tm_offset + 14 * psc(cstep));
113afp v33 = buffer_ld1(top_tm_blob_data, v_tm_offset + 15 * psc(cstep));
114#endif
115
116// const float itm[2][4] = {
117// {1.0f, 1.0f, 1.0f, 0.0f},
118// {0.0f, 1.0f, -1.0f, 1.0f}
119// };
120
121// implicit transpose
122afp m00 = v00 + v01 + v02;
123afp m01 = v10 + v11 + v12;
124afp m02 = v20 + v21 + v22;
125afp m03 = v30 + v31 + v32;
126
127afp m10 = v01 - v02 + v03;
128afp m11 = v11 - v12 + v13;
129afp m12 = v21 - v22 + v23;
130afp m13 = v31 - v32 + v33;
131
132if (bias_term == 1)
133{
134#if NCNN_image_shader
135const afp bias_value = image3d_ld1(bias_blob, ivec3(gz, 0, 0));
136#else
137const afp bias_value = buffer_ld1(bias_data, gz);
138#endif
139
140v00 = bias_value + m00 + m01 + m02;
141v10 = bias_value + m10 + m11 + m12;
142
143v01 = bias_value + m01 - m02 + m03;
144v11 = bias_value + m11 - m12 + m13;
145}
146else
147{
148v00 = m00 + m01 + m02;
149v10 = m10 + m11 + m12;
150
151v01 = m01 - m02 + m03;
152v11 = m11 - m12 + m13;
153}
154
155v00 = activation_afp(v00, activation_type, activation_param_0, activation_param_1);
156v10 = activation_afp(v10, activation_type, activation_param_0, activation_param_1);
157v01 = activation_afp(v01, activation_type, activation_param_0, activation_param_1);
158v11 = activation_afp(v11, activation_type, activation_param_0, activation_param_1);
159
160// store 2x2
161int x = gx * 2;
162int y = gy * 2;
163
164#if NCNN_image_shader
165image3d_st1(top_blob, ivec3(x, y, gz), v00);
166image3d_st1(top_blob, ivec3(x + 1, y, gz), v01);
167image3d_st1(top_blob, ivec3(x, y + 1, gz), v10);
168image3d_st1(top_blob, ivec3(x + 1, y + 1, gz), v11);
169#else
170int v_offset_0 = gz * psc(outcstep) + y * psc(outw) + x;
171int v_offset_1 = v_offset_0 + psc(outw);
172
173buffer_st1(top_blob_data, v_offset_0 + 0, v00);
174if (x + 1 < psc(outw)) buffer_st1(top_blob_data, v_offset_0 + 1, v01);
175if (y + 1 < psc(outh)) buffer_st1(top_blob_data, v_offset_1 + 0, v10);
176if (y + 1 < psc(outh) && x + 1 < psc(outw)) buffer_st1(top_blob_data, v_offset_1 + 1, v11);
177#endif
178}
179