ncnn

Форк
0
/
padding_pack1to4.comp 
323 строки · 11.3 Кб
1
// Tencent is pleased to support the open source community by making ncnn available.
2
//
3
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4
//
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
7
//
8
// https://opensource.org/licenses/BSD-3-Clause
9
//
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
14

15
#version 450
16

17
#if NCNN_fp16_storage
18
#extension GL_EXT_shader_16bit_storage: require
19
#endif
20
#if NCNN_fp16_arithmetic
21
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
22
#endif
23

24
layout (constant_id = 0) const int type = 1;
25
layout (constant_id = 1) const float value = 0;
26
layout (constant_id = 2) const int per_channel_pad = 0;
27

28
#define shape_constant_id_offset 3
29
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
30
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
31
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
32
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
33
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
34

35
layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
36
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
37
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
38
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
39
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
40

41
#if NCNN_image_shader
42
layout (binding = 0) uniform unfp sampler3D bottom_blob;
43
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
44
layout (binding = 2) uniform unfp sampler3D per_channel_pad_blob;
45
#else
46
layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
47
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
48
layout (binding = 2) readonly buffer per_channel_pad_blob { sfpvec4 per_channel_pad_blob_data[]; };
49
#endif
50

51
layout (push_constant) uniform parameter
52
{
53
    int dims;
54
    int w;
55
    int h;
56
    int c;
57
    int cstep;
58

59
    int outdims;
60
    int outw;
61
    int outh;
62
    int outc;
63
    int outcstep;
64

65
    int left;
66
    int top;
67
    int front;
68
} p;
69

70
void main()
71
{
72
    int gx = int(gl_GlobalInvocationID.x);
73
    int gy = int(gl_GlobalInvocationID.y);
74
    int gz = int(gl_GlobalInvocationID.z);
75

76
    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
77
        return;
78

79
    if (psc(dims) == 1)
80
    {
81
        ivec4 x4 = gx * 4 - p.left + ivec4(0, 1, 2, 3);
82

83
        if (type == 0)
84
        {
85
            bvec4 mask = bvec4(uvec4(greaterThanEqual(x4, ivec4(0))) & uvec4(lessThan(x4, ivec4(psc(w)))));
86

87
#if NCNN_image_shader
88
            afpvec4 v;
89
            v.r = image3d_ld1(bottom_blob, ivec3(x4.r, 0, 0));
90
            v.g = image3d_ld1(bottom_blob, ivec3(x4.g, 0, 0));
91
            v.b = image3d_ld1(bottom_blob, ivec3(x4.b, 0, 0));
92
            v.a = image3d_ld1(bottom_blob, ivec3(x4.a, 0, 0));
93

94
            v = mix(afpvec4(value), v, mask);
95

96
            image3d_st4(top_blob, ivec3(gx, 0, 0), v);
97
#else
98
            afpvec4 v;
99
            // buffer_ld1 x4 index on vec returns zero on radv driver  :(
100
            // this is an ineffiecnt workaround  --- nihui
101
            if (x4.r < 0 && x4.a >= 0)
102
            {
103
                v.r = x4.r >= 0 ? buffer_ld1(bottom_blob_data, x4.r) : afp(value);
104
                v.g = x4.g >= 0 ? buffer_ld1(bottom_blob_data, x4.g) : afp(value);
105
                v.b = x4.b >= 0 ? buffer_ld1(bottom_blob_data, x4.b) : afp(value);
106
                v.a = x4.a >= 0 ? buffer_ld1(bottom_blob_data, x4.a) : afp(value);
107
            }
108
            else
109
            {
110
                v.r = buffer_ld1(bottom_blob_data, x4.r);
111
                v.g = buffer_ld1(bottom_blob_data, x4.g);
112
                v.b = buffer_ld1(bottom_blob_data, x4.b);
113
                v.a = buffer_ld1(bottom_blob_data, x4.a);
114

115
                v = mix(afpvec4(value), v, mask);
116
            }
117

118
            buffer_st4(top_blob_data, gx, v);
119
#endif
120
        }
121
        if (type == 1)
122
        {
123
            x4 = clamp(x4, 0, psc(w) - 1);
124

125
#if NCNN_image_shader
126
            afpvec4 v;
127
            v.r = image3d_ld1(bottom_blob, ivec3(x4.r, 0, 0));
128
            v.g = image3d_ld1(bottom_blob, ivec3(x4.g, 0, 0));
129
            v.b = image3d_ld1(bottom_blob, ivec3(x4.b, 0, 0));
130
            v.a = image3d_ld1(bottom_blob, ivec3(x4.a, 0, 0));
131

132
            image3d_st4(top_blob, ivec3(gx, 0, 0), v);
133
#else
134
            buffer_cp1to4(top_blob_data, gx, bottom_blob_data, x4);
135
#endif
136
        }
137
        if (type == 2)
138
        {
139
            x4 = abs(x4);
140
            // NOTE psc(X) get zeros on nvidia
141
            // TODO only enable this workaround for some nvidia driver
142
            x4 = (p.w - 1) - abs(x4 - (p.w - 1));
143
//             x4 = (psc(w) - 1) - abs(x4 - (psc(w) - 1));
144

145
#if NCNN_image_shader
146
            afpvec4 v;
147
            v.r = image3d_ld1(bottom_blob, ivec3(x4.r, 0, 0));
148
            v.g = image3d_ld1(bottom_blob, ivec3(x4.g, 0, 0));
149
            v.b = image3d_ld1(bottom_blob, ivec3(x4.b, 0, 0));
150
            v.a = image3d_ld1(bottom_blob, ivec3(x4.a, 0, 0));
151

152
            image3d_st4(top_blob, ivec3(gx, 0, 0), v);
153
#else
154
            buffer_cp1to4(top_blob_data, gx, bottom_blob_data, x4);
155
#endif
156
        }
157
    }
158
    else if (psc(dims) == 2)
159
    {
160
        const int gi = gy * psc(outw) + gx;
161

162
        int x = gx - p.left;
163
        ivec4 y4 = gy * 4 - p.top + ivec4(0, 1, 2, 3);
164

165
        if (type == 0)
166
        {
167
            bvec4 mask = bvec4(uvec4(x >= 0 && x < psc(w)) & (uvec4(greaterThanEqual(y4, ivec4(0))) & uvec4(lessThan(y4, ivec4(psc(h))))));
168

169
#if NCNN_image_shader
170
            afpvec4 v;
171
            v.r = image3d_ld1(bottom_blob, ivec3(x, y4.r, 0));
172
            v.g = image3d_ld1(bottom_blob, ivec3(x, y4.g, 0));
173
            v.b = image3d_ld1(bottom_blob, ivec3(x, y4.b, 0));
174
            v.a = image3d_ld1(bottom_blob, ivec3(x, y4.a, 0));
175

176
            v = mix(afpvec4(value), v, mask);
177

178
            image3d_st4(top_blob, ivec3(gx, gy, 0), v);
179
#else
180
            ivec4 v_offset = y4 * psc(w) + x;
181

182
            afpvec4 v;
183
            v.r = buffer_ld1(bottom_blob_data, v_offset.r);
184
            v.g = buffer_ld1(bottom_blob_data, v_offset.g);
185
            v.b = buffer_ld1(bottom_blob_data, v_offset.b);
186
            v.a = buffer_ld1(bottom_blob_data, v_offset.a);
187

188
            v = mix(afpvec4(value), v, mask);
189

190
            buffer_st4(top_blob_data, gi, v);
191
#endif
192
        }
193
        if (type == 1)
194
        {
195
            x = clamp(x, 0, psc(w) - 1);
196
            y4 = clamp(y4, 0, psc(h) - 1);
197

198
#if NCNN_image_shader
199
            afpvec4 v;
200
            v.r = image3d_ld1(bottom_blob, ivec3(x, y4.r, 0));
201
            v.g = image3d_ld1(bottom_blob, ivec3(x, y4.g, 0));
202
            v.b = image3d_ld1(bottom_blob, ivec3(x, y4.b, 0));
203
            v.a = image3d_ld1(bottom_blob, ivec3(x, y4.a, 0));
204

205
            image3d_st4(top_blob, ivec3(gx, gy, 0), v);
206
#else
207
            ivec4 v_offset = y4 * psc(w) + x;
208
            buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset);
209
#endif
210
        }
211
        if (type == 2)
212
        {
213
            x = abs(x);
214
            y4 = abs(y4);
215
            // NOTE psc(X) get zeros on nvidia
216
            // TODO only enable this workaround for some nvidia driver
217
            x = (p.w - 1) - abs(x - (p.w - 1));
218
            y4 = (p.h - 1) - abs(y4 - (p.h - 1));
219
//             x = (psc(w) - 1) - abs(x - (psc(w) - 1));
220
//             y4 = (psc(h) - 1) - abs(y4 - (psc(h) - 1));
221

222
#if NCNN_image_shader
223
            afpvec4 v;
224
            v.r = image3d_ld1(bottom_blob, ivec3(x, y4.r, 0));
225
            v.g = image3d_ld1(bottom_blob, ivec3(x, y4.g, 0));
226
            v.b = image3d_ld1(bottom_blob, ivec3(x, y4.b, 0));
227
            v.a = image3d_ld1(bottom_blob, ivec3(x, y4.a, 0));
228

229
            image3d_st4(top_blob, ivec3(gx, gy, 0), v);
230
#else
231
            ivec4 v_offset = y4 * psc(w) + x;
232
            buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset);
233
#endif
234
        }
235
    }
236
    else // if (psc(dims) == 3)
237
    {
238
        const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
239

240
        int x = gx - p.left;
241
        int y = gy - p.top;
242
        ivec4 z4 = gz * 4 - p.front + ivec4(0, 1, 2, 3);
243

244
        if (type == 0)
245
        {
246
            bvec4 mask = bvec4(uvec4(x >= 0 && x < psc(w) && y >= 0 && y < psc(h)) & (uvec4(greaterThanEqual(z4, ivec4(0))) & uvec4(lessThan(z4, ivec4(psc(c))))));
247

248
#if NCNN_image_shader
249
            afpvec4 pad_value = per_channel_pad == 1 ? image3d_ld4(per_channel_pad_blob, ivec3(gz, 0, 0)) : afpvec4(value);
250

251
            afpvec4 v;
252
            v.r = image3d_ld1(bottom_blob, ivec3(x, y, z4.r));
253
            v.g = image3d_ld1(bottom_blob, ivec3(x, y, z4.g));
254
            v.b = image3d_ld1(bottom_blob, ivec3(x, y, z4.b));
255
            v.a = image3d_ld1(bottom_blob, ivec3(x, y, z4.a));
256

257
            v = mix(pad_value, v, mask);
258

259
            image3d_st4(top_blob, ivec3(gx, gy, gz), v);
260
#else
261
            afpvec4 pad_value = per_channel_pad == 1 ? buffer_ld4(per_channel_pad_blob_data, gz) : afpvec4(value);
262

263
            ivec4 v_offset = z4 * psc(cstep) + y * psc(w) + x;
264

265
            afpvec4 v;
266
            v.r = buffer_ld1(bottom_blob_data, v_offset.r);
267
            v.g = buffer_ld1(bottom_blob_data, v_offset.g);
268
            v.b = buffer_ld1(bottom_blob_data, v_offset.b);
269
            v.a = buffer_ld1(bottom_blob_data, v_offset.a);
270

271
            v = mix(pad_value, v, mask);
272

273
            buffer_st4(top_blob_data, gi, v);
274
#endif
275
        }
276
        if (type == 1)
277
        {
278
            x = clamp(x, 0, psc(w) - 1);
279
            y = clamp(y, 0, psc(h) - 1);
280
            z4 = clamp(z4, 0, psc(c) - 1);
281

282
#if NCNN_image_shader
283
            afpvec4 v;
284
            v.r = image3d_ld1(bottom_blob, ivec3(x, y, z4.r));
285
            v.g = image3d_ld1(bottom_blob, ivec3(x, y, z4.g));
286
            v.b = image3d_ld1(bottom_blob, ivec3(x, y, z4.b));
287
            v.a = image3d_ld1(bottom_blob, ivec3(x, y, z4.a));
288

289
            image3d_st4(top_blob, ivec3(gx, gy, gz), v);
290
#else
291
            ivec4 v_offset = z4 * psc(cstep) + y * psc(w) + x;
292
            buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset);
293
#endif
294
        }
295
        if (type == 2)
296
        {
297
            x = abs(x);
298
            y = abs(y);
299
            z4 = abs(z4);
300
            // NOTE psc(X) get zeros on nvidia
301
            // TODO only enable this workaround for some nvidia driver
302
            x = (p.w - 1) - abs(x - (p.w - 1));
303
            y = (p.h - 1) - abs(y - (p.h - 1));
304
            z4 = (p.c - 1) - abs(z4 - (p.c - 1));
305
//             x = (psc(w) - 1) - abs(x - (psc(w) - 1));
306
//             y = (psc(h) - 1) - abs(y - (psc(h) - 1));
307
//             z4 = (psc(c) - 1) - abs(z4 - (psc(c) - 1));
308

309
#if NCNN_image_shader
310
            afpvec4 v;
311
            v.r = image3d_ld1(bottom_blob, ivec3(x, y, z4.r));
312
            v.g = image3d_ld1(bottom_blob, ivec3(x, y, z4.g));
313
            v.b = image3d_ld1(bottom_blob, ivec3(x, y, z4.b));
314
            v.a = image3d_ld1(bottom_blob, ivec3(x, y, z4.a));
315

316
            image3d_st4(top_blob, ivec3(gx, gy, gz), v);
317
#else
318
            ivec4 v_offset = z4 * psc(cstep) + y * psc(w) + x;
319
            buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset);
320
#endif
321
        }
322
    }
323
}
324

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.