ncnn

Форк
0
/
padding_pack4.comp 
383 строки · 11.7 Кб
1
// Tencent is pleased to support the open source community by making ncnn available.
2
//
3
// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4
//
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
7
//
8
// https://opensource.org/licenses/BSD-3-Clause
9
//
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
14

15
#version 450
16

17
#if NCNN_fp16_storage
18
#extension GL_EXT_shader_16bit_storage: require
19
#endif
20
#if NCNN_fp16_arithmetic
21
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
22
#endif
23

24
layout (constant_id = 0) const int type = 1;
25
layout (constant_id = 1) const float value = 0;
26
layout (constant_id = 2) const int per_channel_pad = 0;
27

28
#define shape_constant_id_offset 3
29
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
30
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
31
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
32
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
33
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
34

35
layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
36
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
37
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
38
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
39
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
40

41
#if NCNN_image_shader
42
layout (binding = 0) uniform unfp sampler3D bottom_blob;
43
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
44
layout (binding = 2) uniform unfp sampler3D per_channel_pad_blob;
45
#else
46
layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
47
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
48
layout (binding = 2) readonly buffer per_channel_pad_blob { sfpvec4 per_channel_pad_blob_data[]; };
49
#endif
50

51
layout (push_constant) uniform parameter
52
{
53
    int dims;
54
    int w;
55
    int h;
56
    int c;
57
    int cstep;
58

59
    int outdims;
60
    int outw;
61
    int outh;
62
    int outc;
63
    int outcstep;
64

65
    int left;
66
    int top;
67
    int front;
68
} p;
69

70
void main()
71
{
72
    int gx = int(gl_GlobalInvocationID.x);
73
    int gy = int(gl_GlobalInvocationID.y);
74
    int gz = int(gl_GlobalInvocationID.z);
75

76
    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
77
        return;
78

79
    if (psc(dims) == 1)
80
    {
81
        int x = gx - p.left / 4;
82

83
        if (type == 0)
84
        {
85
            if (x >= 0 && x < psc(w))
86
            {
87
#if NCNN_image_shader
88
                image3d_cp4(top_blob, ivec3(gx, 0, 0), bottom_blob, ivec3(x, 0, 0));
89
#else
90
                buffer_cp4(top_blob_data, gx, bottom_blob_data, x);
91
#endif
92
            }
93
            else
94
            {
95
                afpvec4 v = afpvec4(value);
96
#if NCNN_image_shader
97
                image3d_st4(top_blob, ivec3(gx, 0, 0), v);
98
#else
99
                buffer_st4(top_blob_data, gx, v);
100
#endif
101
            }
102
        }
103
        if (type == 1)
104
        {
105
            x = clamp(x, 0, psc(w) - 1);
106

107
#if NCNN_image_shader
108
            afpvec4 v = image3d_ld4(bottom_blob, ivec3(x, 0, 0));
109

110
            if (gx < p.left / 4)
111
                v = afpvec4(v.r);
112
            else if (gx >= psc(w) + p.left / 4)
113
                v = afpvec4(v.a);
114

115
            image3d_st4(top_blob, ivec3(gx, 0, 0), v);
116
#else
117
            afpvec4 v = buffer_ld4(bottom_blob_data, x);
118

119
            if (gx < p.left / 4)
120
                v = afpvec4(v.r);
121
            else if (gx >= psc(w) + p.left / 4)
122
                v = afpvec4(v.a);
123

124
            buffer_st4(top_blob_data, gx, v);
125
#endif
126
        }
127
        if (type == 2)
128
        {
129
            x = abs(x);
130
            // NOTE psc(X) get zeros on nvidia
131
            // TODO only enable this workaround for some nvidia driver
132
            x = (p.w - 1) - abs(x - (p.w - 1));
133
//             x = (psc(w) - 1) - abs(x - (psc(w) - 1));
134

135
#if NCNN_image_shader
136
            afpvec4 v = image3d_ld4(bottom_blob, ivec3(x, 0, 0));
137

138
            if (gx < p.left / 4)
139
            {
140
                afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x - 1, 0, 0));
141
                v = afpvec4(v.r, v0.a, v0.b, v0.g);
142
            }
143
            else if (gx >= psc(w) + p.left / 4)
144
            {
145
                afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x + 1, 0, 0));
146
                v = afpvec4(v1.b, v1.g, v1.r, v.a);
147
            }
148

149
            image3d_st4(top_blob, ivec3(gx, 0, 0), v);
150
#else
151
            afpvec4 v = buffer_ld4(bottom_blob_data, x);
152

153
            if (gx < p.left / 4)
154
            {
155
                afpvec4 v0 = buffer_ld4(bottom_blob_data, x - 1);
156
                v = afpvec4(v.r, v0.a, v0.b, v0.g);
157
            }
158
            else if (gx >= psc(w) + p.left / 4)
159
            {
160
                afpvec4 v1 = buffer_ld4(bottom_blob_data, x + 1);
161
                v = afpvec4(v1.b, v1.g, v1.r, v.a);
162
            }
163

164
            buffer_st4(top_blob_data, gx, v);
165
#endif
166
        }
167
    }
168
    else if (psc(dims) == 2)
169
    {
170
        const int gi = gy * psc(outw) + gx;
171

172
        int x = gx - p.left;
173
        int y = gy - p.top / 4;
174

175
        if (type == 0)
176
        {
177
            if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h))
178
            {
179
#if NCNN_image_shader
180
                image3d_cp4(top_blob, ivec3(gx, gy, 0), bottom_blob, ivec3(x, y, 0));
181
#else
182
                int v_offset = y * psc(w) + x;
183
                buffer_cp4(top_blob_data, gi, bottom_blob_data, v_offset);
184
#endif
185
            }
186
            else
187
            {
188
                afpvec4 v = afpvec4(value);
189
#if NCNN_image_shader
190
                image3d_st4(top_blob, ivec3(gx, gy, 0), v);
191
#else
192
                buffer_st4(top_blob_data, gi, v);
193
#endif
194
            }
195
        }
196
        if (type == 1)
197
        {
198
            x = clamp(x, 0, psc(w) - 1);
199
            y = clamp(y, 0, psc(h) - 1);
200

201
#if NCNN_image_shader
202
            afpvec4 v = image3d_ld4(bottom_blob, ivec3(x, y, 0));
203

204
            if (gy < p.top / 4)
205
                v = afpvec4(v.r);
206
            else if (gy >= psc(h) + p.top / 4)
207
                v = afpvec4(v.a);
208

209
            image3d_st4(top_blob, ivec3(gx, gy, 0), v);
210
#else
211
            int v_offset = y * psc(w) + x;
212

213
            afpvec4 v = buffer_ld4(bottom_blob_data, v_offset);
214

215
            if (gy < p.top / 4)
216
                v = afpvec4(v.r);
217
            else if (gy >= psc(h) + p.top / 4)
218
                v = afpvec4(v.a);
219

220
            buffer_st4(top_blob_data, gi, v);
221
#endif
222
        }
223
        if (type == 2)
224
        {
225
            x = abs(x);
226
            y = abs(y);
227
            // NOTE psc(X) get zeros on nvidia
228
            // TODO only enable this workaround for some nvidia driver
229
            x = (p.w - 1) - abs(x - (p.w - 1));
230
            y = (p.h - 1) - abs(y - (p.h - 1));
231
//             x = (psc(w) - 1) - abs(x - (psc(w) - 1));
232
//             y = (psc(h) - 1) - abs(y - (psc(h) - 1));
233

234
#if NCNN_image_shader
235
            afpvec4 v = image3d_ld4(bottom_blob, ivec3(x, y, 0));
236

237
            if (gy < p.top / 4)
238
            {
239
                afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x, y - 1, 0));
240
                v = afpvec4(v.r, v0.a, v0.b, v0.g);
241
            }
242
            else if (gy >= psc(h) + p.top / 4)
243
            {
244
                afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x, y + 1, 0));
245
                v = afpvec4(v1.b, v1.g, v1.r, v.a);
246
            }
247

248
            image3d_st4(top_blob, ivec3(gx, gy, 0), v);
249
#else
250
            int v_offset = y * psc(w) + x;
251

252
            afpvec4 v = buffer_ld4(bottom_blob_data, v_offset);
253

254
            if (gy < p.top / 4)
255
            {
256
                afpvec4 v0 = buffer_ld4(bottom_blob_data, v_offset - psc(w));
257
                v = afpvec4(v.r, v0.a, v0.b, v0.g);
258
            }
259
            else if (gy >= psc(h) + p.top / 4)
260
            {
261
                afpvec4 v1 = buffer_ld4(bottom_blob_data, v_offset + psc(w));
262
                v = afpvec4(v1.b, v1.g, v1.r, v.a);
263
            }
264

265
            buffer_st4(top_blob_data, gi, v);
266
#endif
267
        }
268
    }
269
    else // if (psc(dims) == 3)
270
    {
271
        const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
272

273
        int x = gx - p.left;
274
        int y = gy - p.top;
275
        int z = gz - p.front / 4;
276

277
        if (type == 0)
278
        {
279
            if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h) && z >= 0 && z < psc(c))
280
            {
281
#if NCNN_image_shader
282
                image3d_cp4(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, z));
283
#else
284
                int v_offset = z * psc(cstep) + y * psc(w) + x;
285
                buffer_cp4(top_blob_data, gi, bottom_blob_data, v_offset);
286
#endif
287
            }
288
            else if (per_channel_pad == 1)
289
            {
290
#if NCNN_image_shader
291
                image3d_cp4(top_blob, ivec3(gx, gy, gz), per_channel_pad_blob, ivec3(gz, 0, 0));
292
#else
293
                buffer_cp4(top_blob_data, gi, per_channel_pad_blob_data, gz);
294
#endif
295
            }
296
            else
297
            {
298
                afpvec4 v = afpvec4(value);
299
#if NCNN_image_shader
300
                image3d_st4(top_blob, ivec3(gx, gy, gz), v);
301
#else
302
                buffer_st4(top_blob_data, gi, v);
303
#endif
304
            }
305
        }
306
        if (type == 1)
307
        {
308
            x = clamp(x, 0, psc(w) - 1);
309
            y = clamp(y, 0, psc(h) - 1);
310
            z = clamp(z, 0, psc(c) - 1);
311

312
#if NCNN_image_shader
313
            afpvec4 v = image3d_ld4(bottom_blob, ivec3(x, y, z));
314

315
            if (gz < p.front / 4)
316
                v = afpvec4(v.r);
317
            else if (gz >= psc(c) + p.front / 4)
318
                v = afpvec4(v.a);
319

320
            image3d_st4(top_blob, ivec3(gx, gy, gz), v);
321
#else
322
            int v_offset = z * psc(cstep) + y * psc(w) + x;
323

324
            afpvec4 v = buffer_ld4(bottom_blob_data, v_offset);
325

326
            if (gz < p.front / 4)
327
                v = afpvec4(v.r);
328
            else if (gz >= psc(c) + p.front / 4)
329
                v = afpvec4(v.a);
330

331
            buffer_st4(top_blob_data, gi, v);
332
#endif
333
        }
334
        if (type == 2)
335
        {
336
            x = abs(x);
337
            y = abs(y);
338
            z = abs(z);
339
            // NOTE psc(X) get zeros on nvidia
340
            // TODO only enable this workaround for some nvidia driver
341
            x = (p.w - 1) - abs(x - (p.w - 1));
342
            y = (p.h - 1) - abs(y - (p.h - 1));
343
            z = (p.c - 1) - abs(z - (p.c - 1));
344
//             x = (psc(w) - 1) - abs(x - (psc(w) - 1));
345
//             y = (psc(h) - 1) - abs(y - (psc(h) - 1));
346
//             z = (psc(c) - 1) - abs(z - (psc(c) - 1));
347

348
#if NCNN_image_shader
349
            afpvec4 v = image3d_ld4(bottom_blob, ivec3(x, y, z));
350

351
            if (gz < p.front / 4)
352
            {
353
                afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x, y, z - 1));
354
                v = afpvec4(v.r, v0.a, v0.b, v0.g);
355
            }
356
            else if (gz >= psc(c) + p.front / 4)
357
            {
358
                afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x, y, z + 1));
359
                v = afpvec4(v1.b, v1.g, v1.r, v.a);
360
            }
361

362
            image3d_st4(top_blob, ivec3(gx, gy, gz), v);
363
#else
364
            int v_offset = z * psc(cstep) + y * psc(w) + x;
365

366
            afpvec4 v = buffer_ld4(bottom_blob_data, v_offset);
367

368
            if (gz < p.front / 4)
369
            {
370
                afpvec4 v0 = buffer_ld4(bottom_blob_data, v_offset - psc(cstep));
371
                v = afpvec4(v.r, v0.a, v0.b, v0.g);
372
            }
373
            else if (gz >= psc(c) + p.front / 4)
374
            {
375
                afpvec4 v1 = buffer_ld4(bottom_blob_data, v_offset + psc(cstep));
376
                v = afpvec4(v1.b, v1.g, v1.r, v.a);
377
            }
378

379
            buffer_st4(top_blob_data, gi, v);
380
#endif
381
        }
382
    }
383
}
384

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.