ncnn

Форк
0
/
padding_pack8to4.comp 
451 строка · 15.2 Кб
1
// Tencent is pleased to support the open source community by making ncnn available.
2
//
3
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4
//
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
7
//
8
// https://opensource.org/licenses/BSD-3-Clause
9
//
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
14

15
#version 450
16

17
#if NCNN_fp16_storage
18
#extension GL_EXT_shader_16bit_storage: require
19
#endif
20
#if NCNN_fp16_arithmetic
21
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
22
#endif
23

24
layout (constant_id = 0) const int type = 1;
25
layout (constant_id = 1) const float value = 0;
26
layout (constant_id = 2) const int per_channel_pad = 0;
27

28
#define shape_constant_id_offset 3
29
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
30
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
31
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
32
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
33
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
34

35
layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
36
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
37
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
38
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
39
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
40

41
#if NCNN_image_shader
42
layout (binding = 0) uniform unfp sampler3D bottom_blob;
43
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
44
layout (binding = 2) uniform unfp sampler3D per_channel_pad_blob;
45
#else
46
layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
47
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
48
layout (binding = 2) readonly buffer per_channel_pad_blob { sfpvec4 per_channel_pad_blob_data[]; };
49
#endif
50

51
layout (push_constant) uniform parameter
52
{
53
    int dims;
54
    int w;
55
    int h;
56
    int c;
57
    int cstep;
58

59
    int outdims;
60
    int outw;
61
    int outh;
62
    int outc;
63
    int outcstep;
64

65
    int left;
66
    int top;
67
    int front;
68
} p;
69

70
void main()
71
{
72
    int gx = int(gl_GlobalInvocationID.x);
73
    int gy = int(gl_GlobalInvocationID.y);
74
    int gz = int(gl_GlobalInvocationID.z);
75

76
    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
77
        return;
78

79
    afpvec4 v;
80

81
    if (psc(dims) == 1)
82
    {
83
        int x = gx - p.left / 4;
84

85
        if (type == 0)
86
        {
87
            // nvidia driver crash when using load and store pair  :(
88
            // copy is the workaround  --- nihui
89
#if NCNN_image_shader
90
            if (x >= 0 && x < psc(w) * 2)
91
            {
92
                image3d_cp4(top_blob, ivec3(gx, 0, 0), bottom_blob, ivec3(x, 0, 0));
93
            }
94
            else
95
            {
96
                v = afpvec4(value);
97
                image3d_st4(top_blob, ivec3(gx, 0, 0), v);
98
            }
99
#else
100
            if (x >= 0 && x < psc(w) * 2)
101
            {
102
                buffer_cp4(top_blob_data, gx, bottom_blob_data, x);
103
            }
104
            else
105
            {
106
                // nvidia driver is unhappy if we do not touch the v variable here  :<
107
                v = afpvec4(value);
108
                buffer_st4(top_blob_data, gx, v);
109
                // buffer_st4(top_blob_data, gx, afpvec4(value));
110
            }
111
#endif
112
        }
113
        if (type == 1)
114
        {
115
#if NCNN_image_shader
116
            if (x < 0)
117
            {
118
                v = afpvec4(image3d_ld4(bottom_blob, ivec3(0, 0, 0)).r);
119
            }
120
            else if (x >= psc(w) * 2)
121
            {
122
                v = afpvec4(image3d_ld4(bottom_blob, ivec3(psc(w) * 2 - 1, 0, 0)).a);
123
            }
124
            else
125
            {
126
                v = image3d_ld4(bottom_blob, ivec3((x / 2) * 2 + x % 2, 0, 0));
127
            }
128

129
            image3d_st4(top_blob, ivec3(gx, 0, 0), v);
130
#else
131
            if (x < 0)
132
            {
133
                v = afpvec4(buffer_ld4(bottom_blob_data, 0).r);
134
            }
135
            else if (x >= psc(w) * 2)
136
            {
137
                v = afpvec4(buffer_ld4(bottom_blob_data, psc(w) * 2 - 1).a);
138
            }
139
            else
140
            {
141
                v = buffer_ld4(bottom_blob_data, (x / 2) * 2 + x % 2);
142
            }
143

144
            buffer_st4(top_blob_data, gx, v);
145
#endif
146
        }
147
        if (type == 2)
148
        {
149
#if NCNN_image_shader
150
            if (x < 0)
151
            {
152
                ivec2 x01 = -x + ivec2(1, 0);
153
                afpvec4 v0 = image3d_ld4(bottom_blob, ivec3((x01.x / 2) * 2 + x01.x % 2, 0, 0));
154
                afpvec4 v1 = image3d_ld4(bottom_blob, ivec3((x01.y / 2) * 2 + x01.y % 2, 0, 0));
155
                v = afpvec4(v1.r, v0.a, v0.b, v0.g);
156
            }
157
            else if (x >= psc(w) * 2)
158
            {
159
                ivec2 x01 = psc(w) * 2 - x + psc(w) * 2 - 1 - ivec2(1, 0);
160
                afpvec4 v0 = image3d_ld4(bottom_blob, ivec3((x01.x / 2) * 2 + x01.x % 2, 0, 0));
161
                afpvec4 v1 = image3d_ld4(bottom_blob, ivec3((x01.y / 2) * 2 + x01.y % 2, 0, 0));
162
                v = afpvec4(v1.b, v1.g, v1.r, v0.a);
163
            }
164
            else
165
            {
166
                v = image3d_ld4(bottom_blob, ivec3((x / 2) * 2 + x % 2, 0, 0));
167
            }
168

169
            image3d_st4(top_blob, ivec3(gx, 0, 0), v);
170
#else
171
            if (x < 0)
172
            {
173
                ivec2 x01 = -x + ivec2(1, 0);
174
                afpvec4 v0 = buffer_ld4(bottom_blob_data, (x01.x / 2) * 2 + x01.x % 2);
175
                afpvec4 v1 = buffer_ld4(bottom_blob_data, (x01.y / 2) * 2 + x01.y % 2);
176
                v = afpvec4(v1.r, v0.a, v0.b, v0.g);
177
            }
178
            else if (x >= psc(w) * 2)
179
            {
180
                ivec2 x01 = psc(w) * 2 - x + psc(w) * 2 - 1 - ivec2(1, 0);
181
                afpvec4 v0 = buffer_ld4(bottom_blob_data, (x01.x / 2) * 2 + x01.x % 2);
182
                afpvec4 v1 = buffer_ld4(bottom_blob_data, (x01.y / 2) * 2 + x01.y % 2);
183
                v = afpvec4(v1.b, v1.g, v1.r, v0.a);
184
            }
185
            else
186
            {
187
                v = buffer_ld4(bottom_blob_data, (x / 2) * 2 + x % 2);
188
            }
189

190
            buffer_st4(top_blob_data, gx, v);
191
#endif
192
        }
193
    }
194
    else if (psc(dims) == 2)
195
    {
196
        int x = gx - p.left;
197
        int y = gy - p.top / 4;
198

199
        if (type == 0)
200
        {
201
#if NCNN_image_shader
202
            if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h) * 2)
203
            {
204
                v = image3d_ld4(bottom_blob, ivec3(x * 2 + y % 2, y / 2, 0));
205
            }
206
            else
207
            {
208
                v = afpvec4(value);
209
            }
210

211
            image3d_st4(top_blob, ivec3(gx, gy, 0), v);
212
#else
213
            if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h) * 2)
214
            {
215
                v = buffer_ld4(bottom_blob_data, (y / 2) * psc(w) * 2 + x * 2 + y % 2);
216
            }
217
            else
218
            {
219
                v = afpvec4(value);
220
            }
221

222
            const int gi = gy * psc(outw) + gx;
223

224
            buffer_st4(top_blob_data, gi, v);
225
#endif
226
        }
227
        if (type == 1)
228
        {
229
            x = clamp(x, 0, psc(w) * 2 - 1);
230

231
#if NCNN_image_shader
232
            if (y < 0)
233
            {
234
                v = afpvec4(image3d_ld4(bottom_blob, ivec3(x * 2, 0, 0)).r);
235
            }
236
            else if (y >= psc(h) * 2)
237
            {
238
                v = afpvec4(image3d_ld4(bottom_blob, ivec3(x * 2 + 1, psc(h) * 2 - 1, 0)).a);
239
            }
240
            else
241
            {
242
                v = image3d_ld4(bottom_blob, ivec3(x * 2 + y % 2, y / 2, 0));
243
            }
244

245
            image3d_st4(top_blob, ivec3(gx, gy, 0), v);
246
#else
247
            if (y < 0)
248
            {
249
                v = afpvec4(buffer_ld4(bottom_blob_data, x * 2).r);
250
            }
251
            else if (y >= psc(h) * 2)
252
            {
253
                v = afpvec4(buffer_ld4(bottom_blob_data, (psc(h) * 2 - 1) * psc(w) * 2 + x * 2 + 1).a);
254
            }
255
            else
256
            {
257
                v = buffer_ld4(bottom_blob_data, (y / 2) * psc(w) * 2 + x * 2 + y % 2);
258
            }
259

260
            const int gi = gy * psc(outw) + gx;
261

262
            buffer_st4(top_blob_data, gi, v);
263
#endif
264
        }
265
        if (type == 2)
266
        {
267
            x = abs(x);
268
            // NOTE psc(X) get zeros on nvidia
269
            // TODO only enable this workaround for some nvidia driver
270
            x = (p.w * 2 - 1) - abs(x - (p.w * 2 - 1));
271
//             x = (psc(w) * 2 - 1) - abs(x - (psc(w) * 2 - 1));
272

273
#if NCNN_image_shader
274
            if (y < 0)
275
            {
276
                ivec2 y01 = -y + ivec2(1, 0);
277
                afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x * 2 + y01.x % 2, y01.x / 2, 0));
278
                afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x * 2 + y01.y % 2, y01.y / 2, 0));
279
                v = afpvec4(v1.r, v0.a, v0.b, v0.g);
280
            }
281
            else if (y >= psc(h) * 2)
282
            {
283
                ivec2 y01 = psc(h) * 2 - y + psc(h) * 2 - 1 - ivec2(1, 0);
284
                afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x * 2 + y01.x % 2, y01.x / 2, 0));
285
                afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x * 2 + y01.y % 2, y01.y / 2, 0));
286
                v = afpvec4(v1.b, v1.g, v1.r, v0.a);
287
            }
288
            else
289
            {
290
                v = image3d_ld4(bottom_blob, ivec3(x * 2 + y % 2, y / 2, 0));
291
            }
292

293
            image3d_st4(top_blob, ivec3(gx, gy, 0), v);
294
#else
295
            if (y < 0)
296
            {
297
                ivec2 y01 = -y + ivec2(1, 0);
298
                afpvec4 v0 = buffer_ld4(bottom_blob_data, (y01.x / 2) * psc(w) * 2 + x * 2 + y01.x % 2);
299
                afpvec4 v1 = buffer_ld4(bottom_blob_data, (y01.y / 2) * psc(w) * 2 + x * 2 + y01.y % 2);
300
                v = afpvec4(v1.r, v0.a, v0.b, v0.g);
301
            }
302
            else if (y >= psc(h) * 2)
303
            {
304
                ivec2 y01 = psc(h) * 2 - y + psc(h) * 2 - 1 - ivec2(1, 0);
305
                afpvec4 v0 = buffer_ld4(bottom_blob_data, (y01.x / 2) * psc(w) * 2 + x * 2 + y01.x % 2);
306
                afpvec4 v1 = buffer_ld4(bottom_blob_data, (y01.y / 2) * psc(w) * 2 + x * 2 + y01.y % 2);
307
                v = afpvec4(v1.b, v1.g, v1.r, v0.a);
308
            }
309
            else
310
            {
311
                v = buffer_ld4(bottom_blob_data, (y / 2) * psc(w) + x * 2 + y % 2);
312
            }
313

314
            const int gi = gy * psc(outw) + gx;
315

316
            buffer_st4(top_blob_data, gi, v);
317
#endif
318
        }
319
    }
320
    else // if (psc(dims) == 3)
321
    {
322
        int x = gx - p.left;
323
        int y = gy - p.top;
324
        int z = gz - p.front / 4;
325

326
        if (type == 0)
327
        {
328
#if NCNN_image_shader
329
            if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h) && z >= 0 && z < psc(c) * 2)
330
            {
331
                v = image3d_ld4(bottom_blob, ivec3(x * 2 + z % 2, y, z / 2));
332
            }
333
            else
334
            {
335
                v = per_channel_pad == 1 ? image3d_ld4(per_channel_pad_blob, ivec3(gz, 0, 0)) : afpvec4(value);
336
            }
337

338
            image3d_st4(top_blob, ivec3(gx, gy, gz), v);
339
#else
340
            if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h) && z >= 0 && z < psc(c) * 2)
341
            {
342
                v = buffer_ld4(bottom_blob_data, ((z / 2) * psc(cstep) + y * psc(w)) * 2 + x * 2 + z % 2);
343
            }
344
            else
345
            {
346
                v = per_channel_pad == 1 ? buffer_ld4(per_channel_pad_blob_data, gz) : afpvec4(value);
347
            }
348

349
            const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
350

351
            buffer_st4(top_blob_data, gi, v);
352
#endif
353
        }
354
        if (type == 1)
355
        {
356
            x = clamp(x, 0, psc(w) * 2 - 1);
357
            y = clamp(y, 0, psc(h) * 2 - 1);
358

359
#if NCNN_image_shader
360
            if (z < 0)
361
            {
362
                v = afpvec4(image3d_ld4(bottom_blob, ivec3(x * 2, y, 0)).r);
363
            }
364
            else if (z >= psc(c) * 2)
365
            {
366
                v = afpvec4(image3d_ld4(bottom_blob, ivec3(x * 2 + 1, y, psc(c) * 2 - 1)).a);
367
            }
368
            else
369
            {
370
                v = image3d_ld4(bottom_blob, ivec3(x * 2 + z % 2, y, z / 2));
371
            }
372

373
            image3d_st4(top_blob, ivec3(gx, gy, gz), v);
374
#else
375
            if (z < 0)
376
            {
377
                v = afpvec4(buffer_ld4(bottom_blob_data, y * psc(w) * 2 + x * 2).r);
378
            }
379
            else if (z >= psc(c) * 2)
380
            {
381
                v = afpvec4(buffer_ld4(bottom_blob_data, ((psc(c) * 2 - 1) * psc(cstep) + y * psc(w)) * 2 + x * 2 + 1).a);
382
            }
383
            else
384
            {
385
                v = buffer_ld4(bottom_blob_data, ((z / 2) * psc(cstep) + y * psc(w)) * 2 + x * 2 + y % 2);
386
            }
387

388
            const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
389

390
            buffer_st4(top_blob_data, gi, v);
391
#endif
392
        }
393
        if (type == 2)
394
        {
395
            x = abs(x);
396
            y = abs(y);
397
            // NOTE psc(X) get zeros on nvidia
398
            // TODO only enable this workaround for some nvidia driver
399
            x = (p.w * 2 - 1) - abs(x - (p.w * 2 - 1));
400
            y = (p.h * 2 - 1) - abs(y - (p.h * 2 - 1));
401
//             x = (psc(w) * 2 - 1) - abs(x - (psc(w) * 2 - 1));
402
//             y = (psc(h) * 2 - 1) - abs(y - (psc(h) * 2 - 1));
403

404
#if NCNN_image_shader
405
            if (z < 0)
406
            {
407
                ivec2 z01 = -z + ivec2(1, 0);
408
                afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x * 2 + z01.x % 2, y, z01.x / 2));
409
                afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x * 2 + z01.y % 2, y, z01.y / 2));
410
                v = afpvec4(v1.r, v0.a, v0.b, v0.g);
411
            }
412
            else if (z >= psc(c) * 2)
413
            {
414
                ivec2 z01 = psc(c) * 2 - z + psc(c) * 2 - 1 - ivec2(1, 0);
415
                afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x * 2 + z01.x % 2, y, z01.x / 2));
416
                afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x * 2 + z01.y % 2, y, z01.y / 2));
417
                v = afpvec4(v1.b, v1.g, v1.r, v0.a);
418
            }
419
            else
420
            {
421
                v = image3d_ld4(bottom_blob, ivec3(x * 2 + z % 2, y, z / 2));
422
            }
423

424
            image3d_st4(top_blob, ivec3(gx, gy, gz), v);
425
#else
426
            if (z < 0)
427
            {
428
                ivec2 z01 = -y + ivec2(1, 0);
429
                afpvec4 v0 = buffer_ld4(bottom_blob_data, ((z01.x / 2) * psc(cstep) + y * psc(w)) * 2 + x * 2 + z01.x % 2);
430
                afpvec4 v1 = buffer_ld4(bottom_blob_data, ((z01.y / 2) * psc(cstep) + y * psc(w)) * 2 + x * 2 + z01.y % 2);
431
                v = afpvec4(v1.r, v0.a, v0.b, v0.g);
432
            }
433
            else if (z >= psc(c) * 2)
434
            {
435
                ivec2 z01 = psc(c) * 2 - z + psc(c) * 2 - 1 - ivec2(1, 0);
436
                afpvec4 v0 = buffer_ld4(bottom_blob_data, ((z01.x / 2) * psc(cstep) + y * psc(w)) * 2 + x * 2 + z01.x % 2);
437
                afpvec4 v1 = buffer_ld4(bottom_blob_data, ((z01.y / 2) * psc(cstep) + y * psc(w)) * 2 + x * 2 + z01.y % 2);
438
                v = afpvec4(v1.b, v1.g, v1.r, v0.a);
439
            }
440
            else
441
            {
442
                v = buffer_ld4(bottom_blob_data, ((z / 2) * psc(cstep) + y * psc(w)) * 2 + x * 2 + y % 2);
443
            }
444

445
            const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
446

447
            buffer_st4(top_blob_data, gi, v);
448
#endif
449
        }
450
    }
451
}
452

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.