ncnn

Форк
0
/
padding_pack1to8.comp 
406 строк · 16.7 Кб
1
// Tencent is pleased to support the open source community by making ncnn available.
2
//
3
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4
//
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
7
//
8
// https://opensource.org/licenses/BSD-3-Clause
9
//
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
14

15
#version 450
16

17
#if NCNN_fp16_storage
18
#extension GL_EXT_shader_16bit_storage: require
19
struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
20
#endif
21
#if NCNN_fp16_arithmetic
22
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
23
#endif
24

25
layout (constant_id = 0) const int type = 1;
26
layout (constant_id = 1) const float value = 0;
27
layout (constant_id = 2) const int per_channel_pad = 0;
28

29
#define shape_constant_id_offset 3
30
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
31
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
32
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
33
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
34
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
35

36
layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
37
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
38
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
39
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
40
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
41

42
#if NCNN_image_shader
43
layout (binding = 0) uniform unfp sampler3D bottom_blob;
44
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
45
layout (binding = 2) uniform unfp sampler3D per_channel_pad_blob;
46
#else
47
layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
48
layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
49
layout (binding = 2) readonly buffer per_channel_pad_blob { sfpvec8 per_channel_pad_blob_data[]; };
50
#endif
51

52
layout (push_constant) uniform parameter
53
{
54
    int dims;
55
    int w;
56
    int h;
57
    int c;
58
    int cstep;
59

60
    int outdims;
61
    int outw;
62
    int outh;
63
    int outc;
64
    int outcstep;
65

66
    int left;
67
    int top;
68
    int front;
69
} p;
70

71
void main()
72
{
73
    int gx = int(gl_GlobalInvocationID.x);
74
    int gy = int(gl_GlobalInvocationID.y);
75
    int gz = int(gl_GlobalInvocationID.z);
76

77
    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
78
        return;
79

80
    if (psc(dims) == 1)
81
    {
82
        ivec4 x4 = gx * 8 - p.left + ivec4(0, 1, 2, 3);
83
        ivec4 xx4 = x4 + 4;
84

85
        if (type == 0)
86
        {
87
            bvec4 mask = bvec4(uvec4(greaterThanEqual(x4, ivec4(0))) & uvec4(lessThan(x4, ivec4(psc(w)))));
88
            bvec4 mask2 = bvec4(uvec4(greaterThanEqual(xx4, ivec4(0))) & uvec4(lessThan(xx4, ivec4(psc(w)))));
89

90
#if NCNN_image_shader
91
            afpvec8 v;
92
            v[0].r = image3d_ld1(bottom_blob, ivec3(x4.r, 0, 0));
93
            v[0].g = image3d_ld1(bottom_blob, ivec3(x4.g, 0, 0));
94
            v[0].b = image3d_ld1(bottom_blob, ivec3(x4.b, 0, 0));
95
            v[0].a = image3d_ld1(bottom_blob, ivec3(x4.a, 0, 0));
96
            v[1].r = image3d_ld1(bottom_blob, ivec3(xx4.r, 0, 0));
97
            v[1].g = image3d_ld1(bottom_blob, ivec3(xx4.g, 0, 0));
98
            v[1].b = image3d_ld1(bottom_blob, ivec3(xx4.b, 0, 0));
99
            v[1].a = image3d_ld1(bottom_blob, ivec3(xx4.a, 0, 0));
100

101
            v[0] = mix(afpvec4(value), v[0], mask);
102
            v[1] = mix(afpvec4(value), v[1], mask2);
103

104
            image3d_st8(top_blob, ivec3(gx, 0, 0), v);
105
#else
106
            afpvec8 v;
107
            // buffer_ld1 x4/xx4 index on vec returns zero on radv driver  :(
108
            // this is an ineffiecnt workaround  --- nihui
109
            if (x4.r < 0 && xx4.a >= 0)
110
            {
111
                v[0].r = x4.r >= 0 ? buffer_ld1(bottom_blob_data, x4.r) : afp(value);
112
                v[0].g = x4.g >= 0 ? buffer_ld1(bottom_blob_data, x4.g) : afp(value);
113
                v[0].b = x4.b >= 0 ? buffer_ld1(bottom_blob_data, x4.b) : afp(value);
114
                v[0].a = x4.a >= 0 ? buffer_ld1(bottom_blob_data, x4.a) : afp(value);
115
                v[1].r = xx4.r >= 0 ? buffer_ld1(bottom_blob_data, xx4.r) : afp(value);
116
                v[1].g = xx4.g >= 0 ? buffer_ld1(bottom_blob_data, xx4.g) : afp(value);
117
                v[1].b = xx4.b >= 0 ? buffer_ld1(bottom_blob_data, xx4.b) : afp(value);
118
                v[1].a = xx4.a >= 0 ? buffer_ld1(bottom_blob_data, xx4.a) : afp(value);
119
            }
120
            else
121
            {
122
                v[0].r = buffer_ld1(bottom_blob_data, x4.r);
123
                v[0].g = buffer_ld1(bottom_blob_data, x4.g);
124
                v[0].b = buffer_ld1(bottom_blob_data, x4.b);
125
                v[0].a = buffer_ld1(bottom_blob_data, x4.a);
126
                v[1].r = buffer_ld1(bottom_blob_data, xx4.r);
127
                v[1].g = buffer_ld1(bottom_blob_data, xx4.g);
128
                v[1].b = buffer_ld1(bottom_blob_data, xx4.b);
129
                v[1].a = buffer_ld1(bottom_blob_data, xx4.a);
130

131
                v[0] = mix(afpvec4(value), v[0], mask);
132
                v[1] = mix(afpvec4(value), v[1], mask2);
133
            }
134

135
            buffer_st8(top_blob_data, gx, v);
136
#endif
137
        }
138
        if (type == 1)
139
        {
140
            x4 = clamp(x4, 0, psc(w) - 1);
141
            xx4 = clamp(xx4, 0, psc(w) - 1);
142

143
#if NCNN_image_shader
144
            afpvec8 v;
145
            v[0].r = image3d_ld1(bottom_blob, ivec3(x4.r, 0, 0));
146
            v[0].g = image3d_ld1(bottom_blob, ivec3(x4.g, 0, 0));
147
            v[0].b = image3d_ld1(bottom_blob, ivec3(x4.b, 0, 0));
148
            v[0].a = image3d_ld1(bottom_blob, ivec3(x4.a, 0, 0));
149
            v[1].r = image3d_ld1(bottom_blob, ivec3(xx4.r, 0, 0));
150
            v[1].g = image3d_ld1(bottom_blob, ivec3(xx4.g, 0, 0));
151
            v[1].b = image3d_ld1(bottom_blob, ivec3(xx4.b, 0, 0));
152
            v[1].a = image3d_ld1(bottom_blob, ivec3(xx4.a, 0, 0));
153

154
            image3d_st8(top_blob, ivec3(gx, 0, 0), v);
155
#else
156
            buffer_cp1to8(top_blob_data, gx, bottom_blob_data, x4, xx4);
157
#endif
158
        }
159
        if (type == 2)
160
        {
161
            x4 = abs(x4);
162
            xx4 = abs(xx4);
163
            // NOTE psc(X) get zeros on nvidia
164
            // TODO only enable this workaround for some nvidia driver
165
            x4 = (p.w - 1) - abs(x4 - (p.w - 1));
166
            xx4 = (p.w - 1) - abs(xx4 - (p.w - 1));
167
//             x4 = (psc(w) - 1) - abs(x4 - (psc(w) - 1));
168
//             xx4 = (psc(w) - 1) - abs(xx4 - (psc(w) - 1));
169

170
#if NCNN_image_shader
171
            afpvec8 v;
172
            v[0].r = image3d_ld1(bottom_blob, ivec3(x4.r, 0, 0));
173
            v[0].g = image3d_ld1(bottom_blob, ivec3(x4.g, 0, 0));
174
            v[0].b = image3d_ld1(bottom_blob, ivec3(x4.b, 0, 0));
175
            v[0].a = image3d_ld1(bottom_blob, ivec3(x4.a, 0, 0));
176
            v[1].r = image3d_ld1(bottom_blob, ivec3(xx4.r, 0, 0));
177
            v[1].g = image3d_ld1(bottom_blob, ivec3(xx4.g, 0, 0));
178
            v[1].b = image3d_ld1(bottom_blob, ivec3(xx4.b, 0, 0));
179
            v[1].a = image3d_ld1(bottom_blob, ivec3(xx4.a, 0, 0));
180

181
            image3d_st8(top_blob, ivec3(gx, 0, 0), v);
182
#else
183
            buffer_cp1to8(top_blob_data, gx, bottom_blob_data, x4, xx4);
184
#endif
185
        }
186
    }
187
    else if (psc(dims) == 2)
188
    {
189
        const int gi = gy * psc(outw) + gx;
190

191
        int x = gx - p.left;
192
        ivec4 y4 = gy * 8 - p.top + ivec4(0, 1, 2, 3);
193
        ivec4 yy4 = y4 + 4;
194

195
        if (type == 0)
196
        {
197
            bvec4 mask = bvec4(uvec4(x >= 0 && x < psc(w)) & (uvec4(greaterThanEqual(y4, ivec4(0))) & uvec4(lessThan(y4, ivec4(psc(h))))));
198
            bvec4 mask2 = bvec4(uvec4(x >= 0 && x < psc(w)) & (uvec4(greaterThanEqual(yy4, ivec4(0))) & uvec4(lessThan(yy4, ivec4(psc(h))))));
199

200
#if NCNN_image_shader
201
            afpvec8 v;
202
            v[0].r = image3d_ld1(bottom_blob, ivec3(x, y4.r, 0));
203
            v[0].g = image3d_ld1(bottom_blob, ivec3(x, y4.g, 0));
204
            v[0].b = image3d_ld1(bottom_blob, ivec3(x, y4.b, 0));
205
            v[0].a = image3d_ld1(bottom_blob, ivec3(x, y4.a, 0));
206
            v[1].r = image3d_ld1(bottom_blob, ivec3(x, yy4.r, 0));
207
            v[1].g = image3d_ld1(bottom_blob, ivec3(x, yy4.g, 0));
208
            v[1].b = image3d_ld1(bottom_blob, ivec3(x, yy4.b, 0));
209
            v[1].a = image3d_ld1(bottom_blob, ivec3(x, yy4.a, 0));
210

211
            v[0] = mix(afpvec4(value), v[0], mask);
212
            v[1] = mix(afpvec4(value), v[1], mask2);
213

214
            image3d_st8(top_blob, ivec3(gx, gy, 0), v);
215
#else
216
            ivec4 v_offset = y4 * psc(w) + x;
217
            ivec4 v_offset2 = yy4 * psc(w) + x;
218

219
            afpvec8 v;
220
            v[0].r = buffer_ld1(bottom_blob_data, v_offset.r);
221
            v[0].g = buffer_ld1(bottom_blob_data, v_offset.g);
222
            v[0].b = buffer_ld1(bottom_blob_data, v_offset.b);
223
            v[0].a = buffer_ld1(bottom_blob_data, v_offset.a);
224
            v[1].r = buffer_ld1(bottom_blob_data, v_offset2.r);
225
            v[1].g = buffer_ld1(bottom_blob_data, v_offset2.g);
226
            v[1].b = buffer_ld1(bottom_blob_data, v_offset2.b);
227
            v[1].a = buffer_ld1(bottom_blob_data, v_offset2.a);
228

229
            v[0] = mix(afpvec4(value), v[0], mask);
230
            v[1] = mix(afpvec4(value), v[1], mask2);
231

232
            buffer_st8(top_blob_data, gi, v);
233
#endif
234
        }
235
        if (type == 1)
236
        {
237
            x = clamp(x, 0, psc(w) - 1);
238
            y4 = clamp(y4, 0, psc(h) - 1);
239
            yy4 = clamp(yy4, 0, psc(h) - 1);
240

241
#if NCNN_image_shader
242
            afpvec8 v;
243
            v[0].r = image3d_ld1(bottom_blob, ivec3(x, y4.r, 0));
244
            v[0].g = image3d_ld1(bottom_blob, ivec3(x, y4.g, 0));
245
            v[0].b = image3d_ld1(bottom_blob, ivec3(x, y4.b, 0));
246
            v[0].a = image3d_ld1(bottom_blob, ivec3(x, y4.a, 0));
247
            v[1].r = image3d_ld1(bottom_blob, ivec3(x, yy4.r, 0));
248
            v[1].g = image3d_ld1(bottom_blob, ivec3(x, yy4.g, 0));
249
            v[1].b = image3d_ld1(bottom_blob, ivec3(x, yy4.b, 0));
250
            v[1].a = image3d_ld1(bottom_blob, ivec3(x, yy4.a, 0));
251

252
            image3d_st8(top_blob, ivec3(gx, gy, 0), v);
253
#else
254
            ivec4 v_offset = y4 * psc(w) + x;
255
            ivec4 v_offset2 = yy4 * psc(w) + x;
256
            buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, v_offset2);
257
#endif
258
        }
259
        if (type == 2)
260
        {
261
            x = abs(x);
262
            y4 = abs(y4);
263
            yy4 = abs(yy4);
264
            // NOTE psc(X) get zeros on nvidia
265
            // TODO only enable this workaround for some nvidia driver
266
            x = (p.w - 1) - abs(x - (p.w - 1));
267
            y4 = (p.h - 1) - abs(y4 - (p.h - 1));
268
            yy4 = (p.h - 1) - abs(yy4 - (p.h - 1));
269
//             x = (psc(w) - 1) - abs(x - (psc(w) - 1));
270
//             y4 = (psc(h) - 1) - abs(y4 - (psc(h) - 1));
271
//             yy4 = (psc(h) - 1) - abs(yy4 - (psc(h) - 1));
272

273
#if NCNN_image_shader
274
            afpvec8 v;
275
            v[0].r = image3d_ld1(bottom_blob, ivec3(x, y4.r, 0));
276
            v[0].g = image3d_ld1(bottom_blob, ivec3(x, y4.g, 0));
277
            v[0].b = image3d_ld1(bottom_blob, ivec3(x, y4.b, 0));
278
            v[0].a = image3d_ld1(bottom_blob, ivec3(x, y4.a, 0));
279
            v[1].r = image3d_ld1(bottom_blob, ivec3(x, yy4.r, 0));
280
            v[1].g = image3d_ld1(bottom_blob, ivec3(x, yy4.g, 0));
281
            v[1].b = image3d_ld1(bottom_blob, ivec3(x, yy4.b, 0));
282
            v[1].a = image3d_ld1(bottom_blob, ivec3(x, yy4.a, 0));
283

284
            image3d_st8(top_blob, ivec3(gx, gy, 0), v);
285
#else
286
            ivec4 v_offset = y4 * psc(w) + x;
287
            ivec4 v_offset2 = yy4 * psc(w) + x;
288
            buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, v_offset2);
289
#endif
290
        }
291
    }
292
    else // if (psc(dims) == 3)
293
    {
294
        const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
295

296
        int x = gx - p.left;
297
        int y = gy - p.top;
298
        ivec4 z4 = gz * 8 - p.front + ivec4(0, 1, 2, 3);
299
        ivec4 zz4 = z4 + 4;
300

301
        if (type == 0)
302
        {
303
            bvec4 mask = bvec4(uvec4(x >= 0 && x < psc(w) && y >= 0 && y < psc(h)) & (uvec4(greaterThanEqual(z4, ivec4(0))) & uvec4(lessThan(z4, ivec4(psc(c))))));
304
            bvec4 mask2 = bvec4(uvec4(x >= 0 && x < psc(w) && y >= 0 && y < psc(h)) & (uvec4(greaterThanEqual(zz4, ivec4(0))) & uvec4(lessThan(zz4, ivec4(psc(c))))));
305

306
#if NCNN_image_shader
307
            afpvec8 pad_value = per_channel_pad == 1 ? image3d_ld8(per_channel_pad_blob, ivec3(gz, 0, 0)) : afpvec8(afpvec4(value), afpvec4(value));
308

309
            afpvec8 v;
310
            v[0].r = image3d_ld1(bottom_blob, ivec3(x, y, z4.r));
311
            v[0].g = image3d_ld1(bottom_blob, ivec3(x, y, z4.g));
312
            v[0].b = image3d_ld1(bottom_blob, ivec3(x, y, z4.b));
313
            v[0].a = image3d_ld1(bottom_blob, ivec3(x, y, z4.a));
314
            v[1].r = image3d_ld1(bottom_blob, ivec3(x, y, zz4.r));
315
            v[1].g = image3d_ld1(bottom_blob, ivec3(x, y, zz4.g));
316
            v[1].b = image3d_ld1(bottom_blob, ivec3(x, y, zz4.b));
317
            v[1].a = image3d_ld1(bottom_blob, ivec3(x, y, zz4.a));
318

319
            v[0] = mix(pad_value[0], v[0], mask);
320
            v[1] = mix(pad_value[1], v[1], mask2);
321

322
            image3d_st8(top_blob, ivec3(gx, gy, gz), v);
323
#else
324
            afpvec8 pad_value = per_channel_pad == 1 ? buffer_ld8(per_channel_pad_blob_data, gz) : afpvec8(afpvec4(value), afpvec4(value));
325

326
            ivec4 v_offset = z4 * psc(cstep) + y * psc(w) + x;
327
            ivec4 v_offset2 = zz4 * psc(cstep) + y * psc(w) + x;
328

329
            afpvec8 v;
330
            v[0].r = buffer_ld1(bottom_blob_data, v_offset.r);
331
            v[0].g = buffer_ld1(bottom_blob_data, v_offset.g);
332
            v[0].b = buffer_ld1(bottom_blob_data, v_offset.b);
333
            v[0].a = buffer_ld1(bottom_blob_data, v_offset.a);
334
            v[1].r = buffer_ld1(bottom_blob_data, v_offset2.r);
335
            v[1].g = buffer_ld1(bottom_blob_data, v_offset2.g);
336
            v[1].b = buffer_ld1(bottom_blob_data, v_offset2.b);
337
            v[1].a = buffer_ld1(bottom_blob_data, v_offset2.a);
338

339
            v[0] = mix(pad_value[0], v[0], mask);
340
            v[1] = mix(pad_value[1], v[1], mask2);
341

342
            buffer_st8(top_blob_data, gi, v);
343
#endif
344
        }
345
        if (type == 1)
346
        {
347
            x = clamp(x, 0, psc(w) - 1);
348
            y = clamp(y, 0, psc(h) - 1);
349
            z4 = clamp(z4, 0, psc(c) - 1);
350
            zz4 = clamp(zz4, 0, psc(c) - 1);
351

352
#if NCNN_image_shader
353
            afpvec8 v;
354
            v[0].r = image3d_ld1(bottom_blob, ivec3(x, y, z4.r));
355
            v[0].g = image3d_ld1(bottom_blob, ivec3(x, y, z4.g));
356
            v[0].b = image3d_ld1(bottom_blob, ivec3(x, y, z4.b));
357
            v[0].a = image3d_ld1(bottom_blob, ivec3(x, y, z4.a));
358
            v[1].r = image3d_ld1(bottom_blob, ivec3(x, y, zz4.r));
359
            v[1].g = image3d_ld1(bottom_blob, ivec3(x, y, zz4.g));
360
            v[1].b = image3d_ld1(bottom_blob, ivec3(x, y, zz4.b));
361
            v[1].a = image3d_ld1(bottom_blob, ivec3(x, y, zz4.a));
362

363
            image3d_st8(top_blob, ivec3(gx, gy, gz), v);
364
#else
365
            ivec4 v_offset = z4 * psc(cstep) + y * psc(w) + x;
366
            ivec4 v_offset2 = zz4 * psc(cstep) + y * psc(w) + x;
367
            buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, v_offset2);
368
#endif
369
        }
370
        if (type == 2)
371
        {
372
            x = abs(x);
373
            y = abs(y);
374
            z4 = abs(z4);
375
            zz4 = abs(zz4);
376
            // NOTE psc(X) get zeros on nvidia
377
            // TODO only enable this workaround for some nvidia driver
378
            x = (p.w - 1) - abs(x - (p.w - 1));
379
            y = (p.h - 1) - abs(y - (p.h - 1));
380
            z4 = (p.c - 1) - abs(z4 - (p.c - 1));
381
            zz4 = (p.c - 1) - abs(zz4 - (p.c - 1));
382
//             x = (psc(w) - 1) - abs(x - (psc(w) - 1));
383
//             y = (psc(h) - 1) - abs(y - (psc(h) - 1));
384
//             z4 = (psc(c) - 1) - abs(z4 - (psc(c) - 1));
385
//             zz4 = (psc(c) - 1) - abs(zz4 - (psc(c) - 1));
386

387
#if NCNN_image_shader
388
            afpvec8 v;
389
            v[0].r = image3d_ld1(bottom_blob, ivec3(x, y, z4.r));
390
            v[0].g = image3d_ld1(bottom_blob, ivec3(x, y, z4.g));
391
            v[0].b = image3d_ld1(bottom_blob, ivec3(x, y, z4.b));
392
            v[0].a = image3d_ld1(bottom_blob, ivec3(x, y, z4.a));
393
            v[1].r = image3d_ld1(bottom_blob, ivec3(x, y, zz4.r));
394
            v[1].g = image3d_ld1(bottom_blob, ivec3(x, y, zz4.g));
395
            v[1].b = image3d_ld1(bottom_blob, ivec3(x, y, zz4.b));
396
            v[1].a = image3d_ld1(bottom_blob, ivec3(x, y, zz4.a));
397

398
            image3d_st8(top_blob, ivec3(gx, gy, gz), v);
399
#else
400
            ivec4 v_offset = z4 * psc(cstep) + y * psc(w) + x;
401
            ivec4 v_offset2 = zz4 * psc(cstep) + y * psc(w) + x;
402
            buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, v_offset2);
403
#endif
404
        }
405
    }
406
}
407

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.