ncnn

Форк
0
/
padding_pack4to1.comp 
374 строки · 11.2 Кб
1
// Tencent is pleased to support the open source community by making ncnn available.
2
//
3
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4
//
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
7
//
8
// https://opensource.org/licenses/BSD-3-Clause
9
//
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
14

15
#version 450
16

17
#if NCNN_fp16_storage
18
#extension GL_EXT_shader_16bit_storage: require
19
#endif
20
#if NCNN_fp16_arithmetic
21
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
22
#endif
23

24
layout (constant_id = 0) const int type = 1;
25
layout (constant_id = 1) const float value = 0;
26
layout (constant_id = 2) const int per_channel_pad = 0;
27

28
#define shape_constant_id_offset 3
29
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
30
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
31
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
32
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
33
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
34

35
layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
36
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
37
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
38
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
39
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
40

41
#if NCNN_image_shader
42
layout (binding = 0) uniform unfp sampler3D bottom_blob;
43
layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
44
layout (binding = 2) uniform unfp sampler3D per_channel_pad_blob;
45
#else
46
#if NCNN_fp16_packed
47
layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
48
#else
49
layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
50
#endif
51
layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
52
layout (binding = 2) readonly buffer per_channel_pad_blob { sfp per_channel_pad_blob_data[]; };
53
#endif
54

55
layout (push_constant) uniform parameter
56
{
57
    int dims;
58
    int w;
59
    int h;
60
    int c;
61
    int cstep;
62

63
    int outdims;
64
    int outw;
65
    int outh;
66
    int outc;
67
    int outcstep;
68

69
    int left;
70
    int top;
71
    int front;
72
} p;
73

74
void main()
75
{
76
    int gx = int(gl_GlobalInvocationID.x);
77
    int gy = int(gl_GlobalInvocationID.y);
78
    int gz = int(gl_GlobalInvocationID.z);
79

80
    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
81
        return;
82

83
    if (psc(dims) == 1)
84
    {
85
        int x = gx - p.left;
86

87
        if (type == 0)
88
        {
89
            if (x >= 0 && x < psc(w) * 4)
90
            {
91
#if NCNN_image_shader
92
                afpvec4 v = image3d_ld4(bottom_blob, ivec3(x / 4, 0, 0));
93

94
                image3d_st1(top_blob, ivec3(gx, 0, 0), v[x % 4]);
95
#else
96
#if NCNN_fp16_packed
97
                int v_offset = (x / 4) * 2 + (x % 4) / 2;
98
                int lane2 = x % 2;
99

100
                afpvec2 v = buffer_ld2(bottom_blob_data, v_offset);
101

102
                buffer_st1(top_blob_data, gx, v[lane2]);
103
#else
104
                int v_offset = (x / 4) * 4 + x % 4;
105

106
                buffer_cp1(top_blob_data, gx, bottom_blob_data, v_offset);
107
#endif
108
#endif
109
            }
110
            else
111
            {
112
                afp v = afp(value);
113
#if NCNN_image_shader
114
                image3d_st1(top_blob, ivec3(gx, 0, 0), v);
115
#else
116
                buffer_st1(top_blob_data, gx, v);
117
#endif
118
            }
119
        }
120
        if (type == 1)
121
        {
122
            x = clamp(x, 0, psc(w) * 4 - 1);
123

124
#if NCNN_image_shader
125
            afpvec4 v = image3d_ld4(bottom_blob, ivec3(x / 4, 0, 0));
126

127
            image3d_st1(top_blob, ivec3(gx, 0, 0), v[x % 4]);
128
#else
129
#if NCNN_fp16_packed
130
            int v_offset = (x / 4) * 2 + (x % 4) / 2;
131
            int lane2 = x % 2;
132

133
            afpvec2 v = buffer_ld2(bottom_blob_data, v_offset);
134

135
            buffer_st1(top_blob_data, gx, v[lane2]);
136
#else
137
            int v_offset = (x / 4) * 4 + x % 4;
138

139
            buffer_cp1(top_blob_data, gx, bottom_blob_data, v_offset);
140
#endif
141
#endif
142
        }
143
        if (type == 2)
144
        {
145
            x = abs(x);
146
            // NOTE psc(X) get zeros on nvidia
147
            // TODO only enable this workaround for some nvidia driver
148
            x = (p.w * 4 - 1) - abs(x - (p.w * 4 - 1));
149
//             x = (psc(w) * 4 - 1) - abs(x - (psc(w) * 4 - 1));
150

151
#if NCNN_image_shader
152
            afpvec4 v = image3d_ld4(bottom_blob, ivec3(x / 4, 0, 0));
153

154
            image3d_st1(top_blob, ivec3(gx, 0, 0), v[x % 4]);
155
#else
156
#if NCNN_fp16_packed
157
            int v_offset = (x / 4) * 2 + (x % 4) / 2;
158
            int lane2 = x % 2;
159

160
            afpvec2 v = buffer_ld2(bottom_blob_data, v_offset);
161

162
            buffer_st1(top_blob_data, gx, v[lane2]);
163
#else
164
            int v_offset = (x / 4) * 4 + x % 4;
165

166
            buffer_cp1(top_blob_data, gx, bottom_blob_data, v_offset);
167
#endif
168
#endif
169
        }
170
    }
171
    else if (psc(dims) == 2)
172
    {
173
        const int gi = gy * psc(outw) + gx;
174

175
        int x = gx - p.left;
176
        int y = gy - p.top;
177

178
        if (type == 0)
179
        {
180
            if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h) * 4)
181
            {
182
#if NCNN_image_shader
183
                afpvec4 v = image3d_ld4(bottom_blob, ivec3(x, y / 4, 0));
184

185
                image3d_st1(top_blob, ivec3(gx, gy, 0), v[y % 4]);
186
#else
187
#if NCNN_fp16_packed
188
                int v_offset = ((y / 4) * psc(w) + x) * 2 + (y % 4) / 2;
189
                int lane2 = y % 2;
190

191
                afpvec2 v = buffer_ld2(bottom_blob_data, v_offset);
192

193
                buffer_st1(top_blob_data, gi, v[lane2]);
194
#else
195
                int v_offset = ((y / 4) * psc(w) + x) * 4 + y % 4;
196

197
                buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset);
198
#endif
199
#endif
200
            }
201
            else
202
            {
203
                afp v = afp(value);
204
#if NCNN_image_shader
205
                image3d_st1(top_blob, ivec3(gx, gy, 0), v);
206
#else
207
                buffer_st1(top_blob_data, gi, v);
208
#endif
209
            }
210
        }
211
        if (type == 1)
212
        {
213
            x = clamp(x, 0, psc(w) - 1);
214
            y = clamp(y, 0, psc(h) * 4 - 1);
215

216
#if NCNN_image_shader
217
            afpvec4 v = image3d_ld4(bottom_blob, ivec3(x, y / 4, 0));
218

219
            image3d_st1(top_blob, ivec3(gx, gy, 0), v[y % 4]);
220
#else
221
#if NCNN_fp16_packed
222
            int v_offset = ((y / 4) * psc(w) + x) * 2 + (y % 4) / 2;
223
            int lane2 = y % 2;
224

225
            afpvec2 v = buffer_ld2(bottom_blob_data, v_offset);
226

227
            buffer_st1(top_blob_data, gi, v[lane2]);
228
#else
229
            int v_offset = ((y / 4) * psc(w) + x) * 4 + y % 4;
230

231
            buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset);
232
#endif
233
#endif
234
        }
235
        if (type == 2)
236
        {
237
            x = abs(x);
238
            y = abs(y);
239
            // NOTE psc(X) get zeros on nvidia
240
            // TODO only enable this workaround for some nvidia driver
241
            x = (p.w - 1) - abs(x - (p.w - 1));
242
            y = (p.h * 4 - 1) - abs(y - (p.h * 4 - 1));
243
//             x = (psc(w) - 1) - abs(x - (psc(w) - 1));
244
//             y = (psc(h) * 4 - 1) - abs(y - (psc(h) * 4 - 1));
245

246
#if NCNN_image_shader
247
            afpvec4 v = image3d_ld4(bottom_blob, ivec3(x, y / 4, 0));
248

249
            image3d_st1(top_blob, ivec3(gx, gy, 0), v[y % 4]);
250
#else
251
#if NCNN_fp16_packed
252
            int v_offset = ((y / 4) * psc(w) + x) * 2 + (y % 4) / 2;
253
            int lane2 = y % 2;
254

255
            afpvec2 v = buffer_ld2(bottom_blob_data, v_offset);
256

257
            buffer_st1(top_blob_data, gi, v[lane2]);
258
#else
259
            int v_offset = ((y / 4) * psc(w) + x) * 4 + y % 4;
260

261
            buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset);
262
#endif
263
#endif
264
        }
265
    }
266
    else // if (psc(dims) == 3)
267
    {
268
        const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
269

270
        int x = gx - p.left;
271
        int y = gy - p.top;
272
        int z = gz - p.front;
273

274
        if (type == 0)
275
        {
276
            if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h) && z >= 0 && z < psc(c) * 4)
277
            {
278
#if NCNN_image_shader
279
                afpvec4 v = image3d_ld4(bottom_blob, ivec3(x, y, z / 4));
280

281
                image3d_st1(top_blob, ivec3(gx, gy, gz), v[z % 4]);
282
#else
283
#if NCNN_fp16_packed
284
                int v_offset = ((z / 4) * psc(cstep) + y * psc(w) + x) * 2 + (z % 4) / 2;
285
                int lane2 = z % 2;
286

287
                afpvec2 v = buffer_ld2(bottom_blob_data, v_offset);
288

289
                buffer_st1(top_blob_data, gi, v[lane2]);
290
#else
291
                int v_offset = ((z / 4) * psc(cstep) + y * psc(w) + x) * 4 + z % 4;
292

293
                buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset);
294
#endif
295
#endif
296
            }
297
            else if (per_channel_pad == 1)
298
            {
299
#if NCNN_image_shader
300
                image3d_cp1(top_blob, ivec3(gx, gy, gz), per_channel_pad_blob, ivec3(gz, 0, 0));
301
#else
302
                buffer_cp1(top_blob_data, gi, per_channel_pad_blob_data, gz);
303
#endif
304
            }
305
            else
306
            {
307
                afp v = afp(value);
308
#if NCNN_image_shader
309
                image3d_st1(top_blob, ivec3(gx, gy, gz), v);
310
#else
311
                buffer_st1(top_blob_data, gi, v);
312
#endif
313
            }
314
        }
315
        if (type == 1)
316
        {
317
            x = clamp(x, 0, psc(w) - 1);
318
            y = clamp(y, 0, psc(h) - 1);
319
            z = clamp(z, 0, psc(c) * 4 - 1);
320

321
#if NCNN_image_shader
322
            afpvec4 v = image3d_ld4(bottom_blob, ivec3(x, y, z / 4));
323

324
            image3d_st1(top_blob, ivec3(gx, gy, gz), v[z % 4]);
325
#else
326
#if NCNN_fp16_packed
327
            int v_offset = ((z / 4) * psc(cstep) + y * psc(w) + x) * 2 + (z % 4) / 2;
328
            int lane2 = z % 2;
329

330
            afpvec2 v = buffer_ld2(bottom_blob_data, v_offset);
331

332
            buffer_st1(top_blob_data, gi, v[lane2]);
333
#else
334
            int v_offset = ((z / 4) * psc(cstep) + y * psc(w) + x) * 4 + z % 4;
335

336
            buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset);
337
#endif
338
#endif
339
        }
340
        if (type == 2)
341
        {
342
            x = abs(x);
343
            y = abs(y);
344
            z = abs(z);
345
            // NOTE psc(X) get zeros on nvidia
346
            // TODO only enable this workaround for some nvidia driver
347
            x = (p.w - 1) - abs(x - (p.w - 1));
348
            y = (p.h - 1) - abs(y - (p.h - 1));
349
            z = (p.c * 4 - 1) - abs(z - (p.c * 4 - 1));
350
//             x = (psc(w) - 1) - abs(x - (psc(w) - 1));
351
//             y = (psc(h) - 1) - abs(y - (psc(h) - 1));
352
//             z = (psc(c) * 4 - 1) - abs(z - (psc(c) * 4 - 1));
353

354
#if NCNN_image_shader
355
            afpvec4 v = image3d_ld4(bottom_blob, ivec3(x, y, z / 4));
356

357
            image3d_st1(top_blob, ivec3(gx, gy, gz), v[z % 4]);
358
#else
359
#if NCNN_fp16_packed
360
            int v_offset = ((z / 4) * psc(cstep) + y * psc(w) + x) * 2 + (z % 4) / 2;
361
            int lane2 = z % 2;
362

363
            afpvec2 v = buffer_ld2(bottom_blob_data, v_offset);
364

365
            buffer_st1(top_blob_data, gi, v[lane2]);
366
#else
367
            int v_offset = ((z / 4) * psc(cstep) + y * psc(w) + x) * 4 + z % 4;
368

369
            buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset);
370
#endif
371
#endif
372
        }
373
    }
374
}
375

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.