ncnn

Форк
0
/
permute_pack4.comp 
451 строка · 12.0 Кб
1
// Tencent is pleased to support the open source community by making ncnn available.
2
//
3
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4
//
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
7
//
8
// https://opensource.org/licenses/BSD-3-Clause
9
//
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
14

15
#version 450
16

17
#if NCNN_fp16_storage
18
#extension GL_EXT_shader_16bit_storage: require
19
#endif
20
#if NCNN_fp16_arithmetic
21
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
22
#endif
23

24
layout (constant_id = 0) const int order_type = 0;
25
layout (constant_id = 1) const int bugihfa = 0;
26

27
#define shape_constant_id_offset 2
28
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
29
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
30
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
31
layout (constant_id = shape_constant_id_offset + 3) const int d = 0;
32
layout (constant_id = shape_constant_id_offset + 4) const int c = 0;
33
layout (constant_id = shape_constant_id_offset + 5) const int cstep = 0;
34

35
layout (constant_id = shape_constant_id_offset + 6) const int outdims = 0;
36
layout (constant_id = shape_constant_id_offset + 7) const int outw = 0;
37
layout (constant_id = shape_constant_id_offset + 8) const int outh = 0;
38
layout (constant_id = shape_constant_id_offset + 9) const int outd = 0;
39
layout (constant_id = shape_constant_id_offset + 10) const int outc = 0;
40
layout (constant_id = shape_constant_id_offset + 11) const int outcstep = 0;
41

42
#if NCNN_image_shader
43
layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
44
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
45
#else
46
#if NCNN_fp16_packed
47
layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
48
#else
49
layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
50
#endif
51
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
52
#endif
53

54
layout (push_constant) uniform parameter
55
{
56
    int dims;
57
    int w;
58
    int h;
59
    int d;
60
    int c;
61
    int cstep;
62

63
    int outdims;
64
    int outw;
65
    int outh;
66
    int outd;
67
    int outc;
68
    int outcstep;
69
} p;
70

71
void main()
72
{
73
    int gx = int(gl_GlobalInvocationID.x);
74
    int gy = int(gl_GlobalInvocationID.y);
75
    int gz = int(gl_GlobalInvocationID.z);
76

77
    if (gx >= psc(outw) || gy >= psc(outh) * psc(outd) || gz >= psc(outc))
78
        return;
79

80
    ivec4 x4;
81
    ivec4 y4;
82
    ivec4 z4;
83

84
    if (psc(dims) == 2)
85
    {
86
        // order_type
87
        // 0 = w h
88
        // 1 = h w
89

90
        gz = 0;
91
        z4 = ivec4(0);
92

93
        if (order_type == 0)
94
        {
95
            x4 = ivec4(gx);
96
            y4 = gy * 4 + ivec4(0, 1, 2, 3);
97
        }
98
        if (order_type == 1)
99
        {
100
            x4 = gy * 4 + ivec4(0, 1, 2, 3);
101
            y4 = ivec4(gx);
102
        }
103
    }
104
    else if (psc(dims) == 3)
105
    {
106
        // order_type
107
        // 0 = w h c
108
        // 1 = h w c
109
        // 2 = w c h
110
        // 3 = c w h
111
        // 4 = h c w
112
        // 5 = c h w
113

114
        if (order_type == 0)
115
        {
116
            x4 = ivec4(gx);
117
            y4 = ivec4(gy);
118
            z4 = gz * 4 + ivec4(0, 1, 2, 3);
119
        }
120
        if (order_type == 1)
121
        {
122
            x4 = ivec4(gy);
123
            y4 = ivec4(gx);
124
            z4 = gz * 4 + ivec4(0, 1, 2, 3);
125
        }
126
        if (order_type == 2)
127
        {
128
            x4 = ivec4(gx);
129
            y4 = gz * 4 + ivec4(0, 1, 2, 3);
130
            z4 = ivec4(gy);
131
        }
132
        if (order_type == 3)
133
        {
134
            x4 = ivec4(gy);
135
            y4 = gz * 4 + ivec4(0, 1, 2, 3);
136
            z4 = ivec4(gx);
137
        }
138
        if (order_type == 4)
139
        {
140
            x4 = gz * 4 + ivec4(0, 1, 2, 3);
141
            y4 = ivec4(gx);
142
            z4 = ivec4(gy);
143
        }
144
        if (order_type == 5)
145
        {
146
            x4 = gz * 4 + ivec4(0, 1, 2, 3);
147
            y4 = ivec4(gy);
148
            z4 = ivec4(gx);
149
        }
150
    }
151
    else // if (psc(dims) == 4)
152
    {
153
        // order_type
154
        // 0 = w h d c
155
        // 1 = h w d c
156
        // 2 = w d h c
157
        // 3 = d w h c
158
        // 4 = h d w c
159
        // 5 = d h w c
160
        // 6 = w h c d
161
        // 7 = h w c d
162
        // 8 = w c h d
163
        // 9 = c w h d
164
        //10 = h c w d
165
        //11 = c h w d
166
        //12 = w d c h
167
        //13 = d w c h
168
        //14 = w c d h
169
        //15 = c w d h
170
        //16 = d c w h
171
        //17 = c d w h
172
        //18 = h d c w
173
        //19 = d h c w
174
        //20 = h c d w
175
        //21 = c h d w
176
        //22 = d c h w
177
        //23 = c d h w
178

179
        int yd = gy / psc(outh);
180
        int yh = gy % psc(outh);
181

182
        if (order_type == 0)
183
        {
184
            x4 = ivec4(gx);
185
            y4 = ivec4(yd * psc(h) + yh);
186
            z4 = gz * 4 + ivec4(0, 1, 2, 3);
187
        }
188
        if (order_type == 1)
189
        {
190
            x4 = ivec4(yh);
191
            y4 = ivec4(yd * psc(h) + gx);
192
            z4 = gz * 4 + ivec4(0, 1, 2, 3);
193
        }
194
        if (order_type == 2)
195
        {
196
            x4 = ivec4(gx);
197
            y4 = ivec4(yh * psc(h) + yd);
198
            z4 = gz * 4 + ivec4(0, 1, 2, 3);
199
        }
200
        if (order_type == 3)
201
        {
202
            x4 = ivec4(yh);
203
            y4 = ivec4(gx * psc(h) + yd);
204
            z4 = gz * 4 + ivec4(0, 1, 2, 3);
205
        }
206
        if (order_type == 4)
207
        {
208
            x4 = ivec4(yd);
209
            y4 = ivec4(yh * psc(h) + gx);
210
            z4 = gz * 4 + ivec4(0, 1, 2, 3);
211
        }
212
        if (order_type == 5)
213
        {
214
            x4 = ivec4(yd);
215
            y4 = ivec4(gx * psc(h) + yh);
216
            z4 = gz * 4 + ivec4(0, 1, 2, 3);
217
        }
218
        if (order_type == 6)
219
        {
220
            x4 = ivec4(gx);
221
            y4 = (gz * 4 + ivec4(0, 1, 2, 3)) * psc(h) + yh;
222
            z4 = ivec4(yd);
223
        }
224
        if (order_type == 7)
225
        {
226
            x4 = ivec4(yh);
227
            y4 = (gz * 4 + ivec4(0, 1, 2, 3)) * psc(h) + gx;
228
            z4 = ivec4(yd);
229
        }
230
        if (order_type == 8)
231
        {
232
            x4 = ivec4(gx);
233
            y4 = (gz * 4 + ivec4(0, 1, 2, 3)) * psc(h) + yd;
234
            z4 = ivec4(yh);
235
        }
236
        if (order_type == 9)
237
        {
238
            x4 = ivec4(yh);
239
            y4 = (gz * 4 + ivec4(0, 1, 2, 3)) * psc(h) + yd;
240
            z4 = ivec4(gx);
241
        }
242
        if (order_type == 10)
243
        {
244
            x4 = ivec4(yd);
245
            y4 = (gz * 4 + ivec4(0, 1, 2, 3)) * psc(h) + gx;
246
            z4 = ivec4(yh);
247
        }
248
        if (order_type == 11)
249
        {
250
            x4 = ivec4(yd);
251
            y4 = (gz * 4 + ivec4(0, 1, 2, 3)) * psc(h) + yh;
252
            z4 = ivec4(gx);
253
        }
254
        if (order_type == 12)
255
        {
256
            x4 = ivec4(gx);
257
            y4 = yh * psc(h) + gz * 4 + ivec4(0, 1, 2, 3);
258
            z4 = ivec4(yd);
259
        }
260
        if (order_type == 13)
261
        {
262
            x4 = ivec4(yh);
263
            y4 = gx * psc(h) + gz * 4 + ivec4(0, 1, 2, 3);
264
            z4 = ivec4(yd);
265
        }
266
        if (order_type == 14)
267
        {
268
            x4 = ivec4(gx);
269
            y4 = yd * psc(h) + gz * 4 + ivec4(0, 1, 2, 3);
270
            z4 = ivec4(yh);
271
        }
272
        if (order_type == 15)
273
        {
274
            x4 = ivec4(yh);
275
            y4 = yd * psc(h) + gz * 4 + ivec4(0, 1, 2, 3);
276
            z4 = ivec4(gx);
277
        }
278
        if (order_type == 16)
279
        {
280
            x4 = ivec4(yd);
281
            y4 = gx * psc(h) + gz * 4 + ivec4(0, 1, 2, 3);
282
            z4 = ivec4(yh);
283
        }
284
        if (order_type == 17)
285
        {
286
            x4 = ivec4(yd);
287
            y4 = yh * psc(h) + gz * 4 + ivec4(0, 1, 2, 3);
288
            z4 = ivec4(gx);
289
        }
290
        if (order_type == 18)
291
        {
292
            x4 = gz * 4 + ivec4(0, 1, 2, 3);
293
            y4 = ivec4(yh * psc(h) + gx);
294
            z4 = ivec4(yd);
295
        }
296
        if (order_type == 19)
297
        {
298
            x4 = gz * 4 + ivec4(0, 1, 2, 3);
299
            y4 = ivec4(gx * psc(h) + yh);
300
            z4 = ivec4(yd);
301
        }
302
        if (order_type == 20)
303
        {
304
            x4 = gz * 4 + ivec4(0, 1, 2, 3);
305
            y4 = ivec4(yd * psc(h) + gx);
306
            z4 = ivec4(yh);
307
        }
308
        if (order_type == 21)
309
        {
310
            x4 = gz * 4 + ivec4(0, 1, 2, 3);
311
            y4 = ivec4(yd * psc(h) + yh);
312
            z4 = ivec4(gx);
313
        }
314
        if (order_type == 22)
315
        {
316
            x4 = gz * 4 + ivec4(0, 1, 2, 3);
317
            y4 = ivec4(gx * psc(h) + yd);
318
            z4 = ivec4(yh);
319
        }
320
        if (order_type == 23)
321
        {
322
            x4 = gz * 4 + ivec4(0, 1, 2, 3);
323
            y4 = ivec4(yh * psc(h) + yd);
324
            z4 = ivec4(gx);
325
        }
326
    }
327

328
#if NCNN_image_shader
329
    ivec4 i4 = z4 * psc(w) * psc(h) * psc(d) + y4 * psc(w) + x4;
330
    ivec4 lane4;
331
#else
332
    ivec4 i4 = z4 * psc(cstep) + y4 * psc(w) + x4;
333
    ivec4 v_offset;
334
#if NCNN_fp16_packed
335
    ivec4 lane2;
336
#endif
337
#endif
338

339
    if (psc(dims) == 2)
340
    {
341
        y4 = i4 / psc(w);
342
        x4 = i4 % psc(w);
343

344
#if NCNN_image_shader
345
        lane4 = y4 % 4;
346
        y4 = y4 / 4;
347
#else
348
#if NCNN_fp16_packed
349
        v_offset = ((y4 / 4) * psc(w) + x4) * 2 + (y4 % 4) / 2;
350
        lane2 = y4 % 2;
351
#else
352
        v_offset = ((y4 / 4) * psc(w) + x4) * 4 + y4 % 4;
353
#endif
354
#endif
355
    }
356
    else if (psc(dims) == 3)
357
    {
358
        int size = psc(w) * psc(h);
359

360
        z4 = i4 / size;
361
        y4 = i4 % size / psc(w);
362
        x4 = i4 % size % psc(w);
363

364
#if NCNN_image_shader
365
        lane4 = z4 % 4;
366
        z4 = z4 / 4;
367
#else
368
#if NCNN_fp16_packed
369
        v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 2 + (z4 % 4) / 2;
370
        lane2 = z4 % 2;
371
#else
372
        v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 4 + z4 % 4;
373
#endif
374
#endif
375
    }
376
    else // if (psc(dims) == 4)
377
    {
378
        int size = psc(w) * psc(h) * psc(d);
379
        int dsize = psc(w) * psc(h);
380

381
        z4 = i4 / size;
382
        ivec4 yd4 = i4 % size / dsize;
383
        ivec4 yh4 = i4 % size % dsize / psc(w);
384
        x4 = i4 % size % dsize % psc(w);
385

386
        y4 = yd4 * psc(h) + yh4;
387

388
#if NCNN_image_shader
389
        lane4 = z4 % 4;
390
        z4 = z4 / 4;
391
#else
392
#if NCNN_fp16_packed
393
        v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 2 + (z4 % 4) / 2;
394
        lane2 = z4 % 2;
395
#else
396
        v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 4 + z4 % 4;
397
#endif
398
#endif
399
    }
400

401
#if NCNN_image_shader
402
    afpvec4 vr = image3d_ld4(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r));
403
    afpvec4 vg = image3d_ld4(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g));
404
    afpvec4 vb = image3d_ld4(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b));
405
    afpvec4 va = image3d_ld4(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a));
406

407
    afpvec4 v;
408
#if NCNN_fp16_arithmetic
409
    if (bugihfa == 1)
410
    {
411
        if (lane4.r == 0) v.r = vr.r;
412
        if (lane4.r == 1) v.r = vr.g;
413
        if (lane4.r == 2) v.r = vr.b;
414
        if (lane4.r == 3) v.r = vr.a;
415
        if (lane4.g == 0) v.g = vg.r;
416
        if (lane4.g == 1) v.g = vg.g;
417
        if (lane4.g == 2) v.g = vg.b;
418
        if (lane4.g == 3) v.g = vg.a;
419
        if (lane4.b == 0) v.b = vb.r;
420
        if (lane4.b == 1) v.b = vb.g;
421
        if (lane4.b == 2) v.b = vb.b;
422
        if (lane4.b == 3) v.b = vb.a;
423
        if (lane4.a == 0) v.a = va.r;
424
        if (lane4.a == 1) v.a = va.g;
425
        if (lane4.a == 2) v.a = va.b;
426
        if (lane4.a == 3) v.a = va.a;
427
    }
428
    else
429
#endif
430
    {
431
        v = afpvec4(vr[lane4.r], vg[lane4.g], vb[lane4.b], va[lane4.a]);
432
    }
433

434
    image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v);
435
#else
436
    int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
437

438
#if NCNN_fp16_packed
439
    afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
440
    afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
441
    afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
442
    afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
443

444
    afpvec4 v = afpvec4(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a]);
445

446
    buffer_st4(top_blob_data, gi, v);
447
#else
448
    buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset);
449
#endif
450
#endif
451
}
452

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.