ncnn

permute_pack8.comp
583 строки · 15.8 Кб
Перенос по словам
1
// Tencent is pleased to support the open source community by making ncnn available.
2
//
3
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4
//
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
7
//
8
// https://opensource.org/licenses/BSD-3-Clause
9
//
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
14

15
#version 450
16

17
#if NCNN_fp16_storage
18
#extension GL_EXT_shader_16bit_storage: require
19
struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
20
#endif
21
#if NCNN_fp16_arithmetic
22
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
23
#endif
24

25
layout (constant_id = 0) const int order_type = 0;
26
layout (constant_id = 1) const int bugihfa = 0;
27

28
#define shape_constant_id_offset 2
29
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
30
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
31
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
32
layout (constant_id = shape_constant_id_offset + 3) const int d = 0;
33
layout (constant_id = shape_constant_id_offset + 4) const int c = 0;
34
layout (constant_id = shape_constant_id_offset + 5) const int cstep = 0;
35

36
layout (constant_id = shape_constant_id_offset + 6) const int outdims = 0;
37
layout (constant_id = shape_constant_id_offset + 7) const int outw = 0;
38
layout (constant_id = shape_constant_id_offset + 8) const int outh = 0;
39
layout (constant_id = shape_constant_id_offset + 9) const int outd = 0;
40
layout (constant_id = shape_constant_id_offset + 10) const int outc = 0;
41
layout (constant_id = shape_constant_id_offset + 11) const int outcstep = 0;
42

43
#if NCNN_image_shader
44
layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
45
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
46
#else
47
#if NCNN_fp16_packed
48
layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
49
#else
50
layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
51
#endif
52
layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
53
#endif
54

55
layout (push_constant) uniform parameter
56
{
57
    int dims;
58
    int w;
59
    int h;
60
    int d;
61
    int c;
62
    int cstep;
63

64
    int outdims;
65
    int outw;
66
    int outh;
67
    int outd;
68
    int outc;
69
    int outcstep;
70
} p;
71

72
void main()
73
{
74
    int gx = int(gl_GlobalInvocationID.x);
75
    int gy = int(gl_GlobalInvocationID.y);
76
    int gz = int(gl_GlobalInvocationID.z);
77

78
    if (gx >= psc(outw) || gy >= psc(outh) * psc(outd) || gz >= psc(outc))
79
        return;
80

81
    ivec4 x4;
82
    ivec4 xx4;
83
    ivec4 y4;
84
    ivec4 yy4;
85
    ivec4 z4;
86
    ivec4 zz4;
87

88
    if (psc(dims) == 2)
89
    {
90
        // order_type
91
        // 0 = w h
92
        // 1 = h w
93

94
        gz = 0;
95
        z4 = ivec4(0);
96
        zz4 = z4;
97

98
        if (order_type == 0)
99
        {
100
            x4 = ivec4(gx);
101
            xx4 = x4;
102
            y4 = gy * 8 + ivec4(0, 1, 2, 3);
103
            yy4 = y4 + 4;
104
        }
105
        if (order_type == 1)
106
        {
107
            x4 = gy * 8 + ivec4(0, 1, 2, 3);
108
            xx4 = x4 + 4;
109
            y4 = ivec4(gx);
110
            yy4 = y4;
111
        }
112
    }
113
    else if (psc(dims) == 3)
114
    {
115
        // order_type
116
        // 0 = w h c
117
        // 1 = h w c
118
        // 2 = w c h
119
        // 3 = c w h
120
        // 4 = h c w
121
        // 5 = c h w
122

123
        if (order_type == 0)
124
        {
125
            x4 = ivec4(gx);
126
            xx4 = x4;
127
            y4 = ivec4(gy);
128
            yy4 = y4;
129
            z4 = gz * 8 + ivec4(0, 1, 2, 3);
130
            zz4 = z4 + 4;
131
        }
132
        if (order_type == 1)
133
        {
134
            x4 = ivec4(gy);
135
            xx4 = x4;
136
            y4 = ivec4(gx);
137
            yy4 = y4;
138
            z4 = gz * 8 + ivec4(0, 1, 2, 3);
139
            zz4 = z4 + 4;
140
        }
141
        if (order_type == 2)
142
        {
143
            x4 = ivec4(gx);
144
            xx4 = x4;
145
            y4 = gz * 8 + ivec4(0, 1, 2, 3);
146
            yy4 = y4 + 4;
147
            z4 = ivec4(gy);
148
            zz4 = z4;
149
        }
150
        if (order_type == 3)
151
        {
152
            x4 = ivec4(gy);
153
            xx4 = x4;
154
            y4 = gz * 8 + ivec4(0, 1, 2, 3);
155
            yy4 = y4 + 4;
156
            z4 = ivec4(gx);
157
            zz4 = z4;
158
        }
159
        if (order_type == 4)
160
        {
161
            x4 = gz * 8 + ivec4(0, 1, 2, 3);
162
            xx4 = x4 + 4;
163
            y4 = ivec4(gx);
164
            yy4 = y4;
165
            z4 = ivec4(gy);
166
            zz4 = z4;
167
        }
168
        if (order_type == 5)
169
        {
170
            x4 = gz * 8 + ivec4(0, 1, 2, 3);
171
            xx4 = x4 + 4;
172
            y4 = ivec4(gy);
173
            yy4 = y4;
174
            z4 = ivec4(gx);
175
            zz4 = z4;
176
        }
177
    }
178
    else // if (psc(dims) == 4)
179
    {
180
        // order_type
181
        // 0 = w h d c
182
        // 1 = h w d c
183
        // 2 = w d h c
184
        // 3 = d w h c
185
        // 4 = h d w c
186
        // 5 = d h w c
187
        // 6 = w h c d
188
        // 7 = h w c d
189
        // 8 = w c h d
190
        // 9 = c w h d
191
        //10 = h c w d
192
        //11 = c h w d
193
        //12 = w d c h
194
        //13 = d w c h
195
        //14 = w c d h
196
        //15 = c w d h
197
        //16 = d c w h
198
        //17 = c d w h
199
        //18 = h d c w
200
        //19 = d h c w
201
        //20 = h c d w
202
        //21 = c h d w
203
        //22 = d c h w
204
        //23 = c d h w
205

206
        int yd = gy / psc(outh);
207
        int yh = gy % psc(outh);
208

209
        if (order_type == 0)
210
        {
211
            x4 = ivec4(gx);
212
            xx4 = x4;
213
            y4 = ivec4(yd * psc(h) + yh);
214
            yy4 = y4;
215
            z4 = gz * 8 + ivec4(0, 1, 2, 3);
216
            zz4 = z4 + 4;
217
        }
218
        if (order_type == 1)
219
        {
220
            x4 = ivec4(yh);
221
            xx4 = x4;
222
            y4 = ivec4(yd * psc(h) + gx);
223
            yy4 = y4;
224
            z4 = gz * 8 + ivec4(0, 1, 2, 3);
225
            zz4 = z4 + 4;
226
        }
227
        if (order_type == 2)
228
        {
229
            x4 = ivec4(gx);
230
            xx4 = x4;
231
            y4 = ivec4(yh * psc(h) + yd);
232
            yy4 = y4;
233
            z4 = gz * 8 + ivec4(0, 1, 2, 3);
234
            zz4 = z4 + 4;
235
        }
236
        if (order_type == 3)
237
        {
238
            x4 = ivec4(yh);
239
            xx4 = x4;
240
            y4 = ivec4(gx * psc(h) + yd);
241
            yy4 = y4;
242
            z4 = gz * 8 + ivec4(0, 1, 2, 3);
243
            zz4 = z4 + 4;
244
        }
245
        if (order_type == 4)
246
        {
247
            x4 = ivec4(yd);
248
            xx4 = x4;
249
            y4 = ivec4(yh * psc(h) + gx);
250
            yy4 = y4;
251
            z4 = gz * 8 + ivec4(0, 1, 2, 3);
252
            zz4 = z4 + 4;
253
        }
254
        if (order_type == 5)
255
        {
256
            x4 = ivec4(yd);
257
            xx4 = x4;
258
            y4 = ivec4(gx * psc(h) + yh);
259
            yy4 = y4;
260
            z4 = gz * 8 + ivec4(0, 1, 2, 3);
261
            zz4 = z4 + 4;
262
        }
263
        if (order_type == 6)
264
        {
265
            x4 = ivec4(gx);
266
            xx4 = x4;
267
            y4 = (gz * 8 + ivec4(0, 1, 2, 3)) * psc(h) + yh;
268
            yy4 = y4 + 4 * psc(h);
269
            z4 = ivec4(yd);
270
            zz4 = z4;
271
        }
272
        if (order_type == 7)
273
        {
274
            x4 = ivec4(yh);
275
            xx4 = x4;
276
            y4 = (gz * 8 + ivec4(0, 1, 2, 3)) * psc(h) + gx;
277
            yy4 = y4 + 4 * psc(h);
278
            z4 = ivec4(yd);
279
            zz4 = z4;
280
        }
281
        if (order_type == 8)
282
        {
283
            x4 = ivec4(gx);
284
            xx4 = x4;
285
            y4 = (gz * 8 + ivec4(0, 1, 2, 3)) * psc(h) + yd;
286
            yy4 = y4 + 4 * psc(h);
287
            z4 = ivec4(yh);
288
            zz4 = z4;
289
        }
290
        if (order_type == 9)
291
        {
292
            x4 = ivec4(yh);
293
            xx4 = x4;
294
            y4 = (gz * 8 + ivec4(0, 1, 2, 3)) * psc(h) + yd;
295
            yy4 = y4 + 4 * psc(h);
296
            z4 = ivec4(gx);
297
            zz4 = z4;
298
        }
299
        if (order_type == 10)
300
        {
301
            x4 = ivec4(yd);
302
            xx4 = x4;
303
            y4 = (gz * 8 + ivec4(0, 1, 2, 3)) * psc(h) + gx;
304
            yy4 = y4 + 4 * psc(h);
305
            z4 = ivec4(yh);
306
            zz4 = z4;
307
        }
308
        if (order_type == 11)
309
        {
310
            x4 = ivec4(yd);
311
            xx4 = x4;
312
            y4 = (gz * 8 + ivec4(0, 1, 2, 3)) * psc(h) + yh;
313
            yy4 = y4 + 4 * psc(h);
314
            z4 = ivec4(gx);
315
            zz4 = z4;
316
        }
317
        if (order_type == 12)
318
        {
319
            x4 = ivec4(gx);
320
            xx4 = x4;
321
            y4 = yh * psc(h) + gz * 8 + ivec4(0, 1, 2, 3);
322
            yy4 = y4 + 4;
323
            z4 = ivec4(yd);
324
            zz4 = z4;
325
        }
326
        if (order_type == 13)
327
        {
328
            x4 = ivec4(yh);
329
            xx4 = x4;
330
            y4 = gx * psc(h) + gz * 8 + ivec4(0, 1, 2, 3);
331
            yy4 = y4 + 4;
332
            z4 = ivec4(yd);
333
            zz4 = z4;
334
        }
335
        if (order_type == 14)
336
        {
337
            x4 = ivec4(gx);
338
            xx4 = x4;
339
            y4 = yd * psc(h) + gz * 8 + ivec4(0, 1, 2, 3);
340
            yy4 = y4 + 4;
341
            z4 = ivec4(yh);
342
            zz4 = z4;
343
        }
344
        if (order_type == 15)
345
        {
346
            x4 = ivec4(yh);
347
            xx4 = x4;
348
            y4 = yd * psc(h) + gz * 8 + ivec4(0, 1, 2, 3);
349
            yy4 = y4 + 4;
350
            z4 = ivec4(gx);
351
            zz4 = z4;
352
        }
353
        if (order_type == 16)
354
        {
355
            x4 = ivec4(yd);
356
            xx4 = x4;
357
            y4 = gx * psc(h) + gz * 8 + ivec4(0, 1, 2, 3);
358
            yy4 = y4 + 4;
359
            z4 = ivec4(yh);
360
            zz4 = z4;
361
        }
362
        if (order_type == 17)
363
        {
364
            x4 = ivec4(yd);
365
            xx4 = x4;
366
            y4 = yh * psc(h) + gz * 8 + ivec4(0, 1, 2, 3);
367
            yy4 = y4 + 4;
368
            z4 = ivec4(gx);
369
            zz4 = z4;
370
        }
371
        if (order_type == 18)
372
        {
373
            x4 = gz * 8 + ivec4(0, 1, 2, 3);
374
            xx4 = x4 + 4;
375
            y4 = ivec4(yh * psc(h) + gx);
376
            yy4 = y4;
377
            z4 = ivec4(yd);
378
            zz4 = z4;
379
        }
380
        if (order_type == 19)
381
        {
382
            x4 = gz * 8 + ivec4(0, 1, 2, 3);
383
            xx4 = x4 + 4;
384
            y4 = ivec4(gx * psc(h) + yh);
385
            yy4 = y4;
386
            z4 = ivec4(yd);
387
            zz4 = z4;
388
        }
389
        if (order_type == 20)
390
        {
391
            x4 = gz * 8 + ivec4(0, 1, 2, 3);
392
            xx4 = x4 + 4;
393
            y4 = ivec4(yd * psc(h) + gx);
394
            yy4 = y4;
395
            z4 = ivec4(yh);
396
            zz4 = z4;
397
        }
398
        if (order_type == 21)
399
        {
400
            x4 = gz * 8 + ivec4(0, 1, 2, 3);
401
            xx4 = x4 + 4;
402
            y4 = ivec4(yd * psc(h) + yh);
403
            yy4 = y4;
404
            z4 = ivec4(gx);
405
            zz4 = z4;
406
        }
407
        if (order_type == 22)
408
        {
409
            x4 = gz * 8 + ivec4(0, 1, 2, 3);
410
            xx4 = x4 + 4;
411
            y4 = ivec4(gx * psc(h) + yd);
412
            yy4 = y4;
413
            z4 = ivec4(yh);
414
            zz4 = z4;
415
        }
416
        if (order_type == 23)
417
        {
418
            x4 = gz * 8 + ivec4(0, 1, 2, 3);
419
            xx4 = x4 + 4;
420
            y4 = ivec4(yh * psc(h) + yd);
421
            yy4 = y4;
422
            z4 = ivec4(gx);
423
            zz4 = z4;
424
        }
425
    }
426

427
#if NCNN_image_shader
428
    ivec4 i4 = z4 * psc(w) * psc(h) * psc(d) + y4 * psc(w) + x4;
429
    ivec4 ii4 = zz4 * psc(w) * psc(h) * psc(d) + yy4 * psc(w) + xx4;
430
    ivec4 lane2;
431
    ivec4 lane2_1;
432
    ivec4 lane4;
433
    ivec4 lane4_1;
434
#else
435
    ivec4 i4 = z4 * psc(cstep) + y4 * psc(w) + x4;
436
    ivec4 ii4 = zz4 * psc(cstep) + yy4 * psc(w) + xx4;
437
    ivec4 v_offset;
438
    ivec4 vv_offset;
439
#if NCNN_fp16_packed
440
    ivec4 lane2;
441
    ivec4 lane4;
442
#endif
443
#endif
444

445
    if (psc(dims) == 2)
446
    {
447
        y4 = i4 / psc(w);
448
        x4 = i4 % psc(w);
449
        yy4 = ii4 / psc(w);
450
        xx4 = ii4 % psc(w);
451

452
#if NCNN_image_shader
453
        lane2 = (y4 % 8) / 4;
454
        lane2_1 = (yy4 % 8) / 4;
455
        lane4 = y4 % 4;
456
        lane4_1 = yy4 % 4;
457

458
        y4 = y4 / 8;
459
        yy4 = yy4 / 8;
460
#else
461
#if NCNN_fp16_packed
462
        v_offset = ((y4 / 8) * psc(w) + x4) * 4 + (y4 % 8) / 2;
463
        lane2 = y4 % 2;
464
        vv_offset = ((yy4 / 8) * psc(w) + xx4) * 4 + (yy4 % 8) / 2;
465
        lane4 = yy4 % 2;
466
#else
467
        v_offset = ((y4 / 8) * psc(w) + x4) * 8 + y4 % 8;
468
        vv_offset = ((yy4 / 8) * psc(w) + xx4) * 8 + yy4 % 8;
469
#endif
470
#endif
471
    }
472
    else if (psc(dims) == 3)
473
    {
474
        int size = psc(w) * psc(h);
475

476
        z4 = i4 / size;
477
        y4 = i4 % size / psc(w);
478
        x4 = i4 % size % psc(w);
479
        zz4 = ii4 / size;
480
        yy4 = ii4 % size / psc(w);
481
        xx4 = ii4 % size % psc(w);
482

483
#if NCNN_image_shader
484
        lane2 = (z4 % 8) / 4;
485
        lane2_1 = (zz4 % 8) / 4;
486
        lane4 = z4 % 4;
487
        lane4_1 = zz4 % 4;
488

489
        z4 = z4 / 8;
490
        zz4 = zz4 / 8;
491
#else
492
#if NCNN_fp16_packed
493
        v_offset = ((z4 / 8) * psc(cstep) + y4 * psc(w) + x4) * 4 + (z4 % 8) / 2;
494
        lane2 = z4 % 2;
495
        vv_offset = ((zz4 / 8) * psc(cstep) + yy4 * psc(w) + xx4) * 4 + (zz4 % 8) / 2;
496
        lane4 = zz4 % 2;
497
#else
498
        v_offset = ((z4 / 8) * psc(cstep) + y4 * psc(w) + x4) * 8 + z4 % 8;
499
        vv_offset = ((zz4 / 8) * psc(cstep) + yy4 * psc(w) + xx4) * 8 + zz4 % 8;
500
#endif
501
#endif
502
    }
503
    else // if (psc(dims) == 4)
504
    {
505
        int size = psc(w) * psc(h) * psc(d);
506
        int dsize = psc(w) * psc(h);
507

508
        z4 = i4 / size;
509
        ivec4 yd4 = i4 % size / dsize;
510
        ivec4 yh4 = i4 % size % dsize / psc(w);
511
        x4 = i4 % size % psc(w);
512
        zz4 = ii4 / size;
513
        ivec4 yyd4 = ii4 % size / dsize;
514
        ivec4 yyh4 = ii4 % size % dsize / psc(w);
515
        xx4 = ii4 % size % psc(w);
516

517
        y4 = yd4 * psc(h) + yh4;
518
        yy4 = yyd4 * psc(h) + yyh4;
519

520
#if NCNN_image_shader
521
        lane2 = (z4 % 8) / 4;
522
        lane2_1 = (zz4 % 8) / 4;
523
        lane4 = z4 % 4;
524
        lane4_1 = zz4 % 4;
525

526
        z4 = z4 / 8;
527
        zz4 = zz4 / 8;
528
#else
529
#if NCNN_fp16_packed
530
        v_offset = ((z4 / 8) * psc(cstep) + y4 * psc(w) + x4) * 4 + (z4 % 8) / 2;
531
        lane2 = z4 % 2;
532
        vv_offset = ((zz4 / 8) * psc(cstep) + yy4 * psc(w) + xx4) * 4 + (zz4 % 8) / 2;
533
        lane4 = zz4 % 2;
534
#else
535
        v_offset = ((z4 / 8) * psc(cstep) + y4 * psc(w) + x4) * 8 + z4 % 8;
536
        vv_offset = ((zz4 / 8) * psc(cstep) + yy4 * psc(w) + xx4) * 8 + zz4 % 8;
537
#endif
538
#endif
539
    }
540

541
#if NCNN_image_shader
542
    afpvec8 v0 = image3d_ld8(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r));
543
    afpvec8 v1 = image3d_ld8(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g));
544
    afpvec8 v2 = image3d_ld8(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b));
545
    afpvec8 v3 = image3d_ld8(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a));
546
    afpvec8 v4 = image3d_ld8(bottom_blob_3d, ivec3(xx4.r, yy4.r, zz4.r));
547
    afpvec8 v5 = image3d_ld8(bottom_blob_3d, ivec3(xx4.g, yy4.g, zz4.g));
548
    afpvec8 v6 = image3d_ld8(bottom_blob_3d, ivec3(xx4.b, yy4.b, zz4.b));
549
    afpvec8 v7 = image3d_ld8(bottom_blob_3d, ivec3(xx4.a, yy4.a, zz4.a));
550

551
    afpvec8 v;
552
    v[0].r = v0[lane2.r][lane4.r];
553
    v[0].g = v1[lane2.g][lane4.g];
554
    v[0].b = v2[lane2.b][lane4.b];
555
    v[0].a = v3[lane2.a][lane4.a];
556
    v[1].r = v4[lane2_1.r][lane4_1.r];
557
    v[1].g = v5[lane2_1.g][lane4_1.g];
558
    v[1].b = v6[lane2_1.b][lane4_1.b];
559
    v[1].a = v7[lane2_1.a][lane4_1.a];
560

561
    image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v);
562
#else
563
    int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
564

565
#if NCNN_fp16_packed
566
    afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
567
    afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
568
    afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
569
    afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
570

571
    afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r);
572
    afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g);
573
    afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b);
574
    afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a);
575

576
    afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]);
577

578
    buffer_st8(top_blob_data, gi, v);
579
#else
580
    buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset);
581
#endif
582
#endif
583
}
584
ncnn

Использование cookies