ncnn

Форк
0
/
permute_pack4to8.comp 
572 строки · 15.5 Кб
1
// Tencent is pleased to support the open source community by making ncnn available.
2
//
3
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4
//
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
7
//
8
// https://opensource.org/licenses/BSD-3-Clause
9
//
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
14

15
#version 450
16

17
#if NCNN_fp16_storage
18
#extension GL_EXT_shader_16bit_storage: require
19
struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
20
#endif
21
#if NCNN_fp16_arithmetic
22
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
23
#endif
24

25
layout (constant_id = 0) const int order_type = 0;
26
layout (constant_id = 1) const int bugihfa = 0;
27

28
#define shape_constant_id_offset 2
29
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
30
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
31
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
32
layout (constant_id = shape_constant_id_offset + 3) const int d = 0;
33
layout (constant_id = shape_constant_id_offset + 4) const int c = 0;
34
layout (constant_id = shape_constant_id_offset + 5) const int cstep = 0;
35

36
layout (constant_id = shape_constant_id_offset + 6) const int outdims = 0;
37
layout (constant_id = shape_constant_id_offset + 7) const int outw = 0;
38
layout (constant_id = shape_constant_id_offset + 8) const int outh = 0;
39
layout (constant_id = shape_constant_id_offset + 9) const int outd = 0;
40
layout (constant_id = shape_constant_id_offset + 10) const int outc = 0;
41
layout (constant_id = shape_constant_id_offset + 11) const int outcstep = 0;
42

43
#if NCNN_image_shader
44
layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
45
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
46
#else
47
#if NCNN_fp16_packed
48
layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
49
#else
50
layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
51
#endif
52
layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
53
#endif
54

55
layout (push_constant) uniform parameter
56
{
57
    int dims;
58
    int w;
59
    int h;
60
    int d;
61
    int c;
62
    int cstep;
63

64
    int outdims;
65
    int outw;
66
    int outh;
67
    int outd;
68
    int outc;
69
    int outcstep;
70
} p;
71

72
void main()
73
{
74
    int gx = int(gl_GlobalInvocationID.x);
75
    int gy = int(gl_GlobalInvocationID.y);
76
    int gz = int(gl_GlobalInvocationID.z);
77

78
    if (gx >= psc(outw) || gy >= psc(outh) * psc(outd) || gz >= psc(outc))
79
        return;
80

81
    ivec4 x4;
82
    ivec4 xx4;
83
    ivec4 y4;
84
    ivec4 yy4;
85
    ivec4 z4;
86
    ivec4 zz4;
87

88
    if (psc(dims) == 2)
89
    {
90
        // order_type
91
        // 0 = w h
92
        // 1 = h w
93

94
        gz = 0;
95
        z4 = ivec4(0);
96
        zz4 = z4;
97

98
        if (order_type == 0)
99
        {
100
            x4 = ivec4(gx);
101
            xx4 = x4;
102
            y4 = gy * 8 + ivec4(0, 1, 2, 3);
103
            yy4 = y4 + 4;
104
        }
105
        if (order_type == 1)
106
        {
107
            x4 = gy * 8 + ivec4(0, 1, 2, 3);
108
            xx4 = x4 + 4;
109
            y4 = ivec4(gx);
110
            yy4 = y4;
111
        }
112
    }
113
    else if (psc(dims) == 3)
114
    {
115
        // order_type
116
        // 0 = w h c
117
        // 1 = h w c
118
        // 2 = w c h
119
        // 3 = c w h
120
        // 4 = h c w
121
        // 5 = c h w
122

123
        if (order_type == 0)
124
        {
125
            x4 = ivec4(gx);
126
            xx4 = x4;
127
            y4 = ivec4(gy);
128
            yy4 = y4;
129
            z4 = gz * 8 + ivec4(0, 1, 2, 3);
130
            zz4 = z4 + 4;
131
        }
132
        if (order_type == 1)
133
        {
134
            x4 = ivec4(gy);
135
            xx4 = x4;
136
            y4 = ivec4(gx);
137
            yy4 = y4;
138
            z4 = gz * 8 + ivec4(0, 1, 2, 3);
139
            zz4 = z4 + 4;
140
        }
141
        if (order_type == 2)
142
        {
143
            x4 = ivec4(gx);
144
            xx4 = x4;
145
            y4 = gz * 8 + ivec4(0, 1, 2, 3);
146
            yy4 = y4 + 4;
147
            z4 = ivec4(gy);
148
            zz4 = y4;
149
        }
150
        if (order_type == 3)
151
        {
152
            x4 = ivec4(gy);
153
            xx4 = x4;
154
            y4 = gz * 8 + ivec4(0, 1, 2, 3);
155
            yy4 = y4 + 4;
156
            z4 = ivec4(gx);
157
            zz4 = z4;
158
        }
159
        if (order_type == 4)
160
        {
161
            x4 = gz * 8 + ivec4(0, 1, 2, 3);
162
            xx4 = x4 + 4;
163
            y4 = ivec4(gx);
164
            yy4 = y4;
165
            z4 = ivec4(gy);
166
            zz4 = z4;
167
        }
168
        if (order_type == 5)
169
        {
170
            x4 = gz * 8 + ivec4(0, 1, 2, 3);
171
            xx4 = x4 + 4;
172
            y4 = ivec4(gy);
173
            yy4 = y4;
174
            z4 = ivec4(gx);
175
            zz4 = z4;
176
        }
177
    }
178
    else // if (psc(dims) == 4)
179
    {
180
        // order_type
181
        // 0 = w h d c
182
        // 1 = h w d c
183
        // 2 = w d h c
184
        // 3 = d w h c
185
        // 4 = h d w c
186
        // 5 = d h w c
187
        // 6 = w h c d
188
        // 7 = h w c d
189
        // 8 = w c h d
190
        // 9 = c w h d
191
        //10 = h c w d
192
        //11 = c h w d
193
        //12 = w d c h
194
        //13 = d w c h
195
        //14 = w c d h
196
        //15 = c w d h
197
        //16 = d c w h
198
        //17 = c d w h
199
        //18 = h d c w
200
        //19 = d h c w
201
        //20 = h c d w
202
        //21 = c h d w
203
        //22 = d c h w
204
        //23 = c d h w
205

206
        int yd = gy / psc(outh);
207
        int yh = gy % psc(outh);
208

209
        if (order_type == 0)
210
        {
211
            x4 = ivec4(gx);
212
            xx4 = x4;
213
            y4 = ivec4(yd * psc(h) + yh);
214
            yy4 = y4;
215
            z4 = gz * 8 + ivec4(0, 1, 2, 3);
216
            zz4 = z4 + 4;
217
        }
218
        if (order_type == 1)
219
        {
220
            x4 = ivec4(yh);
221
            xx4 = x4;
222
            y4 = ivec4(yd * psc(h) + gx);
223
            yy4 = y4;
224
            z4 = gz * 8 + ivec4(0, 1, 2, 3);
225
            zz4 = z4 + 4;
226
        }
227
        if (order_type == 2)
228
        {
229
            x4 = ivec4(gx);
230
            xx4 = x4;
231
            y4 = ivec4(yh * psc(h) + yd);
232
            yy4 = y4;
233
            z4 = gz * 8 + ivec4(0, 1, 2, 3);
234
            zz4 = z4 + 4;
235
        }
236
        if (order_type == 3)
237
        {
238
            x4 = ivec4(yh);
239
            xx4 = x4;
240
            y4 = ivec4(gx * psc(h) + yd);
241
            yy4 = y4;
242
            z4 = gz * 8 + ivec4(0, 1, 2, 3);
243
            zz4 = z4 + 4;
244
        }
245
        if (order_type == 4)
246
        {
247
            x4 = ivec4(yd);
248
            xx4 = x4;
249
            y4 = ivec4(yh * psc(h) + gx);
250
            yy4 = y4;
251
            z4 = gz * 8 + ivec4(0, 1, 2, 3);
252
            zz4 = z4 + 4;
253
        }
254
        if (order_type == 5)
255
        {
256
            x4 = ivec4(yd);
257
            xx4 = x4;
258
            y4 = ivec4(gx * psc(h) + yh);
259
            yy4 = y4;
260
            z4 = gz * 8 + ivec4(0, 1, 2, 3);
261
            zz4 = z4 + 4;
262
        }
263
        if (order_type == 6)
264
        {
265
            x4 = ivec4(gx);
266
            xx4 = x4;
267
            y4 = (gz * 8 + ivec4(0, 1, 2, 3)) * psc(h) + yh;
268
            yy4 = y4 + 4 * psc(h);
269
            z4 = ivec4(yd);
270
            zz4 = z4;
271
        }
272
        if (order_type == 7)
273
        {
274
            x4 = ivec4(yh);
275
            xx4 = x4;
276
            y4 = (gz * 8 + ivec4(0, 1, 2, 3)) * psc(h) + gx;
277
            yy4 = y4 + 4 * psc(h);
278
            z4 = ivec4(yd);
279
            zz4 = z4;
280
        }
281
        if (order_type == 8)
282
        {
283
            x4 = ivec4(gx);
284
            xx4 = x4;
285
            y4 = (gz * 8 + ivec4(0, 1, 2, 3)) * psc(h) + yd;
286
            yy4 = y4 + 4 * psc(h);
287
            z4 = ivec4(yh);
288
            zz4 = z4;
289
        }
290
        if (order_type == 9)
291
        {
292
            x4 = ivec4(yh);
293
            xx4 = x4;
294
            y4 = (gz * 8 + ivec4(0, 1, 2, 3)) * psc(h) + yd;
295
            yy4 = y4 + 4 * psc(h);
296
            z4 = ivec4(gx);
297
            zz4 = z4;
298
        }
299
        if (order_type == 10)
300
        {
301
            x4 = ivec4(yd);
302
            xx4 = x4;
303
            y4 = (gz * 8 + ivec4(0, 1, 2, 3)) * psc(h) + gx;
304
            yy4 = y4 + 4 * psc(h);
305
            z4 = ivec4(yh);
306
            zz4 = z4;
307
        }
308
        if (order_type == 11)
309
        {
310
            x4 = ivec4(yd);
311
            xx4 = x4;
312
            y4 = (gz * 8 + ivec4(0, 1, 2, 3)) * psc(h) + yh;
313
            yy4 = y4 + 4 * psc(h);
314
            z4 = ivec4(gx);
315
            zz4 = z4;
316
        }
317
        if (order_type == 12)
318
        {
319
            x4 = ivec4(gx);
320
            xx4 = x4;
321
            y4 = yh * psc(h) + gz * 8 + ivec4(0, 1, 2, 3);
322
            yy4 = y4 + 4;
323
            z4 = ivec4(yd);
324
            zz4 = z4;
325
        }
326
        if (order_type == 13)
327
        {
328
            x4 = ivec4(yh);
329
            xx4 = x4;
330
            y4 = gx * psc(h) + gz * 8 + ivec4(0, 1, 2, 3);
331
            yy4 = y4 + 4;
332
            z4 = ivec4(yd);
333
            zz4 = z4;
334
        }
335
        if (order_type == 14)
336
        {
337
            x4 = ivec4(gx);
338
            xx4 = x4;
339
            y4 = yd * psc(h) + gz * 8 + ivec4(0, 1, 2, 3);
340
            yy4 = y4 + 4;
341
            z4 = ivec4(yh);
342
            zz4 = z4;
343
        }
344
        if (order_type == 15)
345
        {
346
            x4 = ivec4(yh);
347
            xx4 = x4;
348
            y4 = yd * psc(h) + gz * 8 + ivec4(0, 1, 2, 3);
349
            yy4 = y4 + 4;
350
            z4 = ivec4(gx);
351
            zz4 = z4;
352
        }
353
        if (order_type == 16)
354
        {
355
            x4 = ivec4(yd);
356
            xx4 = x4;
357
            y4 = gx * psc(h) + gz * 8 + ivec4(0, 1, 2, 3);
358
            yy4 = y4 + 4;
359
            z4 = ivec4(yh);
360
            zz4 = z4;
361
        }
362
        if (order_type == 17)
363
        {
364
            x4 = ivec4(yd);
365
            xx4 = x4;
366
            y4 = yh * psc(h) + gz * 8 + ivec4(0, 1, 2, 3);
367
            yy4 = y4 + 4;
368
            z4 = ivec4(gx);
369
            zz4 = z4;
370
        }
371
        if (order_type == 18)
372
        {
373
            x4 = gz * 8 + ivec4(0, 1, 2, 3);
374
            xx4 = x4 + 4;
375
            y4 = ivec4(yh * psc(h) + gx);
376
            yy4 = y4;
377
            z4 = ivec4(yd);
378
            zz4 = z4;
379
        }
380
        if (order_type == 19)
381
        {
382
            x4 = gz * 8 + ivec4(0, 1, 2, 3);
383
            xx4 = x4 + 4;
384
            y4 = ivec4(gx * psc(h) + yh);
385
            yy4 = y4;
386
            z4 = ivec4(yd);
387
            zz4 = z4;
388
        }
389
        if (order_type == 20)
390
        {
391
            x4 = gz * 8 + ivec4(0, 1, 2, 3);
392
            xx4 = x4 + 4;
393
            y4 = ivec4(yd * psc(h) + gx);
394
            yy4 = y4;
395
            z4 = ivec4(yh);
396
            zz4 = z4;
397
        }
398
        if (order_type == 21)
399
        {
400
            x4 = gz * 8 + ivec4(0, 1, 2, 3);
401
            xx4 = x4 + 4;
402
            y4 = ivec4(yd * psc(h) + yh);
403
            yy4 = y4;
404
            z4 = ivec4(gx);
405
            zz4 = z4;
406
        }
407
        if (order_type == 22)
408
        {
409
            x4 = gz * 8 + ivec4(0, 1, 2, 3);
410
            xx4 = x4 + 4;
411
            y4 = ivec4(gx * psc(h) + yd);
412
            yy4 = y4;
413
            z4 = ivec4(yh);
414
            zz4 = z4;
415
        }
416
        if (order_type == 23)
417
        {
418
            x4 = gz * 8 + ivec4(0, 1, 2, 3);
419
            xx4 = x4 + 4;
420
            y4 = ivec4(yh * psc(h) + yd);
421
            yy4 = y4;
422
            z4 = ivec4(gx);
423
            zz4 = z4;
424
        }
425
    }
426

427
#if NCNN_image_shader
428
    ivec4 i4 = z4 * psc(w) * psc(h) * psc(d) + y4 * psc(w) + x4;
429
    ivec4 ii4 = zz4 * psc(w) * psc(h) * psc(d) + yy4 * psc(w) + xx4;
430
    ivec4 lane4;
431
    ivec4 lane4_1;
432
#else
433
    ivec4 i4 = z4 * psc(cstep) + y4 * psc(w) + x4;
434
    ivec4 ii4 = zz4 * psc(cstep) + yy4 * psc(w) + xx4;
435
    ivec4 v_offset;
436
    ivec4 vv_offset;
437
#if NCNN_fp16_packed
438
    ivec4 lane2;
439
    ivec4 lane4;
440
#endif
441
#endif
442

443
    if (psc(dims) == 2)
444
    {
445
        y4 = i4 / psc(w);
446
        x4 = i4 % psc(w);
447
        yy4 = ii4 / psc(w);
448
        xx4 = ii4 % psc(w);
449

450
#if NCNN_image_shader
451
        lane4 = y4 % 4;
452
        lane4_1 = yy4 % 4;
453
        y4 = y4 / 4;
454
        yy4 = yy4 / 4;
455
#else
456
#if NCNN_fp16_packed
457
        v_offset = ((y4 / 4) * psc(w) + x4) * 2 + (y4 % 4) / 2;
458
        lane2 = y4 % 2;
459
        vv_offset = ((yy4 / 4) * psc(w) + xx4) * 2 + (yy4 % 4) / 2;
460
        lane4 = yy4 % 2;
461
#else
462
        v_offset = ((y4 / 4) * psc(w) + x4) * 4 + y4 % 4;
463
        vv_offset = ((yy4 / 4) * psc(w) + xx4) * 4 + yy4 % 4;
464
#endif
465
#endif
466
    }
467
    else if (psc(dims) == 3)
468
    {
469
        int size = psc(w) * psc(h);
470

471
        z4 = i4 / size;
472
        y4 = i4 % size / psc(w);
473
        x4 = i4 % size % psc(w);
474
        zz4 = ii4 / size;
475
        yy4 = ii4 % size / psc(w);
476
        xx4 = ii4 % size % psc(w);
477

478
#if NCNN_image_shader
479
        lane4 = z4 % 4;
480
        lane4_1 = zz4 % 4;
481
        z4 = z4 / 4;
482
        zz4 = zz4 / 4;
483
#else
484
#if NCNN_fp16_packed
485
        v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 2 + (z4 % 4) / 2;
486
        lane2 = z4 % 2;
487
        vv_offset = ((zz4 / 4) * psc(cstep) + yy4 * psc(w) + xx4) * 2 + (zz4 % 4) / 2;
488
        lane4 = zz4 % 2;
489
#else
490
        v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 4 + z4 % 4;
491
        vv_offset = ((zz4 / 4) * psc(cstep) + yy4 * psc(w) + xx4) * 4 + zz4 % 4;
492
#endif
493
#endif
494
    }
495
    else // if (psc(dims) == 4)
496
    {
497
        int size = psc(w) * psc(h) * psc(d);
498
        int dsize = psc(w) * psc(h);
499

500
        z4 = i4 / size;
501
        ivec4 yd4 = i4 % size / dsize;
502
        ivec4 yh4 = i4 % size % dsize / psc(w);
503
        x4 = i4 % size % psc(w);
504
        zz4 = ii4 / size;
505
        ivec4 yyd4 = ii4 % size / dsize;
506
        ivec4 yyh4 = ii4 % size % dsize / psc(w);
507
        xx4 = ii4 % size % psc(w);
508

509
        y4 = yd4 * psc(h) + yh4;
510
        yy4 = yyd4 * psc(h) + yyh4;
511

512
#if NCNN_image_shader
513
        lane4 = z4 % 4;
514
        lane4_1 = zz4 % 4;
515
        z4 = z4 / 4;
516
        zz4 = zz4 / 4;
517
#else
518
#if NCNN_fp16_packed
519
        v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 2 + (z4 % 4) / 2;
520
        lane2 = z4 % 2;
521
        vv_offset = ((zz4 / 4) * psc(cstep) + yy4 * psc(w) + xx4) * 2 + (zz4 % 4) / 2;
522
        lane4 = zz4 % 2;
523
#else
524
        v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 4 + z4 % 4;
525
        vv_offset = ((zz4 / 4) * psc(cstep) + yy4 * psc(w) + xx4) * 4 + zz4 % 4;
526
#endif
527
#endif
528
    }
529

530
#if NCNN_image_shader
531
    afpvec4 v0 = image3d_ld4(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r));
532
    afpvec4 v1 = image3d_ld4(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g));
533
    afpvec4 v2 = image3d_ld4(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b));
534
    afpvec4 v3 = image3d_ld4(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a));
535
    afpvec4 v4 = image3d_ld4(bottom_blob_3d, ivec3(xx4.r, yy4.r, zz4.r));
536
    afpvec4 v5 = image3d_ld4(bottom_blob_3d, ivec3(xx4.g, yy4.g, zz4.g));
537
    afpvec4 v6 = image3d_ld4(bottom_blob_3d, ivec3(xx4.b, yy4.b, zz4.b));
538
    afpvec4 v7 = image3d_ld4(bottom_blob_3d, ivec3(xx4.a, yy4.a, zz4.a));
539

540
    afpvec8 v;
541
    v[0].r = v0[lane4.r];
542
    v[0].g = v1[lane4.g];
543
    v[0].b = v2[lane4.b];
544
    v[0].a = v3[lane4.a];
545
    v[1].r = v4[lane4_1.r];
546
    v[1].g = v5[lane4_1.g];
547
    v[1].b = v6[lane4_1.b];
548
    v[1].a = v7[lane4_1.a];
549

550
    image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v);
551
#else
552
    int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
553

554
#if NCNN_fp16_packed
555
    afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
556
    afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
557
    afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
558
    afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
559

560
    afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r);
561
    afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g);
562
    afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b);
563
    afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a);
564

565
    afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]);
566

567
    buffer_st8(top_blob_data, gi, v);
568
#else
569
    buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset);
570
#endif
571
#endif
572
}
573

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.