ncnn

Форк
0
/
permute_pack1to4.comp 
339 строк · 9.2 Кб
1
// Tencent is pleased to support the open source community by making ncnn available.
2
//
3
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4
//
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
7
//
8
// https://opensource.org/licenses/BSD-3-Clause
9
//
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
14

15
#version 450
16

17
#if NCNN_fp16_storage
18
#extension GL_EXT_shader_16bit_storage: require
19
#endif
20
#if NCNN_fp16_arithmetic
21
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
22
#endif
23

24
layout (constant_id = 0) const int order_type = 0;
25
layout (constant_id = 1) const int bugihfa = 0;
26

27
#define shape_constant_id_offset 2
28
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
29
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
30
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
31
layout (constant_id = shape_constant_id_offset + 3) const int d = 0;
32
layout (constant_id = shape_constant_id_offset + 4) const int c = 0;
33
layout (constant_id = shape_constant_id_offset + 5) const int cstep = 0;
34

35
layout (constant_id = shape_constant_id_offset + 6) const int outdims = 0;
36
layout (constant_id = shape_constant_id_offset + 7) const int outw = 0;
37
layout (constant_id = shape_constant_id_offset + 8) const int outh = 0;
38
layout (constant_id = shape_constant_id_offset + 9) const int outd = 0;
39
layout (constant_id = shape_constant_id_offset + 10) const int outc = 0;
40
layout (constant_id = shape_constant_id_offset + 11) const int outcstep = 0;
41

42
#if NCNN_image_shader
43
layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
44
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
45
#else
46
layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
47
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
48
#endif
49

50
layout (push_constant) uniform parameter
51
{
52
    int dims;
53
    int w;
54
    int h;
55
    int d;
56
    int c;
57
    int cstep;
58

59
    int outdims;
60
    int outw;
61
    int outh;
62
    int outd;
63
    int outc;
64
    int outcstep;
65
} p;
66

67
void main()
68
{
69
    int gx = int(gl_GlobalInvocationID.x);
70
    int gy = int(gl_GlobalInvocationID.y);
71
    int gz = int(gl_GlobalInvocationID.z);
72

73
    if (gx >= psc(outw) || gy >= psc(outh) * psc(outd) || gz >= psc(outc))
74
        return;
75

76
    ivec4 x4;
77
    ivec4 y4;
78
    ivec4 z4;
79

80
    if (psc(dims) == 2)
81
    {
82
        // order_type
83
        // 0 = w h
84
        // 1 = h w
85

86
        gz = 0;
87
        z4 = ivec4(0);
88

89
        if (order_type == 0)
90
        {
91
            x4 = ivec4(gx);
92
            y4 = gy * 4 + ivec4(0, 1, 2, 3);
93
        }
94
        if (order_type == 1)
95
        {
96
            x4 = gy * 4 + ivec4(0, 1, 2, 3);
97
            y4 = ivec4(gx);
98
        }
99
    }
100
    else if (psc(dims) == 3)
101
    {
102
        // order_type
103
        // 0 = w h c
104
        // 1 = h w c
105
        // 2 = w c h
106
        // 3 = c w h
107
        // 4 = h c w
108
        // 5 = c h w
109

110
        if (order_type == 0)
111
        {
112
            x4 = ivec4(gx);
113
            y4 = ivec4(gy);
114
            z4 = gz * 4 + ivec4(0, 1, 2, 3);
115
        }
116
        if (order_type == 1)
117
        {
118
            x4 = ivec4(gy);
119
            y4 = ivec4(gx);
120
            z4 = gz * 4 + ivec4(0, 1, 2, 3);
121
        }
122
        if (order_type == 2)
123
        {
124
            x4 = ivec4(gx);
125
            y4 = gz * 4 + ivec4(0, 1, 2, 3);
126
            z4 = ivec4(gy);
127
        }
128
        if (order_type == 3)
129
        {
130
            x4 = ivec4(gy);
131
            y4 = gz * 4 + ivec4(0, 1, 2, 3);
132
            z4 = ivec4(gx);
133
        }
134
        if (order_type == 4)
135
        {
136
            x4 = gz * 4 + ivec4(0, 1, 2, 3);
137
            y4 = ivec4(gx);
138
            z4 = ivec4(gy);
139
        }
140
        if (order_type == 5)
141
        {
142
            x4 = gz * 4 + ivec4(0, 1, 2, 3);
143
            y4 = ivec4(gy);
144
            z4 = ivec4(gx);
145
        }
146
    }
147
    else // if (psc(dims) == 4)
148
    {
149
        // order_type
150
        // 0 = w h d c
151
        // 1 = h w d c
152
        // 2 = w d h c
153
        // 3 = d w h c
154
        // 4 = h d w c
155
        // 5 = d h w c
156
        // 6 = w h c d
157
        // 7 = h w c d
158
        // 8 = w c h d
159
        // 9 = c w h d
160
        //10 = h c w d
161
        //11 = c h w d
162
        //12 = w d c h
163
        //13 = d w c h
164
        //14 = w c d h
165
        //15 = c w d h
166
        //16 = d c w h
167
        //17 = c d w h
168
        //18 = h d c w
169
        //19 = d h c w
170
        //20 = h c d w
171
        //21 = c h d w
172
        //22 = d c h w
173
        //23 = c d h w
174

175
        int yd = gy / psc(outh);
176
        int yh = gy % psc(outh);
177

178
        if (order_type == 0)
179
        {
180
            x4 = ivec4(gx);
181
            y4 = ivec4(yd * psc(h) + yh);
182
            z4 = gz * 4 + ivec4(0, 1, 2, 3);
183
        }
184
        if (order_type == 1)
185
        {
186
            x4 = ivec4(yh);
187
            y4 = ivec4(yd * psc(h) + gx);
188
            z4 = gz * 4 + ivec4(0, 1, 2, 3);
189
        }
190
        if (order_type == 2)
191
        {
192
            x4 = ivec4(gx);
193
            y4 = ivec4(yh * psc(h) + yd);
194
            z4 = gz * 4 + ivec4(0, 1, 2, 3);
195
        }
196
        if (order_type == 3)
197
        {
198
            x4 = ivec4(yh);
199
            y4 = ivec4(gx * psc(h) + yd);
200
            z4 = gz * 4 + ivec4(0, 1, 2, 3);
201
        }
202
        if (order_type == 4)
203
        {
204
            x4 = ivec4(yd);
205
            y4 = ivec4(yh * psc(h) + gx);
206
            z4 = gz * 4 + ivec4(0, 1, 2, 3);
207
        }
208
        if (order_type == 5)
209
        {
210
            x4 = ivec4(yd);
211
            y4 = ivec4(gx * psc(h) + yh);
212
            z4 = gz * 4 + ivec4(0, 1, 2, 3);
213
        }
214
        if (order_type == 6)
215
        {
216
            x4 = ivec4(gx);
217
            y4 = (gz * 4 + ivec4(0, 1, 2, 3)) * psc(h) + yh;
218
            z4 = ivec4(yd);
219
        }
220
        if (order_type == 7)
221
        {
222
            x4 = ivec4(yh);
223
            y4 = (gz * 4 + ivec4(0, 1, 2, 3)) * psc(h) + gx;
224
            z4 = ivec4(yd);
225
        }
226
        if (order_type == 8)
227
        {
228
            x4 = ivec4(gx);
229
            y4 = (gz * 4 + ivec4(0, 1, 2, 3)) * psc(h) + yd;
230
            z4 = ivec4(yh);
231
        }
232
        if (order_type == 9)
233
        {
234
            x4 = ivec4(yh);
235
            y4 = (gz * 4 + ivec4(0, 1, 2, 3)) * psc(h) + yd;
236
            z4 = ivec4(gx);
237
        }
238
        if (order_type == 10)
239
        {
240
            x4 = ivec4(yd);
241
            y4 = (gz * 4 + ivec4(0, 1, 2, 3)) * psc(h) + gx;
242
            z4 = ivec4(yh);
243
        }
244
        if (order_type == 11)
245
        {
246
            x4 = ivec4(yd);
247
            y4 = (gz * 4 + ivec4(0, 1, 2, 3)) * psc(h) + yh;
248
            z4 = ivec4(gx);
249
        }
250
        if (order_type == 12)
251
        {
252
            x4 = ivec4(gx);
253
            y4 = yh * psc(h) + gz * 4 + ivec4(0, 1, 2, 3);
254
            z4 = ivec4(yd);
255
        }
256
        if (order_type == 13)
257
        {
258
            x4 = ivec4(yh);
259
            y4 = gx * psc(h) + gz * 4 + ivec4(0, 1, 2, 3);
260
            z4 = ivec4(yd);
261
        }
262
        if (order_type == 14)
263
        {
264
            x4 = ivec4(gx);
265
            y4 = yd * psc(h) + gz * 4 + ivec4(0, 1, 2, 3);
266
            z4 = ivec4(yh);
267
        }
268
        if (order_type == 15)
269
        {
270
            x4 = ivec4(yh);
271
            y4 = yd * psc(h) + gz * 4 + ivec4(0, 1, 2, 3);
272
            z4 = ivec4(gx);
273
        }
274
        if (order_type == 16)
275
        {
276
            x4 = ivec4(yd);
277
            y4 = gx * psc(h) + gz * 4 + ivec4(0, 1, 2, 3);
278
            z4 = ivec4(yh);
279
        }
280
        if (order_type == 17)
281
        {
282
            x4 = ivec4(yd);
283
            y4 = yh * psc(h) + gz * 4 + ivec4(0, 1, 2, 3);
284
            z4 = ivec4(gx);
285
        }
286
        if (order_type == 18)
287
        {
288
            x4 = gz * 4 + ivec4(0, 1, 2, 3);
289
            y4 = ivec4(yh * psc(h) + gx);
290
            z4 = ivec4(yd);
291
        }
292
        if (order_type == 19)
293
        {
294
            x4 = gz * 4 + ivec4(0, 1, 2, 3);
295
            y4 = ivec4(gx * psc(h) + yh);
296
            z4 = ivec4(yd);
297
        }
298
        if (order_type == 20)
299
        {
300
            x4 = gz * 4 + ivec4(0, 1, 2, 3);
301
            y4 = ivec4(yd * psc(h) + gx);
302
            z4 = ivec4(yh);
303
        }
304
        if (order_type == 21)
305
        {
306
            x4 = gz * 4 + ivec4(0, 1, 2, 3);
307
            y4 = ivec4(yd * psc(h) + yh);
308
            z4 = ivec4(gx);
309
        }
310
        if (order_type == 22)
311
        {
312
            x4 = gz * 4 + ivec4(0, 1, 2, 3);
313
            y4 = ivec4(gx * psc(h) + yd);
314
            z4 = ivec4(yh);
315
        }
316
        if (order_type == 23)
317
        {
318
            x4 = gz * 4 + ivec4(0, 1, 2, 3);
319
            y4 = ivec4(yh * psc(h) + yd);
320
            z4 = ivec4(gx);
321
        }
322
    }
323

324
#if NCNN_image_shader
325
    afpvec4 v;
326
    v.r = image3d_ld1(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r));
327
    v.g = image3d_ld1(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g));
328
    v.b = image3d_ld1(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b));
329
    v.a = image3d_ld1(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a));
330

331
    image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v);
332
#else
333
    ivec4 v_offset = z4 * psc(cstep) + y4 * psc(w) + x4;
334

335
    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
336

337
    buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset);
338
#endif
339
}
340

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.