ncnn

Форк
0
/
shufflechannel_pack8.comp 
169 строк · 6.1 Кб
1
// Tencent is pleased to support the open source community by making ncnn available.
2
//
3
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4
//
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
7
//
8
// https://opensource.org/licenses/BSD-3-Clause
9
//
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
14

15
#version 450
16

17
#if NCNN_fp16_storage
18
#extension GL_EXT_shader_16bit_storage: require
19
struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
20
#endif
21
#if NCNN_fp16_arithmetic
22
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
23
#endif
24

25
layout (constant_id = 0) const int group = 0;
26
layout (constant_id = 1) const int bugihfa = 0;
27

28
#define shape_constant_id_offset 2
29
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
30
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
31
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
32
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
33
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
34

35
layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
36
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
37
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
38
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
39
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
40

41
#if NCNN_image_shader
42
layout (binding = 0) uniform unfp sampler3D bottom_blob;
43
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
44
#else
45
layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
46
layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
47
#endif
48

49
layout (push_constant) uniform parameter
50
{
51
    int dims;
52
    int w;
53
    int h;
54
    int c;
55
    int cstep;
56

57
    int outdims;
58
    int outw;
59
    int outh;
60
    int outc;
61
    int outcstep;
62

63
    int group;
64
} p;
65

66
void main()
67
{
68
    int gx = int(gl_GlobalInvocationID.x);
69
    int gy = int(gl_GlobalInvocationID.y);
70
    int gz = int(gl_GlobalInvocationID.z);
71

72
    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
73
        return;
74

75
    const int channels_g = psc(c) * 8 / psc(group);
76

77
    ivec4 gz4 = ivec4(gz * 8) + ivec4(0, 1, 2, 3);
78
    ivec4 gzz4 = gz4 + 4;
79

80
    ivec4 z4 = (gz4 % psc(group)) * channels_g + (gz4 / psc(group));
81
    ivec4 zz4 = (gzz4 % psc(group)) * channels_g + (gzz4 / psc(group));
82

83
#if NCNN_image_shader
84
    ivec4 sz8 = z4 / 8;
85
    ivec4 szz8 = zz4 / 8;
86

87
    afpvec8 vr = image3d_ld8(bottom_blob, ivec3(gx, gy, sz8.r));
88
    afpvec8 vg = image3d_ld8(bottom_blob, ivec3(gx, gy, sz8.g));
89
    afpvec8 vb = image3d_ld8(bottom_blob, ivec3(gx, gy, sz8.b));
90
    afpvec8 va = image3d_ld8(bottom_blob, ivec3(gx, gy, sz8.a));
91

92
    afpvec8 vvr = image3d_ld8(bottom_blob, ivec3(gx, gy, szz8.r));
93
    afpvec8 vvg = image3d_ld8(bottom_blob, ivec3(gx, gy, szz8.g));
94
    afpvec8 vvb = image3d_ld8(bottom_blob, ivec3(gx, gy, szz8.b));
95
    afpvec8 vva = image3d_ld8(bottom_blob, ivec3(gx, gy, szz8.a));
96
#else
97
    ivec4 v_offset = (z4 / 8) * psc(cstep) + gy * psc(w) + gx;
98
    ivec4 vv_offset = (zz4 / 8) * psc(cstep) + gy * psc(w) + gx;
99

100
    // v = v4[lane]
101

102
    afpvec8 vr = buffer_ld8(bottom_blob_data, v_offset.r);
103
    afpvec8 vg = buffer_ld8(bottom_blob_data, v_offset.g);
104
    afpvec8 vb = buffer_ld8(bottom_blob_data, v_offset.b);
105
    afpvec8 va = buffer_ld8(bottom_blob_data, v_offset.a);
106

107
    afpvec8 vvr = buffer_ld8(bottom_blob_data, vv_offset.r);
108
    afpvec8 vvg = buffer_ld8(bottom_blob_data, vv_offset.g);
109
    afpvec8 vvb = buffer_ld8(bottom_blob_data, vv_offset.b);
110
    afpvec8 vva = buffer_ld8(bottom_blob_data, vv_offset.a);
111
#endif
112

113
    ivec4 sz4 = (z4 % 8) / 4;
114
    ivec4 szz4 = (zz4 % 8) / 4;
115

116
    ivec4 lane4 = z4 % 4;
117
    ivec4 lane8 = zz4 % 4;
118

119
    afpvec8 v;
120
#if NCNN_fp16_arithmetic
121
    if (bugihfa == 1)
122
    {
123
        if (lane4.r == 0) v[0].r = vr[sz4.r].r;
124
        if (lane4.r == 1) v[0].r = vr[sz4.r].g;
125
        if (lane4.r == 2) v[0].r = vr[sz4.r].b;
126
        if (lane4.r == 3) v[0].r = vr[sz4.r].a;
127
        if (lane4.g == 0) v[0].g = vr[sz4.g].r;
128
        if (lane4.g == 1) v[0].g = vr[sz4.g].g;
129
        if (lane4.g == 2) v[0].g = vr[sz4.g].b;
130
        if (lane4.g == 3) v[0].g = vr[sz4.g].a;
131
        if (lane4.b == 0) v[0].b = vr[sz4.b].r;
132
        if (lane4.b == 1) v[0].b = vr[sz4.b].g;
133
        if (lane4.b == 2) v[0].b = vr[sz4.b].b;
134
        if (lane4.b == 3) v[0].b = vr[sz4.b].a;
135
        if (lane4.a == 0) v[0].a = vr[sz4.a].r;
136
        if (lane4.a == 1) v[0].a = vr[sz4.a].g;
137
        if (lane4.a == 2) v[0].a = vr[sz4.a].b;
138
        if (lane4.a == 3) v[0].a = vr[sz4.a].a;
139
        if (lane8.r == 0) v[1].r = vvr[szz4.r].r;
140
        if (lane8.r == 1) v[1].r = vvr[szz4.r].g;
141
        if (lane8.r == 2) v[1].r = vvr[szz4.r].b;
142
        if (lane8.r == 3) v[1].r = vvr[szz4.r].a;
143
        if (lane8.g == 0) v[1].g = vvr[szz4.g].r;
144
        if (lane8.g == 1) v[1].g = vvr[szz4.g].g;
145
        if (lane8.g == 2) v[1].g = vvr[szz4.g].b;
146
        if (lane8.g == 3) v[1].g = vvr[szz4.g].a;
147
        if (lane8.b == 0) v[1].b = vvr[szz4.b].r;
148
        if (lane8.b == 1) v[1].b = vvr[szz4.b].g;
149
        if (lane8.b == 2) v[1].b = vvr[szz4.b].b;
150
        if (lane8.b == 3) v[1].b = vvr[szz4.b].a;
151
        if (lane8.a == 0) v[1].a = vvr[szz4.a].r;
152
        if (lane8.a == 1) v[1].a = vvr[szz4.a].g;
153
        if (lane8.a == 2) v[1].a = vvr[szz4.a].b;
154
        if (lane8.a == 3) v[1].a = vvr[szz4.a].a;
155
    }
156
    else
157
#endif
158
    {
159
        v = afpvec8(vr[sz4.r][lane4.r], vg[sz4.g][lane4.g], vb[sz4.b][lane4.b], va[sz4.a][lane4.a], vvr[szz4.r][lane8.r], vvg[szz4.g][lane8.g], vvb[szz4.b][lane8.b], vva[szz4.a][lane8.a]);
160
    }
161

162
#if NCNN_image_shader
163
    image3d_st8(top_blob, ivec3(gx, gy, gz), v);
164
#else
165
    int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
166

167
    buffer_st8(top_blob_data, gi, v);
168
#endif
169
}
170

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.