ncnn

Форк
0
/
interp_bicubic_pack8.comp 
205 строк · 8.1 Кб
1
// Tencent is pleased to support the open source community by making ncnn available.
2
//
3
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4
//
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
7
//
8
// https://opensource.org/licenses/BSD-3-Clause
9
//
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
14

15
#version 450
16

17
#if NCNN_fp16_storage
18
#extension GL_EXT_shader_16bit_storage: require
19
struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
20
#endif
21
#if NCNN_fp16_arithmetic
22
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
23
#endif
24

25
#define shape_constant_id_offset 0
26
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
27
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
28
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
29
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
30
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
31

32
layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
33
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
34
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
35
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
36
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
37

38
#if NCNN_image_shader
39
layout (binding = 0) uniform unfp sampler3D bottom_blob;
40
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
41
#else
42
layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
43
layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
44
#endif
45
layout (binding = 2) readonly buffer alpha_blob { sfpvec4 alpha_blob_data[]; };
46
layout (binding = 3) readonly buffer xofs_blob { int xofs_blob_data[]; };
47
layout (binding = 4) readonly buffer beta_blob { sfpvec4 beta_blob_data[]; };
48
layout (binding = 5) readonly buffer yofs_blob { int yofs_blob_data[]; };
49

50
layout (push_constant) uniform parameter
51
{
52
    int dims;
53
    int w;
54
    int h;
55
    int c;
56
    int cstep;
57

58
    int outdims;
59
    int outw;
60
    int outh;
61
    int outc;
62
    int outcstep;
63
} p;
64

65
void main()
66
{
67
    int gx = int(gl_GlobalInvocationID.x);
68
    int gy = int(gl_GlobalInvocationID.y);
69
    int gz = int(gl_GlobalInvocationID.z);
70

71
    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
72
        return;
73

74
    if (psc(dims) == 2)
75
    {
76
        int sx = xofs_blob_data[gx];
77

78
#if NCNN_image_shader
79
        afpvec8 b0 = image3d_ld8(bottom_blob, ivec3(sx - 1, gy, gz));
80
        afpvec8 b1 = image3d_ld8(bottom_blob, ivec3(sx + 0, gy, gz));
81
        afpvec8 b2 = image3d_ld8(bottom_blob, ivec3(sx + 1, gy, gz));
82
        afpvec8 b3 = image3d_ld8(bottom_blob, ivec3(sx + 2, gy, gz));
83
#else
84
        int v_offset_1 = gz * psc(cstep) + gy * psc(w) + sx;
85

86
        afpvec8 b0 = buffer_ld8(bottom_blob_data, v_offset_1 - 1);
87
        afpvec8 b1 = buffer_ld8(bottom_blob_data, v_offset_1 + 0);
88
        afpvec8 b2 = buffer_ld8(bottom_blob_data, v_offset_1 + 1);
89
        afpvec8 b3 = buffer_ld8(bottom_blob_data, v_offset_1 + 2);
90
#endif
91

92
        afpvec4 alpha = buffer_ld4(alpha_blob_data, gx);
93

94
        afpvec8 v;
95
        v[0] = b0[0] * alpha.r + b1[0] * alpha.g + b2[0] * alpha.b + b3[0] * alpha.a;
96
        v[1] = b0[1] * alpha.r + b1[1] * alpha.g + b2[1] * alpha.b + b3[1] * alpha.a;
97

98
#if NCNN_image_shader
99
        image3d_st8(top_blob, ivec3(gx, gy, gz), v);
100
#else
101
        const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
102

103
        buffer_st8(top_blob_data, gi, v);
104
#endif
105
        return;
106
    }
107

108
    int sx = xofs_blob_data[gx];
109
    int sy = yofs_blob_data[gy];
110

111
    afpvec4 alpha = buffer_ld4(alpha_blob_data, gx);
112

113
#if NCNN_image_shader
114
    afpvec8 a0 = image3d_ld8(bottom_blob, ivec3(sx - 1, sy - 1, gz));
115
    afpvec8 a1 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy - 1, gz));
116
    afpvec8 a2 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy - 1, gz));
117
    afpvec8 a3 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy - 1, gz));
118

119
    afpvec8 a;
120
    a[0] = a0[0] * alpha.r + a1[0] * alpha.g + a2[0] * alpha.b + a3[0] * alpha.a;
121
    a[1] = a0[1] * alpha.r + a1[1] * alpha.g + a2[1] * alpha.b + a3[1] * alpha.a;
122

123
    afpvec8 b0 = image3d_ld8(bottom_blob, ivec3(sx - 1, sy + 0, gz));
124
    afpvec8 b1 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy + 0, gz));
125
    afpvec8 b2 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy + 0, gz));
126
    afpvec8 b3 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy + 0, gz));
127

128
    afpvec8 b;
129
    b[0] = b0[0] * alpha.r + b1[0] * alpha.g + b2[0] * alpha.b + b3[0] * alpha.a;
130
    b[1] = b0[1] * alpha.r + b1[1] * alpha.g + b2[1] * alpha.b + b3[1] * alpha.a;
131

132
    afpvec8 c0 = image3d_ld8(bottom_blob, ivec3(sx - 1, sy + 1, gz));
133
    afpvec8 c1 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy + 1, gz));
134
    afpvec8 c2 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy + 1, gz));
135
    afpvec8 c3 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy + 1, gz));
136

137
    afpvec8 c;
138
    c[0] = c0[0] * alpha.r + c1[0] * alpha.g + c2[0] * alpha.b + c3[0] * alpha.a;
139
    c[1] = c0[1] * alpha.r + c1[1] * alpha.g + c2[1] * alpha.b + c3[1] * alpha.a;
140

141
    afpvec8 d0 = image3d_ld8(bottom_blob, ivec3(sx - 1, sy + 2, gz));
142
    afpvec8 d1 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy + 2, gz));
143
    afpvec8 d2 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy + 2, gz));
144
    afpvec8 d3 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy + 2, gz));
145

146
    afpvec8 d;
147
    d[0] = d0[0] * alpha.r + d1[0] * alpha.g + d2[0] * alpha.b + d3[0] * alpha.a;
148
    d[1] = d0[1] * alpha.r + d1[1] * alpha.g + d2[1] * alpha.b + d3[1] * alpha.a;
149
#else
150
    int v_offset_0 = gz * psc(cstep) + (sy - 1) * psc(w) + sx;
151
    int v_offset_1 = gz * psc(cstep) + (sy + 0) * psc(w) + sx;
152
    int v_offset_2 = gz * psc(cstep) + (sy + 1) * psc(w) + sx;
153
    int v_offset_3 = gz * psc(cstep) + (sy + 2) * psc(w) + sx;
154

155
    afpvec8 a0 = buffer_ld8(bottom_blob_data, v_offset_0 - 1);
156
    afpvec8 a1 = buffer_ld8(bottom_blob_data, v_offset_0 + 0);
157
    afpvec8 a2 = buffer_ld8(bottom_blob_data, v_offset_0 + 1);
158
    afpvec8 a3 = buffer_ld8(bottom_blob_data, v_offset_0 + 2);
159

160
    afpvec8 a;
161
    a[0] = a0[0] * alpha.r + a1[0] * alpha.g + a2[0] * alpha.b + a3[0] * alpha.a;
162
    a[1] = a0[1] * alpha.r + a1[1] * alpha.g + a2[1] * alpha.b + a3[1] * alpha.a;
163

164
    afpvec8 b0 = buffer_ld8(bottom_blob_data, v_offset_1 - 1);
165
    afpvec8 b1 = buffer_ld8(bottom_blob_data, v_offset_1 + 0);
166
    afpvec8 b2 = buffer_ld8(bottom_blob_data, v_offset_1 + 1);
167
    afpvec8 b3 = buffer_ld8(bottom_blob_data, v_offset_1 + 2);
168

169
    afpvec8 b;
170
    b[0] = b0[0] * alpha.r + b1[0] * alpha.g + b2[0] * alpha.b + b3[0] * alpha.a;
171
    b[1] = b0[1] * alpha.r + b1[1] * alpha.g + b2[1] * alpha.b + b3[1] * alpha.a;
172

173
    afpvec8 c0 = buffer_ld8(bottom_blob_data, v_offset_2 - 1);
174
    afpvec8 c1 = buffer_ld8(bottom_blob_data, v_offset_2 + 0);
175
    afpvec8 c2 = buffer_ld8(bottom_blob_data, v_offset_2 + 1);
176
    afpvec8 c3 = buffer_ld8(bottom_blob_data, v_offset_2 + 2);
177

178
    afpvec8 c;
179
    c[0] = c0[0] * alpha.r + c1[0] * alpha.g + c2[0] * alpha.b + c3[0] * alpha.a;
180
    c[1] = c0[1] * alpha.r + c1[1] * alpha.g + c2[1] * alpha.b + c3[1] * alpha.a;
181

182
    afpvec8 d0 = buffer_ld8(bottom_blob_data, v_offset_3 - 1);
183
    afpvec8 d1 = buffer_ld8(bottom_blob_data, v_offset_3 + 0);
184
    afpvec8 d2 = buffer_ld8(bottom_blob_data, v_offset_3 + 1);
185
    afpvec8 d3 = buffer_ld8(bottom_blob_data, v_offset_3 + 2);
186

187
    afpvec8 d;
188
    d[0] = d0[0] * alpha.r + d1[0] * alpha.g + d2[0] * alpha.b + d3[0] * alpha.a;
189
    d[1] = d0[1] * alpha.r + d1[1] * alpha.g + d2[1] * alpha.b + d3[1] * alpha.a;
190
#endif
191

192
    afpvec4 beta = buffer_ld4(beta_blob_data, gy);
193

194
    afpvec8 v;
195
    v[0] = a[0] * beta.r + b[0] * beta.g + c[0] * beta.b + d[0] * beta.a;
196
    v[1] = a[1] * beta.r + b[1] * beta.g + c[1] * beta.b + d[1] * beta.a;
197

198
#if NCNN_image_shader
199
    image3d_st8(top_blob, ivec3(gx, gy, gz), v);
200
#else
201
    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
202

203
    buffer_st8(top_blob_data, gi, v);
204
#endif
205
}
206

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.