ncnn

Форк
0
/
convolution_3x3s1d1_winograd23_transform_output.comp 
178 строк · 6.7 Кб
1
// Tencent is pleased to support the open source community by making ncnn available.
2
//
3
// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
4
//
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
7
//
8
// https://opensource.org/licenses/BSD-3-Clause
9
//
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
14

15
#version 450
16

17
#if NCNN_fp16_storage
18
#extension GL_EXT_shader_16bit_storage: require
19
#endif
20
#if NCNN_fp16_arithmetic
21
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
22
#endif
23

24
#extension GL_GOOGLE_include_directive: enable
25
#include "vulkan_activation.comp"
26

27
layout (constant_id = 0) const int bias_term = 0;
28
layout (constant_id = 1) const int activation_type = 0;
29
layout (constant_id = 2) const float activation_param_0 = 0;
30
layout (constant_id = 3) const float activation_param_1 = 0;
31

32
#define shape_constant_id_offset 4
33
layout (constant_id = shape_constant_id_offset + 0) const int c = 0;
34
layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0;
35

36
layout (constant_id = shape_constant_id_offset + 2) const int block_x = 0;
37
layout (constant_id = shape_constant_id_offset + 3) const int block_y = 0;
38

39
layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
40
layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
41
layout (constant_id = shape_constant_id_offset + 6) const int outcstep = 0;
42

43
#if NCNN_image_shader
44
layout (binding = 0) uniform unfp sampler3D top_tm_blob;
45
layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
46
layout (binding = 2) uniform unfp sampler3D bias_blob;
47
#else
48
layout (binding = 0) readonly buffer top_tm_blob { sfp top_tm_blob_data[]; };
49
layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
50
layout (binding = 2) readonly buffer bias_blob { sfp bias_data[]; };
51
#endif
52

53
layout (push_constant) uniform parameter
54
{
55
    int c;
56
    int cstep;
57

58
    int block_x;
59
    int block_y;
60

61
    int outw;
62
    int outh;
63
    int outcstep;
64
} p;
65

66
void main()
67
{
68
    int gx = int(gl_GlobalInvocationID.x);
69
    int gy = int(gl_GlobalInvocationID.y);
70
    int gz = int(gl_GlobalInvocationID.z);
71

72
    if (gx >= psc(block_x) || gy >= psc(block_y) || gz >= psc(c))
73
        return;
74

75
    // load 16
76
#if NCNN_image_shader
77
    int sx = gy * psc(block_x) + gx;
78

79
    afp v00 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 0));
80
    afp v01 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 1));
81
    afp v02 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 2));
82
    afp v03 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 3));
83
    afp v10 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 4));
84
    afp v11 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 5));
85
    afp v12 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 6));
86
    afp v13 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 7));
87
    afp v20 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 8));
88
    afp v21 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 9));
89
    afp v22 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 10));
90
    afp v23 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 11));
91
    afp v30 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 12));
92
    afp v31 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 13));
93
    afp v32 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 14));
94
    afp v33 = image3d_ld1(top_tm_blob, ivec3(sx, gz, 15));
95
#else
96
    int v_tm_offset = gz * psc(block_x) * psc(block_y) + gy * psc(block_x) + gx;
97

98
    afp v00 = buffer_ld1(top_tm_blob_data, v_tm_offset + 0 * psc(cstep));
99
    afp v01 = buffer_ld1(top_tm_blob_data, v_tm_offset + 1 * psc(cstep));
100
    afp v02 = buffer_ld1(top_tm_blob_data, v_tm_offset + 2 * psc(cstep));
101
    afp v03 = buffer_ld1(top_tm_blob_data, v_tm_offset + 3 * psc(cstep));
102
    afp v10 = buffer_ld1(top_tm_blob_data, v_tm_offset + 4 * psc(cstep));
103
    afp v11 = buffer_ld1(top_tm_blob_data, v_tm_offset + 5 * psc(cstep));
104
    afp v12 = buffer_ld1(top_tm_blob_data, v_tm_offset + 6 * psc(cstep));
105
    afp v13 = buffer_ld1(top_tm_blob_data, v_tm_offset + 7 * psc(cstep));
106
    afp v20 = buffer_ld1(top_tm_blob_data, v_tm_offset + 8 * psc(cstep));
107
    afp v21 = buffer_ld1(top_tm_blob_data, v_tm_offset + 9 * psc(cstep));
108
    afp v22 = buffer_ld1(top_tm_blob_data, v_tm_offset + 10 * psc(cstep));
109
    afp v23 = buffer_ld1(top_tm_blob_data, v_tm_offset + 11 * psc(cstep));
110
    afp v30 = buffer_ld1(top_tm_blob_data, v_tm_offset + 12 * psc(cstep));
111
    afp v31 = buffer_ld1(top_tm_blob_data, v_tm_offset + 13 * psc(cstep));
112
    afp v32 = buffer_ld1(top_tm_blob_data, v_tm_offset + 14 * psc(cstep));
113
    afp v33 = buffer_ld1(top_tm_blob_data, v_tm_offset + 15 * psc(cstep));
114
#endif
115

116
    // const float itm[2][4] = {
117
    //     {1.0f,  1.0f,  1.0f,  0.0f},
118
    //     {0.0f,  1.0f, -1.0f,  1.0f}
119
    // };
120

121
    // implicit transpose
122
    afp m00 = v00 + v01 + v02;
123
    afp m01 = v10 + v11 + v12;
124
    afp m02 = v20 + v21 + v22;
125
    afp m03 = v30 + v31 + v32;
126

127
    afp m10 = v01 - v02 + v03;
128
    afp m11 = v11 - v12 + v13;
129
    afp m12 = v21 - v22 + v23;
130
    afp m13 = v31 - v32 + v33;
131

132
    if (bias_term == 1)
133
    {
134
#if NCNN_image_shader
135
        const afp bias_value = image3d_ld1(bias_blob, ivec3(gz, 0, 0));
136
#else
137
        const afp bias_value = buffer_ld1(bias_data, gz);
138
#endif
139

140
        v00 = bias_value + m00 + m01 + m02;
141
        v10 = bias_value + m10 + m11 + m12;
142

143
        v01 = bias_value + m01 - m02 + m03;
144
        v11 = bias_value + m11 - m12 + m13;
145
    }
146
    else
147
    {
148
        v00 = m00 + m01 + m02;
149
        v10 = m10 + m11 + m12;
150

151
        v01 = m01 - m02 + m03;
152
        v11 = m11 - m12 + m13;
153
    }
154

155
    v00 = activation_afp(v00, activation_type, activation_param_0, activation_param_1);
156
    v10 = activation_afp(v10, activation_type, activation_param_0, activation_param_1);
157
    v01 = activation_afp(v01, activation_type, activation_param_0, activation_param_1);
158
    v11 = activation_afp(v11, activation_type, activation_param_0, activation_param_1);
159

160
    // store 2x2
161
    int x = gx * 2;
162
    int y = gy * 2;
163

164
#if NCNN_image_shader
165
    image3d_st1(top_blob, ivec3(x, y, gz), v00);
166
    image3d_st1(top_blob, ivec3(x + 1, y, gz), v01);
167
    image3d_st1(top_blob, ivec3(x, y + 1, gz), v10);
168
    image3d_st1(top_blob, ivec3(x + 1, y + 1, gz), v11);
169
#else
170
    int v_offset_0 = gz * psc(outcstep) + y * psc(outw) + x;
171
    int v_offset_1 = v_offset_0 + psc(outw);
172

173
    buffer_st1(top_blob_data, v_offset_0 + 0, v00);
174
    if (x + 1 < psc(outw)) buffer_st1(top_blob_data, v_offset_0 + 1, v01);
175
    if (y + 1 < psc(outh)) buffer_st1(top_blob_data, v_offset_1 + 0, v10);
176
    if (y + 1 < psc(outh) && x + 1 < psc(outw)) buffer_st1(top_blob_data, v_offset_1 + 1, v11);
177
#endif
178
}
179

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.