ncnn

normalize_coeffs_pack4.comp
125 строк · 3.2 Кб
Перенос по словам
1
// Tencent is pleased to support the open source community by making ncnn available.
2
//
3
// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4
//
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
7
//
8
// https://opensource.org/licenses/BSD-3-Clause
9
//
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
14

15
#version 450
16

17
#if NCNN_fp16_storage
18
#extension GL_EXT_shader_16bit_storage: require
19
#endif
20
#if NCNN_fp16_arithmetic
21
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
22
#endif
23

24
layout (constant_id = 0) const int across_spatial = 0;
25
layout (constant_id = 1) const int across_channel = 0;
26
layout (constant_id = 2) const float eps = 0.f;
27
layout (constant_id = 3) const int eps_mode = 0;
28

29
#if NCNN_image_shader
30
layout (binding = 0) uniform highp sampler3D sqsum_blob;
31
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D coeffs_blob;
32
#else
33
layout (binding = 0) readonly buffer sqsum_blob { vec4 sqsum_blob_data[]; };
34
layout (binding = 1) writeonly buffer coeffs_blob { sfpvec4 coeffs_blob_data[]; };
35
#endif
36

37
layout (push_constant) uniform parameter
38
{
39
    int w;
40
    int h;
41
    int c;
42
    int cstep;
43
} p;
44

45
void main()
46
{
47
    int gx = int(gl_GlobalInvocationID.x);
48
    int gy = int(gl_GlobalInvocationID.y);
49
    int gz = int(gl_GlobalInvocationID.z);
50

51
    if (gx >= p.w || gy >= p.h || gz >= p.c)
52
        return;
53

54
#if NCNN_image_shader
55
    vec4 sqsum_4 = texelFetch(sqsum_blob, ivec3(gx, gy, gz), 0);
56
#else
57
    int v_offset = gz * p.cstep + gx;
58

59
    vec4 sqsum_4 = sqsum_blob_data[v_offset];
60
#endif
61

62
    if (across_channel == 0)
63
    {
64
        afpvec4 sqsum = afpvec4(sqsum_4);
65

66
        afpvec4 a;
67

68
        if (eps_mode == 0) // caffe/mxnet
69
        {
70
            a = afp(1.f) / sqrt(sqsum + afp(eps));
71
        }
72

73
        if (eps_mode == 1) // pytorch
74
        {
75
            a = afp(1.f) / max(sqrt(sqsum), afp(eps));
76
        }
77

78
        if (eps_mode == 2) // tensorflow
79
        {
80
            a = afp(1.f) / sqrt(max(sqsum, afp(eps)));
81
        }
82

83
#if NCNN_image_shader
84
        int gi = gz * p.w * p.h + gy * p.w + gx;
85

86
        image3d_st4(coeffs_blob, ivec3(gi, 0, 0), a);
87
#else
88
        int gi = gz * p.w + gx;
89

90
        buffer_st4(coeffs_blob_data, gi, a);
91
#endif
92
    }
93

94
    if (across_channel == 1)
95
    {
96
        afp sqsum = afp(sqsum_4.r + sqsum_4.g + sqsum_4.b + sqsum_4.a);
97

98
        afp a;
99

100
        if (eps_mode == 0) // caffe/mxnet
101
        {
102
            a = afp(1.f) / sqrt(sqsum + afp(eps));
103
        }
104

105
        if (eps_mode == 1) // pytorch
106
        {
107
            a = afp(1.f) / max(sqrt(sqsum), afp(eps));
108
        }
109

110
        if (eps_mode == 2) // tensorflow
111
        {
112
            a = afp(1.f) / sqrt(max(sqsum, afp(eps)));
113
        }
114

115
#if NCNN_image_shader
116
        int gi = gz * p.w * p.h + gy * p.w + gx;
117

118
        image3d_st4(coeffs_blob, ivec3(gi, 0, 0), afpvec4(a));
119
#else
120
        int gi = gz * p.w + gx;
121

122
        buffer_st4(coeffs_blob_data, gi, afpvec4(a));
123
#endif
124
    }
125
}
126
ncnn

Использование cookies