1
// Tencent is pleased to support the open source community by making ncnn available.
3
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
8
// https://opensource.org/licenses/BSD-3-Clause
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
15
#include "innerproduct.h"
17
#include "layer_type.h"
19
#include "fused_activation.h"
23
InnerProduct::InnerProduct()
26
support_inplace = false;
29
int InnerProduct::load_param(const ParamDict& pd)
31
num_output = pd.get(0, 0);
32
bias_term = pd.get(1, 0);
33
weight_data_size = pd.get(2, 0);
34
int8_scale_term = pd.get(8, 0);
35
activation_type = pd.get(9, 0);
36
activation_params = pd.get(10, Mat());
41
support_int8_storage = true;
43
NCNN_LOGE("please build ncnn with NCNN_INT8 enabled for int8 inference");
51
int InnerProduct::load_model(const ModelBin& mb)
53
weight_data = mb.load(weight_data_size, 0);
54
if (weight_data.empty())
59
bias_data = mb.load(num_output, 1);
60
if (bias_data.empty())
67
weight_data_int8_scales = mb.load(num_output, 1);
68
bottom_blob_int8_scales = mb.load(1, 1);
73
// runtime quantize the weight data
74
if (weight_data.elemsize == (size_t)4u && int8_scale_term)
76
const int num_input = weight_data_size / num_output;
78
Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
82
opt_q.num_threads = 1;
83
opt_q.use_packing_layout = false;
84
quantize_to_int8(weight_data_r2, weight_data_int8, weight_data_int8_scales, opt_q);
85
if (weight_data_int8.empty())
88
weight_data = weight_data_int8.reshape(weight_data_size);
95
int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
98
if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
100
return forward_int8(bottom_blob, top_blob, opt);
104
const int num_input = weight_data_size / num_output;
106
int w = bottom_blob.w;
107
int h = bottom_blob.h;
108
int channels = bottom_blob.c;
109
size_t elemsize = bottom_blob.elemsize;
112
if (bottom_blob.dims == 2 && w == num_input)
115
top_blob.create(num_output, h, elemsize, opt.blob_allocator);
116
if (top_blob.empty())
119
#pragma omp parallel for num_threads(opt.num_threads)
120
for (int j = 0; j < h; j++)
122
const float* m = bottom_blob.row(j);
123
float* outptr = top_blob.row(j);
125
for (int p = 0; p < num_output; p++)
127
const float* kptr = (const float*)weight_data + w * p;
134
for (int i = 0; i < w; i++)
136
sum += m[i] * kptr[i];
139
outptr[p] = activation_ss(sum, activation_type, activation_params);
146
top_blob.create(num_output, elemsize, opt.blob_allocator);
147
if (top_blob.empty())
151
#pragma omp parallel for num_threads(opt.num_threads)
152
for (int p = 0; p < num_output; p++)
160
for (int q = 0; q < channels; q++)
162
const float* w = (const float*)weight_data + size * channels * p + size * q;
163
const float* m = bottom_blob.channel(q);
165
for (int i = 0; i < size; i++)
171
top_blob[p] = activation_ss(sum, activation_type, activation_params);
178
int InnerProduct::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
180
const int num_input = weight_data_size / num_output;
182
int w = bottom_blob.w;
183
int h = bottom_blob.h;
184
int channels = bottom_blob.c;
185
size_t elemsize = bottom_blob.elemsize;
188
Mat bottom_blob_int8 = bottom_blob;
192
opt_g.blob_allocator = opt.workspace_allocator;
193
opt_g.use_packing_layout = false;
195
quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_g);
198
if (bottom_blob.dims == 2 && w == num_input)
201
top_blob.create(num_output, h, 4u, opt.blob_allocator);
202
if (top_blob.empty())
205
#pragma omp parallel for num_threads(opt.num_threads)
206
for (int j = 0; j < h; j++)
208
const signed char* m = bottom_blob_int8.row<signed char>(j);
209
float* outptr = top_blob.row(j);
211
for (int p = 0; p < num_output; p++)
213
const signed char* kptr = (const signed char*)weight_data + w * p;
216
for (int i = 0; i < w; i++)
218
sum += m[i] * kptr[i];
220
// dequantize and relu
222
if (weight_data_int8_scales[p] == 0)
225
scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);
227
float sumfp32 = sum * scale_in;
230
sumfp32 += bias_data[p];
232
outptr[p] = activation_ss(sumfp32, activation_type, activation_params);
239
top_blob.create(num_output, 4u, opt.blob_allocator);
240
if (top_blob.empty())
244
#pragma omp parallel for num_threads(opt.num_threads)
245
for (int p = 0; p < num_output; p++)
247
float* outptr = top_blob;
251
int offset = size * channels * p;
253
for (int q = 0; q < channels; q++)
255
const signed char* w = (const signed char*)weight_data + offset + size * q;
256
const signed char* m = bottom_blob_int8.channel(q);
258
for (int i = 0; i < size; i++)
264
// dequantize and relu
266
if (weight_data_int8_scales[p] == 0)
269
scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);
271
float sumfp32 = sum * scale_in;
274
sumfp32 += bias_data[p];
276
outptr[p] = activation_ss(sumfp32, activation_type, activation_params);