1
// Tencent is pleased to support the open source community by making ncnn available.
3
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
8
// https://opensource.org/licenses/BSD-3-Clause
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
15
#include "deconvolution_riscv.h"
18
#include "layer_type.h"
21
#include <riscv_vector.h>
22
#endif // __riscv_vector
24
#include "riscv_activation.h"
25
#include "riscv_usability.h"
30
#include "deconvolution_packn.h"
31
#include "deconvolution_pack1ton.h"
32
#include "deconvolution_packnto1.h"
35
#include "deconvolution_fp16s.h"
36
#include "deconvolution_packn_fp16s.h"
37
#include "deconvolution_pack1ton_fp16s.h"
38
#include "deconvolution_packnto1_fp16s.h"
40
#endif // __riscv_vector
42
Deconvolution_riscv::Deconvolution_riscv()
45
support_packing = true;
47
support_fp16_storage = true;
49
#endif // __riscv_vector
52
int Deconvolution_riscv::create_pipeline(const Option& opt)
57
#if __riscv_vector && __riscv_zfh
58
if (opt.use_fp16_storage)
60
return create_pipeline_fp16s(opt);
65
const int packn = csrr_vlenb() / 4;
68
const int maxk = kernel_w * kernel_h;
69
int num_input = weight_data_size / maxk / num_output;
71
Mat weight_data_transposed(weight_data.w);
73
float* pt = weight_data_transposed;
74
const float* p = weight_data;
76
for (int i = 0; i < num_input * num_output; i++)
78
for (int k = 0; k < maxk; k++)
80
pt[maxk - 1 - k] = p[k];
91
if (opt.use_packing_layout)
93
elempack = num_input % packn == 0 ? packn : 1;
94
out_elempack = num_output % packn == 0 ? packn : 1;
98
// src = kw-kh-inch-outch
99
// dst = pb-pa-kw-kh-inch/pa-outch/pb
101
Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output);
103
weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack);
105
for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
107
float* g00 = weight_data_tm.channel(q / out_elempack);
109
for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
111
for (int k = 0; k < maxk; k++)
113
for (int i = 0; i < elempack; i++)
115
for (int j = 0; j < out_elempack; j++)
117
const float* k00 = weight_data_r2.channel(q + j).row(p + i);
131
if (elempack == packn && out_elempack == packn)
136
if (elempack == 1 && out_elempack == packn)
141
if (elempack == packn && out_elempack == 1)
144
#endif // __riscv_vector
147
if (elempack == 1 && out_elempack == 1)
152
weight_data.release();
157
int Deconvolution_riscv::destroy_pipeline(const Option& opt)
162
int Deconvolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
164
int elembits = bottom_blob.elembits();
166
#if __riscv_vector && __riscv_zfh
167
if (opt.use_fp16_storage && elembits == 16)
169
if (opt.use_fp16_arithmetic)
170
return forward_fp16sa(bottom_blob, top_blob, opt);
172
return forward_fp16s(bottom_blob, top_blob, opt);
177
const int packn = csrr_vlenb() / 4;
180
// deconvolv with NxN kernel
181
// value = value + bias
183
int w = bottom_blob.w;
184
int h = bottom_blob.h;
185
int channels = bottom_blob.c;
186
size_t elemsize = bottom_blob.elemsize;
187
int elempack = bottom_blob.elempack;
189
// NCNN_LOGE("Deconvolution input %d x %d pad = %d %d ksize=%d %d stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
191
const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
192
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
194
int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
195
int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
196
int out_elempack = 1;
198
if (opt.use_packing_layout)
200
out_elempack = num_output % packn == 0 ? packn : 1;
203
size_t out_elemsize = elemsize / elempack * out_elempack;
205
Mat top_blob_bordered;
206
if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0))
208
top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
212
top_blob_bordered = top_blob;
213
top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
215
if (top_blob_bordered.empty())
218
const int maxk = kernel_w * kernel_h;
221
if (elempack == packn && out_elempack == packn)
224
deconvolution_packn_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
228
if (elempack == 1 && out_elempack == packn)
231
deconvolution_pack1ton_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
235
if (elempack == packn && out_elempack == 1)
238
deconvolution_packnto1_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
241
#endif // __riscv_vector
243
if (elempack == 1 && out_elempack == 1)
247
#pragma omp parallel for num_threads(opt.num_threads)
248
for (int p = 0; p < num_output; p++)
250
float* outptr = top_blob_bordered.channel(p);
252
for (int i = 0; i < outh; i++)
254
for (int j = 0; j < outw; j++)
263
const float* kptr = (const float*)weight_data_tm.channel(p);
266
for (int q = 0; q < channels; q++)
268
const Mat m = bottom_blob.channel(q);
270
for (int y = 0; y < kernel_h; y++)
272
int sys = (i + y * dilation_h - (kernel_extent_h - 1));
273
if (sys < 0 || sys % stride_h != 0)
276
int sy = sys / stride_h;
280
const float* sptr = m.row(sy);
282
for (int x = 0; x < kernel_w; x++)
284
int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
285
if (sxs < 0 || sxs % stride_w != 0)
288
int sx = sxs / stride_w;
292
float val = sptr[sx];
294
int k = y * kernel_w + x;
305
sum = activation_ss(sum, activation_type, activation_params);
316
cut_padding(top_blob_bordered, top_blob, opt);
317
if (top_blob.empty())
323
int Deconvolution_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
325
const Mat& bottom_blob = bottom_blobs[0];
326
const Mat& _weight_data = bottom_blobs[1];
327
Mat& top_blob = top_blobs[0];
329
const int _num_input = bottom_blob.c * bottom_blob.elempack;
330
const int _kernel_w = _weight_data.w;
331
const int _kernel_h = _weight_data.h;
332
const int _num_output = _weight_data.d * 1;
334
Mat weight_data_flattened;
335
flatten(_weight_data, weight_data_flattened, opt);
336
if (weight_data_flattened.empty())
340
if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && weight_data_flattened.elembits() == 16)
342
Mat weight_data_flattened_fp32;
343
cast_float16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt);
344
weight_data_flattened = weight_data_flattened_fp32;
348
// weight_data_flattened as pack1
349
weight_data_flattened.w *= weight_data_flattened.elempack;
350
weight_data_flattened.elemsize /= weight_data_flattened.elempack;
351
weight_data_flattened.elempack = 1;
353
// transpose group-inch/group-outch/group-kh-kw to group-outch/group-inch/group-kh-kw
354
Mat weight_data_transposed;
356
weight_data_transposed.create(_kernel_w * _kernel_h * _num_output * _num_input / 1, 4u, opt.workspace_allocator);
357
if (weight_data_transposed.empty())
360
const int outch_g = _num_output / 1;
361
const int inch_g = _num_input / 1;
362
const int maxk = _kernel_h * _kernel_w;
364
for (int g = 0; g < 1; g++)
366
// reorder weight from inch-outch to outch-inch
367
float* wg2 = (float*)weight_data_transposed + g * outch_g * inch_g * maxk;
368
const float* wg = (const float*)weight_data_flattened + g * inch_g * outch_g * maxk;
369
for (int i = 0; i < outch_g; i++)
371
for (int j = 0; j < inch_g; j++)
373
for (int k = 0; k < maxk; k++)
375
wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k];
382
Mat bias_data_flattened;
385
const Mat& _bias_data = bottom_blobs[2];
386
flatten(_bias_data, bias_data_flattened, opt);
387
if (bias_data_flattened.empty())
391
if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && bias_data_flattened.elembits() == 16)
393
Mat bias_data_flattened_fp32;
394
cast_float16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt);
395
bias_data_flattened = bias_data_flattened_fp32;
399
// bias_data_flattened as pack1
400
bias_data_flattened.w *= bias_data_flattened.elempack;
401
bias_data_flattened.elemsize /= bias_data_flattened.elempack;
402
bias_data_flattened.elempack = 1;
405
ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
408
pd.set(0, _num_output);
409
pd.set(1, _kernel_w);
410
pd.set(11, _kernel_h);
411
pd.set(2, dilation_w);
412
pd.set(12, dilation_h);
414
pd.set(13, stride_h);
416
pd.set(15, pad_right);
418
pd.set(16, pad_bottom);
419
pd.set(18, output_pad_right);
420
pd.set(19, output_pad_bottom);
421
pd.set(20, output_w);
422
pd.set(21, output_h);
423
pd.set(5, bias_term);
424
pd.set(6, weight_data_transposed.w);
425
pd.set(9, activation_type);
426
pd.set(10, activation_params);
430
ncnn::Mat weights[2];
431
weights[0] = weight_data_transposed;
432
weights[1] = bias_data_flattened;
434
op->load_model(ncnn::ModelBinFromMatArray(weights));
436
op->create_pipeline(opt);
438
op->forward(bottom_blob, top_blob, opt);
440
op->destroy_pipeline(opt);
447
#if __riscv_vector && __riscv_zfh
448
int Deconvolution_riscv::create_pipeline_fp16s(const Option& opt)
450
const int packn = csrr_vlenb() / 2;
452
const int maxk = kernel_w * kernel_h;
453
const int num_input = weight_data_size / maxk / num_output;
456
int out_elempack = 1;
458
if (opt.use_packing_layout)
460
elempack = num_input % packn == 0 ? packn : 1;
461
out_elempack = num_output % packn == 0 ? packn : 1;
464
Mat weight_data_transposed(weight_data.w);
466
float* pt = weight_data_transposed;
467
const float* p = weight_data;
469
for (int i = 0; i < num_input * num_output; i++)
471
for (int k = 0; k < maxk; k++)
473
pt[maxk - 1 - k] = p[k];
481
// src = kw-kh-inch-outch
482
// dst = pb-pa-kw-kh-inch/pa-outch/pb
484
Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output);
486
weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)2u * elempack * out_elempack, elempack * out_elempack);
488
for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
490
__fp16* g00 = weight_data_tm.channel(q / out_elempack);
492
for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
494
for (int k = 0; k < maxk; k++)
496
for (int i = 0; i < elempack; i++)
498
for (int j = 0; j < out_elempack; j++)
500
const float* k00 = weight_data_r2.channel(q + j).row(p + i);
502
g00[0] = (__fp16)k00[k];
513
if (elempack == packn && out_elempack == packn)
518
if (elempack == 1 && out_elempack == packn)
523
if (elempack == packn && out_elempack == 1)
528
if (elempack == 1 && out_elempack == 1)
532
ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
535
weight_data.release();
540
int Deconvolution_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
542
const int packn = csrr_vlenb() / 2;
544
// deconvolv with NxN kernel
545
// value = value + bias
547
int w = bottom_blob.w;
548
int h = bottom_blob.h;
549
int channels = bottom_blob.c;
550
size_t elemsize = bottom_blob.elemsize;
551
int elempack = bottom_blob.elempack;
553
// NCNN_LOGE("Deconvolution input %d x %d pad = %d %d ksize=%d %d stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
555
const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
556
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
558
int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
559
int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
560
int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1;
561
size_t out_elemsize = elemsize / elempack * out_elempack;
563
Mat top_blob_bordered;
564
if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0))
566
top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
570
top_blob_bordered = top_blob;
571
top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
573
if (top_blob_bordered.empty())
576
if (elempack == packn && out_elempack == packn)
579
deconvolution_packn_fp16s_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
583
if (elempack == 1 && out_elempack == packn)
586
deconvolution_pack1ton_fp16s_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
590
if (elempack == packn && out_elempack == 1)
593
deconvolution_packnto1_fp16s_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
597
if (elempack == 1 && out_elempack == 1)
600
deconvolution_fp16s(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
604
cut_padding(top_blob_bordered, top_blob, opt);
605
if (top_blob.empty())
611
int Deconvolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
613
const int packn = csrr_vlenb() / 2;
615
// deconvolv with NxN kernel
616
// value = value + bias
618
int w = bottom_blob.w;
619
int h = bottom_blob.h;
620
int channels = bottom_blob.c;
621
size_t elemsize = bottom_blob.elemsize;
622
int elempack = bottom_blob.elempack;
624
// NCNN_LOGE("Deconvolution input %d x %d pad = %d %d ksize=%d %d stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
626
const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
627
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
629
int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
630
int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
631
int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1;
632
size_t out_elemsize = elemsize / elempack * out_elempack;
634
Mat top_blob_bordered;
635
if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0))
637
top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
641
top_blob_bordered = top_blob;
642
top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
644
if (top_blob_bordered.empty())
647
if (elempack == packn && out_elempack == packn)
650
deconvolution_packn_fp16sa_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
654
if (elempack == 1 && out_elempack == packn)
657
deconvolution_pack1ton_fp16sa_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
661
if (elempack == packn && out_elempack == 1)
664
deconvolution_packnto1_fp16sa_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
668
if (elempack == 1 && out_elempack == 1)
671
deconvolution_fp16s(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
675
cut_padding(top_blob_bordered, top_blob, opt);
676
if (top_blob.empty())
681
#endif // __riscv_vector && __riscv_zfh