16
#define _CRT_SECURE_NO_DEPRECATE
25
#include "datareader.h"
27
#include "layer_type.h"
31
#include "modelwriter.h"
33
class DataReaderFromEmpty : public ncnn::DataReader
36
virtual int scan(const char* format, void* p) const
40
virtual size_t read(void* buf, size_t size) const
47
class NetOptimize : public ModelWriter
53
int fuse_batchnorm_scale();
54
int fuse_convolution_batchnorm();
55
int fuse_convolution_mul();
56
int fuse_convolution_add();
57
int fuse_convolutiondepthwise_batchnorm();
58
int fuse_convolutiondepthwise_mul();
59
int fuse_convolutiondepthwise_add();
60
int fuse_deconvolution_batchnorm();
61
int fuse_deconvolution_mul();
62
int fuse_deconvolution_add();
63
int fuse_deconvolutiondepthwise_batchnorm();
64
int fuse_innerproduct_batchnorm();
65
int fuse_innerproduct_add();
66
int fuse_innerproduct_dropout();
67
int fuse_convolution_activation();
68
int fuse_convolutiondepthwise_activation();
69
int fuse_deconvolution_activation();
70
int fuse_deconvolutiondepthwise_activation();
71
int fuse_innerproduct_activation();
72
int fuse_memorydata_binaryop();
73
int fuse_binaryop_eltwise();
75
int eliminate_dropout();
76
int eliminate_pooling1x1();
78
int eliminate_split();
79
int eliminate_orphaned_memorydata();
80
int eliminate_flatten_after_global_pooling();
81
int eliminate_reshape_after_global_pooling();
82
int eliminate_flatten_after_innerproduct();
83
int eliminate_reshape_before_binaryop();
85
int replace_reduction_with_global_pooling();
86
int replace_prelu_with_leaky_relu();
87
int replace_convolution_with_innerproduct_after_global_pooling();
88
int replace_convolution_with_innerproduct_after_innerproduct();
91
NetOptimize::NetOptimize()
96
int NetOptimize::fuse_batchnorm_scale()
98
const size_t layer_count = layers.size();
99
for (size_t i = 0; i < layer_count; i++)
101
if (layers[i]->type != "BatchNorm")
105
int top_blob_index = layers[i]->tops[0];
108
for (; j < layer_count; j++)
110
if (layers[j]->type != "Scale")
113
if (layers[j]->bottoms.size() != 1)
116
if (layers[j]->bottoms[0] == top_blob_index)
120
if (j == layer_count)
124
ncnn::BatchNorm* batchnorm = (ncnn::BatchNorm*)layers[i];
125
ncnn::Scale* scale = (ncnn::Scale*)layers[j];
127
fprintf(stderr, "fuse_batchnorm_scale %s %s\n", batchnorm->name.c_str(), scale->name.c_str());
133
int channels = batchnorm->channels;
135
float* slope = batchnorm->slope_data;
136
float* bias = batchnorm->bias_data;
138
for (int q = 0; q < channels; q++)
140
slope[q] = slope[q] * scale->scale_data[q];
141
if (scale->bias_term)
142
bias[q] = bias[q] * scale->scale_data[q] + scale->bias_data[q];
144
bias[q] = bias[q] * scale->scale_data[q];
148
int top_blob_index_final = scale->tops[0];
149
batchnorm->tops[0] = top_blob_index_final;
150
blobs[top_blob_index_final].producer = i;
151
scale->type = "ncnnfused";
157
int NetOptimize::fuse_convolution_batchnorm()
159
const size_t layer_count = layers.size();
160
for (size_t i = 0; i < layer_count; i++)
162
if (layers[i]->type != "Convolution")
166
int top_blob_index = layers[i]->tops[0];
169
for (; j < layer_count; j++)
171
if (layers[j]->type != "BatchNorm")
174
if (layers[j]->bottoms.size() != 1)
177
if (layers[j]->bottoms[0] == top_blob_index)
181
if (j == layer_count)
185
ncnn::Convolution* convolution = (ncnn::Convolution*)layers[i];
186
ncnn::BatchNorm* batchnorm = (ncnn::BatchNorm*)layers[j];
188
fprintf(stderr, "fuse_convolution_batchnorm %s %s\n", convolution->name.c_str(), batchnorm->name.c_str());
191
int channels = batchnorm->channels;
192
float eps = batchnorm->eps;
198
std::vector<float> a(channels);
199
std::vector<float> b(channels);
200
for (int i = 0; i < channels; i++)
202
float sqrt_var = static_cast<float>(sqrt(batchnorm->var_data[i] + eps));
203
a[i] = batchnorm->bias_data[i] - batchnorm->slope_data[i] * batchnorm->mean_data[i] / sqrt_var;
204
b[i] = batchnorm->slope_data[i] / sqrt_var;
207
if (convolution->bias_term == 0)
210
convolution->bias_term = 1;
211
convolution->bias_data = ncnn::Mat(channels);
212
convolution->bias_data.fill(0.f);
215
const int weight_per_outch = convolution->weight_data_size / channels;
217
float* weight = convolution->weight_data;
218
float* bias = convolution->bias_data;
219
for (int i = 0; i < channels; i++)
221
float* conv_weight_outch = weight + weight_per_outch * i;
222
for (int j = 0; j < weight_per_outch; j++)
224
conv_weight_outch[j] *= b[i];
227
bias[i] = bias[i] * b[i] + a[i];
231
int top_blob_index_final = batchnorm->tops[0];
232
convolution->tops[0] = top_blob_index_final;
233
blobs[top_blob_index_final].producer = i;
234
batchnorm->type = "ncnnfused";
240
int NetOptimize::fuse_convolution_mul()
242
const size_t layer_count = layers.size();
243
for (size_t i = 0; i < layer_count; i++)
245
if (layers[i]->type != "Convolution")
249
int top_blob_index = layers[i]->tops[0];
252
for (; j < layer_count; j++)
254
if (layers[j]->type != "BinaryOp")
257
if (layers[j]->bottoms.size() != 2)
260
if (layers[j]->bottoms[0] == top_blob_index)
264
if (j == layer_count)
268
ncnn::Convolution* convolution = (ncnn::Convolution*)layers[i];
269
ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
271
if (binaryop->op_type != 2 || binaryop->with_scalar)
278
if (layers[k]->type != "MemoryData")
281
if (layers[k]->tops[0] == binaryop->bottoms[1])
288
ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
290
int channels = convolution->num_output;
292
if (memorydata->w != channels || memorydata->h != 0 || memorydata->c != 0)
298
fprintf(stderr, "fuse_convolution_mul %s %s\n", convolution->name.c_str(), binaryop->name.c_str());
301
const int weight_per_outch = convolution->weight_data_size / channels;
303
float* weight = convolution->weight_data;
304
float* bias = convolution->bias_data;
305
for (int i = 0; i < channels; i++)
307
float* conv_weight_outch = weight + weight_per_outch * i;
308
for (int j = 0; j < weight_per_outch; j++)
310
conv_weight_outch[j] *= memorydata->data[i];
315
bias[i] = bias[i] * memorydata->data[i];
320
int top_blob_index_final = binaryop->tops[0];
321
convolution->tops[0] = top_blob_index_final;
322
blobs[top_blob_index_final].producer = i;
323
binaryop->type = "ncnnfused";
329
int NetOptimize::fuse_convolution_add()
331
const size_t layer_count = layers.size();
332
for (size_t i = 0; i < layer_count; i++)
334
if (layers[i]->type != "Convolution")
338
int top_blob_index = layers[i]->tops[0];
341
for (; j < layer_count; j++)
343
if (layers[j]->type != "BinaryOp")
346
if (layers[j]->bottoms.size() != 2)
349
if (layers[j]->bottoms[0] == top_blob_index)
353
if (j == layer_count)
357
ncnn::Convolution* convolution = (ncnn::Convolution*)layers[i];
358
ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
360
if (binaryop->op_type != 0 || binaryop->with_scalar)
367
if (layers[k]->type != "MemoryData")
370
if (layers[k]->tops[0] == binaryop->bottoms[1])
377
ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
379
int channels = convolution->num_output;
381
bool broadcasting_type_ok = false;
382
if (memorydata->w == channels && memorydata->h == 0 && memorydata->c == 0)
383
broadcasting_type_ok = true;
384
if (memorydata->w == 1 && memorydata->h == 1 && memorydata->c == channels)
385
broadcasting_type_ok = true;
387
if (!broadcasting_type_ok)
393
fprintf(stderr, "fuse_convolution_add %s %s\n", convolution->name.c_str(), binaryop->name.c_str());
395
ncnn::Mat bias_data = memorydata->data.reshape(channels);
397
if (convolution->bias_term == 0)
400
convolution->bias_term = 1;
401
convolution->bias_data = bias_data;
405
float* bias = convolution->bias_data;
406
for (int i = 0; i < channels; i++)
408
bias[i] = bias[i] + bias_data[i];
413
int top_blob_index_final = binaryop->tops[0];
414
convolution->tops[0] = top_blob_index_final;
415
blobs[top_blob_index_final].producer = i;
416
binaryop->type = "ncnnfused";
422
int NetOptimize::fuse_convolutiondepthwise_batchnorm()
424
const size_t layer_count = layers.size();
425
for (size_t i = 0; i < layer_count; i++)
427
if (layers[i]->type != "ConvolutionDepthWise")
431
int top_blob_index = layers[i]->tops[0];
434
for (; j < layer_count; j++)
436
if (layers[j]->type != "BatchNorm")
439
if (layers[j]->bottoms.size() != 1)
442
if (layers[j]->bottoms[0] == top_blob_index)
446
if (j == layer_count)
450
ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layers[i];
451
ncnn::BatchNorm* batchnorm = (ncnn::BatchNorm*)layers[j];
453
fprintf(stderr, "fuse_convolutiondepthwise_batchnorm %s %s\n", convolutiondepthwise->name.c_str(), batchnorm->name.c_str());
456
int channels = batchnorm->channels;
457
float eps = batchnorm->eps;
463
std::vector<float> a(channels);
464
std::vector<float> b(channels);
465
for (int i = 0; i < channels; i++)
467
float sqrt_var = static_cast<float>(sqrt(batchnorm->var_data[i] + eps));
468
a[i] = batchnorm->bias_data[i] - batchnorm->slope_data[i] * batchnorm->mean_data[i] / sqrt_var;
469
b[i] = batchnorm->slope_data[i] / sqrt_var;
472
if (convolutiondepthwise->bias_term == 0)
475
convolutiondepthwise->bias_term = 1;
476
convolutiondepthwise->bias_data = ncnn::Mat(channels);
477
convolutiondepthwise->bias_data.fill(0.f);
480
const int weight_per_outch = convolutiondepthwise->weight_data_size / channels;
482
float* weight = convolutiondepthwise->weight_data;
483
float* bias = convolutiondepthwise->bias_data;
484
for (int i = 0; i < channels; i++)
486
float* conv_weight_outch = weight + weight_per_outch * i;
487
for (int j = 0; j < weight_per_outch; j++)
489
conv_weight_outch[j] *= b[i];
492
bias[i] = bias[i] * b[i] + a[i];
496
int top_blob_index_final = batchnorm->tops[0];
497
convolutiondepthwise->tops[0] = top_blob_index_final;
498
blobs[top_blob_index_final].producer = i;
499
batchnorm->type = "ncnnfused";
505
int NetOptimize::fuse_convolutiondepthwise_mul()
507
const size_t layer_count = layers.size();
508
for (size_t i = 0; i < layer_count; i++)
510
if (layers[i]->type != "ConvolutionDepthWise")
514
int top_blob_index = layers[i]->tops[0];
517
for (; j < layer_count; j++)
519
if (layers[j]->type != "BinaryOp")
522
if (layers[j]->bottoms.size() != 2)
525
if (layers[j]->bottoms[0] == top_blob_index)
529
if (j == layer_count)
533
ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layers[i];
534
ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
536
if (binaryop->op_type != 2 || binaryop->with_scalar)
543
if (layers[k]->type != "MemoryData")
546
if (layers[k]->tops[0] == binaryop->bottoms[1])
553
ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
555
int channels = convolutiondepthwise->num_output;
557
if (memorydata->w != channels || memorydata->h != 0 || memorydata->c != 0)
563
fprintf(stderr, "fuse_convolutiondepthwise_mul %s %s\n", convolutiondepthwise->name.c_str(), binaryop->name.c_str());
566
const int weight_per_outch = convolutiondepthwise->weight_data_size / channels;
568
float* weight = convolutiondepthwise->weight_data;
569
float* bias = convolutiondepthwise->bias_data;
570
for (int i = 0; i < channels; i++)
572
float* conv_weight_outch = weight + weight_per_outch * i;
573
for (int j = 0; j < weight_per_outch; j++)
575
conv_weight_outch[j] *= memorydata->data[i];
580
bias[i] = bias[i] * memorydata->data[i];
585
int top_blob_index_final = binaryop->tops[0];
586
convolutiondepthwise->tops[0] = top_blob_index_final;
587
blobs[top_blob_index_final].producer = i;
588
binaryop->type = "ncnnfused";
594
int NetOptimize::fuse_convolutiondepthwise_add()
596
const size_t layer_count = layers.size();
597
for (size_t i = 0; i < layer_count; i++)
599
if (layers[i]->type != "ConvolutionDepthWise")
603
int top_blob_index = layers[i]->tops[0];
606
for (; j < layer_count; j++)
608
if (layers[j]->type != "BinaryOp")
611
if (layers[j]->bottoms.size() != 2)
614
if (layers[j]->bottoms[0] == top_blob_index)
618
if (j == layer_count)
622
ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layers[i];
623
ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
625
if (binaryop->op_type != 0 || binaryop->with_scalar)
632
if (layers[k]->type != "MemoryData")
635
if (layers[k]->tops[0] == binaryop->bottoms[1])
642
ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
644
int channels = convolutiondepthwise->num_output;
646
bool broadcasting_type_ok = false;
647
if (memorydata->w == channels && memorydata->h == 0 && memorydata->c == 0)
648
broadcasting_type_ok = true;
649
if (memorydata->w == 1 && memorydata->h == 1 && memorydata->c == channels)
650
broadcasting_type_ok = true;
652
if (!broadcasting_type_ok)
658
fprintf(stderr, "fuse_convolutiondepthwise_add %s %s\n", convolutiondepthwise->name.c_str(), binaryop->name.c_str());
660
ncnn::Mat bias_data = memorydata->data.reshape(channels);
662
if (convolutiondepthwise->bias_term == 0)
665
convolutiondepthwise->bias_term = 1;
666
convolutiondepthwise->bias_data = bias_data;
670
float* bias = convolutiondepthwise->bias_data;
671
for (int i = 0; i < channels; i++)
673
bias[i] = bias[i] + bias_data[i];
678
int top_blob_index_final = binaryop->tops[0];
679
convolutiondepthwise->tops[0] = top_blob_index_final;
680
blobs[top_blob_index_final].producer = i;
681
binaryop->type = "ncnnfused";
687
int NetOptimize::fuse_deconvolution_batchnorm()
689
const size_t layer_count = layers.size();
690
for (size_t i = 0; i < layer_count; i++)
692
if (layers[i]->type != "Deconvolution")
696
int top_blob_index = layers[i]->tops[0];
699
for (; j < layer_count; j++)
701
if (layers[j]->type != "BatchNorm")
704
if (layers[j]->bottoms.size() != 1)
707
if (layers[j]->bottoms[0] == top_blob_index)
711
if (j == layer_count)
715
ncnn::Deconvolution* deconvolution = (ncnn::Deconvolution*)layers[i];
716
ncnn::BatchNorm* batchnorm = (ncnn::BatchNorm*)layers[j];
718
fprintf(stderr, "fuse_deconvolution_batchnorm %s %s\n", deconvolution->name.c_str(), batchnorm->name.c_str());
721
int channels = batchnorm->channels;
722
float eps = batchnorm->eps;
728
std::vector<float> a(channels);
729
std::vector<float> b(channels);
730
for (int i = 0; i < channels; i++)
732
float sqrt_var = static_cast<float>(sqrt(batchnorm->var_data[i] + eps));
733
a[i] = batchnorm->bias_data[i] - batchnorm->slope_data[i] * batchnorm->mean_data[i] / sqrt_var;
734
b[i] = batchnorm->slope_data[i] / sqrt_var;
737
if (deconvolution->bias_term == 0)
740
deconvolution->bias_term = 1;
741
deconvolution->bias_data = ncnn::Mat(channels);
742
deconvolution->bias_data.fill(0.f);
745
const int weight_per_outch = deconvolution->weight_data_size / channels;
747
float* weight = deconvolution->weight_data;
748
float* bias = deconvolution->bias_data;
749
for (int i = 0; i < channels; i++)
751
float* conv_weight_outch = weight + weight_per_outch * i;
752
for (int j = 0; j < weight_per_outch; j++)
754
conv_weight_outch[j] *= b[i];
757
bias[i] = bias[i] * b[i] + a[i];
761
int top_blob_index_final = batchnorm->tops[0];
762
deconvolution->tops[0] = top_blob_index_final;
763
blobs[top_blob_index_final].producer = i;
764
batchnorm->type = "ncnnfused";
770
int NetOptimize::fuse_deconvolution_mul()
772
const size_t layer_count = layers.size();
773
for (size_t i = 0; i < layer_count; i++)
775
if (layers[i]->type != "Deconvolution")
779
int top_blob_index = layers[i]->tops[0];
782
for (; j < layer_count; j++)
784
if (layers[j]->type != "BinaryOp")
787
if (layers[j]->bottoms.size() != 2)
790
if (layers[j]->bottoms[0] == top_blob_index)
794
if (j == layer_count)
798
ncnn::Deconvolution* deconvolution = (ncnn::Deconvolution*)layers[i];
799
ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
801
if (binaryop->op_type != 2 || binaryop->with_scalar)
808
if (layers[k]->type != "MemoryData")
811
if (layers[k]->tops[0] == binaryop->bottoms[1])
818
ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
820
int channels = deconvolution->num_output;
822
if (memorydata->w != channels || memorydata->h != 0 || memorydata->c != 0)
828
fprintf(stderr, "fuse_deconvolution_mul %s %s\n", deconvolution->name.c_str(), binaryop->name.c_str());
831
const int weight_per_outch = deconvolution->weight_data_size / channels;
833
float* weight = deconvolution->weight_data;
834
float* bias = deconvolution->bias_data;
835
for (int i = 0; i < channels; i++)
837
float* conv_weight_outch = weight + weight_per_outch * i;
838
for (int j = 0; j < weight_per_outch; j++)
840
conv_weight_outch[j] *= memorydata->data[i];
845
bias[i] = bias[i] * memorydata->data[i];
850
int top_blob_index_final = binaryop->tops[0];
851
deconvolution->tops[0] = top_blob_index_final;
852
blobs[top_blob_index_final].producer = i;
853
binaryop->type = "ncnnfused";
859
int NetOptimize::fuse_deconvolution_add()
861
const size_t layer_count = layers.size();
862
for (size_t i = 0; i < layer_count; i++)
864
if (layers[i]->type != "Deconvolution")
868
int top_blob_index = layers[i]->tops[0];
871
for (; j < layer_count; j++)
873
if (layers[j]->type != "BinaryOp")
876
if (layers[j]->bottoms.size() != 2)
879
if (layers[j]->bottoms[0] == top_blob_index)
883
if (j == layer_count)
887
ncnn::Deconvolution* deconvolution = (ncnn::Deconvolution*)layers[i];
888
ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
890
if (binaryop->op_type != 0 || binaryop->with_scalar)
897
if (layers[k]->type != "MemoryData")
900
if (layers[k]->tops[0] == binaryop->bottoms[1])
907
ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
909
int channels = deconvolution->num_output;
911
bool broadcasting_type_ok = false;
912
if (memorydata->w == channels && memorydata->h == 0 && memorydata->c == 0)
913
broadcasting_type_ok = true;
914
if (memorydata->w == 1 && memorydata->h == 1 && memorydata->c == channels)
915
broadcasting_type_ok = true;
917
if (!broadcasting_type_ok)
923
fprintf(stderr, "fuse_deconvolution_add %s %s\n", deconvolution->name.c_str(), binaryop->name.c_str());
925
ncnn::Mat bias_data = memorydata->data.reshape(channels);
927
if (deconvolution->bias_term == 0)
930
deconvolution->bias_term = 1;
931
deconvolution->bias_data = bias_data;
935
float* bias = deconvolution->bias_data;
936
for (int i = 0; i < channels; i++)
938
bias[i] = bias[i] + bias_data[i];
943
int top_blob_index_final = binaryop->tops[0];
944
deconvolution->tops[0] = top_blob_index_final;
945
blobs[top_blob_index_final].producer = i;
946
binaryop->type = "ncnnfused";
952
int NetOptimize::fuse_deconvolutiondepthwise_batchnorm()
954
const size_t layer_count = layers.size();
955
for (size_t i = 0; i < layer_count; i++)
957
if (layers[i]->type != "DeconvolutionDepthWise")
961
int top_blob_index = layers[i]->tops[0];
964
for (; j < layer_count; j++)
966
if (layers[j]->type != "BatchNorm")
969
if (layers[j]->bottoms.size() != 1)
972
if (layers[j]->bottoms[0] == top_blob_index)
976
if (j == layer_count)
980
ncnn::DeconvolutionDepthWise* deconvolutiondepthwise = (ncnn::DeconvolutionDepthWise*)layers[i];
981
ncnn::BatchNorm* batchnorm = (ncnn::BatchNorm*)layers[j];
983
fprintf(stderr, "fuse_deconvolutiondepthwise_batchnorm %s %s\n", deconvolutiondepthwise->name.c_str(), batchnorm->name.c_str());
986
int channels = batchnorm->channels;
987
float eps = batchnorm->eps;
993
std::vector<float> a(channels);
994
std::vector<float> b(channels);
995
for (int i = 0; i < channels; i++)
997
float sqrt_var = static_cast<float>(sqrt(batchnorm->var_data[i] + eps));
998
a[i] = batchnorm->bias_data[i] - batchnorm->slope_data[i] * batchnorm->mean_data[i] / sqrt_var;
999
b[i] = batchnorm->slope_data[i] / sqrt_var;
1002
if (deconvolutiondepthwise->bias_term == 0)
1005
deconvolutiondepthwise->bias_term = 1;
1006
deconvolutiondepthwise->bias_data = ncnn::Mat(channels);
1007
deconvolutiondepthwise->bias_data.fill(0.f);
1010
const int weight_per_outch = deconvolutiondepthwise->weight_data_size / channels;
1012
float* weight = deconvolutiondepthwise->weight_data;
1013
float* bias = deconvolutiondepthwise->bias_data;
1014
for (int i = 0; i < channels; i++)
1016
float* conv_weight_outch = weight + weight_per_outch * i;
1017
for (int j = 0; j < weight_per_outch; j++)
1019
conv_weight_outch[j] *= b[i];
1022
bias[i] = bias[i] * b[i] + a[i];
1026
int top_blob_index_final = batchnorm->tops[0];
1027
deconvolutiondepthwise->tops[0] = top_blob_index_final;
1028
blobs[top_blob_index_final].producer = i;
1029
batchnorm->type = "ncnnfused";
1035
int NetOptimize::fuse_innerproduct_batchnorm()
1037
const size_t layer_count = layers.size();
1038
for (size_t i = 0; i < layer_count; i++)
1040
if (layers[i]->type != "InnerProduct")
1044
int top_blob_index = layers[i]->tops[0];
1047
for (; j < layer_count; j++)
1049
if (layers[j]->type != "BatchNorm")
1052
if (layers[j]->bottoms.size() != 1)
1055
if (layers[j]->bottoms[0] == top_blob_index)
1059
if (j == layer_count)
1063
ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
1064
ncnn::BatchNorm* batchnorm = (ncnn::BatchNorm*)layers[j];
1066
fprintf(stderr, "fuse_innerproduct_batchnorm %s %s\n", innerproduct->name.c_str(), batchnorm->name.c_str());
1069
int channels = batchnorm->channels;
1070
float eps = batchnorm->eps;
1076
std::vector<float> a(channels);
1077
std::vector<float> b(channels);
1078
for (int i = 0; i < channels; i++)
1080
float sqrt_var = static_cast<float>(sqrt(batchnorm->var_data[i] + eps));
1081
a[i] = batchnorm->bias_data[i] - batchnorm->slope_data[i] * batchnorm->mean_data[i] / sqrt_var;
1082
b[i] = batchnorm->slope_data[i] / sqrt_var;
1085
if (innerproduct->bias_term == 0)
1088
innerproduct->bias_term = 1;
1089
innerproduct->bias_data = ncnn::Mat(channels);
1090
innerproduct->bias_data.fill(0.f);
1093
const int weight_per_outch = innerproduct->weight_data_size / channels;
1095
float* weight = innerproduct->weight_data;
1096
float* bias = innerproduct->bias_data;
1097
for (int i = 0; i < channels; i++)
1099
float* conv_weight_outch = weight + weight_per_outch * i;
1100
for (int j = 0; j < weight_per_outch; j++)
1102
conv_weight_outch[j] *= b[i];
1105
bias[i] = bias[i] * b[i] + a[i];
1109
int top_blob_index_final = batchnorm->tops[0];
1110
innerproduct->tops[0] = top_blob_index_final;
1111
blobs[top_blob_index_final].producer = i;
1112
batchnorm->type = "ncnnfused";
1118
int NetOptimize::fuse_innerproduct_add()
1120
const size_t layer_count = layers.size();
1121
for (size_t i = 0; i < layer_count; i++)
1123
if (layers[i]->type != "InnerProduct")
1127
int top_blob_index = layers[i]->tops[0];
1130
for (; j < layer_count; j++)
1132
if (layers[j]->type != "BinaryOp")
1135
if (layers[j]->bottoms.size() != 2)
1138
if (layers[j]->bottoms[0] == top_blob_index)
1142
if (j == layer_count)
1146
ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
1147
ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
1149
if (binaryop->op_type != 0 || binaryop->with_scalar)
1156
if (layers[k]->type != "MemoryData")
1159
if (layers[k]->tops[0] == binaryop->bottoms[1])
1166
ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
1168
int channels = innerproduct->num_output;
1170
bool broadcasting_type_ok = false;
1171
if (memorydata->w == channels && memorydata->h == 0 && memorydata->c == 0)
1172
broadcasting_type_ok = true;
1173
if (memorydata->w == 1 && memorydata->h == 1 && memorydata->c == channels)
1174
broadcasting_type_ok = true;
1176
if (!broadcasting_type_ok)
1182
fprintf(stderr, "fuse_innerproduct_add %s %s\n", innerproduct->name.c_str(), binaryop->name.c_str());
1184
ncnn::Mat bias_data = memorydata->data.reshape(channels);
1186
if (innerproduct->bias_term == 0)
1189
innerproduct->bias_term = 1;
1190
innerproduct->bias_data = bias_data;
1194
float* bias = innerproduct->bias_data;
1195
for (int i = 0; i < channels; i++)
1197
bias[i] = bias[i] + bias_data[i];
1202
int top_blob_index_final = binaryop->tops[0];
1203
innerproduct->tops[0] = top_blob_index_final;
1204
blobs[top_blob_index_final].producer = i;
1205
binaryop->type = "ncnnfused";
1211
int NetOptimize::fuse_innerproduct_dropout()
1213
const size_t layer_count = layers.size();
1214
for (size_t i = 0; i < layer_count; i++)
1216
if (layers[i]->type != "InnerProduct")
1220
int top_blob_index = layers[i]->tops[0];
1223
for (; j < layer_count; j++)
1225
if (layers[j]->type != "Dropout")
1228
if (layers[j]->bottoms.size() != 1)
1231
if (layers[j]->bottoms[0] == top_blob_index)
1235
if (j == layer_count)
1239
ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
1240
ncnn::Dropout* dropout = (ncnn::Dropout*)layers[j];
1242
fprintf(stderr, "fuse_innerproduct_dropout %s %s\n", innerproduct->name.c_str(), dropout->name.c_str());
1244
float scale = dropout->scale;
1247
const int num_output = innerproduct->num_output;
1248
const int weight_per_outch = innerproduct->weight_data_size / num_output;
1250
float* weight = innerproduct->weight_data;
1251
for (int i = 0; i < num_output; i++)
1253
float* conv_weight_outch = weight + weight_per_outch * i;
1254
for (int j = 0; j < weight_per_outch; j++)
1256
conv_weight_outch[j] *= scale;
1260
if (innerproduct->bias_term)
1262
float* bias = innerproduct->bias_data;
1263
for (int i = 0; i < num_output; i++)
1270
int top_blob_index_final = dropout->tops[0];
1271
innerproduct->tops[0] = top_blob_index_final;
1272
blobs[top_blob_index_final].producer = i;
1273
dropout->type = "ncnnfused";
1279
int NetOptimize::fuse_convolution_activation()
1281
const size_t layer_count = layers.size();
1282
for (size_t i = 0; i < layer_count; i++)
1284
if (layers[i]->type != "Convolution")
1288
int top_blob_index = layers[i]->tops[0];
1291
for (; j < layer_count; j++)
1293
if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid" && layers[j]->type != "Mish" && layers[j]->type != "HardSwish")
1296
if (layers[j]->bottoms.size() != 1)
1299
if (layers[j]->bottoms[0] == top_blob_index)
1303
if (j == layer_count)
1307
ncnn::Convolution* convolution = (ncnn::Convolution*)layers[i];
1308
ncnn::Layer* activation = layers[j];
1310
fprintf(stderr, "fuse_convolution_activation %s %s\n", convolution->name.c_str(), activation->name.c_str());
1312
if (activation->type == "ReLU")
1314
ncnn::ReLU* relu = (ncnn::ReLU*)activation;
1316
if (relu->slope == 0.f)
1318
convolution->activation_type = 1;
1322
convolution->activation_type = 2;
1323
convolution->activation_params = ncnn::Mat(1);
1324
convolution->activation_params[0] = relu->slope;
1327
else if (activation->type == "Clip")
1329
ncnn::Clip* clip = (ncnn::Clip*)activation;
1331
convolution->activation_type = 3;
1332
convolution->activation_params = ncnn::Mat(2);
1333
convolution->activation_params[0] = clip->min;
1334
convolution->activation_params[1] = clip->max;
1336
else if (activation->type == "Sigmoid")
1338
convolution->activation_type = 4;
1340
else if (activation->type == "Mish")
1342
convolution->activation_type = 5;
1344
else if (activation->type == "HardSwish")
1346
ncnn::HardSwish* hardswish = (ncnn::HardSwish*)activation;
1348
convolution->activation_type = 6;
1349
convolution->activation_params = ncnn::Mat(2);
1350
convolution->activation_params[0] = hardswish->alpha;
1351
convolution->activation_params[1] = hardswish->beta;
1354
int top_blob_index_final = activation->tops[0];
1355
convolution->tops[0] = top_blob_index_final;
1356
blobs[top_blob_index_final].producer = i;
1357
activation->type = "ncnnfused";
1360
for (size_t i = 0; i < layer_count; i++)
1362
if (layers[i]->type != "Convolution1D")
1366
int top_blob_index = layers[i]->tops[0];
1369
for (; j < layer_count; j++)
1371
if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid" && layers[j]->type != "Mish")
1374
if (layers[j]->bottoms.size() != 1)
1377
if (layers[j]->bottoms[0] == top_blob_index)
1381
if (j == layer_count)
1385
ncnn::Convolution1D* convolution = (ncnn::Convolution1D*)layers[i];
1386
ncnn::Layer* activation = layers[j];
1388
fprintf(stderr, "fuse_convolution1d_activation %s %s\n", convolution->name.c_str(), activation->name.c_str());
1390
if (activation->type == "ReLU")
1392
ncnn::ReLU* relu = (ncnn::ReLU*)activation;
1394
if (relu->slope == 0.f)
1396
convolution->activation_type = 1;
1400
convolution->activation_type = 2;
1401
convolution->activation_params = ncnn::Mat(1);
1402
convolution->activation_params[0] = relu->slope;
1405
else if (activation->type == "Clip")
1407
ncnn::Clip* clip = (ncnn::Clip*)activation;
1409
convolution->activation_type = 3;
1410
convolution->activation_params = ncnn::Mat(2);
1411
convolution->activation_params[0] = clip->min;
1412
convolution->activation_params[1] = clip->max;
1414
else if (activation->type == "Sigmoid")
1416
convolution->activation_type = 4;
1418
else if (activation->type == "Mish")
1420
convolution->activation_type = 5;
1423
int top_blob_index_final = activation->tops[0];
1424
convolution->tops[0] = top_blob_index_final;
1425
blobs[top_blob_index_final].producer = i;
1426
activation->type = "ncnnfused";
1432
int NetOptimize::fuse_convolutiondepthwise_activation()
1434
const size_t layer_count = layers.size();
1435
for (size_t i = 0; i < layer_count; i++)
1437
if (layers[i]->type != "ConvolutionDepthWise")
1441
int top_blob_index = layers[i]->tops[0];
1444
for (; j < layer_count; j++)
1446
if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid" && layers[j]->type != "Mish" && layers[j]->type != "HardSwish")
1449
if (layers[j]->bottoms.size() != 1)
1452
if (layers[j]->bottoms[0] == top_blob_index)
1456
if (j == layer_count)
1460
ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layers[i];
1461
ncnn::Layer* activation = layers[j];
1463
fprintf(stderr, "fuse_convolutiondepthwise_activation %s %s\n", convolutiondepthwise->name.c_str(), activation->name.c_str());
1465
if (activation->type == "ReLU")
1467
ncnn::ReLU* relu = (ncnn::ReLU*)activation;
1469
if (relu->slope == 0.f)
1471
convolutiondepthwise->activation_type = 1;
1475
convolutiondepthwise->activation_type = 2;
1476
convolutiondepthwise->activation_params = ncnn::Mat(1);
1477
convolutiondepthwise->activation_params[0] = relu->slope;
1480
else if (activation->type == "Clip")
1482
ncnn::Clip* clip = (ncnn::Clip*)activation;
1484
convolutiondepthwise->activation_type = 3;
1485
convolutiondepthwise->activation_params = ncnn::Mat(2);
1486
convolutiondepthwise->activation_params[0] = clip->min;
1487
convolutiondepthwise->activation_params[1] = clip->max;
1489
else if (activation->type == "Sigmoid")
1491
convolutiondepthwise->activation_type = 4;
1493
else if (activation->type == "Mish")
1495
convolutiondepthwise->activation_type = 5;
1497
else if (activation->type == "HardSwish")
1499
ncnn::HardSwish* hardswish = (ncnn::HardSwish*)activation;
1501
convolutiondepthwise->activation_type = 6;
1502
convolutiondepthwise->activation_params = ncnn::Mat(2);
1503
convolutiondepthwise->activation_params[0] = hardswish->alpha;
1504
convolutiondepthwise->activation_params[1] = hardswish->beta;
1507
int top_blob_index_final = activation->tops[0];
1508
convolutiondepthwise->tops[0] = top_blob_index_final;
1509
blobs[top_blob_index_final].producer = i;
1510
activation->type = "ncnnfused";
1516
int NetOptimize::fuse_deconvolution_activation()
1518
const size_t layer_count = layers.size();
1519
for (size_t i = 0; i < layer_count; i++)
1521
if (layers[i]->type != "Deconvolution")
1525
int top_blob_index = layers[i]->tops[0];
1528
for (; j < layer_count; j++)
1530
if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid")
1533
if (layers[j]->bottoms.size() != 1)
1536
if (layers[j]->bottoms[0] == top_blob_index)
1540
if (j == layer_count)
1544
ncnn::Deconvolution* deconvolution = (ncnn::Deconvolution*)layers[i];
1545
ncnn::Layer* activation = layers[j];
1547
fprintf(stderr, "fuse_deconvolution_activation %s %s\n", deconvolution->name.c_str(), activation->name.c_str());
1549
if (activation->type == "ReLU")
1551
ncnn::ReLU* relu = (ncnn::ReLU*)activation;
1553
if (relu->slope == 0.f)
1555
deconvolution->activation_type = 1;
1559
deconvolution->activation_type = 2;
1560
deconvolution->activation_params = ncnn::Mat(1);
1561
deconvolution->activation_params[0] = relu->slope;
1564
else if (activation->type == "Clip")
1566
ncnn::Clip* clip = (ncnn::Clip*)activation;
1568
deconvolution->activation_type = 3;
1569
deconvolution->activation_params = ncnn::Mat(2);
1570
deconvolution->activation_params[0] = clip->min;
1571
deconvolution->activation_params[1] = clip->max;
1573
else if (activation->type == "Sigmoid")
1575
deconvolution->activation_type = 4;
1578
int top_blob_index_final = activation->tops[0];
1579
deconvolution->tops[0] = top_blob_index_final;
1580
blobs[top_blob_index_final].producer = i;
1581
activation->type = "ncnnfused";
1587
int NetOptimize::fuse_deconvolutiondepthwise_activation()
1589
const size_t layer_count = layers.size();
1590
for (size_t i = 0; i < layer_count; i++)
1592
if (layers[i]->type != "DeconvolutionDepthWise")
1596
int top_blob_index = layers[i]->tops[0];
1599
for (; j < layer_count; j++)
1601
if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid")
1604
if (layers[j]->bottoms.size() != 1)
1607
if (layers[j]->bottoms[0] == top_blob_index)
1611
if (j == layer_count)
1615
ncnn::DeconvolutionDepthWise* deconvolutiondepthwise = (ncnn::DeconvolutionDepthWise*)layers[i];
1616
ncnn::Layer* activation = layers[j];
1618
fprintf(stderr, "fuse_deconvolutiondepthwise_activation %s %s\n", deconvolutiondepthwise->name.c_str(), activation->name.c_str());
1620
if (activation->type == "ReLU")
1622
ncnn::ReLU* relu = (ncnn::ReLU*)activation;
1624
if (relu->slope == 0.f)
1626
deconvolutiondepthwise->activation_type = 1;
1630
deconvolutiondepthwise->activation_type = 2;
1631
deconvolutiondepthwise->activation_params = ncnn::Mat(1);
1632
deconvolutiondepthwise->activation_params[0] = relu->slope;
1635
else if (activation->type == "Clip")
1637
ncnn::Clip* clip = (ncnn::Clip*)activation;
1639
deconvolutiondepthwise->activation_type = 3;
1640
deconvolutiondepthwise->activation_params = ncnn::Mat(2);
1641
deconvolutiondepthwise->activation_params[0] = clip->min;
1642
deconvolutiondepthwise->activation_params[1] = clip->max;
1644
else if (activation->type == "Sigmoid")
1646
deconvolutiondepthwise->activation_type = 4;
1649
int top_blob_index_final = activation->tops[0];
1650
deconvolutiondepthwise->tops[0] = top_blob_index_final;
1651
blobs[top_blob_index_final].producer = i;
1652
activation->type = "ncnnfused";
1658
int NetOptimize::fuse_innerproduct_activation()
1660
const size_t layer_count = layers.size();
1661
for (size_t i = 0; i < layer_count; i++)
1663
if (layers[i]->type != "InnerProduct")
1667
int top_blob_index = layers[i]->tops[0];
1670
for (; j < layer_count; j++)
1672
if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid" && layers[j]->type != "Mish" && layers[j]->type != "HardSwish")
1675
if (layers[j]->bottoms.size() != 1)
1678
if (layers[j]->bottoms[0] == top_blob_index)
1682
if (j == layer_count)
1686
ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
1687
ncnn::Layer* activation = layers[j];
1689
fprintf(stderr, "fuse_innerproduct_activation %s %s\n", innerproduct->name.c_str(), activation->name.c_str());
1691
if (activation->type == "ReLU")
1693
ncnn::ReLU* relu = (ncnn::ReLU*)activation;
1695
if (relu->slope == 0.f)
1697
innerproduct->activation_type = 1;
1701
innerproduct->activation_type = 2;
1702
innerproduct->activation_params = ncnn::Mat(1);
1703
innerproduct->activation_params[0] = relu->slope;
1706
else if (activation->type == "Clip")
1708
ncnn::Clip* clip = (ncnn::Clip*)activation;
1710
innerproduct->activation_type = 3;
1711
innerproduct->activation_params = ncnn::Mat(2);
1712
innerproduct->activation_params[0] = clip->min;
1713
innerproduct->activation_params[1] = clip->max;
1715
else if (activation->type == "Sigmoid")
1717
innerproduct->activation_type = 4;
1719
else if (activation->type == "Mish")
1721
innerproduct->activation_type = 5;
1723
else if (activation->type == "HardSwish")
1725
ncnn::HardSwish* hardswish = (ncnn::HardSwish*)activation;
1727
innerproduct->activation_type = 6;
1728
innerproduct->activation_params = ncnn::Mat(2);
1729
innerproduct->activation_params[0] = hardswish->alpha;
1730
innerproduct->activation_params[1] = hardswish->beta;
1733
int top_blob_index_final = activation->tops[0];
1734
innerproduct->tops[0] = top_blob_index_final;
1735
blobs[top_blob_index_final].producer = i;
1736
activation->type = "ncnnfused";
1742
int NetOptimize::fuse_memorydata_binaryop()
1744
const size_t layer_count = layers.size();
1745
for (size_t i = 0; i < layer_count; i++)
1747
if (layers[i]->type != "MemoryData")
1751
int top_blob_index = layers[i]->tops[0];
1754
for (; j < layer_count; j++)
1756
if (layers[j]->type != "BinaryOp")
1759
if (layers[j]->bottoms.size() != 2)
1762
if (layers[j]->bottoms[0] == top_blob_index || layers[j]->bottoms[1] == top_blob_index)
1766
if (j == layer_count)
1770
ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[i];
1771
ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
1773
if (memorydata->w != 1 || memorydata->h != 0 || memorydata->c != 0)
1779
int memorydata_index = 1;
1781
if (binaryop->bottoms[0] == top_blob_index)
1783
int op_type = binaryop->op_type;
1785
if (op_type == ncnn::BinaryOp::Operation_ADD
1786
|| op_type == ncnn::BinaryOp::Operation_MUL
1787
|| op_type == ncnn::BinaryOp::Operation_MAX
1788
|| op_type == ncnn::BinaryOp::Operation_MIN)
1790
memorydata_index = 0;
1792
else if (op_type == ncnn::BinaryOp::Operation_SUB)
1794
binaryop->op_type = ncnn::BinaryOp::Operation_RSUB;
1795
memorydata_index = 0;
1797
else if (op_type == ncnn::BinaryOp::Operation_DIV)
1799
binaryop->op_type = ncnn::BinaryOp::Operation_RDIV;
1800
memorydata_index = 0;
1809
float scalar = memorydata->data[0];
1811
binaryop->with_scalar = 1;
1812
binaryop->b = scalar;
1814
fprintf(stderr, "fuse_memorydata_binaryop %s %s\n", memorydata->name.c_str(), binaryop->name.c_str());
1816
binaryop->bottoms.erase(binaryop->bottoms.begin() + memorydata_index);
1817
memorydata->type = "ncnnfused";
1820
for (size_t i = 0; i < layer_count; i++)
1822
if (layers[i]->type != "MemoryData")
1826
int top_blob_index = layers[i]->tops[0];
1829
for (; j0 < layer_count; j0++)
1831
if (layers[j0]->type != "Split")
1834
if (layers[j0]->bottoms.size() != 1)
1837
if (layers[j0]->bottoms[0] == top_blob_index)
1841
if (j0 == layer_count)
1844
int split_top_blob_index = -1;
1847
for (; j1 < layer_count; j1++)
1849
if (layers[j1]->type != "BinaryOp")
1852
if (layers[j1]->bottoms.size() != 2)
1855
for (int k = 0; k < (int)layers[j0]->tops.size(); k++)
1857
if (layers[j1]->bottoms[0] == layers[j0]->tops[k] || layers[j1]->bottoms[1] == layers[j0]->tops[k])
1859
split_top_blob_index = k;
1864
if (split_top_blob_index != -1)
1868
if (j1 == layer_count)
1872
ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[i];
1873
ncnn::Split* split = (ncnn::Split*)layers[j0];
1874
ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j1];
1876
if (memorydata->w != 1 || memorydata->h != 0 || memorydata->c != 0)
1882
int memorydata_index = 1;
1884
if (binaryop->bottoms[0] == split->tops[split_top_blob_index])
1886
int op_type = binaryop->op_type;
1888
if (op_type == ncnn::BinaryOp::Operation_ADD
1889
|| op_type == ncnn::BinaryOp::Operation_MUL
1890
|| op_type == ncnn::BinaryOp::Operation_MAX
1891
|| op_type == ncnn::BinaryOp::Operation_MIN)
1893
memorydata_index = 0;
1895
else if (op_type == ncnn::BinaryOp::Operation_SUB)
1897
binaryop->op_type = ncnn::BinaryOp::Operation_RSUB;
1898
memorydata_index = 0;
1900
else if (op_type == ncnn::BinaryOp::Operation_DIV)
1902
binaryop->op_type = ncnn::BinaryOp::Operation_RDIV;
1903
memorydata_index = 0;
1912
float scalar = memorydata->data[0];
1914
binaryop->with_scalar = 1;
1915
binaryop->b = scalar;
1917
fprintf(stderr, "fuse_memorydata_binaryop %s %s\n", memorydata->name.c_str(), binaryop->name.c_str());
1919
binaryop->bottoms.erase(binaryop->bottoms.begin() + memorydata_index);
1920
split->tops.erase(split->tops.begin() + split_top_blob_index);
1921
if (split->tops.empty())
1923
split->type = "ncnnfused";
1924
memorydata->type = "ncnnfused";
1933
int NetOptimize::fuse_binaryop_eltwise()
1935
const size_t layer_count = layers.size();
1936
for (size_t i = 0; i < layer_count; i++)
1938
if (layers[i]->type != "BinaryOp")
1941
if (layers[i]->bottoms.size() != 2)
1944
ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[i];
1946
if (binaryop->op_type != ncnn::BinaryOp::Operation_ADD)
1949
if (binaryop->with_scalar)
1953
int bottom_blob_index_0 = binaryop->bottoms[0];
1954
int bottom_blob_index_1 = binaryop->bottoms[1];
1957
for (; j0 < i; j0++)
1959
if (layers[j0]->type != "BinaryOp")
1962
if (layers[j0]->bottoms.size() != 1)
1965
if (((ncnn::BinaryOp*)layers[j0])->op_type != ncnn::BinaryOp::Operation_MUL)
1968
if (layers[j0]->tops[0] == bottom_blob_index_0)
1973
for (; j1 < i; j1++)
1975
if (layers[j1]->type != "BinaryOp")
1978
if (layers[j1]->bottoms.size() != 1)
1981
if (((ncnn::BinaryOp*)layers[j1])->op_type != ncnn::BinaryOp::Operation_MUL)
1984
if (layers[j1]->tops[0] == bottom_blob_index_1)
1988
if (j0 == i && j1 == i)
1991
ncnn::BinaryOp* binaryop0 = (ncnn::BinaryOp*)layers[j0];
1992
ncnn::BinaryOp* binaryop1 = (ncnn::BinaryOp*)layers[j1];
1994
fprintf(stderr, "fuse_binaryop_eltwise %s %s %s\n", binaryop0->name.c_str(), binaryop1->name.c_str(), binaryop->name.c_str());
1996
ncnn::Eltwise* eltwise = (ncnn::Eltwise*)ncnn::create_layer_cpu("Eltwise");
1998
eltwise->type = "Eltwise";
1999
eltwise->name = binaryop->name;
2000
eltwise->bottoms = binaryop->bottoms;
2001
eltwise->tops = binaryop->tops;
2004
eltwise->load_param(pd);
2006
eltwise->op_type = ncnn::Eltwise::Operation_SUM;
2008
eltwise->coeffs = ncnn::Mat(2);
2010
if (j0 != i && j1 != i)
2013
eltwise->coeffs[0] = binaryop0->b;
2014
eltwise->coeffs[1] = binaryop1->b;
2016
eltwise->bottoms[0] = binaryop0->bottoms[0];
2017
eltwise->bottoms[1] = binaryop1->bottoms[0];
2019
binaryop0->type = "ncnnfused";
2020
binaryop1->type = "ncnnfused";
2022
if (j0 != i && j1 == i)
2025
eltwise->coeffs[0] = binaryop0->b;
2026
eltwise->coeffs[1] = 1.f;
2028
eltwise->bottoms[0] = binaryop0->bottoms[0];
2030
binaryop0->type = "ncnnfused";
2032
if (j0 == i && j1 != i)
2035
eltwise->coeffs[0] = 1.f;
2036
eltwise->coeffs[1] = binaryop1->b;
2038
eltwise->bottoms[1] = binaryop1->bottoms[0];
2040
binaryop1->type = "ncnnfused";
2043
layers[i] = eltwise;
2050
int NetOptimize::eliminate_dropout()
2052
const size_t layer_count = layers.size();
2053
for (size_t i = 0; i < layer_count; i++)
2055
if (layers[i]->type != "Dropout")
2058
ncnn::Dropout* dropout = (ncnn::Dropout*)layers[i];
2059
if (dropout->scale != 1.f)
2063
int bottom_blob_index = layers[i]->bottoms[0];
2068
if (layers[j]->type == "ncnnfused")
2071
if (layers[j]->tops.size() != 1)
2074
if (layers[j]->tops[0] == bottom_blob_index)
2081
ncnn::Layer* any = layers[j];
2083
fprintf(stderr, "eliminate_dropout %s %s\n", any->name.c_str(), dropout->name.c_str());
2085
int top_blob_index_final = dropout->tops[0];
2086
any->tops[0] = top_blob_index_final;
2087
blobs[top_blob_index_final].producer = j;
2088
dropout->type = "ncnnfused";
2094
int NetOptimize::eliminate_pooling1x1()
2096
const size_t layer_count = layers.size();
2097
for (size_t i = 0; i < layer_count; i++)
2099
if (layers[i]->type != "Pooling")
2102
ncnn::Pooling* pooling = (ncnn::Pooling*)layers[i];
2103
if (pooling->pad_left != 0 || pooling->pad_right != 0 || pooling->pad_top != 0 || pooling->pad_bottom != 0)
2106
if (pooling->kernel_w != 1 || pooling->kernel_h != 1 || pooling->stride_w != 1 || pooling->stride_h != 1)
2109
if (pooling->global_pooling != 0)
2113
int bottom_blob_index = layers[i]->bottoms[0];
2119
if (layers[j]->type == "ncnnfused")
2122
for (size_t k = 0; k < layers[j]->tops.size(); k++)
2124
if (layers[j]->tops[k] == bottom_blob_index)
2138
ncnn::Layer* any = layers[j];
2140
fprintf(stderr, "eliminate_pooling1x1 %s %s\n", any->name.c_str(), pooling->name.c_str());
2142
int top_blob_index_final = pooling->tops[0];
2143
any->tops[top_i] = top_blob_index_final;
2144
blobs[top_blob_index_final].producer = j;
2145
pooling->type = "ncnnfused";
2151
int NetOptimize::eliminate_noop()
2153
const size_t layer_count = layers.size();
2154
for (size_t i = 0; i < layer_count; i++)
2156
if (layers[i]->type != "Noop")
2159
ncnn::Layer* noop = layers[i];
2161
if (noop->bottoms.empty())
2164
fprintf(stderr, "eliminate_noop %s\n", noop->name.c_str());
2166
size_t top_blob_count = noop->tops.size();
2167
for (size_t j = 0; j < top_blob_count; j++)
2169
int top_blob_index_final = noop->tops[j];
2170
blobs[top_blob_index_final].producer = -1;
2172
noop->type = "ncnnfused";
2178
int bottom_blob_index = noop->bottoms[0];
2184
if (layers[j]->type == "ncnnfused")
2187
bool link_noop = false;
2188
size_t top_blob_count = layers[j]->tops.size();
2189
for (size_t k = 0; k < top_blob_count; k++)
2191
if (layers[j]->tops[k] == bottom_blob_index)
2203
if (j == -1 || any_k == -1)
2206
ncnn::Layer* any = layers[j];
2208
fprintf(stderr, "eliminate_noop %s %s\n", any->name.c_str(), noop->name.c_str());
2210
int top_blob_index_final = noop->tops[0];
2211
any->tops[any_k] = top_blob_index_final;
2212
blobs[top_blob_index_final].producer = j;
2214
noop->type = "ncnnfused";
2220
int NetOptimize::eliminate_split()
2222
const size_t layer_count = layers.size();
2223
for (size_t i = 0; i < layer_count; i++)
2225
if (layers[i]->type != "Split")
2228
ncnn::Layer* split = layers[i];
2230
int real_split_output_count = 0;
2231
int real_split_top_blob_index = -1;
2232
size_t top_blob_count = split->tops.size();
2233
for (size_t j = 0; j < top_blob_count; j++)
2235
int top_blob_index_final = split->tops[j];
2236
if (blobs[top_blob_index_final].consumer != -1)
2238
real_split_output_count += 1;
2239
real_split_top_blob_index = j;
2243
if (real_split_output_count > 1)
2247
int bottom_blob_index = split->bottoms[0];
2253
if (layers[j]->type == "ncnnfused")
2256
for (size_t k = 0; k < layers[j]->tops.size(); k++)
2258
if (layers[j]->tops[k] == bottom_blob_index)
2272
ncnn::Layer* any = layers[j];
2274
fprintf(stderr, "eliminate_split %s %s\n", any->name.c_str(), split->name.c_str());
2276
int top_blob_index_final = split->tops[real_split_top_blob_index];
2277
any->tops[top_i] = top_blob_index_final;
2278
blobs[top_blob_index_final].producer = j;
2279
split->type = "ncnnfused";
2285
int NetOptimize::eliminate_orphaned_memorydata()
2287
const size_t layer_count = layers.size();
2288
for (size_t i = 0; i < layer_count; i++)
2290
if (layers[i]->type != "MemoryData")
2294
int top_blob_index = layers[i]->tops[0];
2297
for (; j < layer_count; j++)
2299
if (layers[j]->type == "ncnnfused")
2302
bool orphaned = true;
2303
for (size_t k = 0; k < layers[j]->bottoms.size(); k++)
2305
if (layers[j]->bottoms[k] == top_blob_index)
2316
if (j < layer_count)
2320
fprintf(stderr, "eliminate_orphaned_memorydata %s\n", layers[i]->name.c_str());
2322
layers[i]->type = "ncnnfused";
2328
int NetOptimize::eliminate_reshape_after_global_pooling()
2330
const size_t layer_count = layers.size();
2331
for (size_t i = 0; i < layer_count; i++)
2333
if (layers[i]->type != "Pooling")
2336
ncnn::Pooling* pooling = (ncnn::Pooling*)layers[i];
2337
if (pooling->global_pooling == 0)
2341
int top_blob_index = layers[i]->tops[0];
2344
for (; j < layer_count; j++)
2346
if (layers[j]->type != "Reshape")
2349
if (layers[j]->bottoms.size() != 1)
2352
if (layers[j]->bottoms[0] == top_blob_index)
2356
if (j == layer_count)
2359
ncnn::Reshape* reshape = (ncnn::Reshape*)layers[j];
2360
if (reshape->h != -233 || reshape->c != -233 || reshape->permute != 0)
2363
fprintf(stderr, "eliminate_reshape_after_global_pooling %s %s\n", pooling->name.c_str(), reshape->name.c_str());
2365
int top_blob_index_final = reshape->tops[0];
2366
pooling->tops[0] = top_blob_index_final;
2367
blobs[top_blob_index_final].producer = i;
2368
reshape->type = "ncnnfused";
2374
int NetOptimize::eliminate_flatten_after_global_pooling()
2376
const size_t layer_count = layers.size();
2377
for (size_t i = 0; i < layer_count; i++)
2379
if (layers[i]->type != "Pooling")
2382
ncnn::Pooling* pooling = (ncnn::Pooling*)layers[i];
2383
if (pooling->global_pooling == 0)
2387
int top_blob_index = layers[i]->tops[0];
2390
for (; j < layer_count; j++)
2392
if (layers[j]->type != "Flatten")
2395
if (layers[j]->bottoms.size() != 1)
2398
if (layers[j]->bottoms[0] == top_blob_index)
2402
if (j == layer_count)
2405
ncnn::Flatten* flatten = (ncnn::Flatten*)layers[j];
2407
fprintf(stderr, "eliminate_flatten_after_global_pooling %s %s\n", pooling->name.c_str(), flatten->name.c_str());
2409
int top_blob_index_final = flatten->tops[0];
2410
pooling->tops[0] = top_blob_index_final;
2411
blobs[top_blob_index_final].producer = i;
2412
flatten->type = "ncnnfused";
2418
int NetOptimize::eliminate_flatten_after_innerproduct()
2420
const size_t layer_count = layers.size();
2421
for (size_t i = 0; i < layer_count; i++)
2423
if (layers[i]->type != "InnerProduct")
2427
int top_blob_index = layers[i]->tops[0];
2430
for (; j < layer_count; j++)
2432
if (layers[j]->type != "Flatten")
2435
if (layers[j]->bottoms.size() != 1)
2438
if (layers[j]->bottoms[0] == top_blob_index)
2442
if (j == layer_count)
2445
ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
2446
ncnn::Flatten* flatten = (ncnn::Flatten*)layers[j];
2448
fprintf(stderr, "eliminate_flatten_after_innerproduct %s %s\n", innerproduct->name.c_str(), flatten->name.c_str());
2450
int top_blob_index_final = flatten->tops[0];
2451
innerproduct->tops[0] = top_blob_index_final;
2452
blobs[top_blob_index_final].producer = i;
2453
flatten->type = "ncnnfused";
2459
int NetOptimize::eliminate_reshape_before_binaryop()
2461
const size_t layer_count = layers.size();
2462
for (size_t i = 0; i < layer_count; i++)
2464
if (layers[i]->type != "Reshape")
2467
ncnn::Reshape* reshape = (ncnn::Reshape*)layers[i];
2468
if (reshape->w != 1 || reshape->h != 1 || reshape->permute != 0)
2472
int top_blob_index = layers[i]->tops[0];
2475
for (; j < layer_count; j++)
2477
if (layers[j]->type != "BinaryOp")
2480
if (layers[j]->bottoms.size() != 2)
2483
if (layers[j]->bottoms[0] == top_blob_index || layers[j]->bottoms[1] == top_blob_index)
2487
if (j == layer_count)
2490
ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
2492
fprintf(stderr, "eliminate_reshape_before_binaryop %s %s\n", reshape->name.c_str(), binaryop->name.c_str());
2494
int bottom_blob_index_final = reshape->bottoms[0];
2495
if (layers[j]->bottoms[0] == top_blob_index)
2496
binaryop->bottoms[0] = bottom_blob_index_final;
2497
if (layers[j]->bottoms[1] == top_blob_index)
2498
binaryop->bottoms[1] = bottom_blob_index_final;
2499
blobs[bottom_blob_index_final].consumer = j;
2500
reshape->type = "ncnnfused";
2506
int NetOptimize::replace_reduction_with_global_pooling()
2508
const size_t layer_count = layers.size();
2509
for (size_t i = 0; i < layer_count; i++)
2511
if (layers[i]->type != "Reduction")
2514
ncnn::Reduction* reduction1 = (ncnn::Reduction*)layers[i];
2515
if (reduction1->operation != 3 || reduction1->reduce_all != 0 || reduction1->coeff != 1.f)
2518
if (reduction1->axes.w != 1)
2521
const int* axes_ptr = reduction1->axes;
2522
if (axes_ptr[0] != 2 && axes_ptr[0] != 3)
2526
int top_blob_index = layers[i]->tops[0];
2529
for (; j < layer_count; j++)
2531
if (layers[j]->type != "Reduction")
2534
if (layers[j]->bottoms.size() != 1)
2537
if (layers[j]->bottoms[0] == top_blob_index)
2541
if (j == layer_count)
2544
ncnn::Reduction* reduction2 = (ncnn::Reduction*)layers[j];
2545
if (reduction2->operation != 3 || reduction2->reduce_all != 0 || reduction2->coeff != 1.f)
2548
if (reduction2->axes.w != 1)
2551
const int* axes2_ptr = reduction2->axes;
2552
if (axes2_ptr[0] != 2)
2555
fprintf(stderr, "replace_reduction_with_global_pooling %s %s\n", reduction1->name.c_str(), reduction2->name.c_str());
2557
ncnn::Pooling* pooling = (ncnn::Pooling*)ncnn::create_layer_cpu("Pooling");
2559
pooling->type = "Pooling";
2560
pooling->name = reduction2->name;
2561
pooling->bottoms = reduction2->bottoms;
2562
pooling->tops = reduction2->tops;
2565
pooling->load_param(pd);
2567
pooling->pooling_type = 1;
2568
pooling->global_pooling = 1;
2570
layers[j] = pooling;
2573
int bottom_blob_index_final = reduction1->bottoms[0];
2574
pooling->bottoms[0] = bottom_blob_index_final;
2575
blobs[bottom_blob_index_final].consumer = j;
2576
reduction1->type = "ncnnfused";
2582
int NetOptimize::replace_prelu_with_leaky_relu()
2584
const size_t layer_count = layers.size();
2585
for (size_t i = 0; i < layer_count; i++)
2587
if (layers[i]->type != "PReLU")
2590
ncnn::PReLU* prelu = (ncnn::PReLU*)layers[i];
2591
if (prelu->num_slope != 1)
2594
fprintf(stderr, "replace_prelu_with_leaky_relu %s\n", prelu->name.c_str());
2596
ncnn::ReLU* relu = (ncnn::ReLU*)ncnn::create_layer_cpu("ReLU");
2598
relu->type = "ReLU";
2599
relu->name = prelu->name;
2600
relu->bottoms = prelu->bottoms;
2601
relu->tops = prelu->tops;
2604
relu->load_param(pd);
2606
relu->slope = prelu->slope_data[0];
2615
int NetOptimize::replace_convolution_with_innerproduct_after_global_pooling()
2617
const size_t layer_count = layers.size();
2618
for (size_t i = 0; i < layer_count; i++)
2620
if (layers[i]->type != "Pooling")
2623
ncnn::Pooling* pooling = (ncnn::Pooling*)layers[i];
2624
if (pooling->global_pooling == 0)
2628
int top_blob_index = layers[i]->tops[0];
2631
for (; j < layer_count; j++)
2633
if (layers[j]->type != "Convolution")
2636
if (layers[j]->bottoms.size() != 1)
2639
if (layers[j]->bottoms[0] == top_blob_index)
2643
if (j == layer_count)
2646
ncnn::Convolution* convolution = (ncnn::Convolution*)layers[j];
2648
fprintf(stderr, "replace_convolution_with_innerproduct_after_global_pooling %s %s\n", pooling->name.c_str(), convolution->name.c_str());
2650
ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)ncnn::create_layer_cpu("InnerProduct");
2652
innerproduct->type = "InnerProduct";
2653
innerproduct->name = convolution->name;
2654
innerproduct->bottoms = convolution->bottoms;
2655
innerproduct->tops = convolution->tops;
2658
innerproduct->load_param(pd);
2660
innerproduct->num_output = convolution->num_output;
2661
innerproduct->bias_term = convolution->bias_term;
2662
innerproduct->weight_data_size = convolution->weight_data_size;
2663
innerproduct->int8_scale_term = convolution->int8_scale_term;
2665
innerproduct->weight_data = convolution->weight_data;
2666
innerproduct->bias_data = convolution->bias_data;
2668
innerproduct->weight_data_int8_scales = convolution->weight_data_int8_scales;
2669
innerproduct->bottom_blob_int8_scales = convolution->bottom_blob_int8_scales;
2672
innerproduct->activation_type = convolution->activation_type;
2673
innerproduct->activation_params = convolution->activation_params;
2675
layers[j] = innerproduct;
2682
int NetOptimize::replace_convolution_with_innerproduct_after_innerproduct()
2684
const size_t layer_count = layers.size();
2687
bool replaced = false;
2689
for (size_t i = 0; i < layer_count; i++)
2691
if (layers[i]->type != "InnerProduct")
2695
int top_blob_index = layers[i]->tops[0];
2698
for (; j < layer_count; j++)
2700
if (layers[j]->type != "Convolution")
2703
if (layers[j]->bottoms.size() != 1)
2706
if (layers[j]->bottoms[0] == top_blob_index)
2710
if (j == layer_count)
2713
ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
2714
ncnn::Convolution* convolution = (ncnn::Convolution*)layers[j];
2716
fprintf(stderr, "replace_convolution_with_innerproduct_after_innerproduct %s %s\n", innerproduct->name.c_str(), convolution->name.c_str());
2718
ncnn::InnerProduct* innerproduct2 = (ncnn::InnerProduct*)ncnn::create_layer_cpu("InnerProduct");
2720
innerproduct2->type = "InnerProduct";
2721
innerproduct2->name = convolution->name;
2722
innerproduct2->bottoms = convolution->bottoms;
2723
innerproduct2->tops = convolution->tops;
2726
innerproduct2->load_param(pd);
2728
innerproduct2->num_output = convolution->num_output;
2729
innerproduct2->bias_term = convolution->bias_term;
2730
innerproduct2->weight_data_size = convolution->weight_data_size;
2731
innerproduct->int8_scale_term = convolution->int8_scale_term;
2733
innerproduct2->weight_data = convolution->weight_data;
2734
innerproduct2->bias_data = convolution->bias_data;
2736
innerproduct->weight_data_int8_scales = convolution->weight_data_int8_scales;
2737
innerproduct->bottom_blob_int8_scales = convolution->bottom_blob_int8_scales;
2740
innerproduct2->activation_type = convolution->activation_type;
2741
innerproduct2->activation_params = convolution->activation_params;
2743
layers[j] = innerproduct2;
2756
int main(int argc, char** argv)
2760
fprintf(stderr, "usage: %s [inparam] [inbin] [outparam] [outbin] [flag] [cutstart] [cutend]\n", argv[0]);
2764
const char* inparam = argv[1];
2765
const char* inbin = argv[2];
2766
const char* outparam = argv[3];
2767
const char* outbin = argv[4];
2768
int flag = atoi(argv[5]);
2769
const char* cutstartname = nullptr;
2770
const char* cutendname = nullptr;
2774
cutstartname = argv[6];
2779
cutendname = argv[7];
2782
NetOptimize optimizer;
2784
if (flag == 65536 || flag == 1)
2786
optimizer.storage_type = 1;
2790
optimizer.storage_type = 0;
2793
optimizer.load_param(inparam);
2795
if (strcmp(inbin, "null") == 0)
2797
DataReaderFromEmpty dr;
2798
optimizer.load_model(dr);
2799
optimizer.gen_random_weight = true;
2802
optimizer.load_model(inbin);
2804
if (optimizer.set_cutparam(cutstartname, cutendname) < 0)
2809
optimizer.fuse_batchnorm_scale();
2810
optimizer.fuse_convolution_batchnorm();
2811
optimizer.fuse_convolution_mul();
2812
optimizer.fuse_convolution_add();
2813
optimizer.fuse_convolutiondepthwise_batchnorm();
2814
optimizer.fuse_convolutiondepthwise_mul();
2815
optimizer.fuse_convolutiondepthwise_add();
2816
optimizer.fuse_deconvolution_batchnorm();
2817
optimizer.fuse_deconvolution_mul();
2818
optimizer.fuse_deconvolution_add();
2819
optimizer.fuse_deconvolutiondepthwise_batchnorm();
2820
optimizer.fuse_innerproduct_batchnorm();
2821
optimizer.fuse_innerproduct_add();
2822
optimizer.fuse_innerproduct_dropout();
2824
optimizer.replace_reduction_with_global_pooling();
2825
optimizer.replace_prelu_with_leaky_relu();
2827
optimizer.fuse_convolution_activation();
2828
optimizer.fuse_convolutiondepthwise_activation();
2829
optimizer.fuse_deconvolution_activation();
2830
optimizer.fuse_deconvolutiondepthwise_activation();
2831
optimizer.fuse_innerproduct_activation();
2832
optimizer.fuse_memorydata_binaryop();
2833
optimizer.fuse_binaryop_eltwise();
2835
optimizer.eliminate_dropout();
2836
optimizer.eliminate_pooling1x1();
2837
optimizer.eliminate_noop();
2838
optimizer.eliminate_split();
2839
optimizer.eliminate_flatten_after_global_pooling();
2840
optimizer.eliminate_reshape_after_global_pooling();
2841
optimizer.eliminate_reshape_before_binaryop();
2843
optimizer.replace_convolution_with_innerproduct_after_global_pooling();
2844
optimizer.replace_convolution_with_innerproduct_after_innerproduct();
2846
optimizer.eliminate_flatten_after_innerproduct();
2847
optimizer.eliminate_orphaned_memorydata();
2849
optimizer.shape_inference();
2851
optimizer.estimate_memory_footprint();
2853
optimizer.save(outparam, outbin);