ncnn

yolov5_pnnx.cpp
429 строк · 12.9 Кб
Перенос по словам
1
// Tencent is pleased to support the open source community by making ncnn available.
2
//
3
// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
4
//
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
7
//
8
// https://opensource.org/licenses/BSD-3-Clause
9
//
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
14

15
#include "layer.h"
16
#include "net.h"
17

18
#if defined(USE_NCNN_SIMPLEOCV)
19
#include "simpleocv.h"
20
#else
21
#include <opencv2/core/core.hpp>
22
#include <opencv2/highgui/highgui.hpp>
23
#include <opencv2/imgproc/imgproc.hpp>
24
#endif
25
#include <float.h>
26
#include <stdio.h>
27
#include <vector>
28

29
struct Object
30
{
31
    cv::Rect_<float> rect;
32
    int label;
33
    float prob;
34
};
35

36
static inline float intersection_area(const Object& a, const Object& b)
37
{
38
    cv::Rect_<float> inter = a.rect & b.rect;
39
    return inter.area();
40
}
41

42
static void qsort_descent_inplace(std::vector<Object>& faceobjects, int left, int right)
43
{
44
    int i = left;
45
    int j = right;
46
    float p = faceobjects[(left + right) / 2].prob;
47

48
    while (i <= j)
49
    {
50
        while (faceobjects[i].prob > p)
51
            i++;
52

53
        while (faceobjects[j].prob < p)
54
            j--;
55

56
        if (i <= j)
57
        {
58
            // swap
59
            std::swap(faceobjects[i], faceobjects[j]);
60

61
            i++;
62
            j--;
63
        }
64
    }
65

66
    #pragma omp parallel sections
67
    {
68
        #pragma omp section
69
        {
70
            if (left < j) qsort_descent_inplace(faceobjects, left, j);
71
        }
72
        #pragma omp section
73
        {
74
            if (i < right) qsort_descent_inplace(faceobjects, i, right);
75
        }
76
    }
77
}
78

79
static void qsort_descent_inplace(std::vector<Object>& faceobjects)
80
{
81
    if (faceobjects.empty())
82
        return;
83

84
    qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1);
85
}
86

87
static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
88
{
89
    picked.clear();
90

91
    const int n = faceobjects.size();
92

93
    std::vector<float> areas(n);
94
    for (int i = 0; i < n; i++)
95
    {
96
        areas[i] = faceobjects[i].rect.area();
97
    }
98

99
    for (int i = 0; i < n; i++)
100
    {
101
        const Object& a = faceobjects[i];
102

103
        int keep = 1;
104
        for (int j = 0; j < (int)picked.size(); j++)
105
        {
106
            const Object& b = faceobjects[picked[j]];
107

108
            if (!agnostic && a.label != b.label)
109
                continue;
110

111
            // intersection over union
112
            float inter_area = intersection_area(a, b);
113
            float union_area = areas[i] + areas[picked[j]] - inter_area;
114
            // float IoU = inter_area / union_area
115
            if (inter_area / union_area > nms_threshold)
116
                keep = 0;
117
        }
118

119
        if (keep)
120
            picked.push_back(i);
121
    }
122
}
123

124
static inline float sigmoid(float x)
125
{
126
    return static_cast<float>(1.f / (1.f + exp(-x)));
127
}
128

129
static void generate_proposals(const ncnn::Mat& anchors, int stride, const ncnn::Mat& in_pad, const ncnn::Mat& feat_blob, float prob_threshold, std::vector<Object>& objects)
130
{
131
    const int num_grid_x = feat_blob.w;
132
    const int num_grid_y = feat_blob.h;
133

134
    const int num_anchors = anchors.w / 2;
135

136
    const int num_class = feat_blob.c / num_anchors - 5;
137

138
    const int feat_offset = num_class + 5;
139

140
    for (int q = 0; q < num_anchors; q++)
141
    {
142
        const float anchor_w = anchors[q * 2];
143
        const float anchor_h = anchors[q * 2 + 1];
144

145
        for (int i = 0; i < num_grid_y; i++)
146
        {
147
            for (int j = 0; j < num_grid_x; j++)
148
            {
149
                // find class index with max class score
150
                int class_index = 0;
151
                float class_score = -FLT_MAX;
152
                for (int k = 0; k < num_class; k++)
153
                {
154
                    float score = feat_blob.channel(q * feat_offset + 5 + k).row(i)[j];
155
                    if (score > class_score)
156
                    {
157
                        class_index = k;
158
                        class_score = score;
159
                    }
160
                }
161

162
                float box_score = feat_blob.channel(q * feat_offset + 4).row(i)[j];
163

164
                float confidence = sigmoid(box_score) * sigmoid(class_score);
165

166
                if (confidence >= prob_threshold)
167
                {
168
                    // yolov5/models/yolo.py Detect forward
169
                    // y = x[i].sigmoid()
170
                    // y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i]  # xy
171
                    // y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
172

173
                    float dx = sigmoid(feat_blob.channel(q * feat_offset + 0).row(i)[j]);
174
                    float dy = sigmoid(feat_blob.channel(q * feat_offset + 1).row(i)[j]);
175
                    float dw = sigmoid(feat_blob.channel(q * feat_offset + 2).row(i)[j]);
176
                    float dh = sigmoid(feat_blob.channel(q * feat_offset + 3).row(i)[j]);
177

178
                    float pb_cx = (dx * 2.f - 0.5f + j) * stride;
179
                    float pb_cy = (dy * 2.f - 0.5f + i) * stride;
180

181
                    float pb_w = pow(dw * 2.f, 2) * anchor_w;
182
                    float pb_h = pow(dh * 2.f, 2) * anchor_h;
183

184
                    float x0 = pb_cx - pb_w * 0.5f;
185
                    float y0 = pb_cy - pb_h * 0.5f;
186
                    float x1 = pb_cx + pb_w * 0.5f;
187
                    float y1 = pb_cy + pb_h * 0.5f;
188

189
                    Object obj;
190
                    obj.rect.x = x0;
191
                    obj.rect.y = y0;
192
                    obj.rect.width = x1 - x0;
193
                    obj.rect.height = y1 - y0;
194
                    obj.label = class_index;
195
                    obj.prob = confidence;
196

197
                    objects.push_back(obj);
198
                }
199
            }
200
        }
201
    }
202
}
203

204
static int detect_yolov5(const cv::Mat& bgr, std::vector<Object>& objects)
205
{
206
    ncnn::Net yolov5;
207

208
    yolov5.opt.use_vulkan_compute = true;
209
    // yolov5.opt.use_bf16_storage = true;
210

211
    // original pretrained model from https://github.com/ultralytics/yolov5
212
    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
213
    if (yolov5.load_param("yolov5s.ncnn.param"))
214
        exit(-1);
215
    if (yolov5.load_model("yolov5s.ncnn.bin"))
216
        exit(-1);
217

218
    const int target_size = 640;
219
    const float prob_threshold = 0.25f;
220
    const float nms_threshold = 0.45f;
221

222
    int img_w = bgr.cols;
223
    int img_h = bgr.rows;
224

225
    // yolov5/models/common.py DetectMultiBackend
226
    const int max_stride = 64;
227

228
    // letterbox pad to multiple of max_stride
229
    int w = img_w;
230
    int h = img_h;
231
    float scale = 1.f;
232
    if (w > h)
233
    {
234
        scale = (float)target_size / w;
235
        w = target_size;
236
        h = h * scale;
237
    }
238
    else
239
    {
240
        scale = (float)target_size / h;
241
        h = target_size;
242
        w = w * scale;
243
    }
244

245
    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h);
246

247
    // pad to target_size rectangle
248
    // yolov5/utils/datasets.py letterbox
249
    int wpad = (w + max_stride - 1) / max_stride * max_stride - w;
250
    int hpad = (h + max_stride - 1) / max_stride * max_stride - h;
251
    ncnn::Mat in_pad;
252
    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);
253

254
    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
255
    in_pad.substract_mean_normalize(0, norm_vals);
256

257
    ncnn::Extractor ex = yolov5.create_extractor();
258

259
    ex.input("in0", in_pad);
260

261
    std::vector<Object> proposals;
262

263
    // anchor setting from yolov5/models/yolov5s.yaml
264

265
    // stride 8
266
    {
267
        ncnn::Mat out;
268
        ex.extract("out0", out);
269

270
        ncnn::Mat anchors(6);
271
        anchors[0] = 10.f;
272
        anchors[1] = 13.f;
273
        anchors[2] = 16.f;
274
        anchors[3] = 30.f;
275
        anchors[4] = 33.f;
276
        anchors[5] = 23.f;
277

278
        std::vector<Object> objects8;
279
        generate_proposals(anchors, 8, in_pad, out, prob_threshold, objects8);
280

281
        proposals.insert(proposals.end(), objects8.begin(), objects8.end());
282
    }
283

284
    // stride 16
285
    {
286
        ncnn::Mat out;
287
        ex.extract("out1", out);
288

289
        ncnn::Mat anchors(6);
290
        anchors[0] = 30.f;
291
        anchors[1] = 61.f;
292
        anchors[2] = 62.f;
293
        anchors[3] = 45.f;
294
        anchors[4] = 59.f;
295
        anchors[5] = 119.f;
296

297
        std::vector<Object> objects16;
298
        generate_proposals(anchors, 16, in_pad, out, prob_threshold, objects16);
299

300
        proposals.insert(proposals.end(), objects16.begin(), objects16.end());
301
    }
302

303
    // stride 32
304
    {
305
        ncnn::Mat out;
306
        ex.extract("out2", out);
307

308
        ncnn::Mat anchors(6);
309
        anchors[0] = 116.f;
310
        anchors[1] = 90.f;
311
        anchors[2] = 156.f;
312
        anchors[3] = 198.f;
313
        anchors[4] = 373.f;
314
        anchors[5] = 326.f;
315

316
        std::vector<Object> objects32;
317
        generate_proposals(anchors, 32, in_pad, out, prob_threshold, objects32);
318

319
        proposals.insert(proposals.end(), objects32.begin(), objects32.end());
320
    }
321

322
    // sort all proposals by score from highest to lowest
323
    qsort_descent_inplace(proposals);
324

325
    // apply nms with nms_threshold
326
    std::vector<int> picked;
327
    nms_sorted_bboxes(proposals, picked, nms_threshold);
328

329
    int count = picked.size();
330

331
    objects.resize(count);
332
    for (int i = 0; i < count; i++)
333
    {
334
        objects[i] = proposals[picked[i]];
335

336
        // adjust offset to original unpadded
337
        float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
338
        float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
339
        float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
340
        float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;
341

342
        // clip
343
        x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
344
        y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
345
        x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
346
        y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);
347

348
        objects[i].rect.x = x0;
349
        objects[i].rect.y = y0;
350
        objects[i].rect.width = x1 - x0;
351
        objects[i].rect.height = y1 - y0;
352
    }
353

354
    return 0;
355
}
356

357
static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
358
{
359
    static const char* class_names[] = {
360
        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
361
        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
362
        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
363
        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
364
        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
365
        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
366
        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
367
        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
368
        "hair drier", "toothbrush"
369
    };
370

371
    cv::Mat image = bgr.clone();
372

373
    for (size_t i = 0; i < objects.size(); i++)
374
    {
375
        const Object& obj = objects[i];
376

377
        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
378
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
379

380
        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
381

382
        char text[256];
383
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
384

385
        int baseLine = 0;
386
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
387

388
        int x = obj.rect.x;
389
        int y = obj.rect.y - label_size.height - baseLine;
390
        if (y < 0)
391
            y = 0;
392
        if (x + label_size.width > image.cols)
393
            x = image.cols - label_size.width;
394

395
        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
396
                      cv::Scalar(255, 255, 255), -1);
397

398
        cv::putText(image, text, cv::Point(x, y + label_size.height),
399
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
400
    }
401

402
    cv::imshow("image", image);
403
    cv::waitKey(0);
404
}
405

406
int main(int argc, char** argv)
407
{
408
    if (argc != 2)
409
    {
410
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
411
        return -1;
412
    }
413

414
    const char* imagepath = argv[1];
415

416
    cv::Mat m = cv::imread(imagepath, 1);
417
    if (m.empty())
418
    {
419
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
420
        return -1;
421
    }
422

423
    std::vector<Object> objects;
424
    detect_yolov5(m, objects);
425

426
    draw_objects(m, objects);
427

428
    return 0;
429
}
430
ncnn

Использование cookies