ncnn

concat_mips.cpp
406 строк · 12.8 Кб
Перенос по словам
1
// Tencent is pleased to support the open source community by making ncnn available.
2
//
3
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
4
//
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
7
//
8
// https://opensource.org/licenses/BSD-3-Clause
9
//
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
14

15
#include "concat_mips.h"
16

17
namespace ncnn {
18

19
Concat_mips::Concat_mips()
20
{
21
#if __mips_msa
22
    support_packing = true;
23
#endif // __mips_msa
24
}
25

26
int Concat_mips::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
27
{
28
    int dims = bottom_blobs[0].dims;
29
    int positive_axis = axis < 0 ? dims + axis : axis;
30

31
    if (dims == 1) // positive_axis == 0
32
    {
33
        // concat vector
34
        // total length
35
        size_t elemsize = bottom_blobs[0].elemsize;
36
        int elempack = bottom_blobs[0].elempack;
37
        int top_w = 0;
38
        for (size_t b = 0; b < bottom_blobs.size(); b++)
39
        {
40
            const Mat& bottom_blob = bottom_blobs[b];
41
            top_w += bottom_blob.w * bottom_blob.elempack;
42
        }
43

44
        int out_elempack = opt.use_packing_layout && top_w % 4 == 0 ? 4 : 1;
45
        size_t out_elemsize = elemsize / elempack * out_elempack;
46

47
        Mat& top_blob = top_blobs[0];
48
        top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
49
        if (top_blob.empty())
50
            return -100;
51

52
        float* outptr = top_blob;
53
        for (size_t b = 0; b < bottom_blobs.size(); b++)
54
        {
55
            const Mat& bottom_blob = bottom_blobs[b];
56

57
            const float* ptr = bottom_blob;
58
            memcpy(outptr, ptr, bottom_blob.w * bottom_blob.elemsize);
59

60
            outptr += bottom_blob.w * bottom_blob.elempack;
61
        }
62
    }
63

64
    if (dims == 2 && positive_axis == 0)
65
    {
66
        // concat image
67
        int w = bottom_blobs[0].w;
68

69
        // total height
70
        size_t elemsize = bottom_blobs[0].elemsize;
71
        int elempack = bottom_blobs[0].elempack;
72
        int top_h = 0;
73
        for (size_t b = 0; b < bottom_blobs.size(); b++)
74
        {
75
            const Mat& bottom_blob = bottom_blobs[b];
76
            elemsize = std::min(elemsize, bottom_blob.elemsize);
77
            elempack = std::min(elempack, bottom_blob.elempack);
78
            top_h += bottom_blob.h * bottom_blob.elempack;
79
        }
80

81
        int out_elempack = opt.use_packing_layout && top_h % 4 == 0 ? 4 : 1;
82
        size_t out_elemsize = elemsize / elempack * out_elempack;
83

84
        Mat& top_blob = top_blobs[0];
85
        top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
86
        if (top_blob.empty())
87
            return -100;
88

89
        Mat top_blob_unpacked = top_blob;
90
        if (elempack < out_elempack)
91
        {
92
            top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_allocator);
93
            if (top_blob_unpacked.empty())
94
                return -100;
95
        }
96

97
        float* outptr = top_blob_unpacked;
98
        for (size_t b = 0; b < bottom_blobs.size(); b++)
99
        {
100
            const Mat& bottom_blob = bottom_blobs[b];
101

102
            if (bottom_blob.elempack == 4 && elempack == 1)
103
            {
104
                for (int i = 0; i < bottom_blob.h; i++)
105
                {
106
                    const float* r0 = bottom_blob.row(i);
107

108
                    float* outptr0 = outptr;
109
                    float* outptr1 = outptr + w;
110
                    float* outptr2 = outptr + w * 2;
111
                    float* outptr3 = outptr + w * 3;
112

113
                    for (int j = 0; j < w; j++)
114
                    {
115
                        *outptr0++ = r0[0];
116
                        *outptr1++ = r0[1];
117
                        *outptr2++ = r0[2];
118
                        *outptr3++ = r0[3];
119

120
                        r0 += 4;
121
                    }
122

123
                    outptr += w * 4;
124
                }
125
            }
126
            else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == 4 && elempack == 4)
127
            {
128
                int size = w * bottom_blob.h;
129

130
                const float* ptr = bottom_blob;
131
                memcpy(outptr, ptr, size * bottom_blob.elemsize);
132

133
                outptr += size * bottom_blob.elempack;
134
            }
135
        }
136

137
        // packing
138
        if (elempack < out_elempack)
139
        {
140
            convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
141
        }
142
    }
143

144
    if (dims == 2 && positive_axis == 1)
145
    {
146
        // interleave image row
147
        int h = bottom_blobs[0].h;
148
        size_t elemsize = bottom_blobs[0].elemsize;
149
        int elempack = bottom_blobs[0].elempack;
150

151
        // total width
152
        int top_w = 0;
153
        for (size_t b = 0; b < bottom_blobs.size(); b++)
154
        {
155
            const Mat& bottom_blob = bottom_blobs[b];
156
            top_w += bottom_blob.w;
157
        }
158

159
        Mat& top_blob = top_blobs[0];
160
        top_blob.create(top_w, h, elemsize, elempack, opt.blob_allocator);
161
        if (top_blob.empty())
162
            return -100;
163

164
        #pragma omp parallel for num_threads(opt.num_threads)
165
        for (int i = 0; i < h; i++)
166
        {
167
            float* outptr = top_blob.row(i);
168
            for (size_t b = 0; b < bottom_blobs.size(); b++)
169
            {
170
                const Mat& bottom_blob = bottom_blobs[b];
171

172
                const float* ptr = bottom_blob.row(i);
173
                memcpy(outptr, ptr, bottom_blob.w * elemsize);
174

175
                outptr += bottom_blob.w * elempack;
176
            }
177
        }
178
    }
179

180
    if ((dims == 3 || dims == 4) && positive_axis == 0)
181
    {
182
        // concat dim
183
        int w = bottom_blobs[0].w;
184
        int h = bottom_blobs[0].h;
185
        int d = bottom_blobs[0].d;
186

187
        // total channels
188
        size_t elemsize = bottom_blobs[0].elemsize;
189
        int elempack = bottom_blobs[0].elempack;
190
        int top_channels = 0;
191
        for (size_t b = 0; b < bottom_blobs.size(); b++)
192
        {
193
            const Mat& bottom_blob = bottom_blobs[b];
194
            elemsize = std::min(elemsize, bottom_blob.elemsize);
195
            elempack = std::min(elempack, bottom_blob.elempack);
196
            top_channels += bottom_blob.c * bottom_blob.elempack;
197
        }
198

199
        int out_elempack = opt.use_packing_layout && top_channels % 4 == 0 ? 4 : 1;
200
        size_t out_elemsize = elemsize / elempack * out_elempack;
201

202
        Mat& top_blob = top_blobs[0];
203
        top_blob.create(w, h, d, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
204
        if (top_blob.empty())
205
            return -100;
206

207
        top_blob.dims = dims;
208

209
        Mat top_blob_unpacked = top_blob;
210
        if (elempack < out_elempack)
211
        {
212
            top_blob_unpacked.create(w, h, d, top_channels / elempack, elemsize, elempack, opt.workspace_allocator);
213
            if (top_blob_unpacked.empty())
214
                return -100;
215

216
            top_blob_unpacked.dims = dims;
217
        }
218

219
        int p = 0;
220
        for (size_t b = 0; b < bottom_blobs.size(); b++)
221
        {
222
            const Mat& bottom_blob = bottom_blobs[b];
223

224
            if (bottom_blob.elempack == 4 && elempack == 1)
225
            {
226
                int size = bottom_blob.w * bottom_blob.h * bottom_blob.d;
227

228
                for (int q = 0; q < bottom_blob.c; q++)
229
                {
230
                    const float* r0 = bottom_blob.channel(q);
231

232
                    float* outptr0 = top_blob_unpacked.channel(p);
233
                    float* outptr1 = top_blob_unpacked.channel(p + 1);
234
                    float* outptr2 = top_blob_unpacked.channel(p + 2);
235
                    float* outptr3 = top_blob_unpacked.channel(p + 3);
236

237
                    for (int i = 0; i < size; i++)
238
                    {
239
                        *outptr0++ = r0[0];
240
                        *outptr1++ = r0[1];
241
                        *outptr2++ = r0[2];
242
                        *outptr3++ = r0[3];
243

244
                        r0 += 4;
245
                    }
246

247
                    p += 4;
248
                }
249
            }
250
            else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == 4 && elempack == 4)
251
            {
252
                int size = bottom_blob.total();
253

254
                const float* ptr = bottom_blob;
255
                float* outptr = top_blob_unpacked.channel(p);
256
                memcpy(outptr, ptr, size * bottom_blob.elemsize);
257

258
                p += bottom_blob.c;
259
            }
260
        }
261

262
        // packing
263
        if (elempack < out_elempack)
264
        {
265
            convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
266
        }
267
    }
268

269
    if ((dims == 3 && positive_axis == 1) || (dims == 4 && positive_axis == 2))
270
    {
271
        // interleave dim height
272
        int w = bottom_blobs[0].w;
273
        int d = bottom_blobs[0].d;
274
        int channels = bottom_blobs[0].c;
275
        size_t elemsize = bottom_blobs[0].elemsize;
276
        int elempack = bottom_blobs[0].elempack;
277

278
        // total height
279
        int top_h = 0;
280
        for (size_t b = 0; b < bottom_blobs.size(); b++)
281
        {
282
            const Mat& bottom_blob = bottom_blobs[b];
283
            top_h += bottom_blob.h;
284
        }
285

286
        Mat& top_blob = top_blobs[0];
287
        top_blob.create(w, top_h, d, channels, elemsize, elempack, opt.blob_allocator);
288
        if (top_blob.empty())
289
            return -100;
290

291
        top_blob.dims = dims;
292

293
        #pragma omp parallel for num_threads(opt.num_threads)
294
        for (int q = 0; q < channels; q++)
295
        {
296
            float* outptr = top_blob.channel(q);
297

298
            for (int i = 0; i < d; i++)
299
            {
300
                for (size_t b = 0; b < bottom_blobs.size(); b++)
301
                {
302
                    const Mat& bottom_blob = bottom_blobs[b];
303

304
                    int size = bottom_blob.w * bottom_blob.h;
305

306
                    const float* ptr = bottom_blob.channel(q).depth(i);
307
                    memcpy(outptr, ptr, size * elemsize);
308

309
                    outptr += size * elempack;
310
                }
311
            }
312
        }
313
    }
314

315
    if ((dims == 3 && positive_axis == 2) || (dims == 4 && positive_axis == 3))
316
    {
317
        // interleave dim width
318
        int h = bottom_blobs[0].h;
319
        int d = bottom_blobs[0].d;
320
        int channels = bottom_blobs[0].c;
321
        size_t elemsize = bottom_blobs[0].elemsize;
322
        int elempack = bottom_blobs[0].elempack;
323

324
        // total height
325
        int top_w = 0;
326
        for (size_t b = 0; b < bottom_blobs.size(); b++)
327
        {
328
            const Mat& bottom_blob = bottom_blobs[b];
329
            top_w += bottom_blob.w;
330
        }
331

332
        Mat& top_blob = top_blobs[0];
333
        top_blob.create(top_w, h, d, channels, elemsize, elempack, opt.blob_allocator);
334
        if (top_blob.empty())
335
            return -100;
336

337
        top_blob.dims = dims;
338

339
        #pragma omp parallel for num_threads(opt.num_threads)
340
        for (int q = 0; q < channels; q++)
341
        {
342
            float* outptr = top_blob.channel(q);
343

344
            for (int i = 0; i < d; i++)
345
            {
346
                for (int j = 0; j < h; j++)
347
                {
348
                    for (size_t b = 0; b < bottom_blobs.size(); b++)
349
                    {
350
                        const Mat& bottom_blob = bottom_blobs[b];
351

352
                        const float* ptr = bottom_blob.channel(q).depth(i).row(j);
353
                        memcpy(outptr, ptr, bottom_blob.w * elemsize);
354

355
                        outptr += bottom_blob.w * elempack;
356
                    }
357
                }
358
            }
359
        }
360
    }
361

362
    if (dims == 4 && positive_axis == 1)
363
    {
364
        // interleave dim depth
365
        int w = bottom_blobs[0].w;
366
        int h = bottom_blobs[0].h;
367
        int channels = bottom_blobs[0].c;
368
        size_t elemsize = bottom_blobs[0].elemsize;
369
        int elempack = bottom_blobs[0].elempack;
370

371
        // total depth
372
        int top_d = 0;
373
        for (size_t b = 0; b < bottom_blobs.size(); b++)
374
        {
375
            const Mat& bottom_blob = bottom_blobs[b];
376
            top_d += bottom_blob.d;
377
        }
378

379
        Mat& top_blob = top_blobs[0];
380
        top_blob.create(w, h, top_d, channels, elemsize, elempack, opt.blob_allocator);
381
        if (top_blob.empty())
382
            return -100;
383

384
        #pragma omp parallel for num_threads(opt.num_threads)
385
        for (int q = 0; q < channels; q++)
386
        {
387
            float* outptr = top_blob.channel(q);
388

389
            for (size_t b = 0; b < bottom_blobs.size(); b++)
390
            {
391
                const Mat& bottom_blob = bottom_blobs[b];
392

393
                int size = bottom_blob.w * bottom_blob.h * bottom_blob.d;
394

395
                const float* ptr = bottom_blob.channel(q);
396
                memcpy(outptr, ptr, size * elemsize);
397

398
                outptr += size * elempack;
399
            }
400
        }
401
    }
402

403
    return 0;
404
}
405

406
} // namespace ncnn
407
ncnn

Использование cookies