ncnn

padding_loongarch.cpp
385 строк · 13.2 Кб
Перенос по словам
1
// yala is pleased to support the open source community by making ncnn available.
2
//
3
//
4
// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
7
//
8
// https://opensource.org/licenses/BSD-3-Clause
9
//
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
14

15
#include "padding_loongarch.h"
16

17
#if __loongarch_sx
18
#include <lsxintrin.h>
19
#endif // __loongarch_sx
20

21
#include "loongarch_usability.h"
22

23
namespace ncnn {
24

25
#if __loongarch_sx
26
#include "padding_pack4.h"
27
#include "padding_pack8_int8.h"
28
#endif // __loongarch_sx
29

30
Padding_loongarch::Padding_loongarch()
31
{
32
#if __loongarch_sx
33
    support_packing = true;
34
#endif // __loongarch_sx
35
}
36

37
int Padding_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
38
{
39
    if (top == 0 && bottom == 0 && left == 0 && right == 0 && front == 0 && behind == 0)
40
    {
41
        top_blob = bottom_blob;
42
        return 0;
43
    }
44

45
    int elembits = bottom_blob.elembits();
46

47
    if (elembits == 8)
48
        return forward_int8(bottom_blob, top_blob, opt);
49

50
    int w = bottom_blob.w;
51
    int h = bottom_blob.h;
52
    int d = bottom_blob.d;
53
    int channels = bottom_blob.c;
54
    int dims = bottom_blob.dims;
55
    size_t elemsize = bottom_blob.elemsize;
56
    int elempack = bottom_blob.elempack;
57

58
#if __loongarch_sx
59
    if (elempack == 4)
60
    {
61
        if (dims == 1)
62
        {
63
            int outw = w * elempack + left + right;
64

65
            int out_elempack = outw % 4 == 0 ? 4 : 1;
66
            size_t out_elemsize = elemsize / elempack * out_elempack;
67

68
            if (left % 4 == 0 && out_elempack == 4 && type == 0)
69
            {
70
                top_blob.create(outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
71
                if (top_blob.empty())
72
                    return -100;
73

74
                __m128 pad_value = __lsx_vreplfr2vr_s(value);
75
                padding_constant_pack4_lsx(bottom_blob, top_blob, 0, 0, left / 4, right / 4, pad_value);
76

77
                return 0;
78
            }
79
        }
80

81
        if (dims == 2)
82
        {
83
            int outw = w + left + right;
84
            int outh = h * elempack + top + bottom;
85

86
            int out_elempack = outh % 4 == 0 ? 4 : 1;
87
            size_t out_elemsize = elemsize / elempack * out_elempack;
88

89
            if (top % 4 == 0 && out_elempack == 4 && type == 0)
90
            {
91
                top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
92
                if (top_blob.empty())
93
                    return -100;
94

95
                __m128 pad_value = __lsx_vreplfr2vr_s(value);
96
                padding_constant_pack4_lsx(bottom_blob, top_blob, top / 4, bottom / 4, left, right, pad_value);
97

98
                return 0;
99
            }
100
        }
101

102
        if (dims == 3)
103
        {
104
            int outw = w + left + right;
105
            int outh = h + top + bottom;
106
            int outc = channels * elempack + front + behind;
107

108
            int out_elempack = outc % 4 == 0 ? 4 : 1;
109
            size_t out_elemsize = elemsize / elempack * out_elempack;
110

111
            if (front % 4 == 0 && out_elempack == 4 && !(outc != channels * elempack && type != 0))
112
            {
113
                top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
114
                if (top_blob.empty())
115
                    return -100;
116

117
                int front_ = front / elempack;
118
                #pragma omp parallel for num_threads(opt.num_threads)
119
                for (int q = 0; q < outc / out_elempack; q++)
120
                {
121
                    Mat borderm = top_blob.channel(q);
122

123
                    __m128 pad_value = per_channel_pad_data_size ? (__m128)__lsx_vld((const float*)per_channel_pad_data + q * 4, 0) : __lsx_vreplfr2vr_s(value);
124
                    //Channel padding
125
                    if ((q - front_) < 0 || (q - front_) >= channels)
126
                    {
127
                        borderm.fill(pad_value);
128
                    }
129
                    else
130
                    {
131
                        const Mat m = bottom_blob.channel(q - front_);
132
                        if (type == 0)
133
                            padding_constant_pack4_lsx(m, borderm, top, bottom, left, right, pad_value);
134
                        if (type == 1)
135
                            padding_replicate_pack4_lsx(m, borderm, top, bottom, left, right);
136
                        if (type == 2)
137
                            padding_reflect_pack4_lsx(m, borderm, top, bottom, left, right);
138
                    }
139
                }
140

141
                return 0;
142
            }
143
        }
144

145
        if (dims == 4)
146
        {
147
            int outw = w + left + right;
148
            int outh = h + top + bottom;
149
            int outd = d + front + behind;
150

151
            if (type == 0)
152
            {
153
                top_blob.create(outw, outh, outd, channels, elemsize, elempack, opt.blob_allocator);
154
                if (top_blob.empty())
155
                    return -100;
156

157
                #pragma omp parallel for num_threads(opt.num_threads)
158
                for (int q = 0; q < channels; q++)
159
                {
160
                    __m128 pad_value = per_channel_pad_data_size ? (__m128)__lsx_vld((const float*)per_channel_pad_data + q * 4, 0) : __lsx_vreplfr2vr_s(value);
161

162
                    for (int z = 0; z < outd; z++)
163
                    {
164
                        Mat borderm = top_blob.channel(q).depth(z);
165

166
                        // depth padding
167
                        if ((z - front) < 0 || (z - front) >= d)
168
                        {
169
                            borderm.fill(pad_value);
170
                        }
171
                        else
172
                        {
173
                            const Mat m = bottom_blob.channel(q).depth(z - front);
174
                            padding_constant_pack4_lsx(m, borderm, top, bottom, left, right, pad_value);
175
                        }
176
                    }
177
                }
178

179
                return 0;
180
            }
181
        }
182
    }
183
#endif // __loongarch_sx
184

185
    Mat bottom_blob_unpacked = bottom_blob;
186
    if (elempack != 1)
187
    {
188
        Option opt_pack1 = opt;
189
        opt_pack1.blob_allocator = opt.workspace_allocator;
190

191
        convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
192
    }
193

194
    Mat top_blob_unpacked;
195
    int ret = Padding::forward(bottom_blob_unpacked, top_blob_unpacked, opt);
196
    if (ret != 0)
197
        return ret;
198

199
    int out_elempack = 1;
200
#if __loongarch_sx
201
    if (opt.use_packing_layout)
202
    {
203
        out_elempack = top_blob_unpacked.c % 4 == 0 ? 4 : 1;
204
    }
205
#endif
206

207
    convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
208

209
    return 0;
210
}
211

212
int Padding_loongarch::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
213
{
214
    int w = bottom_blob.w;
215
    int h = bottom_blob.h;
216
    int d = bottom_blob.d;
217
    int channels = bottom_blob.c;
218
    int dims = bottom_blob.dims;
219
    size_t elemsize = bottom_blob.elemsize;
220
    int elempack = bottom_blob.elempack;
221

222
#if __loongarch_sx
223
    if (elempack == 8)
224
    {
225
        if (dims == 1)
226
        {
227
            int outw = w * elempack + left + right;
228

229
            int out_elempack = outw % 8 == 0 ? 8 : 1;
230
            size_t out_elemsize = elemsize / elempack * out_elempack;
231

232
            if (left % 8 == 0 && out_elempack == 8 && type == 0)
233
            {
234
                top_blob.create(outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
235
                if (top_blob.empty())
236
                    return -100;
237

238
                int64_t v8 = (int64_t)value;
239
                int64_t pad_value = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24) | (v8 << 32) | (v8 << 40) | (v8 << 48) | (v8 << 56);
240
                padding_constant_pack8_int8_lsx(bottom_blob, top_blob, 0, 0, left / 8, right / 8, pad_value);
241

242
                return 0;
243
            }
244
        }
245

246
        if (dims == 2)
247
        {
248
            int outw = w + left + right;
249
            int outh = h * elempack + top + bottom;
250

251
            int out_elempack = outh % 8 == 0 ? 8 : 1;
252
            size_t out_elemsize = elemsize / elempack * out_elempack;
253

254
            if (top % 8 == 0 && out_elempack == 8 && type == 0)
255
            {
256
                top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
257
                if (top_blob.empty())
258
                    return -100;
259

260
                int64_t v8 = (int64_t)value;
261
                int64_t pad_value = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24) | (v8 << 32) | (v8 << 40) | (v8 << 48) | (v8 << 56);
262
                padding_constant_pack8_int8_lsx(bottom_blob, top_blob, top / 8, bottom / 8, left, right, pad_value);
263

264
                return 0;
265
            }
266
        }
267

268
        if (dims == 3)
269
        {
270
            int outw = w + left + right;
271
            int outh = h + top + bottom;
272
            int outc = channels * elempack + front + behind;
273

274
            int out_elempack = outc % 8 == 0 ? 8 : 1;
275
            size_t out_elemsize = elemsize / elempack * out_elempack;
276

277
            if (front % 8 == 0 && out_elempack == 8 && !(outc != channels * elempack && type != 0))
278
            {
279
                top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
280
                if (top_blob.empty())
281
                    return -100;
282

283
                int front_ = front / elempack;
284
                #pragma omp parallel for num_threads(opt.num_threads)
285
                for (int q = 0; q < outc / out_elempack; q++)
286
                {
287
                    Mat borderm = top_blob.channel(q);
288

289
                    // TODO perchannel
290
                    //                     int64_t pad_value = per_channel_pad_data_size ? vld1_s8(per_channel_pad_data + q * 8) : vdup_n_s8((signed char)value);
291
                    int64_t v8 = (int64_t)value;
292
                    int64_t pad_value = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24) | (v8 << 32) | (v8 << 40) | (v8 << 48) | (v8 << 56);
293

294
                    //Channel padding
295
                    if ((q - front_) < 0 || (q - front_) >= channels)
296
                    {
297
                        borderm.fill(pad_value);
298
                    }
299
                    else
300
                    {
301
                        const Mat m = bottom_blob.channel(q - front_);
302
                        if (type == 0)
303
                            padding_constant_pack8_int8_lsx(m, borderm, top, bottom, left, right, pad_value);
304
                        if (type == 1)
305
                            padding_replicate_pack8_int8_lsx(m, borderm, top, bottom, left, right);
306
                        if (type == 2)
307
                            padding_reflect_pack8_int8_lsx(m, borderm, top, bottom, left, right);
308
                    }
309
                }
310

311
                return 0;
312
            }
313
        }
314

315
        if (dims == 4)
316
        {
317
            int outw = w + left + right;
318
            int outh = h + top + bottom;
319
            int outd = d + front + behind;
320

321
            if (type == 0)
322
            {
323
                top_blob.create(outw, outh, outd, channels, elemsize, elempack, opt.blob_allocator);
324
                if (top_blob.empty())
325
                    return -100;
326

327
                #pragma omp parallel for num_threads(opt.num_threads)
328
                for (int q = 0; q < channels; q++)
329
                {
330
                    // TODO perchannel
331
                    //                     int64_t pad_value = per_channel_pad_data_size ? vld1_s8(per_channel_pad_data + q * 8) : vdup_n_s8((signed char)value);
332
                    int64_t v8 = (int64_t)value;
333
                    int64_t pad_value = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24) | (v8 << 32) | (v8 << 40) | (v8 << 48) | (v8 << 56);
334

335
                    for (int z = 0; z < outd; z++)
336
                    {
337
                        Mat borderm = top_blob.channel(q).depth(z);
338

339
                        // depth padding
340
                        if ((z - front) < 0 || (z - front) >= d)
341
                        {
342
                            borderm.fill(pad_value);
343
                        }
344
                        else
345
                        {
346
                            const Mat m = bottom_blob.channel(q).depth(z - front);
347
                            padding_constant_pack8_int8_lsx(m, borderm, top, bottom, left, right, pad_value);
348
                        }
349
                    }
350
                }
351

352
                return 0;
353
            }
354
        }
355
    }
356
#endif // __loongarch_sx
357

358
    Mat bottom_blob_unpacked = bottom_blob;
359
    if (elempack != 1)
360
    {
361
        Option opt_pack1 = opt;
362
        opt_pack1.blob_allocator = opt.workspace_allocator;
363

364
        convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
365
    }
366

367
    Mat top_blob_unpacked;
368
    int ret = Padding::forward(bottom_blob_unpacked, top_blob_unpacked, opt);
369
    if (ret != 0)
370
        return ret;
371

372
    int out_elempack = 1;
373
#if __loongarch_sx
374
    if (opt.use_packing_layout)
375
    {
376
        out_elempack = top_blob_unpacked.c % 8 == 0 ? 8 : 1;
377
    }
378
#endif
379

380
    convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
381

382
    return 0;
383
}
384

385
} // namespace ncnn
386
ncnn

Использование cookies