15
#include "padding_loongarch.h"
21
#include "loongarch_usability.h"
26
#include "padding_pack4.h"
27
#include "padding_pack8_int8.h"
30
Padding_loongarch::Padding_loongarch()
33
support_packing = true;
37
int Padding_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
39
if (top == 0 && bottom == 0 && left == 0 && right == 0 && front == 0 && behind == 0)
41
top_blob = bottom_blob;
45
int elembits = bottom_blob.elembits();
48
return forward_int8(bottom_blob, top_blob, opt);
50
int w = bottom_blob.w;
51
int h = bottom_blob.h;
52
int d = bottom_blob.d;
53
int channels = bottom_blob.c;
54
int dims = bottom_blob.dims;
55
size_t elemsize = bottom_blob.elemsize;
56
int elempack = bottom_blob.elempack;
63
int outw = w * elempack + left + right;
65
int out_elempack = outw % 4 == 0 ? 4 : 1;
66
size_t out_elemsize = elemsize / elempack * out_elempack;
68
if (left % 4 == 0 && out_elempack == 4 && type == 0)
70
top_blob.create(outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
74
__m128 pad_value = __lsx_vreplfr2vr_s(value);
75
padding_constant_pack4_lsx(bottom_blob, top_blob, 0, 0, left / 4, right / 4, pad_value);
83
int outw = w + left + right;
84
int outh = h * elempack + top + bottom;
86
int out_elempack = outh % 4 == 0 ? 4 : 1;
87
size_t out_elemsize = elemsize / elempack * out_elempack;
89
if (top % 4 == 0 && out_elempack == 4 && type == 0)
91
top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
95
__m128 pad_value = __lsx_vreplfr2vr_s(value);
96
padding_constant_pack4_lsx(bottom_blob, top_blob, top / 4, bottom / 4, left, right, pad_value);
104
int outw = w + left + right;
105
int outh = h + top + bottom;
106
int outc = channels * elempack + front + behind;
108
int out_elempack = outc % 4 == 0 ? 4 : 1;
109
size_t out_elemsize = elemsize / elempack * out_elempack;
111
if (front % 4 == 0 && out_elempack == 4 && !(outc != channels * elempack && type != 0))
113
top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
114
if (top_blob.empty())
117
int front_ = front / elempack;
118
#pragma omp parallel for num_threads(opt.num_threads)
119
for (int q = 0; q < outc / out_elempack; q++)
121
Mat borderm = top_blob.channel(q);
123
__m128 pad_value = per_channel_pad_data_size ? (__m128)__lsx_vld((const float*)per_channel_pad_data + q * 4, 0) : __lsx_vreplfr2vr_s(value);
125
if ((q - front_) < 0 || (q - front_) >= channels)
127
borderm.fill(pad_value);
131
const Mat m = bottom_blob.channel(q - front_);
133
padding_constant_pack4_lsx(m, borderm, top, bottom, left, right, pad_value);
135
padding_replicate_pack4_lsx(m, borderm, top, bottom, left, right);
137
padding_reflect_pack4_lsx(m, borderm, top, bottom, left, right);
147
int outw = w + left + right;
148
int outh = h + top + bottom;
149
int outd = d + front + behind;
153
top_blob.create(outw, outh, outd, channels, elemsize, elempack, opt.blob_allocator);
154
if (top_blob.empty())
157
#pragma omp parallel for num_threads(opt.num_threads)
158
for (int q = 0; q < channels; q++)
160
__m128 pad_value = per_channel_pad_data_size ? (__m128)__lsx_vld((const float*)per_channel_pad_data + q * 4, 0) : __lsx_vreplfr2vr_s(value);
162
for (int z = 0; z < outd; z++)
164
Mat borderm = top_blob.channel(q).depth(z);
167
if ((z - front) < 0 || (z - front) >= d)
169
borderm.fill(pad_value);
173
const Mat m = bottom_blob.channel(q).depth(z - front);
174
padding_constant_pack4_lsx(m, borderm, top, bottom, left, right, pad_value);
185
Mat bottom_blob_unpacked = bottom_blob;
188
Option opt_pack1 = opt;
189
opt_pack1.blob_allocator = opt.workspace_allocator;
191
convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
194
Mat top_blob_unpacked;
195
int ret = Padding::forward(bottom_blob_unpacked, top_blob_unpacked, opt);
199
int out_elempack = 1;
201
if (opt.use_packing_layout)
203
out_elempack = top_blob_unpacked.c % 4 == 0 ? 4 : 1;
207
convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
212
int Padding_loongarch::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
214
int w = bottom_blob.w;
215
int h = bottom_blob.h;
216
int d = bottom_blob.d;
217
int channels = bottom_blob.c;
218
int dims = bottom_blob.dims;
219
size_t elemsize = bottom_blob.elemsize;
220
int elempack = bottom_blob.elempack;
227
int outw = w * elempack + left + right;
229
int out_elempack = outw % 8 == 0 ? 8 : 1;
230
size_t out_elemsize = elemsize / elempack * out_elempack;
232
if (left % 8 == 0 && out_elempack == 8 && type == 0)
234
top_blob.create(outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
235
if (top_blob.empty())
238
int64_t v8 = (int64_t)value;
239
int64_t pad_value = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24) | (v8 << 32) | (v8 << 40) | (v8 << 48) | (v8 << 56);
240
padding_constant_pack8_int8_lsx(bottom_blob, top_blob, 0, 0, left / 8, right / 8, pad_value);
248
int outw = w + left + right;
249
int outh = h * elempack + top + bottom;
251
int out_elempack = outh % 8 == 0 ? 8 : 1;
252
size_t out_elemsize = elemsize / elempack * out_elempack;
254
if (top % 8 == 0 && out_elempack == 8 && type == 0)
256
top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
257
if (top_blob.empty())
260
int64_t v8 = (int64_t)value;
261
int64_t pad_value = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24) | (v8 << 32) | (v8 << 40) | (v8 << 48) | (v8 << 56);
262
padding_constant_pack8_int8_lsx(bottom_blob, top_blob, top / 8, bottom / 8, left, right, pad_value);
270
int outw = w + left + right;
271
int outh = h + top + bottom;
272
int outc = channels * elempack + front + behind;
274
int out_elempack = outc % 8 == 0 ? 8 : 1;
275
size_t out_elemsize = elemsize / elempack * out_elempack;
277
if (front % 8 == 0 && out_elempack == 8 && !(outc != channels * elempack && type != 0))
279
top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
280
if (top_blob.empty())
283
int front_ = front / elempack;
284
#pragma omp parallel for num_threads(opt.num_threads)
285
for (int q = 0; q < outc / out_elempack; q++)
287
Mat borderm = top_blob.channel(q);
291
int64_t v8 = (int64_t)value;
292
int64_t pad_value = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24) | (v8 << 32) | (v8 << 40) | (v8 << 48) | (v8 << 56);
295
if ((q - front_) < 0 || (q - front_) >= channels)
297
borderm.fill(pad_value);
301
const Mat m = bottom_blob.channel(q - front_);
303
padding_constant_pack8_int8_lsx(m, borderm, top, bottom, left, right, pad_value);
305
padding_replicate_pack8_int8_lsx(m, borderm, top, bottom, left, right);
307
padding_reflect_pack8_int8_lsx(m, borderm, top, bottom, left, right);
317
int outw = w + left + right;
318
int outh = h + top + bottom;
319
int outd = d + front + behind;
323
top_blob.create(outw, outh, outd, channels, elemsize, elempack, opt.blob_allocator);
324
if (top_blob.empty())
327
#pragma omp parallel for num_threads(opt.num_threads)
328
for (int q = 0; q < channels; q++)
332
int64_t v8 = (int64_t)value;
333
int64_t pad_value = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24) | (v8 << 32) | (v8 << 40) | (v8 << 48) | (v8 << 56);
335
for (int z = 0; z < outd; z++)
337
Mat borderm = top_blob.channel(q).depth(z);
340
if ((z - front) < 0 || (z - front) >= d)
342
borderm.fill(pad_value);
346
const Mat m = bottom_blob.channel(q).depth(z - front);
347
padding_constant_pack8_int8_lsx(m, borderm, top, bottom, left, right, pad_value);
358
Mat bottom_blob_unpacked = bottom_blob;
361
Option opt_pack1 = opt;
362
opt_pack1.blob_allocator = opt.workspace_allocator;
364
convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
367
Mat top_blob_unpacked;
368
int ret = Padding::forward(bottom_blob_unpacked, top_blob_unpacked, opt);
372
int out_elempack = 1;
374
if (opt.use_packing_layout)
376
out_elempack = top_blob_unpacked.c % 8 == 0 ? 8 : 1;
380
convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);