1
#include "caffe2/core/common.h"
3
#if defined(C10_MOBILE) && defined(CAFFE2_USE_MPSCNN_TEST)
5
#include "mpscnn_context.h"
6
#include "mpscnn_graph_mask.h"
8
#include "caffe2/core/logging.h"
9
#include "caffe2/core/operator_schema.h"
10
#include "caffe2/core/workspace.h"
11
#include "caffe2/utils/math.h"
12
#include "caffe2/utils/proto_utils.h"
14
#import <UIKit/UIDevice.h>
16
#define SYSTEM_VERSION_GREATER_THAN_OR_EQUAL_TO(v) \
17
([[[UIDevice currentDevice] systemVersion] \
19
options:NSNumericSearch] != NSOrderedAscending)
23
/* Utility functions for operator definition */
24
void add_arg_int(OperatorDef& op, string name, int value) {
25
auto& arg = *(op.add_arg());
30
void add_arg_str(OperatorDef& op, string name, string value) {
31
auto& arg = *(op.add_arg());
36
void add_arg_float(OperatorDef& op, string name, float value) {
37
auto& arg = *(op.add_arg());
44
std::vector<string> names,
45
std::vector<int> values) {
46
CAFFE_ENFORCE_EQ(names.size(), values.size());
47
for (auto i = 0; i < names.size(); i++) {
48
add_arg_int(op, names[i], values[i]);
54
std::vector<string> names,
55
std::vector<string> values) {
56
CAFFE_ENFORCE_EQ(names.size(), values.size());
57
for (auto i = 0; i < names.size(); i++) {
58
add_arg_str(op, names[i], values[i]);
62
void add_inputs(OperatorDef& op, std::vector<string> inputs) {
63
for (auto i = 0; i < inputs.size(); i++) {
64
op.add_input(inputs[i]);
68
void add_outputs(OperatorDef& op, std::vector<string> outputs) {
69
for (auto i = 0; i < outputs.size(); i++) {
70
op.add_output(outputs[i]);
79
for (const auto C : std::vector<size_t>{1, 2, 3, 4, 8, 11, 12}) {
80
for (const auto H : std::vector<size_t>{1, 7, 15, 39}) {
81
for (const auto W : std::vector<size_t>{1, 7, 15, 39}) {
82
for (const auto N : std::vector<size_t>{1, 2}) {
83
for (const auto BS : std::vector<size_t>{1, 2}) {
84
LOG(INFO) << "MPSCNNCopyFrom/To Test";
85
auto mtl = [&](size_t i) {
86
return std::string("X_mtl_") + std::to_string(i);
88
auto cpu = [&](size_t i) {
89
return std::string("X_cpu_") + std::to_string(i);
91
auto y_cpu = [&](size_t i) {
92
return std::string("Y_cpu_") + std::to_string(i);
96
for (auto i = 0; i < N; ++i) {
97
auto* t = BlobGetMutableTensor(ws.CreateBlob(cpu(i)), CPU);
98
t->Resize(BS, C, H, W);
100
math::RandGaussian<float, CPUContext>(
101
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
106
auto& op = *(netdef.add_op());
107
op.set_type("CopyToMPSCNN");
108
for (auto i = 0; i < N; ++i) {
109
op.add_input(cpu(i));
110
op.add_output(mtl(i));
114
auto& op = *(netdef.add_op());
115
op.set_type("CopyFromMPSCNN");
116
for (auto i = 0; i < N; ++i) {
117
op.add_input(mtl(i));
118
op.add_output(y_cpu(i));
122
ws.RunNetOnce(netdef);
123
for (auto i = 0; i < N; ++i) {
124
const auto& t1 = ws.GetBlob(cpu(i))->Get<TensorCPU>();
125
const auto& t2 = ws.GetBlob(y_cpu(i))->Get<TensorCPU>();
126
CAFFE_ENFORCE_EQ(t1.sizes(), t2.sizes());
127
for (auto i = 0; i < t1.size(); ++i) {
128
// FP16 <-> FP32 round trip.
129
TORCH_CHECK_NEAR(t1.data<float>()[i], t2.data<float>()[i], 1e-2);
140
for (const auto ndim : std::vector<size_t>{1, 2, 3, 4}) {
141
for (const auto N : std::vector<size_t>{1, 2}) {
142
LOG(INFO) << "MPSCNNCopyFrom/To ndim Test";
143
auto mtl = [&](size_t i) {
144
return std::string("X_mtl_") + std::to_string(i);
146
auto cpu = [&](size_t i) {
147
return std::string("X_cpu_") + std::to_string(i);
149
auto y_cpu = [&](size_t i) {
150
return std::string("Y_cpu_") + std::to_string(i);
154
for (auto i = 0; i < N; ++i) {
155
auto* t = BlobGetMutableTensor(ws.CreateBlob(cpu(i)), CPU);
167
t->Resize(5, 3, 4, 2);
171
math::RandGaussian<float, CPUContext>(
172
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
177
auto& op = *(netdef.add_op());
178
op.set_type("CopyToMPSCNN");
179
for (auto i = 0; i < N; ++i) {
180
op.add_input(cpu(i));
181
op.add_output(mtl(i));
185
auto& op = *(netdef.add_op());
186
op.set_type("CopyFromMPSCNN");
187
for (auto i = 0; i < N; ++i) {
188
op.add_input(mtl(i));
189
op.add_output(y_cpu(i));
193
ws.RunNetOnce(netdef);
194
for (auto i = 0; i < N; ++i) {
195
const auto& t1 = ws.GetBlob(cpu(i))->Get<TensorCPU>();
196
const auto& t2 = ws.GetBlob(y_cpu(i))->Get<TensorCPU>();
197
CAFFE_ENFORCE_EQ(t1.size(), t2.size());
198
for (auto i = 0; i < t1.size(); ++i) {
199
// FP16 <-> FP32 round trip.
200
TORCH_CHECK_NEAR(t1.data<float>()[i], t2.data<float>()[i], 1e-2);
208
for (const auto& batch_size : std::vector<int>{{1, 2}}) {
209
for (const auto& channels : std::vector<int>{{3, 8}}) {
210
LOG(INFO) << "MPSCNNNormalizePlanarYUV Test: ";
213
auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
214
t->Resize(batch_size, channels, 8, 13);
216
math::RandGaussian<float, CPUContext>(
217
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
221
auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
222
t->Resize(1, channels);
224
math::RandGaussian<float, CPUContext>(
225
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
228
auto* t = BlobGetMutableTensor(ws.CreateBlob("stddev"), CPU);
229
t->Resize(1, channels);
231
math::RandUniform<float, CPUContext>(
232
t->size(), 0.5, 1.5, t->mutable_data<float>(), &ctx);
237
auto& op = *(netdef.add_op());
238
op.set_type("CopyToMPSCNN");
239
op.add_input("X_cpu");
240
op.add_output("X_mtl");
244
auto& op = *(netdef.add_op());
245
op.set_type("MPSCNNNormalizePlanarYUV");
246
op.add_input("X_mtl");
247
op.add_input("mean");
248
op.add_input("stddev");
249
op.add_output("Y_mtl");
253
auto& op = *(netdef.add_op());
254
op.set_type("CopyFromMPSCNN");
255
op.add_input("Y_mtl");
256
op.add_output("Y_cpu");
260
auto& op = *(netdef.add_op());
261
op.set_type("NormalizePlanarYUV");
262
op.add_input("X_cpu");
263
op.add_input("mean");
264
op.add_input("stddev");
265
op.add_output("Y_ref");
268
ws.RunNetOnce(netdef);
269
const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
270
const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
272
CAFFE_ENFORCE_EQ(t1.sizes(), t2.sizes());
273
for (auto i = 0; i < t1.size(); ++i) {
274
// FP16 <-> FP32 round trip, accumulation, etc.
275
const float t1_i = t1.data<float>()[i];
276
const float t2_i = t2.data<float>()[i];
277
TORCH_CHECK_NEAR(t1_i, t2_i, 0.1);
284
LOG(INFO) << "MPSCNNInstanceNorm Test";
285
enum class PreluTy { NONE, CHANNEL, SHARED };
286
for (const auto batchSize : {1, 2}) {
287
for (const auto channels : {3, 8}) {
288
for (const auto prelu :
289
{PreluTy::NONE, PreluTy::CHANNEL, PreluTy::SHARED}) {
290
for (const auto dim : {10, 40}) {
293
auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
294
t->Resize(batchSize, channels, dim, dim);
297
math::RandGaussian<float, CPUContext>(
298
t->size(), 0, 3, t->mutable_data<float>(), &ctx);
302
auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
305
for (auto i = 0; i < t->size(); ++i) {
306
t->mutable_data<float>()[i] = i;
309
// math::RandGaussian<float, CPUContext>(t->size(), 0, 1,
310
// t->mutable_data<float>(), &ctx);
313
auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
316
for (auto i = 0; i < t->size(); ++i) {
317
t->mutable_data<float>()[i] = 8 - 2 * i;
320
// math::RandGaussian<float, CPUContext>(t->size(), 0, 1,
321
// t->mutable_data<float>(), &ctx);
324
auto* t = BlobGetMutableTensor(ws.CreateBlob("pw"), CPU);
325
t->Resize(prelu == PreluTy::SHARED ? 1 : channels);
328
math::RandGaussian<float, CPUContext>(
329
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
334
auto& op = *(netdef.add_op());
335
op.set_type("CopyToMPSCNN");
336
op.add_input("X_cpu");
337
op.add_output("X_mtl");
341
auto& op = *(netdef.add_op());
343
prelu == PreluTy::NONE ? "MPSCNNInstanceNorm"
344
: "MPSCNNInstanceNormPRelu");
345
op.add_input("X_mtl");
348
if (prelu != PreluTy::NONE) {
351
op.add_output("Y_mtl");
355
auto& op = *(netdef.add_op());
356
op.set_type("CopyFromMPSCNN");
357
op.add_input("Y_mtl");
358
op.add_output("Y_cpu");
362
auto& op = *(netdef.add_op());
363
op.set_type("InstanceNorm");
364
op.add_input("X_cpu");
367
auto& arg = *(op.add_arg());
368
arg.set_name("order");
370
op.add_output("Y_ref");
373
if (prelu != PreluTy::NONE) {
374
auto& op = *(netdef.add_op());
375
op.set_type("PRelu");
376
op.add_input("Y_ref");
378
auto& arg = *(op.add_arg());
379
arg.set_name("order");
381
op.add_output("Y_ref");
384
ws.RunNetOnce(netdef);
385
const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
386
const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
388
CAFFE_ENFORCE_EQ(t1.sizes(), t2.sizes());
389
for (auto i = 0; i < t1.size(); ++i) {
390
// FP16 <-> FP32 round trip, accumulation, etc.
391
const float t1_i = t1.data<float>()[i];
392
const float t2_i = t2.data<float>()[i];
393
// Can be larger due to FP errors.
394
constexpr float tol = 5.0e-2;
395
CHECK(std::abs(t1_i - t2_i) <= (tol + tol * std::abs(t1_i)))
396
<< t1_i << ", " << t2_i;
405
for (const auto& shared : std::vector<bool>{{true, false}}) {
406
for (const auto& array : std::vector<bool>{{true, false}}) {
407
for (const auto& batch_size : std::vector<int>{{1, 2}}) {
408
LOG(INFO) << "MPSCNNPRelu Test: " << shared << array << batch_size;
410
const auto channels = array ? 12 : 3;
412
auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
413
t->Resize(batch_size, channels, 8, 13);
415
math::RandGaussian<float, CPUContext>(
416
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
420
auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
421
t->Resize(shared ? channels : 1);
423
math::RandGaussian<float, CPUContext>(
424
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
429
auto& op = *(netdef.add_op());
430
op.set_type("CopyToMPSCNN");
431
op.add_input("X_cpu");
432
op.add_output("X_mtl");
436
auto& op = *(netdef.add_op());
437
op.set_type("MPSCNNPRelu");
438
op.add_input("X_mtl");
440
op.add_output("Y_mtl");
444
auto& op = *(netdef.add_op());
445
op.set_type("CopyFromMPSCNN");
446
op.add_input("Y_mtl");
447
op.add_output("Y_cpu");
451
auto& op = *(netdef.add_op());
452
op.set_type("PRelu");
453
op.add_input("X_cpu");
455
auto& arg = *(op.add_arg());
456
arg.set_name("order");
458
op.add_output("Y_ref");
461
ws.RunNetOnce(netdef);
462
const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
463
const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
465
CAFFE_ENFORCE_EQ(t1.sizes(), t2.sizes());
466
for (auto i = 0; i < t1.size(); ++i) {
467
// FP16 <-> FP32 round trip, accumulation, etc.
468
const float t1_i = t1.data<float>()[i];
469
const float t2_i = t2.data<float>()[i];
470
TORCH_CHECK_NEAR(t1_i, t2_i, 0.1);
478
for (const auto& channels : std::vector<size_t>{3, 12, 15}) {
479
for (const auto& batch_size : std::vector<size_t>{1, 2}) {
480
LOG(INFO) << "MPSCNNSpatialBN Test: " << channels;
483
auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
484
t->Resize(batch_size, channels, 8, 13);
486
math::RandGaussian<float, CPUContext>(
487
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
490
for (const std::string name : {"scale", "bias", "mean", "var"}) {
491
auto* t = BlobGetMutableTensor(ws.CreateBlob(name), CPU);
494
// High mean to avoid var division by zero.
495
math::RandGaussian<float, CPUContext>(
496
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
498
for (auto i = 0; i < t->size(); ++i) {
499
t->mutable_data<float>()[i] =
500
std::abs(t->mutable_data<float>()[i]) + 0.5;
507
auto& op = *(netdef.add_op());
508
op.set_type("CopyToMPSCNN");
509
op.add_input("X_cpu");
510
op.add_output("X_mtl");
514
auto& op = *(netdef.add_op());
515
op.set_type("MPSCNNSpatialBN");
516
op.add_input("X_mtl");
517
op.add_input("scale");
518
op.add_input("bias");
519
op.add_input("mean");
522
auto& arg = *(op.add_arg());
523
arg.set_name(OpSchema::Arg_IsTest);
527
op.add_output("Y_mtl");
531
auto& op = *(netdef.add_op());
532
op.set_type("CopyFromMPSCNN");
533
op.add_input("Y_mtl");
534
op.add_output("Y_cpu");
538
auto& op = *(netdef.add_op());
539
op.set_type("SpatialBN");
540
op.add_input("X_cpu");
541
op.add_input("scale");
542
op.add_input("bias");
543
op.add_input("mean");
546
auto& arg = *(op.add_arg());
547
arg.set_name(OpSchema::Arg_IsTest);
551
op.add_output("Y_ref");
554
ws.RunNetOnce(netdef);
555
const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
556
const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
558
CAFFE_ENFORCE_EQ(t1.sizes(), t2.sizes());
559
for (auto i = 0; i < t1.size(); ++i) {
560
// FP16 <-> FP32 round trip, accumulation, etc.
561
const float t1_i = t1.data<float>()[i];
562
const float t2_i = t2.data<float>()[i];
563
TORCH_CHECK_NEAR(t1_i, t2_i, 0.1);
570
for (const auto& batchSize : std::vector<size_t>{2, 1}) {
571
for (const auto& H : std::vector<size_t>{1, 8}) {
572
for (const auto& W : std::vector<size_t>{1, 8}) {
573
for (const auto& CIn : std::vector<size_t>{1, 12, 224}) {
574
for (const auto& COut : std::vector<size_t>{1, 12, 224}) {
575
LOG(INFO) << "MPSCNNFC Test";
578
auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
579
t->Resize(batchSize, CIn, H, W);
581
math::RandGaussian<float, CPUContext>(
582
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
586
auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
587
t->Resize(COut, CIn * H * W);
589
math::RandGaussian<float, CPUContext>(
590
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
594
auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
597
math::RandGaussian<float, CPUContext>(
598
t->size(), 0, 0.0001, t->mutable_data<float>(), &ctx);
603
auto& op = *(netdef.add_op());
604
op.set_type("CopyToMPSCNN");
605
op.add_input("X_cpu");
606
op.add_output("X_mtl");
610
auto& op = *(netdef.add_op());
611
op.set_type("MPSCNNFC");
612
op.add_input("X_mtl");
615
op.add_output("Y_mtl");
619
auto& op = *(netdef.add_op());
620
op.set_type("CopyFromMPSCNN");
621
op.add_input("Y_mtl");
622
op.add_output("Y_cpu");
625
auto& op = *(netdef.add_op());
627
op.add_input("X_cpu");
630
auto& arg = *(op.add_arg());
631
arg.set_name("order");
633
op.add_output("Y_ref");
636
ws.RunNetOnce(netdef);
637
const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
638
const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
639
CAFFE_ENFORCE_EQ(t2.ndim(), 4);
640
CAFFE_ENFORCE_EQ(t1.ndim(), 2);
641
CAFFE_ENFORCE(t2.dim32(2) == 1 && t2.dim32(3) == 1);
642
const_cast<TensorCPU&>(t2).Reshape(
643
std::vector<int64_t>{int64_t(batchSize), int64_t(COut)});
644
// Note dims do not match, as Metal leaves a 1x1 spatial
646
CAFFE_ENFORCE_EQ(t1.sizes(), t2.sizes());
648
for (auto i = 0; i < t1.size(); ++i) {
649
// FP16 <-> FP32 round trip, accumulation, etc.
650
const float t1_i = t1.data<float>()[i];
651
const float t2_i = t2.data<float>()[i];
652
// LOG(INFO) << "i: " << i << ", cpu: " << t1_i << ", mtl: " <<
654
TORCH_CHECK_NEAR(t1_i, t2_i, 0.7);
664
for (const auto& pool : {"MaxPool", "AveragePool"}) {
665
for (const auto& global_pooling : {true, false}) {
666
for (const auto& batchSize : std::vector<size_t>{1, 2}) {
667
for (const auto& stride_h : std::vector<int>{1, 2, 3}) {
668
for (const auto& stride_w : std::vector<int>{1, 2, 3}) {
669
for (const auto& kernel_h : std::vector<int>{1, 3, 5}) {
670
for (const auto& kernel_w : std::vector<int>{1, 3, 5}) {
671
for (const auto& pad_l : std::vector<int>{0, kernel_w / 2}) {
672
for (const auto& pad_r :
673
std::vector<int>{0, kernel_w / 2}) {
674
for (const auto& pad_t :
675
std::vector<int>{0, kernel_h / 2}) {
676
for (const auto& pad_b :
677
std::vector<int>{0, kernel_h / 2}) {
678
// Waiting response from Apple
679
if (kernel_h != kernel_w) {
682
LOG(INFO) << "MPSCNNPool Test: " << pool;
685
auto* t = BlobGetMutableTensor(
686
ws.CreateBlob("X_cpu"), CPU);
687
t->Resize(batchSize, 8, 8, 13);
689
math::RandGaussian<float, CPUContext>(
693
t->mutable_data<float>(),
698
#define ADD_ARGS(op) \
700
if (global_pooling) { \
701
add_arg_int(op, "stride", 1); \
705
std::vector<string>{"pad_l", \
713
std::vector<int>{pad_l, \
722
add_arg_int(op, "global_pooling", global_pooling); \
725
auto& op = *(netdef.add_op());
726
op.set_type("CopyToMPSCNN");
727
op.add_input("X_cpu");
728
op.add_output("X_mtl");
732
auto& op = *(netdef.add_op());
733
op.set_type(std::string("MPSCNN") + pool);
734
op.add_input("X_mtl");
736
op.add_output("Y_mtl");
740
auto& op = *(netdef.add_op());
741
op.set_type("CopyFromMPSCNN");
742
op.add_input("Y_mtl");
743
op.add_output("Y_cpu");
747
auto& op = *(netdef.add_op());
749
op.add_input("X_cpu");
751
op.add_output("Y_ref");
755
ws.RunNetOnce(netdef);
757
ws.GetBlob("Y_cpu")->Get<TensorCPU>();
759
ws.GetBlob("Y_ref")->Get<TensorCPU>();
761
CAFFE_ENFORCE_EQ(t1.sizes(), t2.sizes());
762
for (auto i = 0; i < t1.size(); ++i) {
763
// FP16 <-> FP32 round trip, accumulation, etc.
764
const float t1_i = t1.data<float>()[i];
765
const float t2_i = t2.data<float>()[i];
766
TORCH_CHECK_NEAR(t1_i, t2_i, 0.1);
782
LOG(INFO) << "MPSCNNPadImage Test";
783
for (const auto dims :
784
std::vector<std::vector<size_t>>{{1, 3, 50, 80}, {1, 12, 50, 80}}) {
787
auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
790
math::RandGaussian<float, CPUContext>(
791
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
796
auto& op = *(netdef.add_op());
797
op.set_type("CopyToMPSCNN");
798
op.add_input("X_cpu");
799
op.add_output("X_mtl");
803
auto& op = *(netdef.add_op());
804
op.set_type("MPSCNNPadImage");
805
op.add_input("X_mtl");
807
auto& arg = *(op.add_arg());
812
auto& arg = *(op.add_arg());
813
arg.set_name("mode");
814
arg.set_s("reflect");
816
op.add_output("Y_mtl");
820
auto& op = *(netdef.add_op());
821
op.set_type("CopyFromMPSCNN");
822
op.add_input("Y_mtl");
823
op.add_output("Y_cpu");
827
auto& op = *(netdef.add_op());
828
op.set_type("PadImage");
829
op.add_input("X_cpu");
831
auto& arg = *(op.add_arg());
836
auto& arg = *(op.add_arg());
837
arg.set_name("mode");
838
arg.set_s("reflect");
840
op.add_output("Y_ref");
843
ws.RunNetOnce(netdef);
844
const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
845
const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
847
CAFFE_ENFORCE_EQ(t1.sizes(), t2.sizes());
848
for (auto i = 0; i < t1.size(); ++i) {
849
// FP16 <-> FP32 round trip, accumulation, etc.
850
const float t1_i = t1.data<float>()[i];
851
const float t2_i = t2.data<float>()[i];
852
// LOG(INFO) << "i: " << i << ", " << "CPU: " << t1_i << ", MTL: " <<
854
TORCH_CHECK_NEAR(t1_i, t2_i, 0.01);
860
LOG(INFO) << "MPSCNNPreprocess Test";
863
auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
864
t->Resize(1, 8, 13, 4);
866
for (auto i = 0; i < t->size(); ++i) {
867
t->mutable_data<uint8_t>()[i] = rand() % 255;
872
auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
875
t->mutable_data<float>()[0] = 100;
876
t->mutable_data<float>()[1] = 50;
877
t->mutable_data<float>()[2] = 150;
883
auto& op = *(netdef.add_op());
884
op.set_type("MPSCNNPackedInt8BGRANHWCToNCHWCStylizerPreprocess");
885
op.add_input("X_cpu");
886
op.add_input("mean");
888
auto& arg = *(op.add_arg());
889
arg.set_name("noise_std");
893
auto& arg = *(op.add_arg());
894
arg.set_name("noise_size");
898
op.add_output("Y_mtl");
902
auto& op = *(netdef.add_op());
903
op.set_type("CopyFromMPSCNN");
904
op.add_input("Y_mtl");
905
op.add_output("Y_cpu");
909
auto& op = *(netdef.add_op());
910
op.set_type("PackedInt8BGRANHWCToNCHWCStylizerPreprocess");
911
op.add_input("X_cpu");
912
op.add_input("mean");
914
auto& arg = *(op.add_arg());
915
arg.set_name("noise_std");
919
auto& arg = *(op.add_arg());
920
arg.set_name("noise_size");
923
op.add_output("Y_ref");
926
ws.RunNetOnce(netdef);
927
const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
928
const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
930
CAFFE_ENFORCE_EQ(t1.sizes(), t2.sizes());
931
for (auto i = 0; i < t1.size(); ++i) {
932
// FP16 <-> FP32 round trip, accumulation, etc.
933
const float t1_i = t1.data<float>()[i];
934
const float t2_i = t2.data<float>()[i];
935
TORCH_CHECK_NEAR(t1_i, t2_i, 0.1);
940
LOG(INFO) << "MPSCNNDeprocess Test";
943
auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
944
t->Resize(1, 3, 8, 24);
946
for (auto i = 0; i < t->size(); ++i) {
947
t->mutable_data<float>()[i] = rand() % 255;
952
auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
955
t->mutable_data<float>()[0] = 100;
956
t->mutable_data<float>()[1] = 50;
957
t->mutable_data<float>()[2] = 150;
963
auto& op = *(netdef.add_op());
964
op.set_type("CopyToMPSCNN");
965
op.add_input("X_cpu");
966
op.add_output("X_mtl");
970
auto& op = *(netdef.add_op());
971
op.set_type("MPSCNNBRGNCHWCToPackedInt8BGRAStylizerDeprocess");
972
op.add_input("X_mtl");
973
op.add_input("mean");
974
op.add_output("Y_cpu");
978
auto& op = *(netdef.add_op());
979
op.set_type("BRGNCHWCToPackedInt8BGRAStylizerDeprocess");
980
op.add_input("X_cpu");
981
op.add_input("mean");
982
op.add_output("Y_ref");
985
ws.RunNetOnce(netdef);
986
const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
987
const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
989
CAFFE_ENFORCE_EQ(t1.sizes(), t2.sizes());
990
for (auto i = 0; i < t1.size(); ++i) {
991
// FP16 <-> FP32 round trip, accumulation, etc.
992
const float t1_i = t1.data<uint8_t>()[i];
993
const float t2_i = t2.data<uint8_t>()[i];
994
TORCH_CHECK_NEAR(t1_i, t2_i, 0.1);
999
LOG(INFO) << "MPSCNNDeprocess Test";
1002
auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
1003
t->Resize(1, 3, 1280, 720);
1005
for (auto i = 0; i < t->size(); ++i) {
1006
t->mutable_data<float>()[i] = rand() % 1000 - 500;
1011
auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
1014
t->mutable_data<float>()[0] = 30;
1015
t->mutable_data<float>()[1] = 40;
1016
t->mutable_data<float>()[2] = 50;
1022
auto& op = *(netdef.add_op());
1023
op.set_type("CopyToMPSCNN");
1024
op.add_input("X_cpu");
1025
op.add_output("X_mtl");
1029
auto& op = *(netdef.add_op());
1030
op.set_type("MPSCNNBRGNCHWCToPackedInt8BGRAStylizerDeprocess");
1031
op.add_input("X_mtl");
1032
op.add_input("mean");
1033
op.add_output("Y_cpu");
1037
auto& op = *(netdef.add_op());
1038
op.set_type("BRGNCHWCToPackedInt8BGRAStylizerDeprocess");
1039
op.add_input("X_cpu");
1040
op.add_input("mean");
1041
op.add_output("Y_ref");
1044
ws.RunNetOnce(netdef);
1045
const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
1046
const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
1048
CAFFE_ENFORCE_EQ(t1.sizes(), t2.sizes());
1049
for (auto i = 0; i < t1.size(); ++i) {
1050
// FP16 <-> FP32 round trip, accumulation, etc.
1051
const float t1_i = t1.data<uint8_t>()[i];
1052
const float t2_i = t2.data<uint8_t>()[i];
1053
TORCH_CHECK_NEAR(t1_i, t2_i, 0.1);
1058
for (const auto& batchSize : std::vector<int>{1, 2}) {
1059
for (const auto& stride_h : std::vector<int>{1, 2, 3}) {
1060
for (const auto& stride_w : std::vector<int>{1, 2, 3}) {
1061
for (const auto& kernel_h : std::vector<int>{1, 3, 8}) {
1062
for (const auto& kernel_w : std::vector<int>{1, 3, 8}) {
1063
for (const auto& pad_l : std::vector<int>{0, kernel_w / 2}) {
1064
for (const auto& pad_r : std::vector<int>{0, kernel_w / 2}) {
1065
for (const auto& pad_t : std::vector<int>{0, kernel_h / 2}) {
1066
for (const auto& pad_b :
1067
std::vector<int>{0, kernel_h / 2}) {
1068
// Waiting response from Apple
1069
if (kernel_h != kernel_w) {
1072
LOG(INFO) << "MPSCNNConv Test";
1076
BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
1077
t->Resize(batchSize, 12, 57, 72);
1079
math::RandGaussian<float, CPUContext>(
1080
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
1084
auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
1085
t->Resize(8, 12, kernel_h, kernel_w);
1087
math::RandGaussian<float, CPUContext>(
1088
8 * 12 * kernel_h * kernel_w,
1091
t->mutable_data<float>(),
1096
auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
1099
math::RandGaussian<float, CPUContext>(
1100
8, 0, 1, t->mutable_data<float>(), &ctx);
1104
#define ADD_ARGS(op) \
1106
add_arg_str(op, "order", "NCHW"); \
1109
std::vector<string>{"stride_h", \
1117
std::vector<int>{stride_h, \
1127
auto& op = *(netdef.add_op());
1128
op.set_type("CopyToMPSCNN");
1129
op.add_input("X_cpu");
1130
op.add_output("X_mtl");
1134
auto& op = *(netdef.add_op());
1135
op.set_type("MPSCNNConv");
1136
op.add_input("X_mtl");
1140
op.add_output("Y_mtl");
1144
auto& op = *(netdef.add_op());
1145
op.set_type("CopyFromMPSCNN");
1146
op.add_input("Y_mtl");
1147
op.add_output("Y_cpu");
1151
auto& op = *(netdef.add_op());
1152
op.set_type("Conv");
1153
op.add_input("X_cpu");
1157
op.add_output("Y_ref");
1160
ws.RunNetOnce(netdef);
1161
const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
1162
const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
1164
CAFFE_ENFORCE_EQ(t1.sizes(), t2.sizes());
1165
for (auto i = 0; i < t1.size(); ++i) {
1166
// FP16 <-> FP32 round trip, accumulation, etc.
1167
const float t1_i = t1.data<float>()[i];
1168
const float t2_i = t2.data<float>()[i];
1169
TORCH_CHECK_NEAR(t1_i, t2_i, 0.2);
1183
bool runtimeAtLeastIOS11 = SYSTEM_VERSION_GREATER_THAN_OR_EQUAL_TO(@"11.0");
1184
if (runtimeAtLeastIOS11) {
1185
for (const auto& batchSize : std::vector<int>{1, 2}) {
1186
for (const auto& input_channels : std::vector<int>{32, 64, 128, 256}) {
1187
for (const auto& channel_multiplier : std::vector<int>{1}) {
1188
LOG(INFO) << "MPSCNNDepthwiseConv Test";
1190
int output_channels = input_channels * channel_multiplier;
1192
auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
1193
t->Resize(batchSize, input_channels, 57, 72);
1195
math::RandGaussian<float, CPUContext>(
1196
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
1200
auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
1201
t->Resize(output_channels, 1, 3, 3);
1203
math::RandGaussian<float, CPUContext>(
1204
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
1208
auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
1209
t->Resize(output_channels);
1211
math::RandGaussian<float, CPUContext>(
1212
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
1216
#define ADD_ARGS(op) \
1218
add_arg_str(op, "order", "NCHW"); \
1221
std::vector<string>{"stride", "kernel", "group"}, \
1222
std::vector<int>{1, 3, input_channels}); \
1225
auto& op = *(netdef.add_op());
1226
op.set_type("CopyToMPSCNN");
1227
op.add_input("X_cpu");
1228
op.add_output("X_mtl");
1232
auto& op = *(netdef.add_op());
1233
op.set_type("MPSCNNConv");
1234
op.add_input("X_mtl");
1238
op.add_output("Y_mtl");
1242
auto& op = *(netdef.add_op());
1243
op.set_type("CopyFromMPSCNN");
1244
op.add_input("Y_mtl");
1245
op.add_output("Y_cpu");
1249
auto& op = *(netdef.add_op());
1250
op.set_type("Conv");
1251
op.add_input("X_cpu");
1255
op.add_output("Y_ref");
1258
ws.RunNetOnce(netdef);
1259
const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
1260
const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
1262
CAFFE_ENFORCE_EQ(t1.sizes(), t2.sizes());
1263
for (auto i = 0; i < t1.size(); ++i) {
1264
// FP16 <-> FP32 round trip, accumulation, etc.
1265
const float t1_i = t1.data<float>()[i];
1266
const float t2_i = t2.data<float>()[i];
1267
TORCH_CHECK_NEAR(t1_i, t2_i, 0.3);
1276
LOG(INFO) << "MPSCNNConvRelu Test";
1279
auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
1280
t->Resize(1, 12, 57, 72);
1282
math::RandGaussian<float, CPUContext>(
1283
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
1287
auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
1288
t->Resize(8, 12, 3, 3);
1290
math::RandGaussian<float, CPUContext>(
1291
8 * 12 * 3 * 3, 0, 1, t->mutable_data<float>(), &ctx);
1295
auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
1298
math::RandGaussian<float, CPUContext>(
1299
8, 0, 1, t->mutable_data<float>(), &ctx);
1304
auto& op = *(netdef.add_op());
1305
op.set_type("CopyToMPSCNN");
1306
op.add_input("X_cpu");
1307
op.add_output("X_mtl");
1311
auto& op = *(netdef.add_op());
1312
op.set_type("MPSCNNConvRelu");
1313
op.add_input("X_mtl");
1317
auto& arg = *(op.add_arg());
1318
arg.set_name("order");
1322
auto& arg = *(op.add_arg());
1323
arg.set_name("kernel");
1327
auto& arg = *(op.add_arg());
1328
arg.set_name("pad");
1331
op.add_output("Y_mtl");
1335
auto& op = *(netdef.add_op());
1336
op.set_type("CopyFromMPSCNN");
1337
op.add_input("Y_mtl");
1338
op.add_output("Y_cpu");
1342
auto& op = *(netdef.add_op());
1343
op.set_type("Conv");
1344
op.add_input("X_cpu");
1348
auto& arg = *(op.add_arg());
1349
arg.set_name("order");
1353
auto& arg = *(op.add_arg());
1354
arg.set_name("kernel");
1358
auto& arg = *(op.add_arg());
1359
arg.set_name("pad");
1362
op.add_output("Y_ref");
1366
auto& op = *(netdef.add_op());
1367
op.set_type("Relu");
1368
op.add_input("Y_ref");
1369
op.add_output("Y_ref");
1372
ws.RunNetOnce(netdef);
1373
const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
1374
const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
1376
CAFFE_ENFORCE_EQ(t1.sizes(), t2.sizes());
1377
for (auto i = 0; i < t1.size(); ++i) {
1378
// FP16 <-> FP32 round trip, accumulation, etc.
1379
const float t1_i = t1.data<float>()[i];
1380
const float t2_i = t2.data<float>()[i];
1381
TORCH_CHECK_NEAR(t1_i, t2_i, 0.1);
1386
LOG(INFO) << "MPSConv Test";
1389
auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
1390
t->Resize(1, 12, 57, 72);
1392
math::RandGaussian<float, CPUContext>(
1393
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
1397
auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
1398
t->Resize(8, 12, 3, 3);
1400
math::RandGaussian<float, CPUContext>(
1401
8 * 12 * 3 * 3, 0, 1, t->mutable_data<float>(), &ctx);
1405
auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
1408
math::RandGaussian<float, CPUContext>(
1409
8, 0, 1, t->mutable_data<float>(), &ctx);
1414
auto& op = *(netdef.add_op());
1415
op.set_type("CopyToMPSCNN");
1416
op.add_input("X_cpu");
1417
op.add_output("X_mtl");
1421
auto& op = *(netdef.add_op());
1422
op.set_type("MPSCNNConv");
1423
op.add_input("X_mtl");
1427
auto& arg = *(op.add_arg());
1428
arg.set_name("order");
1432
auto& arg = *(op.add_arg());
1433
arg.set_name("kernel");
1437
auto& arg = *(op.add_arg());
1438
arg.set_name("pad");
1441
op.add_output("Y_mtl");
1445
auto& op = *(netdef.add_op());
1446
op.set_type("CopyFromMPSCNN");
1447
op.add_input("Y_mtl");
1448
op.add_output("Y_cpu");
1452
auto& op = *(netdef.add_op());
1453
op.set_type("Conv");
1454
op.add_input("X_cpu");
1458
auto& arg = *(op.add_arg());
1459
arg.set_name("order");
1463
auto& arg = *(op.add_arg());
1464
arg.set_name("kernel");
1468
auto& arg = *(op.add_arg());
1469
arg.set_name("pad");
1472
op.add_output("Y_ref");
1475
ws.RunNetOnce(netdef);
1476
const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
1477
const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
1479
CAFFE_ENFORCE_EQ(t1.sizes(), t2.sizes());
1480
for (auto i = 0; i < t1.size(); ++i) {
1481
// FP16 <-> FP32 round trip, accumulation, etc.
1482
const float t1_i = t1.data<float>()[i];
1483
const float t2_i = t2.data<float>()[i];
1484
TORCH_CHECK_NEAR(t1_i, t2_i, 0.1);
1489
for (const auto& batchSize : {1, 2}) {
1490
for (const auto& C : {1, 2}) {
1491
for (const auto& M : {1, 2}) {
1492
for (const auto& K : {3, 4}) {
1493
for (const auto& P : {1, 2}) {
1494
LOG(INFO) << "MPSConv Test";
1497
auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
1498
t->Resize(batchSize, C, 12, 16);
1500
math::RandGaussian<float, CPUContext>(
1501
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
1505
auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
1506
t->Resize(M, C, K, K);
1508
math::RandGaussian<float, CPUContext>(
1509
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
1513
auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
1516
math::RandGaussian<float, CPUContext>(
1517
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
1522
auto& op = *(netdef.add_op());
1523
op.set_type("CopyToMPSCNN");
1524
op.add_input("X_cpu");
1525
op.add_output("X_mtl");
1529
auto& op = *(netdef.add_op());
1530
op.set_type("MPSCNNConv");
1531
op.add_input("X_mtl");
1535
auto& arg = *(op.add_arg());
1536
arg.set_name("order");
1540
auto& arg = *(op.add_arg());
1541
arg.set_name("kernel");
1545
auto& arg = *(op.add_arg());
1546
arg.set_name("pad");
1549
op.add_output("Y_mtl");
1553
auto& op = *(netdef.add_op());
1554
op.set_type("CopyFromMPSCNN");
1555
op.add_input("Y_mtl");
1556
op.add_output("Y_cpu");
1560
auto& op = *(netdef.add_op());
1561
op.set_type("Conv");
1562
op.add_input("X_cpu");
1566
auto& arg = *(op.add_arg());
1567
arg.set_name("order");
1571
auto& arg = *(op.add_arg());
1572
arg.set_name("kernel");
1576
auto& arg = *(op.add_arg());
1577
arg.set_name("pad");
1580
op.add_output("Y_ref");
1583
ws.RunNetOnce(netdef);
1584
const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
1585
const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
1587
CAFFE_ENFORCE_EQ(t1.sizes(), t2.sizes());
1588
for (auto i = 0; i < t1.size(); ++i) {
1589
// FP16 <-> FP32 round trip, accumulation, etc.
1590
const float t1_i = t1.data<float>()[i];
1591
const float t2_i = t2.data<float>()[i];
1592
TORCH_CHECK_NEAR(t1_i, t2_i, 0.1);
1602
for (const auto& batchSize : {1, 2}) {
1603
for (const auto& group : {1, 2}) {
1604
for (const auto& C : {8, 16}) {
1605
for (const auto& M : {8, 16}) {
1606
for (const auto& K : {3, 4}) {
1607
for (const auto& P : {1, 2}) {
1608
LOG(INFO) << "MPSCNNConv Test - group";
1611
auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
1612
t->Resize(batchSize, C, 12, 16);
1614
math::RandGaussian<float, CPUContext>(
1615
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
1619
auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
1620
t->Resize(M, C / group, K, K);
1622
math::RandGaussian<float, CPUContext>(
1623
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
1627
auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
1630
math::RandGaussian<float, CPUContext>(
1631
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
1636
auto& op = *(netdef.add_op());
1637
op.set_type("CopyToMPSCNN");
1638
op.add_input("X_cpu");
1639
op.add_output("X_mtl");
1643
auto& op = *(netdef.add_op());
1644
op.set_type("MPSCNNConv");
1645
op.add_input("X_mtl");
1649
auto& arg = *(op.add_arg());
1650
arg.set_name("order");
1654
auto& arg = *(op.add_arg());
1655
arg.set_name("kernel");
1659
auto& arg = *(op.add_arg());
1660
arg.set_name("pad");
1664
auto& arg = *(op.add_arg());
1665
arg.set_name("group");
1668
op.add_output("Y_mtl");
1672
auto& op = *(netdef.add_op());
1673
op.set_type("CopyFromMPSCNN");
1674
op.add_input("Y_mtl");
1675
op.add_output("Y_cpu");
1679
auto& op = *(netdef.add_op());
1680
op.set_type("Conv");
1681
op.add_input("X_cpu");
1685
auto& arg = *(op.add_arg());
1686
arg.set_name("order");
1690
auto& arg = *(op.add_arg());
1691
arg.set_name("kernel");
1695
auto& arg = *(op.add_arg());
1696
arg.set_name("pad");
1700
auto& arg = *(op.add_arg());
1701
arg.set_name("group");
1704
op.add_output("Y_ref");
1707
ws.RunNetOnce(netdef);
1708
const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
1709
const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
1711
CAFFE_ENFORCE_EQ(t1.sizes(), t2.sizes());
1712
for (auto i = 0; i < t1.size(); ++i) {
1713
// FP16 <-> FP32 round trip, accumulation, etc.
1714
const float t1_i = t1.data<float>()[i];
1715
const float t2_i = t2.data<float>()[i];
1716
TORCH_CHECK_NEAR(t1_i, t2_i, 0.1);
1727
LOG(INFO) << "MPSCNNMul Test";
1730
auto* t = BlobGetMutableTensor(ws.CreateBlob("X0_cpu"), CPU);
1731
t->Resize(1, 12, 57, 72);
1733
math::RandGaussian<float, CPUContext>(
1734
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
1738
auto* t = BlobGetMutableTensor(ws.CreateBlob("X1_cpu"), CPU);
1741
math::RandGaussian<float, CPUContext>(
1742
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
1747
auto& op = *(netdef.add_op());
1748
op.set_type("CopyToMPSCNN");
1749
op.add_input("X0_cpu");
1750
op.add_output("X0_mtl");
1754
auto& op = *(netdef.add_op());
1755
op.set_type("MPSCNNMul");
1756
op.add_input("X0_mtl");
1757
op.add_input("X1_cpu");
1758
op.add_output("Y_mtl");
1759
add_arg_int(op, "broadcast", 1);
1763
auto& op = *(netdef.add_op());
1764
op.set_type("CopyFromMPSCNN");
1765
op.add_input("Y_mtl");
1766
op.add_output("Y_cpu");
1770
auto& op = *(netdef.add_op());
1772
op.add_input("X0_cpu");
1773
op.add_input("X1_cpu");
1774
op.add_output("Y_ref");
1775
add_arg_int(op, "broadcast", 1);
1778
ws.RunNetOnce(netdef);
1779
const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
1780
const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
1782
CAFFE_ENFORCE_EQ(t1.sizes(), t2.sizes());
1783
for (auto i = 0; i < t1.size(); ++i) {
1784
// FP16 <-> FP32 round trip, accumulation, etc.
1785
const float t1_i = t1.data<float>()[i];
1786
const float t2_i = t2.data<float>()[i];
1787
TORCH_CHECK_NEAR(t1_i, t2_i, 0.02);
1792
LOG(INFO) << "MPSCNNSub Test";
1795
auto* t = BlobGetMutableTensor(ws.CreateBlob("X0_cpu"), CPU);
1796
t->Resize(1, 12, 57, 72);
1798
math::RandGaussian<float, CPUContext>(
1799
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
1803
auto* t = BlobGetMutableTensor(ws.CreateBlob("X1_cpu"), CPU);
1806
math::RandGaussian<float, CPUContext>(
1807
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
1812
auto& op = *(netdef.add_op());
1813
op.set_type("CopyToMPSCNN");
1814
op.add_input("X0_cpu");
1815
op.add_output("X0_mtl");
1819
auto& op = *(netdef.add_op());
1820
op.set_type("MPSCNNSub");
1821
op.add_input("X0_mtl");
1822
op.add_input("X1_cpu");
1823
op.add_output("Y_mtl");
1824
add_arg_int(op, "broadcast", 1);
1828
auto& op = *(netdef.add_op());
1829
op.set_type("CopyFromMPSCNN");
1830
op.add_input("Y_mtl");
1831
op.add_output("Y_cpu");
1835
auto& op = *(netdef.add_op());
1837
op.add_input("X0_cpu");
1838
op.add_input("X1_cpu");
1839
op.add_output("Y_ref");
1840
add_arg_int(op, "broadcast", 1);
1843
ws.RunNetOnce(netdef);
1844
const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
1845
const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
1847
CAFFE_ENFORCE_EQ(t1.sizes(), t2.sizes());
1848
for (auto i = 0; i < t1.size(); ++i) {
1849
// FP16 <-> FP32 round trip, accumulation, etc.
1850
const float t1_i = t1.data<float>()[i];
1851
const float t2_i = t2.data<float>()[i];
1852
TORCH_CHECK_NEAR(t1_i, t2_i, 0.01);
1857
LOG(INFO) << "MPSAdd Test";
1860
auto* t = BlobGetMutableTensor(ws.CreateBlob("X0_cpu"), CPU);
1861
t->Resize(1, 12, 57, 72);
1863
math::RandGaussian<float, CPUContext>(
1864
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
1868
auto* t = BlobGetMutableTensor(ws.CreateBlob("X1_cpu"), CPU);
1869
t->Resize(1, 12, 57, 72);
1871
math::RandGaussian<float, CPUContext>(
1872
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
1877
auto& op = *(netdef.add_op());
1878
op.set_type("CopyToMPSCNN");
1879
op.add_input("X0_cpu");
1880
op.add_output("X0_mtl");
1881
op.add_input("X1_cpu");
1882
op.add_output("X1_mtl");
1886
auto& op = *(netdef.add_op());
1887
op.set_type("MPSCNNAdd");
1888
op.add_input("X0_mtl");
1889
op.add_input("X1_mtl");
1890
op.add_output("Y_mtl");
1894
auto& op = *(netdef.add_op());
1895
op.set_type("CopyFromMPSCNN");
1896
op.add_input("Y_mtl");
1897
op.add_output("Y_cpu");
1901
auto& op = *(netdef.add_op());
1903
op.add_input("X0_cpu");
1904
op.add_input("X1_cpu");
1905
op.add_output("Y_ref");
1908
ws.RunNetOnce(netdef);
1909
const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
1910
const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
1912
CAFFE_ENFORCE_EQ(t1.sizes(), t2.sizes());
1913
for (auto i = 0; i < t1.size(); ++i) {
1914
// FP16 <-> FP32 round trip, accumulation, etc.
1915
const float t1_i = t1.data<float>()[i];
1916
const float t2_i = t2.data<float>()[i];
1917
TORCH_CHECK_NEAR(t1_i, t2_i, 0.01);
1922
LOG(INFO) << "MPSAdd Test";
1925
auto* t = BlobGetMutableTensor(ws.CreateBlob("X0_cpu"), CPU);
1926
t->Resize(1, 12, 57, 72);
1928
math::RandGaussian<float, CPUContext>(
1929
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
1933
auto* t = BlobGetMutableTensor(ws.CreateBlob("X1_cpu"), CPU);
1934
t->Resize(1, 12, 57, 72);
1936
math::RandGaussian<float, CPUContext>(
1937
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
1942
auto& op = *(netdef.add_op());
1943
op.set_type("CopyToMPSCNN");
1944
op.add_input("X0_cpu");
1945
op.add_output("X0_mtl");
1946
op.add_input("X1_cpu");
1947
op.add_output("X1_mtl");
1949
// First input is read twice.
1951
auto& arg = *(op.add_arg());
1952
arg.set_name("__mpscnn_read_count__");
1959
auto& op = *(netdef.add_op());
1960
op.set_type("MPSCNNAdd");
1961
op.add_input("X0_mtl");
1962
op.add_input("X1_mtl");
1963
op.add_output("X2_mtl");
1967
auto& op = *(netdef.add_op());
1968
op.set_type("MPSCNNAdd");
1969
op.add_input("X0_mtl");
1970
op.add_input("X2_mtl");
1971
op.add_output("Y_mtl");
1975
auto& op = *(netdef.add_op());
1976
op.set_type("CopyFromMPSCNN");
1977
op.add_input("Y_mtl");
1978
op.add_output("Y_cpu");
1982
auto& op = *(netdef.add_op());
1984
op.add_input("X0_cpu");
1985
op.add_input("X1_cpu");
1986
op.add_output("X2_cpu");
1990
auto& op = *(netdef.add_op());
1992
op.add_input("X0_cpu");
1993
op.add_input("X2_cpu");
1994
op.add_output("Y_ref");
1997
ws.RunNetOnce(netdef);
1998
const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
1999
const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
2001
CAFFE_ENFORCE_EQ(t1.sizes(), t2.sizes());
2002
for (auto i = 0; i < t1.size(); ++i) {
2003
// FP16 <-> FP32 round trip, accumulation, etc.
2004
const float t1_i = t1.data<float>()[i];
2005
const float t2_i = t2.data<float>()[i];
2006
TORCH_CHECK_NEAR(t1_i, t2_i, 0.05);
2011
for (const auto& n : {"Relu", "Tanh", "Sigmoid"}) {
2012
LOG(INFO) << "MPSCNNNeuron Test: " << n;
2015
auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
2016
t->Resize(1, 4, 12, 12);
2018
math::RandGaussian<float, CPUContext>(
2019
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
2024
auto& op = *(netdef.add_op());
2025
op.set_type("CopyToMPSCNN");
2026
op.add_input("X_cpu");
2027
op.add_output("X_mtl");
2031
auto& op = *(netdef.add_op());
2032
op.set_type(std::string("MPSCNN") + n);
2033
op.add_input("X_mtl");
2034
op.add_output("Y_mtl");
2038
auto& op = *(netdef.add_op());
2039
op.set_type("CopyFromMPSCNN");
2040
op.add_input("Y_mtl");
2041
op.add_output("Y_cpu");
2045
auto& op = *(netdef.add_op());
2047
op.add_input("X_cpu");
2048
op.add_output("Y_ref");
2051
ws.RunNetOnce(netdef);
2052
const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
2053
const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
2055
CAFFE_ENFORCE_EQ(t1.sizes(), t2.sizes());
2056
for (auto i = 0; i < t1.size(); ++i) {
2057
// FP16 <-> FP32 round trip, accumulation, etc.
2058
const float t1_i = t1.data<float>()[i];
2059
const float t2_i = t2.data<float>()[i];
2060
TORCH_CHECK_NEAR(t1_i, t2_i, 0.02);
2066
LOG(INFO) << "MPSCNNDropout Test";
2069
auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
2070
t->Resize(1, 12, 57, 72);
2072
math::RandGaussian<float, CPUContext>(
2073
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
2078
auto& op = *(netdef.add_op());
2079
op.set_type("CopyToMPSCNN");
2080
op.add_input("X_cpu");
2081
op.add_output("X_mtl");
2085
auto& op = *(netdef.add_op());
2086
op.set_type("MPSCNNDropout");
2087
op.add_input("X_mtl");
2089
auto& arg = *(op.add_arg());
2090
arg.set_name(OpSchema::Arg_IsTest);
2093
op.add_output("Y_mtl");
2094
op.add_output("Y_mask_mtl");
2098
auto& op = *(netdef.add_op());
2099
op.set_type("CopyFromMPSCNN");
2100
op.add_input("Y_mtl");
2101
op.add_output("Y_cpu");
2105
auto& op = *(netdef.add_op());
2106
op.set_type("Dropout");
2107
op.add_input("X_cpu");
2109
auto& arg = *(op.add_arg());
2110
arg.set_name(OpSchema::Arg_IsTest);
2113
op.add_output("Y_ref");
2114
op.add_output("Y_mask");
2117
ws.RunNetOnce(netdef);
2118
const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
2119
const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
2120
CAFFE_ENFORCE_EQ(t1.sizes(), t2.sizes());
2121
LOG(INFO) << t1.sizes();
2122
for (auto i = 0; i < t1.size(); ++i) {
2123
// FP16 <-> FP32 round trip, accumulation, etc.
2124
const float t1_i = t1.data<float>()[i];
2125
const float t2_i = t2.data<float>()[i];
2126
TORCH_CHECK_NEAR(t1_i, t2_i, 0.1);
2131
for (const auto scale : std::vector<float>{1.0, 2.0, 0.0625}) {
2132
for (const auto channels : std::vector<size_t>{1, 3, 5, 8}) {
2133
for (const auto pool : std::vector<size_t>{1, 3, 7}) {
2134
for (const auto sampling_ratio : std::vector<size_t>{0, 1, 2, 3}) {
2135
LOG(INFO) << "MPSCNNRoIWarp Test - sampling_ratio:"
2136
<< sampling_ratio << "- pool: " << pool
2137
<< " - scale: " << scale;
2140
auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
2141
t->Resize(1, channels, 40, 40);
2143
math::RandGaussian<float, CPUContext>(
2144
t->size(), 4, 2, t->mutable_data<float>(), &ctx);
2147
// Use the batch-first encoding (n, [bbox])
2148
auto* t = BlobGetMutableTensor(ws.CreateBlob("R"), CPU);
2150
for (auto i = 0; i < t->dim32(0); ++i) {
2151
t->mutable_data<float>()[5 * i + 0] = 0; // batch
2152
t->mutable_data<float>()[5 * i + 1] = (i % 4 + 1) * 1.0 / scale;
2153
t->mutable_data<float>()[5 * i + 2] = (i % 5 + 1) * 1.0 / scale;
2154
t->mutable_data<float>()[5 * i + 3] = (i % 3 + 7) * 1.0 / scale;
2155
t->mutable_data<float>()[5 * i + 4] = (i % 4 + 7) * 1.0 / scale;
2161
auto& op = *(netdef.add_op());
2162
op.set_type("CopyToMPSCNN");
2163
op.add_input("X_cpu");
2164
op.add_output("X_mtl");
2168
auto& op = *(netdef.add_op());
2169
op.set_type("MPSCNNRoIWarp");
2170
op.add_input("X_mtl");
2173
auto& arg = *(op.add_arg());
2174
arg.set_name("sampling_ratio");
2175
arg.set_i(sampling_ratio);
2178
auto& arg = *(op.add_arg());
2179
arg.set_name("pooled_h");
2183
auto& arg = *(op.add_arg());
2184
arg.set_name("pooled_w");
2188
auto& arg = *(op.add_arg());
2189
arg.set_name("spatial_scale");
2192
op.add_output("Y_mtl");
2196
auto& op = *(netdef.add_op());
2197
op.set_type("CopyFromMPSCNN");
2198
op.add_input("Y_mtl");
2199
op.add_output("Y_cpu");
2203
auto& op = *(netdef.add_op());
2204
op.set_type("RoIWarp");
2205
op.add_input("X_cpu");
2208
auto& arg = *(op.add_arg());
2209
arg.set_name("sampling_ratio");
2210
arg.set_i(sampling_ratio);
2213
auto& arg = *(op.add_arg());
2214
arg.set_name("pooled_h");
2218
auto& arg = *(op.add_arg());
2219
arg.set_name("pooled_w");
2223
auto& arg = *(op.add_arg());
2224
arg.set_name("spatial_scale");
2227
op.add_output("Y_ref");
2230
ws.RunNetOnce(netdef);
2231
const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
2232
const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
2234
CAFFE_ENFORCE_EQ(t1.sizes(), t2.sizes());
2235
LOG(INFO) << t1.sizes();
2236
for (auto i = 0; i < t1.size(); ++i) {
2237
// FP16 <-> FP32 round trip, accumulation, etc.
2238
const float t1_i = t1.data<float>()[i];
2239
const float t2_i = t2.data<float>()[i];
2240
TORCH_CHECK_NEAR(t1_i, t2_i, 0.1);
2249
for (const auto scale : std::vector<float>{1.0, 2.0, 0.0625}) {
2250
for (const auto pool : std::vector<size_t>{1, 3, 7}) {
2251
LOG(INFO) << "MPSCNNRoIWarp Test 2";
2254
auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
2255
t->Resize(1, 8, 40, 40);
2257
math::RandGaussian<float, CPUContext>(
2258
t->size(), 4, 2, t->mutable_data<float>(), &ctx);
2261
auto* t = BlobGetMutableTensor(ws.CreateBlob("R"), CPU);
2263
for (auto i = 0; i < t->dim32(0); ++i) {
2264
t->mutable_data<float>()[4 * i + 0] = (i % 4 + 1) * 1.0 / scale;
2265
t->mutable_data<float>()[4 * i + 1] = (i % 5 + 1) * 1.0 / scale;
2266
t->mutable_data<float>()[4 * i + 2] = (i % 3 + 7) * 1.0 / scale;
2267
t->mutable_data<float>()[4 * i + 3] = (i % 4 + 7) * 1.0 / scale;
2273
auto& op = *(netdef.add_op());
2274
op.set_type("CopyToMPSCNN");
2275
op.add_input("X_cpu");
2276
op.add_output("X_mtl");
2280
auto& op = *(netdef.add_op());
2281
op.set_type("MPSCNNRoIWarp");
2282
op.add_input("X_mtl");
2285
auto& arg = *(op.add_arg());
2286
arg.set_name("sampling_ratio");
2290
auto& arg = *(op.add_arg());
2291
arg.set_name("pooled_h");
2295
auto& arg = *(op.add_arg());
2296
arg.set_name("pooled_w");
2300
auto& arg = *(op.add_arg());
2301
arg.set_name("spatial_scale");
2304
op.add_output("Y_mtl");
2308
auto& op = *(netdef.add_op());
2309
op.set_type("CopyFromMPSCNN");
2310
op.add_input("Y_mtl");
2311
op.add_output("Y_cpu");
2315
auto& op = *(netdef.add_op());
2316
op.set_type("RoIWarp");
2317
op.add_input("X_cpu");
2320
auto& arg = *(op.add_arg());
2321
arg.set_name("sampling_ratio");
2325
auto& arg = *(op.add_arg());
2326
arg.set_name("pooled_h");
2330
auto& arg = *(op.add_arg());
2331
arg.set_name("pooled_w");
2335
auto& arg = *(op.add_arg());
2336
arg.set_name("spatial_scale");
2339
op.add_output("Y_ref");
2342
ws.RunNetOnce(netdef);
2343
const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
2344
const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
2346
CAFFE_ENFORCE_EQ(t1.sizes(), t2.sizes());
2347
LOG(INFO) << t1.sizes();
2348
for (auto i = 0; i < t1.size(); ++i) {
2349
// FP16 <-> FP32 round trip, accumulation, etc.
2350
const float t1_i = t1.data<float>()[i];
2351
const float t2_i = t2.data<float>()[i];
2352
TORCH_CHECK_NEAR(t1_i, t2_i, 0.1);
2359
for (const auto height_scale : std::vector<float>{1.0, 0.5, 1.7}) {
2360
for (const auto width_scale : std::vector<float>{1.0, 0.5, 2.3}) {
2361
for (const auto C : std::vector<float>{2, 7, 11}) {
2362
for (const auto N : std::vector<float>{1, 2}) {
2363
LOG(INFO) << "MPSCNNResizeNearestOp Test";
2366
auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
2367
t->Resize(N, C, 37, 89);
2369
math::RandGaussian<float, CPUContext>(
2370
t->size(), 4, 2, t->mutable_data<float>(), &ctx);
2374
auto& op = *(netdef.add_op());
2375
op.set_type("CopyToMPSCNN");
2376
op.add_input("X_cpu");
2377
op.add_output("X_mtl");
2381
auto& op = *(netdef.add_op());
2382
op.set_type("MPSCNNResizeNearest");
2383
op.add_input("X_mtl");
2385
auto& arg = *(op.add_arg());
2386
arg.set_name("height_scale");
2387
arg.set_f(height_scale);
2390
auto& arg = *(op.add_arg());
2391
arg.set_name("width_scale");
2392
arg.set_f(width_scale);
2394
op.add_output("Y_mtl");
2398
auto& op = *(netdef.add_op());
2399
op.set_type("CopyFromMPSCNN");
2400
op.add_input("Y_mtl");
2401
op.add_output("Y_cpu");
2405
auto& op = *(netdef.add_op());
2406
op.set_type("ResizeNearest");
2407
op.add_input("X_cpu");
2409
auto& arg = *(op.add_arg());
2410
arg.set_name("height_scale");
2411
arg.set_f(height_scale);
2414
auto& arg = *(op.add_arg());
2415
arg.set_name("width_scale");
2416
arg.set_f(width_scale);
2418
op.add_output("Y_ref");
2421
ws.RunNetOnce(netdef);
2422
const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
2423
const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
2425
CAFFE_ENFORCE_EQ(t1.sizes(), t2.sizes());
2426
LOG(INFO) << t1.sizes();
2427
for (auto i = 0; i < t1.size(); ++i) {
2428
// FP16 <-> FP32 round trip, accumulation, etc.
2429
const float t1_i = t1.data<float>()[i];
2430
const float t2_i = t2.data<float>()[i];
2431
TORCH_CHECK_NEAR(t1_i, t2_i, 0.1);
2440
LOG(INFO) << "MPSCNNGenerateProposals Test: \n";
2442
auto num_images = 1;
2443
auto A = 2; // # anchors
2444
auto H = 4; // height
2445
auto W = 5; // width
2446
vector<float> scores{
2447
5.44218998e-03, 1.19207997e-03, 1.12379994e-03, 1.17181998e-03,
2448
1.20544003e-03, 6.17993006e-04, 1.05261997e-05, 8.91025957e-06,
2449
9.29536981e-09, 6.09605013e-05, 4.72735002e-04, 1.13482002e-10,
2450
1.50015003e-05, 4.45032993e-06, 3.21612994e-08, 8.02662980e-04,
2451
1.40488002e-04, 3.12508007e-07, 3.02616991e-06, 1.97759000e-08,
2452
2.66913995e-02, 5.26766013e-03, 5.05053019e-03, 5.62100019e-03,
2453
5.37420018e-03, 5.26280981e-03, 2.48894998e-04, 1.06842002e-04,
2454
3.92931997e-06, 1.79388002e-03, 4.79440019e-03, 3.41609990e-07,
2455
5.20430971e-04, 3.34090000e-05, 2.19159006e-07, 2.28786003e-03,
2456
5.16703985e-05, 4.04523007e-06, 1.79227004e-06, 5.32449000e-08};
2458
-1.65040009e-02, -1.84051003e-02, -1.85930002e-02, -2.08263006e-02,
2459
-1.83814000e-02, -2.89172009e-02, -3.89706008e-02, -7.52277970e-02,
2460
-1.54091999e-01, -2.55433004e-02, -1.77490003e-02, -1.10340998e-01,
2461
-4.20190990e-02, -2.71421000e-02, 6.89801015e-03, 5.71171008e-02,
2462
-1.75665006e-01, 2.30021998e-02, 3.08554992e-02, -1.39333997e-02,
2463
3.40579003e-01, 3.91070992e-01, 3.91624004e-01, 3.92527014e-01,
2464
3.91445011e-01, 3.79328012e-01, 4.26631987e-01, 3.64892989e-01,
2465
2.76894987e-01, 5.13985991e-01, 3.79999995e-01, 1.80457994e-01,
2466
4.37402993e-01, 4.18545991e-01, 2.51549989e-01, 4.48318988e-01,
2467
1.68564007e-01, 4.65440989e-01, 4.21891987e-01, 4.45928007e-01,
2468
3.27155995e-03, 3.71480011e-03, 3.60032008e-03, 4.27092984e-03,
2469
3.74579988e-03, 5.95752988e-03, -3.14473989e-03, 3.52022005e-03,
2470
-1.88564006e-02, 1.65188999e-03, 1.73791999e-03, -3.56074013e-02,
2471
-1.66615995e-04, 3.14146001e-03, -1.11830998e-02, -5.35363983e-03,
2472
6.49790000e-03, -9.27671045e-03, -2.83346009e-02, -1.61233004e-02,
2473
-2.15505004e-01, -2.19910994e-01, -2.20872998e-01, -2.12831005e-01,
2474
-2.19145000e-01, -2.27687001e-01, -3.43973994e-01, -2.75869995e-01,
2475
-3.19516987e-01, -2.50418007e-01, -2.48537004e-01, -5.08224010e-01,
2476
-2.28724003e-01, -2.82402009e-01, -3.75815988e-01, -2.86352992e-01,
2477
-5.28333001e-02, -4.43836004e-01, -4.55134988e-01, -4.34897989e-01,
2478
-5.65053988e-03, -9.25739005e-04, -1.06790999e-03, -2.37016007e-03,
2479
-9.71166010e-04, -8.90910998e-03, -1.17592998e-02, -2.08992008e-02,
2480
-4.94231991e-02, 6.63906988e-03, 3.20469006e-03, -6.44695014e-02,
2481
-3.11607006e-03, 2.02738005e-03, 1.48096997e-02, 4.39785011e-02,
2482
-8.28424022e-02, 3.62076014e-02, 2.71668993e-02, 1.38250999e-02,
2483
6.76669031e-02, 1.03252999e-01, 1.03255004e-01, 9.89722982e-02,
2484
1.03646003e-01, 4.79663983e-02, 1.11014001e-01, 9.31736007e-02,
2485
1.15768999e-01, 1.04014002e-01, -8.90677981e-03, 1.13103002e-01,
2486
1.33085996e-01, 1.25405997e-01, 1.50051996e-01, -1.13038003e-01,
2487
7.01059997e-02, 1.79651007e-01, 1.41055003e-01, 1.62841007e-01,
2488
-1.00247003e-02, -8.17587040e-03, -8.32176022e-03, -8.90108012e-03,
2489
-8.13035015e-03, -1.77263003e-02, -3.69572006e-02, -3.51580009e-02,
2490
-5.92143014e-02, -1.80795006e-02, -5.46086021e-03, -4.10550982e-02,
2491
-1.83081999e-02, -2.15411000e-02, -1.17953997e-02, 3.33894007e-02,
2492
-5.29635996e-02, -6.97528012e-03, -3.15250992e-03, -3.27355005e-02,
2493
1.29676998e-01, 1.16080999e-01, 1.15947001e-01, 1.21797003e-01,
2494
1.16089001e-01, 1.44875005e-01, 1.15617000e-01, 1.31586999e-01,
2495
1.74735002e-02, 1.21973999e-01, 1.31596997e-01, 2.48907991e-02,
2496
6.18605018e-02, 1.12855002e-01, -6.99798986e-02, 9.58312973e-02,
2497
1.53593004e-01, -8.75087008e-02, -4.92327996e-02, -3.32239009e-02};
2498
vector<float> im_info{60, 80, 0.166667};
2499
vector<float> anchors{-38, -16, 53, 31, -120, -120, 135, 135};
2501
auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
2502
t->Resize(num_images, A, H, W);
2503
for (auto i = 0; i < t->size(); ++i) {
2504
t->mutable_data<float>()[i] = scores[i];
2509
auto* t = BlobGetMutableTensor(ws.CreateBlob("bbox_delta_cpu"), CPU);
2510
t->Resize(num_images, 4 * A, H, W);
2511
for (auto i = 0; i < t->size(); ++i) {
2512
t->mutable_data<float>()[i] = bbx[i];
2517
auto* t = BlobGetMutableTensor(ws.CreateBlob("im_info"), CPU);
2518
t->Resize(num_images, 3);
2519
for (auto i = 0; i < t->size(); ++i) {
2520
t->mutable_data<float>()[i] = im_info[i];
2525
auto* t = BlobGetMutableTensor(ws.CreateBlob("anchors"), CPU);
2527
for (auto i = 0; i < t->size(); ++i) {
2528
t->mutable_data<float>()[i] = anchors[i];
2535
auto& op = *(netdef.add_op());
2536
op.set_type("MPSCNNGenerateProposalsCPP");
2537
op.add_input("X_cpu");
2538
op.add_input("bbox_delta_cpu");
2539
op.add_input("im_info");
2540
op.add_input("anchors");
2541
op.add_output("rois");
2542
op.add_output("rois_probs");
2546
auto& op = *(netdef.add_op());
2547
op.set_type("GenerateProposalsCPP");
2548
op.add_input("X_cpu");
2549
op.add_input("bbox_delta_cpu");
2550
op.add_input("im_info");
2551
op.add_input("anchors");
2552
op.add_output("rois_ref");
2553
op.add_output("rois_probs_ref");
2556
ws.RunNetOnce(netdef);
2557
const auto& t2 = ws.GetBlob("rois")->Get<TensorCPU>();
2558
const auto& t1 = ws.GetBlob("rois_ref")->Get<TensorCPU>();
2560
const auto& t4 = ws.GetBlob("rois_probs")->Get<TensorCPU>();
2561
const auto& t3 = ws.GetBlob("rois_probs_ref")->Get<TensorCPU>();
2563
LOG(INFO) << "t1: " << t1.size() << " t2: " << t2.size();
2565
const float HALF_MIN_VAL = 6.103515625e-05;
2566
for (auto i = 0; i < fmin(t1.size(), t2.size()); ++i) {
2567
// FP16 <-> FP32 round trip, accumulation, etc.
2568
const float t1_i = t1.data<float>()[i];
2569
const float t2_i = t2.data<float>()[i];
2570
const float t3_i = t3.data<float>()[i / 5];
2571
if (t3_i - HALF_MIN_VAL * 2 > 0) {
2572
LOG(INFO) << i << " " << t1_i << " " << t2_i << " " << t3_i;
2573
TORCH_CHECK_NEAR(t1_i, t2_i, 0.1);
2577
for (auto i = 0; i < fmin(t3.size(), t4.size()); ++i) {
2578
// FP16 <-> FP32 round trip, accumulation, etc.
2579
const float t3_i = t3.data<float>()[i];
2580
const float t4_i = t4.data<float>()[i];
2581
LOG(INFO) << i << " " << t3_i;
2582
TORCH_CHECK_NEAR(t3_i, t4_i, 0.1);
2587
for (const auto& batchSize : std::vector<size_t>{1, 2}) {
2588
LOG(INFO) << "MPSCNNSoftmax Test";
2591
auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
2592
// Only works for spatial dimension of (1, 1) - weird.
2593
t->Resize(batchSize, 12, 1, 1);
2595
math::RandGaussian<float, CPUContext>(
2596
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
2601
auto& op = *(netdef.add_op());
2602
op.set_type("CopyToMPSCNN");
2603
op.add_input("X_cpu");
2604
op.add_output("X_mtl");
2608
auto& op = *(netdef.add_op());
2609
op.set_type("MPSCNNSoftmax");
2610
op.add_input("X_mtl");
2611
op.add_output("Y_mtl");
2615
auto& op = *(netdef.add_op());
2616
op.set_type("CopyFromMPSCNN");
2617
op.add_input("Y_mtl");
2618
op.add_output("Y_cpu");
2622
auto& op = *(netdef.add_op());
2623
op.set_type("Softmax");
2624
op.add_input("X_cpu");
2625
op.add_output("Y_ref");
2628
ws.RunNetOnce(netdef);
2629
const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
2630
const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
2631
CAFFE_ENFORCE_EQ(t1.sizes(), t2.sizes());
2632
LOG(INFO) << t1.sizes();
2633
for (auto i = 0; i < t1.size(); ++i) {
2634
// FP16 <-> FP32 round trip, accumulation, etc.
2635
const float t1_i = t1.data<float>()[i];
2636
const float t2_i = t2.data<float>()[i];
2637
TORCH_CHECK_NEAR(t1_i, t2_i, 0.1);
2643
for (const auto& inputChannels : std::vector<size_t>{3, 8}) {
2644
for (const auto& outputChannels : std::vector<size_t>{3, 8}) {
2645
for (const auto& batchSize : std::vector<size_t>{1, 2}) {
2646
for (const auto& stride_h : std::vector<int>{1, 2, 3}) {
2647
for (const auto& stride_w : std::vector<int>{1, 2, 3}) {
2648
for (const auto& kernel_h : std::vector<int>{3}) {
2649
for (const auto& kernel_w : std::vector<int>{3}) {
2650
for (const auto& pad_l : std::vector<int>{0, kernel_w / 2}) {
2651
for (const auto& pad_r :
2652
std::vector<int>{0, kernel_w / 2}) {
2653
for (const auto& pad_t :
2654
std::vector<int>{0, kernel_h / 2}) {
2655
for (const auto& pad_b :
2656
std::vector<int>{0, kernel_h / 2}) {
2657
for (const auto& adj : {0, 1, 2, 3}) {
2658
if (adj >= fmin(stride_h, stride_w)) {
2662
LOG(INFO) << "MPSConvTranspose Test";
2665
auto* t = BlobGetMutableTensor(
2666
ws.CreateBlob("X_cpu"), CPU);
2667
t->Resize(batchSize, inputChannels, 8, 12);
2669
math::RandGaussian<float, CPUContext>(
2673
t->mutable_data<float>(),
2679
BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
2686
math::RandGaussian<float, CPUContext>(
2690
t->mutable_data<float>(),
2696
BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
2697
t->Resize(outputChannels);
2699
math::RandGaussian<float, CPUContext>(
2703
t->mutable_data<float>(),
2709
auto& op = *(netdef.add_op());
2710
op.set_type("CopyToMPSCNN");
2711
op.add_input("X_cpu");
2712
op.add_output("X_mtl");
2716
auto& op = *(netdef.add_op());
2717
op.set_type("MPSCNNConvTranspose");
2718
op.add_input("X_mtl");
2721
#define ADD_ARGS(op) \
2723
add_arg_str(op, "order", "NCHW"); \
2726
std::vector<string>{"kernel_h", \
2735
std::vector<int>{kernel_h, \
2746
op.add_output("Y_mtl");
2750
auto& op = *(netdef.add_op());
2751
op.set_type("CopyFromMPSCNN");
2752
op.add_input("Y_mtl");
2753
op.add_output("Y_cpu");
2757
auto& op = *(netdef.add_op());
2758
op.set_type("ConvTranspose");
2759
op.add_input("X_cpu");
2763
op.add_output("Y_ref");
2767
ws.RunNetOnce(netdef);
2769
ws.GetBlob("Y_cpu")->Get<TensorCPU>();
2771
ws.GetBlob("Y_ref")->Get<TensorCPU>();
2772
CAFFE_ENFORCE_EQ(t1.sizes(), t2.sizes());
2773
LOG(INFO) << t1.sizes();
2774
for (auto i = 0; i < t1.size(); ++i) {
2775
// FP16 <-> FP32 round trip, accumulation, etc.
2776
const float t1_i = t1.data<float>()[i];
2777
const float t2_i = t2.data<float>()[i];
2778
constexpr float tol = 2.0e-2;
2780
std::abs(t1_i - t2_i) <=
2781
(tol + tol * std::abs(t1_i)))
2782
<< t1_i << ", " << t2_i;
2799
for (const auto array : std::vector<bool>{true, false}) {
2800
for (auto numInputs = 2; numInputs <= 4; numInputs++) {
2801
for (const auto batchSize : std::vector<size_t>{1, 2}) {
2802
auto mtl = [&](size_t i) {
2803
return std::string("X_mtl_") + std::to_string(i);
2805
auto cpu = [&](size_t i) {
2806
return std::string("X_cpu_") + std::to_string(i);
2809
LOG(INFO) << "MPSCNNConcat Test" << array << ", " << numInputs << ", "
2812
for (auto i = 0; i < numInputs; ++i) {
2813
auto* t = BlobGetMutableTensor(ws.CreateBlob(cpu(i)), CPU);
2814
t->Resize(batchSize, array ? (i + 1) * 4 : 4, 10, 10);
2816
math::RandGaussian<float, CPUContext>(
2817
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
2822
auto& op = *(netdef.add_op());
2823
op.set_type("CopyToMPSCNN");
2824
for (auto i = 0; i < numInputs; ++i) {
2825
op.add_input(cpu(i));
2826
op.add_output(mtl(i));
2831
auto& op = *(netdef.add_op());
2832
op.set_type("MPSCNNConcat");
2833
for (auto i = 0; i < numInputs; ++i) {
2834
op.add_input(mtl(i));
2837
auto& arg = *(op.add_arg());
2838
arg.set_name("order");
2841
op.add_output("Y_mtl");
2842
op.add_output("Y_mtl_mask");
2846
auto& op = *(netdef.add_op());
2847
op.set_type("CopyFromMPSCNN");
2848
op.add_input("Y_mtl");
2849
op.add_output("Y_cpu");
2853
auto& op = *(netdef.add_op());
2854
op.set_type("Concat");
2855
for (auto i = 0; i < numInputs; ++i) {
2856
op.add_input(cpu(i));
2859
auto& arg = *(op.add_arg());
2860
arg.set_name("order");
2864
op.add_output("Y_ref");
2865
op.add_output("Y_ref_mask");
2868
ws.RunNetOnce(netdef);
2869
const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
2871
const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
2872
CAFFE_ENFORCE_EQ(t1.sizes(), t2.sizes());
2873
LOG(INFO) << t1.sizes();
2874
for (auto i = 0; i < t1.size(); ++i) {
2875
// FP16 <-> FP32 round trip, accumulation, etc.
2876
const float t1_i = t1.data<float>()[i];
2877
const float t2_i = t2.data<float>()[i];
2878
TORCH_CHECK_NEAR(t1_i, t2_i, 0.1);
2886
for (const auto& batchSize : std::vector<size_t>{1, 2, 3, 4}) {
2887
for (const auto& inputChannels :
2888
std::vector<size_t>{1, 2, 3, 4, 16, 24, 32, 48, 96, 128, 256}) {
2889
for (const auto& groups : std::vector<int>{1, 4, 8, 16}) {
2890
if (inputChannels % groups != 0) {
2895
auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
2896
t->Resize(batchSize, inputChannels, 53, 47);
2898
math::RandGaussian<float, CPUContext>(
2899
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
2902
#define ADD_ARGS(op) \
2904
add_arg_str(op, "order", "NCHW"); \
2907
std::vector<string>{"kernel_w", "kernel_h", "group"}, \
2908
std::vector<int>{1, 1, groups}); \
2911
auto& op = *(netdef.add_op());
2912
op.set_type("CopyToMPSCNN");
2913
op.add_input("X_cpu");
2914
op.add_output("X_mtl");
2917
auto& op = *(netdef.add_op());
2918
op.set_type("MPSCNNChannelShuffle");
2919
op.add_input("X_mtl");
2921
op.add_output("Y_mtl");
2924
auto& op = *(netdef.add_op());
2925
op.set_type("CopyFromMPSCNN");
2926
op.add_input("Y_mtl");
2927
op.add_output("Y_cpu");
2930
auto& op = *(netdef.add_op());
2931
op.set_type("ChannelShuffle");
2932
op.add_input("X_cpu");
2934
op.add_output("Y_ref");
2937
ws.RunNetOnce(netdef);
2938
const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
2939
const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
2941
CAFFE_ENFORCE_EQ(t1.sizes(), t2.sizes());
2942
for (auto i = 0; i < t1.size(); ++i) {
2943
// FP16 <-> FP32 round trip, accumulation, etc.
2944
const float t1_i = t1.data<float>()[i];
2945
const float t2_i = t2.data<float>()[i];
2946
TORCH_CHECK_NEAR(t1_i, t2_i, 0.1);
2954
for (const auto channelCount : std::vector<size_t>{1, 2, 3, 4}) {
2955
for (auto numInputs = 2; numInputs <= 4; numInputs++) {
2956
for (const auto batchSize : std::vector<size_t>{1, 2}) {
2957
auto mtl = [&](size_t i) {
2958
return std::string("X_mtl_") + std::to_string(i);
2960
auto cpu = [&](size_t i) {
2961
return std::string("X_cpu_") + std::to_string(i);
2964
LOG(INFO) << "MPSCNNConcat(edge case) Test" << channelCount << ", "
2965
<< numInputs << ", " << batchSize;
2967
for (auto i = 0; i < numInputs; ++i) {
2968
auto* t = BlobGetMutableTensor(ws.CreateBlob(cpu(i)), CPU);
2969
t->Resize(batchSize, channelCount, 9, 17);
2971
math::RandGaussian<float, CPUContext>(
2972
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
2977
auto& op = *(netdef.add_op());
2978
op.set_type("CopyToMPSCNN");
2979
for (auto i = 0; i < numInputs; ++i) {
2980
op.add_input(cpu(i));
2981
op.add_output(mtl(i));
2986
auto& op = *(netdef.add_op());
2987
op.set_type("MPSCNNConcat");
2988
for (auto i = 0; i < numInputs; ++i) {
2989
op.add_input(mtl(i));
2992
auto& arg = *(op.add_arg());
2993
arg.set_name("order");
2996
op.add_output("Y_mtl");
2997
op.add_output("Y_mtl_mask");
3001
auto& op = *(netdef.add_op());
3002
op.set_type("CopyFromMPSCNN");
3003
op.add_input("Y_mtl");
3004
op.add_output("Y_cpu");
3008
auto& op = *(netdef.add_op());
3009
op.set_type("Concat");
3010
for (auto i = 0; i < numInputs; ++i) {
3011
op.add_input(cpu(i));
3014
auto& arg = *(op.add_arg());
3015
arg.set_name("order");
3019
op.add_output("Y_ref");
3020
op.add_output("Y_ref_mask");
3023
ws.RunNetOnce(netdef);
3024
const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
3026
const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
3027
CAFFE_ENFORCE_EQ(t1.sizes(), t2.sizes());
3028
LOG(INFO) << t1.sizes();
3029
for (auto i = 0; i < t1.size(); ++i) {
3030
// FP16 <-> FP32 round trip, accumulation, etc.
3031
const float t1_i = t1.data<float>()[i];
3032
const float t2_i = t2.data<float>()[i];
3033
TORCH_CHECK_NEAR(t1_i, t2_i, 0.1);
3041
LOG(INFO) << "MPSCNNReadCount Test";
3044
auto& op = *(netdef.add_op());
3045
op.add_input("X_cpu");
3046
op.add_output("X_mtl");
3050
auto& op = *(netdef.add_op());
3051
op.add_input("X_mtl");
3052
op.add_output("X_mtl");
3056
auto& op = *(netdef.add_op());
3057
op.add_input("X_mtl");
3062
auto& op = *(netdef.add_op());
3063
op.add_input("X_mtl");
3064
op.add_output("X_mtl");
3066
netdef = annotateDefWithReadCounts(netdef);
3067
auto rc = [&](size_t i) -> size_t {
3068
auto* arg = GetMutableArgument(
3069
"__mpscnn_read_count__", false, netdef.mutable_op(i));
3075
TORCH_CHECK_EQ(rc(0), 1);
3076
TORCH_CHECK_EQ(rc(1), 2);
3077
TORCH_CHECK_EQ(rc(2), 1);
3078
TORCH_CHECK_EQ(rc(3), 1);
3082
for (const auto& computeOp : std::vector<std::string>{"FC", "Conv"}) {
3083
LOG(INFO) << "MPSCNNRewriteForMetal Fusion/Copy Test";
3085
netdef.add_external_input("X");
3086
netdef.add_external_output("Y");
3087
// These two ops can be fused.
3089
auto& op = *(netdef.add_op());
3090
op.set_type(computeOp);
3097
auto& op = *(netdef.add_op());
3098
op.set_type("Relu");
3103
auto& op = *(netdef.add_op());
3104
op.set_type(computeOp);
3108
op.add_output("Y2");
3111
auto& op = *(netdef.add_op());
3112
op.set_type("Relu");
3116
netdef = rewriteForMetal(netdef);
3117
auto ty = [&](size_t i) { return netdef.op(i).type(); };
3118
auto i0 = [&](size_t i) { return netdef.op(i).input(0); };
3119
auto o0 = [&](size_t i) { return netdef.op(i).output(0); };
3120
TORCH_CHECK_EQ(netdef.op_size(), 4);
3121
TORCH_CHECK_EQ(ty(0), "CopyToMPSCNN");
3122
TORCH_CHECK_EQ(ty(1), std::string("MPSCNN") + computeOp + std::string("Relu"));
3123
TORCH_CHECK_EQ(ty(2), std::string("MPSCNN") + computeOp + std::string("Relu"));
3124
TORCH_CHECK_EQ(ty(3), "CopyFromMPSCNN");
3125
TORCH_CHECK_EQ(i0(0), "X");
3126
TORCH_CHECK_EQ(i0(1), o0(0));
3127
TORCH_CHECK_EQ(i0(2), "X2");
3128
TORCH_CHECK_EQ(o0(2), i0(3));
3129
TORCH_CHECK_EQ(o0(3), "Y");
3130
TORCH_CHECK_EQ(netdef.external_input(0), "X");
3131
TORCH_CHECK_EQ(netdef.external_output(0), "Y");
3136
LOG(INFO) << "MPSCNNRewriteForMetal Failure Test";
3138
netdef.add_external_input("X");
3139
netdef.add_external_output("Y");
3141
auto& op = *(netdef.add_op());
3142
op.set_type("Conv");
3146
op.add_output("Y1");
3149
auto& op = *(netdef.add_op());
3150
op.set_type("Conv");
3154
op.add_output("Y2");
3158
auto& op = *(netdef.add_op());
3159
op.set_type("Concat");
3165
netdef = rewriteForMetal(netdef);
3166
CHECK(false) << "Shouldn't reach here, due to multiple usages of X";
3167
} catch (const std::exception& e) {
3173
LOG(INFO) << "MPSCNNRewriteForMetal out-of-place Fusion Test";
3175
netdef.add_external_input("X");
3176
netdef.add_external_output("Z");
3178
auto& op = *(netdef.add_op());
3179
op.set_type("Conv");
3186
auto& op = *(netdef.add_op());
3187
op.set_type("Relu");
3192
auto& op = *(netdef.add_op());
3193
op.set_type("Relu");
3197
netdef = rewriteForMetal(netdef);
3198
TORCH_CHECK_EQ(netdef.op_size(), 4);
3199
auto ty = [&](size_t i) { return netdef.op(i).type(); };
3200
auto i0 = [&](size_t i) { return netdef.op(i).input(0); };
3201
auto o0 = [&](size_t i) { return netdef.op(i).output(0); };
3202
TORCH_CHECK_EQ(ty(0), "CopyToMPSCNN");
3203
TORCH_CHECK_EQ(ty(1), "MPSCNNConvRelu");
3204
TORCH_CHECK_EQ(ty(2), "MPSCNNRelu");
3205
TORCH_CHECK_EQ(ty(3), "CopyFromMPSCNN");
3206
TORCH_CHECK_EQ(i0(1), o0(0));
3207
TORCH_CHECK_EQ(o0(1), "Z");
3208
TORCH_CHECK_EQ(i0(2), "Z");
3209
TORCH_CHECK_EQ(o0(2), i0(3));
3213
LOG(INFO) << "MPSCNNRewriteForMetal out-of-place fusion failure test";
3215
netdef.add_external_input("X");
3216
netdef.add_external_output("Z");
3218
auto& op = *(netdef.add_op());
3219
op.set_type("Conv");
3226
auto& op = *(netdef.add_op());
3227
op.set_type("Relu");
3232
auto& op = *(netdef.add_op());
3233
op.set_type("Relu");
3237
netdef = rewriteForMetal(netdef);
3238
TORCH_CHECK_EQ(netdef.op_size(), 5);
3239
auto ty = [&](size_t i) { return netdef.op(i).type(); };
3240
auto i0 = [&](size_t i) { return netdef.op(i).input(0); };
3241
auto o0 = [&](size_t i) { return netdef.op(i).output(0); };
3242
TORCH_CHECK_EQ(ty(0), "CopyToMPSCNN");
3243
TORCH_CHECK_EQ(ty(1), "MPSCNNConv");
3244
TORCH_CHECK_EQ(ty(2), "MPSCNNRelu");
3245
TORCH_CHECK_EQ(ty(3), "MPSCNNRelu");
3246
TORCH_CHECK_EQ(ty(4), "CopyFromMPSCNN");
3247
TORCH_CHECK_EQ(i0(1), o0(0));
3248
TORCH_CHECK_EQ(o0(1), "Y");
3249
TORCH_CHECK_EQ(i0(2), o0(1));
3250
TORCH_CHECK_EQ(o0(2), "Z");
3251
TORCH_CHECK_EQ(i0(3), o0(1));
3252
TORCH_CHECK_EQ(o0(3), i0(4));
3256
LOG(INFO) << "MPSCNNRewriteForMetal PreProcess/Deprocess Test";
3259
auto& op = *(netdef.add_op());
3260
op.set_type("PackedInt8BGRANHWCToNCHWCStylizerPreprocess");
3265
auto& op = *(netdef.add_op());
3266
op.set_type("Relu");
3271
auto& op = *(netdef.add_op());
3272
op.set_type("BRGNCHWCToPackedInt8BGRAStylizerDeprocess");
3276
netdef = rewriteForMetal(netdef);
3277
auto ty = [&](size_t i) { return netdef.op(i).type(); };
3278
auto i0 = [&](size_t i) { return netdef.op(i).input(0); };
3279
auto o0 = [&](size_t i) { return netdef.op(i).output(0); };
3280
TORCH_CHECK_EQ(netdef.op_size(), 3);
3281
TORCH_CHECK_EQ(ty(0), "MPSCNNPackedInt8BGRANHWCToNCHWCStylizerPreprocess");
3282
TORCH_CHECK_EQ(ty(1), "MPSCNNRelu");
3283
TORCH_CHECK_EQ(ty(2), "MPSCNNBRGNCHWCToPackedInt8BGRAStylizerDeprocess");
3284
TORCH_CHECK_EQ(i0(0), "X");
3285
TORCH_CHECK_EQ(i0(1), o0(0));
3286
TORCH_CHECK_EQ(i0(2), o0(1));
3287
TORCH_CHECK_EQ(o0(2), "Z");
3289
LOG(INFO) << "All MPSCNN tests passed.";
3292
NetDef truncateAfter(NetDef def, size_t idx) {
3293
// idx = 0, net = 10 -> remove 9
3294
// idx = 0, net = 1 -> remove 0
3295
const auto toRemove = def.op_size() - idx - 1;
3296
for (auto i = 0; i < toRemove; ++i) {
3297
def.mutable_op()->RemoveLast();
3299
TORCH_CHECK_EQ(def.op_size(), idx + 1);
3303
NetDef addMPSCNNCopyFinalizer(NetDef def) {
3304
TORCH_CHECK_GE(def.op_size(), 1);
3305
const auto name = def.mutable_op(def.op_size() - 1)->output(0);
3306
def.mutable_op(def.op_size() - 1)->set_output(0, "METAL_COPIER");
3308
auto& op = *(def.add_op());
3309
op.set_type("CopyFromMPSCNN");
3310
op.add_input("METAL_COPIER");
3311
op.add_output(name);
3316
void compareModels(const NetDef& initNet, NetDef predictNet) {
3317
auto* arg = predictNet.mutable_op(0)->mutable_arg(0);
3318
TORCH_CHECK_EQ(arg->name(), "noise_std");
3319
arg->set_f(0.000001);
3321
NetDef metalPredictNet;
3322
CAFFE_ENFORCE(tryConvertToMPSCNN(initNet, predictNet, &metalPredictNet));
3324
// TODO: consider last op as well.
3325
for (auto i = 0; i < predictNet.op_size(); ++i) {
3326
auto truncatedPredictNet = truncateAfter(predictNet, i);
3327
auto truncatedMetalPredictNet = truncateAfter(metalPredictNet, i);
3328
// For all but the last op, we need to add a copy op.
3329
if (i != predictNet.op_size() - 1) {
3330
truncatedMetalPredictNet =
3331
addMPSCNNCopyFinalizer(truncatedMetalPredictNet);
3334
dumpDef(truncatedPredictNet);
3335
dumpDef(truncatedMetalPredictNet);
3338
cws.RunNetOnce(initNet);
3340
auto* t = BlobGetMutableTensor(
3341
cws.CreateBlob(predictNet.external_input(0)), CPU);
3342
t->Resize(1, 224, 224, 4);
3343
for (auto i = 0; i < t->size(); ++i) {
3344
t->mutable_data<uint8_t>()[i] = i % 225;
3347
cws.RunNetOnce(truncatedPredictNet);
3350
mws.RunNetOnce(initNet);
3352
auto* t = BlobGetMutableTensor(
3353
mws.CreateBlob(predictNet.external_input(0)), CPU);
3354
t->Resize(1, 224, 224, 4);
3355
for (auto i = 0; i < t->size(); ++i) {
3356
t->mutable_data<uint8_t>()[i] = i % 225;
3359
mws.RunNetOnce(truncatedMetalPredictNet);
3362
truncatedPredictNet.op(truncatedPredictNet.op_size() - 1).output(0);
3364
LOG(INFO) << "Checking correspondence for name: " << name << ", idx: " << i;
3366
const auto& mt = mws.GetBlob(name)->Get<TensorCPU>();
3367
const auto& ct = cws.GetBlob(name)->Get<TensorCPU>();
3368
TORCH_CHECK_EQ(mt.sizes(), ct.sizes());
3369
for (auto j = 0; j < mt.size(); ++j) {
3370
if (mt.IsType<float>()) {
3372
LOG(INFO) << "i: " << i << ", j: " << j
3373
<< ", CPU: " << ct.data<float>()[j]
3374
<< ", MTL: " << mt.data<float>()[j];
3376
TORCH_CHECK_NEAR(mt.data<float>()[j], ct.data<float>()[j], 5);
3378
CHECK(mt.IsType<uint8_t>());
3380
LOG(INFO) << "i: " << i << ", j: " << j
3381
<< ", CPU: " << ct.data<uint8_t>()[j]
3382
<< ", MTL: " << mt.data<uint8_t>()[j];
3384
TORCH_CHECK_NEAR(mt.data<uint8_t>()[j], ct.data<uint8_t>()[j], 5);
3391
const NetDef& initNet,
3393
std::vector<int> inputDims) {
3394
NetDef metalPredictNet;
3395
NetDef predictNet = setSpecialArgs(net);
3396
CAFFE_ENFORCE(tryConvertToMPSCNNIntermediateCopies(
3397
initNet, predictNet, &metalPredictNet));
3398
dumpDef(predictNet);
3399
dumpDef(metalPredictNet);
3401
#define RUN_NET(ws, predictNet) \
3402
ws.RunNetOnce(initNet); \
3404
auto* t = BlobGetMutableTensor( \
3405
ws.CreateBlob(predictNet.external_input(0)), CPU); \
3406
t->Resize(inputDims); \
3408
math::RandGaussian<float, CPUContext>( \
3409
t->size(), 0, 1, t->mutable_data<float>(), &ctx); \
3411
ws.RunNetOnce(predictNet);
3417
RUN_NET(cws, predictNet);
3420
RUN_NET(mws, metalPredictNet);
3422
for (auto i = 0; i < predictNet.external_output_size(); i++) {
3423
auto blobName = predictNet.external_output(i);
3424
LOG(INFO) << "Checking output blob:" << blobName;
3425
const auto& mt = mws.GetBlob(blobName)->Get<Tensor>();
3426
const auto& ct = cws.GetBlob(blobName)->Get<Tensor>();
3427
if (mt.size() == 0 || ct.size() == 0) {
3428
LOG(INFO) << "One of the operator failed.";
3431
// TORCH_CHECK_EQ(mt.sizes(), ct.sizes());
3432
for (auto j = 0; j < fmin(mt.size(), ct.size()); ++j) {
3433
if (mt.IsType<float>()) {
3435
LOG(INFO) << "i: " << i << ", j: " << j
3436
<< ", CPU: " << ct.data<float>()[j]
3437
<< ", MTL: " << mt.data<float>()[j];
3439
// Disabling check for now because of precision issues
3440
// TORCH_CHECK_NEAR(mt.data<float>()[j], ct.data<float>()[j], 5);
3442
LOG(INFO) << "Type uint8_t";
3443
CHECK(mt.IsType<uint8_t>());
3445
LOG(INFO) << "i: " << i << ", j: " << j
3446
<< ", CPU: " << ct.data<uint8_t>()[j]
3447
<< ", MTL: " << mt.data<uint8_t>()[j];
3449
// Disabling check for now.
3450
// TORCH_CHECK_NEAR(mt.data<uint8_t>()[j], ct.data<uint8_t>()[j], 5);
3454
LOG(INFO) << "rewrite test passed.";