pytorch

benchmark_helper.cc
523 строки · 16.7 Кб
Перенос по словам
1
/**
2
 * Copyright (c) 2016-present, Facebook, Inc.
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at
7
 *
8
 *     http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16

17
#include <chrono>
18
#include <fstream>
19
#include <iostream>
20
#include <string>
21
#include <thread>
22
#ifdef _WIN32
23
#ifndef WIN32_LEAN_AND_MEAN
24
#define WIN32_LEAN_AND_MEAN
25
#endif
26
#include <windows.h>
27
#include <psapi.h>
28
#endif
29

30
#include <binaries/benchmark_helper.h>
31
#include "caffe2/core/blob_serialization.h"
32
#ifdef __CUDA_ARCH__
33
#include "caffe2/core/context_gpu.h"
34
#endif
35
#include "caffe2/core/init.h"
36
#include "caffe2/core/logging.h"
37
#include "caffe2/core/net.h"
38
#include "caffe2/core/operator.h"
39
#include "caffe2/core/tensor_int8.h"
40
#include "caffe2/utils/bench_utils.h"
41
#include "caffe2/utils/string_utils.h"
42
#include <observers/net_observer_reporter_print.h>
43
#include <observers/observer_config.h>
44
#include <observers/perf_observer.h>
45

46
#if defined(TARGET_OS_MAC) || \
47
defined(TARGET_OS_IPHONE) || \
48
defined(TARGET_IPHONE_SIMULATOR)
49
#include <malloc/malloc.h>
50
#else
51
#include <malloc.h>
52
#endif
53

54

55
void observerConfig() {
56
  caffe2::ClearGlobalNetObservers();
57
  caffe2::AddGlobalNetObserverCreator([](caffe2::NetBase* subject) {
58
    return std::make_unique<caffe2::PerfNetObserver>(subject);
59
  });
60
  caffe2::ObserverConfig::setReporter(
61
      std::make_unique<caffe2::NetObserverReporterPrint>());
62
}
63

64
bool backendCudaSet(const string& backend) {
65
  bool run_on_gpu = false;
66
  if (backend == "cuda") {
67
#ifdef __CUDA_ARCH__
68
    if (caffe2::HasCudaGPU()) {
69
      run_on_gpu = true;
70
    } else {
71
      CAFFE_THROW("NO GPU support on this host machine");
72
    }
73
#else
74
    CAFFE_THROW("NO GPU support");
75
#endif
76
  }
77
  return run_on_gpu;
78
}
79

80
void setDeviceType(caffe2::NetDef* net_def, caffe2::DeviceType& run_dev) {
81
  for (int j = 0; j < net_def->op_size(); j++) {
82
    caffe2::OperatorDef* op = net_def->mutable_op(j);
83
    op->mutable_device_option()->set_device_type(caffe2::TypeToProto(run_dev));
84
  }
85
}
86

87
void setOperatorEngine(caffe2::NetDef* net_def, const string& backend) {
88
  if (backend != "builtin") {
89
    string engine = backend == "nnpack"
90
        ? "NNPACK"
91
        : backend == "eigen" ? "EIGEN"
92
                             : backend == "mkl" ? "MKLDNN"
93
                                                : backend == "cuda"
94
                    ? "CUDA"
95
                    : backend == "dnnlowp" ? "DNNLOWP"
96
                                           : backend == "dnnlowp_acc16"
97
                            ? "DNNLOWP_ACC16"
98
                            : backend == "default" ? "" : "NONE";
99
    CAFFE_ENFORCE(engine != "NONE", "Backend is not supported");
100
    for (int i = 0; i < net_def->op_size(); i++) {
101
      caffe2::OperatorDef* op_def = net_def->mutable_op(i);
102
      op_def->set_engine(engine);
103
    }
104
  }
105
}
106

107
int loadInput(
108
    shared_ptr<caffe2::Workspace> workspace,
109
    const bool run_on_gpu,
110
    map<string, caffe2::TensorProtos>& tensor_protos_map,
111
    const string& input,
112
    const string& input_file,
113
    const string& input_dims,
114
    const string& input_type) {
115
  // How many input blobs are in the inputs
116
  int blob_num = 1;
117
  // Load input.
118
  if (input.size()) {
119
    vector<string> input_names = caffe2::split(',', input);
120
    if (input_file.size()) {
121
      vector<string> input_files = caffe2::split(',', input_file);
122
      CAFFE_ENFORCE_EQ(
123
          input_names.size(),
124
          input_files.size(),
125
          "Input name and file should have the same number.");
126
      for (int i = 0; i < input_names.size(); ++i) {
127
        caffe2::TensorProtos tensor_protos;
128
        CAFFE_ENFORCE(
129
            caffe2::ReadProtoFromFile(input_files[i], &tensor_protos));
130
        workspace->CreateBlob(input_names[i]);
131
        tensor_protos_map.insert(std::make_pair(input_names[i], tensor_protos));
132
      }
133
      // Check that all blobs have the same number of entries
134
      blob_num = tensor_protos_map[input_names[0]].protos_size();
135
      for (int i = 1; i < input_names.size(); ++i) {
136
        int bnum = tensor_protos_map[input_names[i]].protos_size();
137
        CAFFE_ENFORCE_EQ(
138
            blob_num,
139
            bnum,
140
            "Number of blobs are not the same for all inputs");
141
      }
142
    } else if (input_dims.size() || input_type.size()) {
143
      CAFFE_ENFORCE_GE(
144
          input_dims.size(),
145
          0,
146
          "Input dims must be specified when input tensors are used.");
147
      CAFFE_ENFORCE_GE(
148
          input_type.size(),
149
          0,
150
          "Input type must be specified when input tensors are used.");
151

152
      vector<string> input_dims_list = caffe2::split(';', input_dims);
153
      CAFFE_ENFORCE_EQ(
154
          input_names.size(),
155
          input_dims_list.size(),
156
          "Input name and dims should have the same number of items.");
157
      vector<string> input_type_list = caffe2::split(';', input_type);
158
      CAFFE_ENFORCE_EQ(
159
          input_names.size(),
160
          input_type_list.size(),
161
          "Input name and type should have the same number of items.");
162
      for (size_t i = 0; i < input_names.size(); ++i) {
163
        vector<string> input_dims_str = caffe2::split(',', input_dims_list[i]);
164
        vector<int> input_dims;
165
        for (const string& s : input_dims_str) {
166
          input_dims.push_back(std::stoi(s));
167
        }
168
        caffe2::Blob* blob = workspace->GetBlob(input_names[i]);
169
        if (blob == nullptr) {
170
          blob = workspace->CreateBlob(input_names[i]);
171
        }
172
        if (run_on_gpu) {
173
          LOG(INFO) << "Running on GPU.";
174
#ifdef __CUDA_ARCH__
175
          caffe2::TensorCUDA* tensor = blob->GetMutable<caffe2::TensorCUDA>();
176
          TORCH_CHECK_NOTNULL(tensor);
177
          tensor->Resize(input_dims);
178
          if (input_type_list[i] == "uint8_t") {
179
            tensor->mutable_data<uint8_t>();
180
          } else if (input_type_list[i] == "float") {
181
            tensor->mutable_data<float>();
182
          } else {
183
            CAFFE_THROW("Unsupported input type: ", input_type_list[i]);
184
          }
185
#else
186
          CAFFE_THROW("Not support GPU on mobile.");
187
#endif
188
        } else {
189
          if (input_type_list[i] == "uint8_t") {
190
            caffe2::int8::Int8TensorCPU* tensor =
191
                blob->GetMutable<caffe2::int8::Int8TensorCPU>();
192
            TORCH_CHECK_NOTNULL(tensor);
193
            tensor->t.Resize(input_dims);
194
            tensor->t.mutable_data<uint8_t>();
195
          } else if (input_type_list[i] == "float") {
196
            caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU);
197
            TORCH_CHECK_NOTNULL(tensor);
198
            tensor->Resize(input_dims);
199
            tensor->mutable_data<float>();
200
          } else if (input_type_list[i] == "int") {
201
            caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU);
202
            TORCH_CHECK_NOTNULL(tensor);
203
            tensor->Resize(input_dims);
204
            tensor->mutable_data<int>();
205
          } else {
206
            CAFFE_THROW("Unsupported input type: ", input_type_list[i]);
207
          }
208
        }
209
      }
210
    } else {
211
      CAFFE_THROW(
212
          "You requested input tensors, but neither input_file nor "
213
          "input_dims is set.");
214
    }
215
  }
216
  return blob_num;
217
}
218

219
void fillInputBlob(
220
    shared_ptr<caffe2::Workspace> workspace,
221
    map<string, caffe2::TensorProtos>& tensor_protos_map,
222
    int iteration) {
223
  if (tensor_protos_map.empty()) {
224
    return;
225
  }
226
  static caffe2::TensorDeserializer deserializer;
227
  for (auto& tensor_kv : tensor_protos_map) {
228
    caffe2::Blob* blob = workspace->GetBlob(tensor_kv.first);
229
    if (blob == nullptr) {
230
      blob = workspace->CreateBlob(tensor_kv.first);
231
    }
232
    // todo: support gpu and make this function a template
233
    int protos_size = tensor_kv.second.protos_size();
234
    if (protos_size == 1 && iteration > 0) {
235
      // Do not override the input data if there is only one input data,
236
      // since it will clear all caches. Rely on wipe_cache to
237
      // clear caches
238
      continue;
239
    }
240
    caffe2::TensorProto* tensor_proto =
241
        tensor_kv.second.mutable_protos(iteration % protos_size);
242
    BlobSetTensor(blob, deserializer.Deserialize(*tensor_proto));
243
    // todo: for other types
244
  }
245
}
246

247
void runNetwork(
248
    shared_ptr<caffe2::Workspace> workspace,
249
    caffe2::NetBase* net,
250
    map<string, caffe2::TensorProtos>& tensor_protos_map,
251
    const bool wipe_cache,
252
    const bool run_individual,
253
    const bool run_on_gpu,
254
    const bool text_output,
255
    const int warmup,
256
    const int iter,
257
    const int num_blobs,
258
    const int sleep_before_run,
259
    const int sleep_between_iteration,
260
    const int sleep_between_net_and_operator,
261
    const std::string& output,
262
    const std::string& output_folder) {
263

264
  LOG(INFO) << "Starting benchmark.";
265
  caffe2::ObserverConfig::initSampleRate(1, 1, 1, run_individual, warmup);
266
  LOG(INFO) << "Running warmup runs.";
267
  for (int i = 0; i < warmup; ++i) {
268
    fillInputBlob(workspace, tensor_protos_map, i);
269
    CAFFE_ENFORCE(net->Run(), "Warmup run ", i, " has failed.");
270
  }
271

272
  if (wipe_cache) {
273
    caffe2::wipe_cache();
274
  }
275
  if (sleep_before_run > 0) {
276
    std::this_thread::sleep_for(std::chrono::seconds(sleep_before_run));
277
  }
278
  LOG(INFO) << "Main runs.";
279
  CAFFE_ENFORCE(
280
      iter >= 0,
281
      "Number of main runs should be non negative, provided ",
282
      iter,
283
      ".");
284
  LOG(INFO) << "net runs.";
285
  long long duration_sum = 0;
286
  for (int i = 0; i < iter; ++i) {
287
    caffe2::ObserverConfig::initSampleRate(1, 1, 1, 0, warmup);
288
    fillInputBlob(workspace, tensor_protos_map, i);
289
    if (wipe_cache) {
290
      caffe2::wipe_cache();
291
    }
292
    auto start = std::chrono::high_resolution_clock::now();
293
    CAFFE_ENFORCE(net->Run(), "Main run ", i, " has failed.");
294
    auto stop = std::chrono::high_resolution_clock::now();
295
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
296
    duration_sum += duration.count();
297
    // Write the output for the first num_blobs times
298
    writeOutput(
299
        workspace,
300
        run_on_gpu,
301
        output,
302
        output_folder,
303
        text_output,
304
        i,
305
        num_blobs);
306
    if (wipe_cache) {
307
      caffe2::wipe_cache();
308
    }
309
    if (sleep_between_iteration > 0) {
310
      std::this_thread::sleep_for(
311
          std::chrono::seconds(sleep_between_iteration));
312
    }
313
  }
314
  std::cout << "Average Duration: " << (duration_sum/iter) << " us" << std::endl;
315
  if (run_individual) {
316
    LOG(INFO) << "operator runs.";
317
    if (sleep_between_net_and_operator > 0) {
318
      std::this_thread::sleep_for(
319
          std::chrono::seconds(sleep_between_net_and_operator));
320
    }
321
    for (int i = 0; i < iter; ++i) {
322
      caffe2::ObserverConfig::initSampleRate(1, 1, 1, 1, warmup);
323
      fillInputBlob(workspace, tensor_protos_map, i);
324
      CAFFE_ENFORCE(net->Run(), "Main run ", i, " with operator has failed.");
325
      if (wipe_cache) {
326
        caffe2::wipe_cache();
327
      }
328
      if (sleep_between_iteration > 0) {
329
        std::this_thread::sleep_for(
330
            std::chrono::seconds(sleep_between_iteration));
331
      }
332
    }
333
  }
334
}
335

336
void writeOutput(
337
    shared_ptr<caffe2::Workspace> workspace,
338
    const bool run_on_gpu,
339
    const string& output,
340
    const string& output_folder,
341
    const bool text_output,
342
    const int index,
343
    const int num_blobs) {
344
  if (output.size() == 0) {
345
    return;
346
  }
347
  string output_prefix = output_folder.size() ? output_folder + "/" : "";
348
  vector<string> output_names = caffe2::split(',', output);
349
  if (output == "*") {
350
    output_names = workspace->Blobs();
351
  }
352
  for (const string& name : output_names) {
353
    CAFFE_ENFORCE(
354
        workspace->HasBlob(name),
355
        "You requested a non-existing blob: ",
356
        name);
357
    if (text_output) {
358
      if (run_on_gpu) {
359
#ifdef __CUDA_ARCH__
360
        writeTextOutput<caffe2::CUDAContext, caffe2::TensorCUDA>(
361
            workspace->GetBlob(name)->GetMutable<caffe2::TensorCUDA>(),
362
            output_prefix,
363
            name,
364
            index,
365
            num_blobs);
366
#else
367
        CAFFE_THROW("Not support GPU.");
368
#endif
369
      } else {
370
        writeTextOutput<caffe2::CPUContext, caffe2::TensorCPU>(
371
            BlobGetMutableTensor(workspace->GetBlob(name), caffe2::CPU),
372
            output_prefix,
373
            name,
374
            index,
375
            num_blobs);
376
      }
377
    } else {
378
      // Do not support multiple entries per blob.
379
      CAFFE_ENFORCE(
380
          index == 0,
381
          "Binary file only support one output.");
382
      string serialized = SerializeBlob(*workspace->GetBlob(name), name);
383
      string output_filename = output_prefix + name;
384
      caffe2::WriteStringToFile(serialized, output_filename.c_str());
385
    }
386
  }
387
}
388

389
void logBenchmarkResult(
390
    const std::string& type,
391
    const std::string& metric,
392
    const std::string& unit,
393
    const int value) {
394
  LOG(INFO) << caffe2::NetObserverReporterPrint::IDENTIFIER << "{"
395
            << "\"type\": \"" << type << "\", "
396
            << "\"metric\": \"" << metric << "\", "
397
            << "\"unit\": \"" << unit << "\", "
398
            << "\"value\": " << c10::to_string(value) << "}\n";
399
}
400

401
long getVirtualMemoryIfOptionEnabled(bool FLAGS_measure_memory) {
402
  if (FLAGS_measure_memory) {
403
#if defined(TARGET_OS_IPHONE) || \
404
defined(TARGET_OS_MAC) || \
405
defined(TARGET_IPHONE_SIMULATOR)
406
    malloc_statistics_t stats = {0};
407
    malloc_zone_statistics(nullptr, &stats);
408
    return stats.size_allocated;
409
#elif defined(_WIN32)
410
    PROCESS_MEMORY_COUNTERS_EX pmc;
411
    GetProcessMemoryInfo(
412
        GetCurrentProcess(), (PROCESS_MEMORY_COUNTERS*)&pmc, sizeof(pmc));
413
    return pmc.PrivateUsage;
414
#else
415
    struct mallinfo info = mallinfo();
416
    return info.uordblks;
417
#endif
418
  }
419

420
  return 0;
421
}
422

423
int benchmark(
424
    int argc,
425
    char* argv[],
426
    const string& FLAGS_backend,
427
    const string& FLAGS_init_net,
428
    const string& FLAGS_input,
429
    const string& FLAGS_input_dims,
430
    const string& FLAGS_input_file,
431
    const string& FLAGS_input_type,
432
    int FLAGS_iter,
433
    bool FLAGS_measure_memory,
434
    const string& FLAGS_net,
435
    const string& FLAGS_output,
436
    const string& FLAGS_output_folder,
437
    bool FLAGS_run_individual,
438
    int FLAGS_sleep_before_run,
439
    int FLAGS_sleep_between_iteration,
440
    int FLAGS_sleep_between_net_and_operator,
441
    bool FLAGS_text_output,
442
    int FLAGS_warmup,
443
    bool FLAGS_wipe_cache) {
444
  // Check arguments to be correct
445
  {
446
    // Need to check whether file exists, as the file reader does not assert if
447
    // file does not exist
448
    std::ifstream net_file(FLAGS_net);
449
    CAFFE_ENFORCE(net_file.good());
450
    net_file.close();
451

452
    std::ifstream init_net_file(FLAGS_init_net);
453
    CAFFE_ENFORCE(init_net_file.good());
454
    init_net_file.close();
455

456
    if (FLAGS_input_file.size() > 0) {
457
      vector<string> input_files = caffe2::split(',', FLAGS_input_file);
458
      for (auto input_file : input_files) {
459
        std::ifstream ifile(input_file);
460
        CAFFE_ENFORCE(ifile.good());
461
        ifile.close();
462
      }
463
    }
464
  }
465

466
  observerConfig();
467
  caffe2::ShowLogInfoToStderr();
468

469
  auto workspace = std::make_shared<caffe2::Workspace>(new caffe2::Workspace());
470
  bool run_on_gpu = backendCudaSet(FLAGS_backend);
471
  // Run initialization network, measure resources used.
472
  long init_vmem = getVirtualMemoryIfOptionEnabled(FLAGS_measure_memory);
473
  caffe2::NetDef init_net_def;
474
  CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_init_net, &init_net_def));
475
  setOperatorEngine(&init_net_def, FLAGS_backend);
476
  CAFFE_ENFORCE(workspace->RunNetOnce(init_net_def));
477
  init_vmem = getVirtualMemoryIfOptionEnabled(FLAGS_measure_memory) - init_vmem;
478

479
  map<string, caffe2::TensorProtos> tensor_protos_map;
480
  int num_blobs = loadInput(
481
      workspace,
482
      run_on_gpu,
483
      tensor_protos_map,
484
      FLAGS_input,
485
      FLAGS_input_file,
486
      FLAGS_input_dims,
487
      FLAGS_input_type);
488

489
  // Run main network.
490
  long predict_vmem = getVirtualMemoryIfOptionEnabled(FLAGS_measure_memory);
491
  caffe2::NetDef net_def;
492
  CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_net, &net_def));
493
  setOperatorEngine(&net_def, FLAGS_backend);
494
  if (!net_def.has_name()) {
495
    net_def.set_name("benchmark");
496
  }
497
  caffe2::NetBase* net = workspace->CreateNet(net_def);
498
  TORCH_CHECK_NOTNULL(net);
499
  runNetwork(
500
      workspace,
501
      net,
502
      tensor_protos_map,
503
      FLAGS_wipe_cache,
504
      FLAGS_run_individual,
505
      run_on_gpu,
506
      FLAGS_text_output,
507
      FLAGS_warmup,
508
      FLAGS_iter,
509
      num_blobs,
510
      FLAGS_sleep_before_run,
511
      FLAGS_sleep_between_iteration,
512
      FLAGS_sleep_between_net_and_operator,
513
      FLAGS_output,
514
      FLAGS_output_folder);
515
  predict_vmem = getVirtualMemoryIfOptionEnabled(
516
      FLAGS_measure_memory) - predict_vmem;
517
  if (FLAGS_measure_memory) {
518
    logBenchmarkResult(
519
        "NET_", "memory", "kB", (init_vmem + predict_vmem) / 1024);
520
  }
521

522
  return 0;
523
}
524
pytorch

Использование cookies