google-research

experiment.py
523 строки · 20.5 Кб
Перенос по словам
1
# coding=utf-8
2
# Copyright 2024 The Google Research Authors.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15

16
"""The runners."""
17
from __future__ import absolute_import
18
from __future__ import division
19
from __future__ import print_function
20
import os
21
import time
22
import numpy as np
23

24
import tensorflow.compat.v1 as tf
25
from capsule_em import model as f_model
26
from capsule_em.mnist \
27
  import mnist_record
28
from capsule_em.norb \
29
  import norb_record
30

31

32
FLAGS = tf.app.flags.FLAGS
33
tf.app.flags.DEFINE_integer('num_prime_capsules', 32,
34
                            'Number of first layer capsules.')
35
tf.app.flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate')
36
tf.app.flags.DEFINE_integer('routing_iteration', 3,
37
                            'Number of iterations for softmax routing')
38
tf.app.flags.DEFINE_float(
39
    'routing_rate', 1,
40
    'ratio for combining routing logits and routing feedback')
41
tf.app.flags.DEFINE_float('decay_rate', 0.96, 'ratio for learning rate decay')
42
tf.app.flags.DEFINE_integer('decay_steps', 20000,
43
                            'number of steps for learning rate decay')
44
tf.app.flags.DEFINE_bool('normalize_kernels', False,
45
                         'Normalize the capsule weight kernels')
46
tf.app.flags.DEFINE_integer('num_second_atoms', 16,
47
                            'number of capsule atoms for the second layer')
48
tf.app.flags.DEFINE_integer('num_primary_atoms', 16,
49
                            'number of capsule atoms for the first layer')
50
tf.app.flags.DEFINE_integer('num_start_conv', 32,
51
                            'number of channels for the start layer')
52
tf.app.flags.DEFINE_integer('kernel_size', 5,
53
                            'kernel size for the start layer.')
54
tf.app.flags.DEFINE_integer(
55
    'routing_iteration_prime', 1,
56
    'number of routing iterations for primary capsules.')
57
tf.app.flags.DEFINE_integer('max_steps', 2000000,
58
                            'Number of steps to run trainer.')
59
tf.app.flags.DEFINE_string('data_dir', '/datasets/mnist/',
60
                           'Directory for storing input data')
61
tf.app.flags.DEFINE_string('summary_dir',
62
                           '/tmp/tensorflow/mnist/logs/mnist_with_summaries',
63
                           'Summaries log directory')
64
tf.app.flags.DEFINE_bool('train', True, 'train or test.')
65
tf.app.flags.DEFINE_integer(
66
    'checkpoint_steps', 1500,
67
    'number of steps before saving a training checkpoint.')
68
tf.app.flags.DEFINE_bool('verbose_image', False, 'whether to show images.')
69
tf.app.flags.DEFINE_bool('multi', True,
70
                         'whether to use multiple digit dataset.')
71
tf.app.flags.DEFINE_bool('eval_once', False,
72
                         'whether to evaluate once on the ckpnt file.')
73
tf.app.flags.DEFINE_integer('eval_size', 24300,
74
                            'number of examples to evaluate.')
75
tf.app.flags.DEFINE_string(
76
    'ckpnt',
77
    '/tmp/tensorflow/mnist/logs/mnist_with_summaries/train/model.ckpnt',
78
    'The checkpoint to load and evaluate once.')
79
tf.app.flags.DEFINE_integer('keep_ckpt', 5, 'number of examples to evaluate.')
80
tf.app.flags.DEFINE_bool(
81
    'clip_lr', False, 'whether to clip learning rate to not go bellow 1e-5.')
82
tf.app.flags.DEFINE_integer('stride_1', 2,
83
                            'stride for the first convolutinal layer.')
84
tf.app.flags.DEFINE_integer('kernel_2', 9,
85
                            'kernel size for the secon convolutinal layer.')
86
tf.app.flags.DEFINE_integer('stride_2', 2,
87
                            'stride for the second convolutinal layer.')
88
tf.app.flags.DEFINE_string('padding', 'VALID',
89
                           'the padding method for conv layers.')
90
tf.app.flags.DEFINE_integer('extra_caps', 2, 'number of extra conv capsules.')
91
tf.app.flags.DEFINE_string('caps_dims', '32,32',
92
                           'output dim for extra conv capsules.')
93
tf.app.flags.DEFINE_string('caps_strides', '2,1',
94
                           'stride for extra conv capsules.')
95
tf.app.flags.DEFINE_string('caps_kernels', '3,3',
96
                           'kernel size for extra conv capsuls.')
97
tf.app.flags.DEFINE_integer('extra_conv', 0, 'number of extra conv layers.')
98

99
tf.app.flags.DEFINE_string('conv_dims', '', 'output dim for extra conv layers.')
100
tf.app.flags.DEFINE_string('conv_strides', '', 'stride for extra conv layers.')
101
tf.app.flags.DEFINE_string('conv_kernels', '',
102
                           'kernel size for extra conv layers.')
103
tf.app.flags.DEFINE_bool('leaky', False, 'Use leaky routing.')
104
tf.app.flags.DEFINE_bool('fast', False, 'Use the new faster implementation.')
105
tf.app.flags.DEFINE_bool('cpu_way', False,
106
                         'If set, use NHWC ordering instead of NCHW.')
107
tf.app.flags.DEFINE_bool('jit_scopes', False,
108
                         'Use xla jit_scopes to compile. Not supported.')
109
tf.app.flags.DEFINE_bool('staircase', False, 'Use staircase decay.')
110
tf.app.flags.DEFINE_integer('num_gpus', 1, 'number of gpus to train.')
111
tf.app.flags.DEFINE_bool('adam', True, 'Use Adam optimizer.')
112
tf.app.flags.DEFINE_bool('pooling', False, 'Pooling after convolution.')
113
tf.app.flags.DEFINE_bool('use_caps', True, 'Use capsule layers.')
114
tf.app.flags.DEFINE_integer(
115
    'extra_fc', 512, 'number of units in the extra fc layer in no caps mode.')
116
tf.app.flags.DEFINE_bool('dropout', False, 'Dropout before last layer.')
117
tf.app.flags.DEFINE_bool('tweak', False, 'During eval recons from tweaked rep.')
118
tf.app.flags.DEFINE_bool('softmax', False, 'softmax loss in no caps.')
119
tf.app.flags.DEFINE_bool('c_dropout', False, 'dropout after conv capsules.')
120
tf.app.flags.DEFINE_bool(
121
    'distort', True,
122
    'distort mnist images by cropping to 24 * 24 and rotating by 15 degrees.')
123
tf.app.flags.DEFINE_bool('restart', False, 'Clean train checkpoints.')
124
tf.app.flags.DEFINE_bool('use_em', True,
125
                         'If set use em capsules with em routing.')
126
tf.app.flags.DEFINE_float('final_beta', 0.01, 'Temperature at the sigmoid.')
127
tf.app.flags.DEFINE_bool('eval_ensemble', False, 'eval over aggregated logits.')
128
tf.app.flags.DEFINE_string('part1', 'ok', 'ok')
129
tf.app.flags.DEFINE_string('part2', 'ok', 'ok')
130
tf.app.flags.DEFINE_bool('reduce_mean', False,
131
                         'If set normalize mean of each image.')
132
tf.app.flags.DEFINE_float('loss_rate', 1.0,
133
                          'classification to regularization rate.')
134
tf.app.flags.DEFINE_integer('batch_size', 64, 'Batch size.')
135
tf.app.flags.DEFINE_integer('norb_pixel', 48, 'Batch size.')
136
tf.app.flags.DEFINE_bool('patching', True, 'If set use patching for eval.')
137

138
tf.app.flags.DEFINE_string('data_set', 'norb', 'the data set to use.')
139
tf.app.flags.DEFINE_string('cifar_data_dir', '/tmp/cifar10_data',
140
                           """Path to the CIFAR-10 data directory.""")
141
tf.app.flags.DEFINE_string('norb_data_dir', '/root/datasets/smallNORB/',
142
                           """Path to the norb data directory.""")
143
tf.app.flags.DEFINE_string('affnist_data_dir', '/tmp/affnist_data',
144
                           """Path to the affnist data directory.""")
145

146

147
num_classes = {
148
    'mnist': 10,
149
    'cifar10': 10,
150
    'mnist_multi': 10,
151
    'svhn': 10,
152
    'affnist': 10,
153
    'expanded_mnist': 10,
154
    'norb': 5,
155
}
156

157

158
def get_features(train, total_batch):
159
  """Return batched inputs."""
160
  print(FLAGS.data_set)
161
  batch_size = total_batch // max(1, FLAGS.num_gpus)
162
  split = 'train' if train else 'test'
163
  features = []
164
  for i in range(FLAGS.num_gpus):
165
    with tf.device('/cpu:0'):
166
      with tf.name_scope('input_tower_%d' % (i)):
167
        if FLAGS.data_set == 'norb':
168
          features += [
169
              norb_record.inputs(
170
                  train_dir=FLAGS.norb_data_dir,
171
                  batch_size=batch_size,
172
                  split=split,
173
                  multi=FLAGS.multi,
174
                  image_pixel=FLAGS.norb_pixel,
175
                  distort=FLAGS.distort,
176
                  patching=FLAGS.patching,
177
              )
178
          ]
179
        elif FLAGS.data_set == 'affnist':
180
          features += [
181
              mnist_record.inputs(
182
                  train_dir=FLAGS.affnist_data_dir,
183
                  batch_size=batch_size,
184
                  split=split,
185
                  multi=FLAGS.multi,
186
                  shift=0,
187
                  height=40,
188
                  train_file='test.tfrecords')
189
          ]
190
        elif FLAGS.data_set == 'expanded_mnist':
191
          features += [
192
              mnist_record.inputs(
193
                  train_dir=FLAGS.data_dir,
194
                  batch_size=batch_size,
195
                  split=split,
196
                  multi=FLAGS.multi,
197
                  height=40,
198
                  train_file='train_6shifted_6padded_mnist.tfrecords',
199
                  shift=6)
200
          ]
201
        else:
202
          if train and not FLAGS.distort:
203
            shift = 2
204
          else:
205
            shift = 0
206
          features += [
207
              mnist_record.inputs(
208
                  train_dir=FLAGS.data_dir,
209
                  batch_size=batch_size,
210
                  split=split,
211
                  multi=FLAGS.multi,
212
                  shift=shift,
213
                  distort=FLAGS.distort)
214
          ]
215
  print(features)
216
  return features
217

218

219
def run_training():
220
  """Train."""
221
  with tf.Graph().as_default():
222
    # Input images and labels.
223
    features = get_features(True, FLAGS.batch_size)
224
    model = f_model.multi_gpu_model
225
    print('so far so good!')
226
    result = model(features)
227

228
    # TODO(sasabour): merge jit scopes after jit scopes where enabled.
229
    merged = result['summary']
230
    train_step = result['train']
231
    # test_writer = tf.summary.FileWriter(FLAGS.summary_dir + '/test')
232

233
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
234

235
    init_op = tf.group(tf.global_variables_initializer(),
236
                       tf.local_variables_initializer())
237
    sess.run(init_op)
238
    saver = tf.train.Saver(max_to_keep=FLAGS.keep_ckpt)
239
    if tf.gfile.Exists(FLAGS.summary_dir + '/train'):
240
      ckpt = tf.train.get_checkpoint_state(FLAGS.summary_dir + '/train/')
241
      print(ckpt)
242
      if (not FLAGS.restart) and ckpt and ckpt.model_checkpoint_path:
243
        print('hesllo')
244
        saver.restore(sess, ckpt.model_checkpoint_path)
245
        prev_step = int(
246
            ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
247
      else:
248
        print('what??')
249
        tf.gfile.DeleteRecursively(FLAGS.summary_dir + '/train')
250
        tf.gfile.MakeDirs(FLAGS.summary_dir + '/train')
251
        prev_step = 0
252
    else:
253
      tf.gfile.MakeDirs(FLAGS.summary_dir + '/train')
254
      prev_step = 0
255
    train_writer = tf.summary.FileWriter(FLAGS.summary_dir + '/train',
256
                                         sess.graph)
257
    coord = tf.train.Coordinator()
258
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
259

260
    try:
261
      step = 0
262
      for i in range(prev_step, FLAGS.max_steps):
263
        step += 1
264
        summary, _ = sess.run([merged, train_step])
265
        train_writer.add_summary(summary, i)
266
        if (i + 1) % FLAGS.checkpoint_steps == 0:
267
          saver.save(
268
              sess,
269
              os.path.join(FLAGS.summary_dir + '/train', 'model.ckpt'),
270
              global_step=i + 1)
271
    except tf.errors.OutOfRangeError:
272
      print('Done training for %d steps.' % step)
273
    finally:
274
      # When done, ask the threads to stop.
275
      coord.request_stop()
276
    train_writer.close()
277
    # Wait for threads to finish.
278
    coord.join(threads)
279
    sess.close()
280

281

282
def run_eval():
283
  """Evaluate on test or validation."""
284
  with tf.Graph().as_default():
285
    # Input images and labels.
286
    features = get_features(False, 5)
287
    model = f_model.multi_gpu_model
288
    result = model(features)
289
    merged = result['summary']
290
    correct_prediction_sum = result['correct']
291
    almost_correct_sum = result['almost']
292
    saver = tf.train.Saver()
293
    test_writer = tf.summary.FileWriter(FLAGS.summary_dir + '/test')
294
    seen_step = -1
295
    time.sleep(3 * 60)
296
    paused = 0
297
    while paused < 360:
298
      ckpt = tf.train.get_checkpoint_state(FLAGS.summary_dir + '/train/')
299
      if ckpt and ckpt.model_checkpoint_path:
300
        # Restores from checkpoin
301
        global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
302
      else:
303
        time.sleep(2 * 60)
304
        paused += 2
305
        continue
306
      while seen_step == int(global_step):
307
        time.sleep(2 * 60)
308
        ckpt = tf.train.get_checkpoint_state(FLAGS.summary_dir + '/train/')
309
        global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
310
        paused += 2
311
        if paused > 360:
312
          test_writer.close()
313
          return
314
      paused = 0
315

316
      seen_step = int(global_step)
317
      print(seen_step)
318
      sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
319
      saver.restore(sess, ckpt.model_checkpoint_path)
320
      coord = tf.train.Coordinator()
321
      threads = tf.train.start_queue_runners(sess=sess, coord=coord)
322
      try:
323
        total_tp = 0
324
        total_almost = 0
325
        for i in range(FLAGS.eval_size // 5):
326
          summary_j, tp, almost = sess.run(
327
              [merged, correct_prediction_sum, almost_correct_sum])
328
          total_tp += tp
329
          total_almost += almost
330

331
        total_false = FLAGS.eval_size - total_tp
332
        total_almost_false = FLAGS.eval_size - total_almost
333
        summary_tp = tf.Summary.FromString(summary_j)
334
        summary_tp.value.add(tag='correct_prediction', simple_value=total_tp)
335
        summary_tp.value.add(tag='wrong_prediction', simple_value=total_false)
336
        summary_tp.value.add(
337
            tag='almost_wrong_prediction', simple_value=total_almost_false)
338
        test_writer.add_summary(summary_tp, global_step)
339
        print('write done')
340
      except tf.errors.OutOfRangeError:
341
        print('Done eval for %d steps.' % i)
342
      finally:
343
        # When done, ask the threads to stop.
344
        coord.request_stop()
345
      # Wait for threads to finish.
346
      coord.join(threads)
347
      sess.close()
348
    test_writer.close()
349

350

351
def softmax(x):
352
  """Compute softmax values for each sets of scores in x."""
353
  e_x = np.exp(x - np.max(x))
354
  return e_x / e_x.sum()
355

356

357
def eval_ensemble(ckpnts):
358
  """Evaluate on an ensemble of checkpoints."""
359
  with tf.Graph().as_default():
360
    first_features = get_features(False, 100)[0]
361
    h = first_features['height']
362
    d = first_features['depth']
363
    features = {
364
        'images': tf.placeholder(tf.float32, shape=(100, d, h, h)),
365
        'labels': tf.placeholder(tf.float32, shape=(100, 10)),
366
        'recons_image': tf.placeholder(tf.float32, shape=(100, d, h, h)),
367
        'recons_label': tf.placeholder(tf.int32, shape=(100)),
368
        'height': first_features['height'],
369
        'depth': first_features['depth']
370
    }
371

372
    model = f_model.multi_gpu_model
373
    result = model([features])
374
    logits = result['logits']
375
    config = tf.ConfigProto(allow_soft_placement=True)
376
    # saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpnt))
377
    batch_logits = np.zeros((FLAGS.eval_size // 100, 100, 10), dtype=np.float32)
378
    batch_recons_label = np.zeros((FLAGS.eval_size // 100, 100),
379
                                  dtype=np.float32)
380
    batch_labels = np.zeros((FLAGS.eval_size // 100, 100, 10), dtype=np.float32)
381
    batch_images = np.zeros((FLAGS.eval_size // 100, 100, d, h, h),
382
                            dtype=np.float32)
383
    batch_recons_image = np.zeros((FLAGS.eval_size // 100, 100, d, h, h),
384
                                  dtype=np.float32)
385
    saver = tf.train.Saver()
386
    sess = tf.Session(config=config)
387
    coord = tf.train.Coordinator()
388
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
389
    try:
390
      for i in range(FLAGS.eval_size // 100):
391
        (batch_recons_label[i, Ellipsis], batch_labels[i, Ellipsis], batch_images[i, Ellipsis],
392
         batch_recons_image[i, Ellipsis]) = sess.run([
393
             first_features['recons_label'], first_features['labels'],
394
             first_features['images'], first_features['recons_image']
395
         ])
396
      for ckpnt in ckpnts:
397
        saver.restore(sess, ckpnt)
398
        for i in range(FLAGS.eval_size // 100):
399
          logits_i = sess.run(
400
              logits,
401
              feed_dict={
402
                  features['recons_label']: batch_recons_label[i, Ellipsis],
403
                  features['labels']: batch_labels[i, Ellipsis],
404
                  features['images']: batch_images[i, Ellipsis],
405
                  features['recons_image']: batch_recons_image[i, Ellipsis]
406
              })
407
          # batch_logits[i, ...] += softmax(logits_i)
408
          batch_logits[i, Ellipsis] += logits_i
409
    except tf.errors.OutOfRangeError:
410
      print('Done eval for %d steps.' % i)
411
    finally:
412
      # When done, ask the threads to stop.
413
      coord.request_stop()
414
      # Wait for threads to finish.
415
    coord.join(threads)
416
    sess.close()
417
    batch_pred = np.argmax(batch_logits, axis=2)
418
    total_wrong = np.sum(np.not_equal(batch_pred, batch_recons_label))
419
    print(total_wrong)
420

421

422
def eval_once(ckpnt):
423
  """Evaluate on one checkpoint once."""
424
  ptches = np.zeros((14, 14, 32, 32))
425
  for i in range(14):
426
    for j in range(14):
427
      ind_x = i * 2
428
      ind_y = j * 2
429
      for k in range(5):
430
        for h in range(5):
431
          ptches[i, j, ind_x + k, ind_y + h] = 1
432
  ptches = np.reshape(ptches, (14 * 14, 32, 32))
433

434
  with tf.Graph().as_default():
435
    features = get_features(False, 1)[0]
436
    if FLAGS.patching:
437
      features['images'] = features['cc_images']
438
      features['recons_label'] = features['cc_recons_label']
439
      features['labels'] = features['cc_labels']
440
    model = f_model.multi_gpu_model
441
    result = model([features])
442
    # merged = result['summary']
443
    correct_prediction_sum = result['correct']
444
    # almost_correct_sum = result['almost']
445
    # mid_act = result['mid_act']
446
    logits = result['logits']
447

448
    saver = tf.train.Saver()
449
    test_writer = tf.summary.FileWriter(FLAGS.summary_dir + '/test_once')
450
    config = tf.ConfigProto(allow_soft_placement=True)
451
    config.gpu_options.per_process_gpu_memory_fraction = 0.3
452
    sess = tf.Session(config=config)
453
    # saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpnt))
454
    saver.restore(sess, ckpnt)
455
    coord = tf.train.Coordinator()
456
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
457
    i = 0
458
    try:
459
      total_tp = 0
460
      for i in range(FLAGS.eval_size):
461
        #, g_ac, ac
462
        lb, tp, lg = sess.run([
463
            features['recons_label'],
464
            correct_prediction_sum,
465
            logits,
466
        ])
467
        if FLAGS.patching:
468
          batched_lg = np.sum(lg / np.sum(lg, axis=1, keepdims=True), axis=0)
469
          batch_pred = np.argmax(batched_lg)
470
          tp = np.equal(batch_pred, lb[0])
471

472
        total_tp += tp
473
      total_false = FLAGS.eval_size - total_tp
474
      print('false:{}, true:{}'.format(total_false, total_tp))
475
      # summary_tp = tf.Summary.FromString(summary_j)
476
      # summary_tp.value.add(tag='correct_prediction', simple_value=total_tp)
477
      # summary_tp.value.add(tag='wrong_prediction', simple_value=total_false)
478
      # summary_tp.value.add(
479
      #     tag='almost_wrong_prediction', simple_value=total_almost_false)
480
      # test_writer.add_summary(summary_tp, i + 1)
481
    except tf.errors.OutOfRangeError:
482
      print('Done eval for %d steps.' % i)
483
    finally:
484
      # When done, ask the threads to stop.
485
      coord.request_stop()
486
    # Wait for threads to finish.
487
    coord.join(threads)
488
    sess.close()
489
    test_writer.close()
490

491

492
def main(_):
493
  if FLAGS.eval_ensemble:
494
    if tf.gfile.Exists(FLAGS.summary_dir + '/test_ensemble'):
495
      tf.gfile.DeleteRecursively(FLAGS.summary_dir + '/test_ensemble')
496
    tf.gfile.MakeDirs(FLAGS.summary_dir + '/test_ensemble')
497
    ensem = []
498
    for i in range(1, 12):
499
      f_name = '/tmp/cifar10/{}{}{}-600000'.format(FLAGS.part1, i, FLAGS.part2)
500
      if tf.train.checkpoint_exists(f_name):
501
        ensem += [f_name]
502

503
    print(len(ensem))
504
    eval_ensemble(ensem)
505
  elif FLAGS.eval_once:
506
    if tf.gfile.Exists(FLAGS.summary_dir + '/test_once'):
507
      tf.gfile.DeleteRecursively(FLAGS.summary_dir + '/test_once')
508
    tf.gfile.MakeDirs(FLAGS.summary_dir + '/test_once')
509
    eval_once(FLAGS.ckpnt)
510
  elif FLAGS.train:
511
    run_training()
512
  else:
513
    if tf.gfile.Exists(FLAGS.summary_dir + '/test_once'):
514
      tf.gfile.DeleteRecursively(FLAGS.summary_dir + '/test_once')
515
    tf.gfile.MakeDirs(FLAGS.summary_dir + '/test_once')
516
    if tf.gfile.Exists(FLAGS.summary_dir + '/test'):
517
      tf.gfile.DeleteRecursively(FLAGS.summary_dir + '/test')
518
    tf.gfile.MakeDirs(FLAGS.summary_dir + '/test')
519
    run_eval()
520

521

522
if __name__ == '__main__':
523
  tf.app.run()
524
google-research

Использование cookies