google-research

nn.py
592 строки · 22.8 Кб
Перенос по словам
1
# coding=utf-8
2
# Copyright 2024 The Google Research Authors.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15

16
"""Defines standard networks layers that train using variational dropout."""
17
from __future__ import absolute_import
18
from __future__ import division
19
from __future__ import print_function
20

21
import tensorflow.compat.v1 as tf
22

23
from state_of_sparsity.layers.utils import layer_utils
24
from state_of_sparsity.layers.variational_dropout import common
25

26

27
def _verify_variational_params(variational_params):
28
  """Verifies that the format of the input `variational_params`.
29

30
  Checks that the input parameters is a 2-tuple of tensors of equal shape.
31

32
  Args:
33
    variational_params: The parameters to check.
34

35
  Raises:
36
    RuntimeError: If the input is not a 2-tuple of tensors with equal shape.
37

38
  Returns:
39
    The input `variational_parameters`.
40
  """
41
  if len(variational_params) != 2:
42
    raise RuntimeError("Incorrect number of variational parameters.")
43
  if variational_params[0].shape != variational_params[1].shape:
44
    raise RuntimeError("Variational parameters must be the same shape.")
45
  return variational_params
46

47

48
def matmul_train(
49
    x,
50
    variational_params,
51
    transpose_a=False,
52
    transpose_b=False,
53
    clip_alpha=None,
54
    eps=common.EPSILON):
55
  R"""Training computation for a variation matmul.
56

57
  In variational dropout we train a Bayesian neural network where we assume a
58
  fully-factorized Gaussian posterior and log uniform prior over the weights.
59

60
  During training, we need to sample weights from this distribution. Rather
61
  than sample weights for each sample in the input batch, we can calculate the
62
  parameters of the distribution over the pre-activations analytically (this
63
  step is called the local reparameterization trick). This function calculates
64
  the mean and standard deviation of the distribution over the pre-activations,
65
  and then draws a single sample for each element in the input batch and passes
66
  them as output.
67

68
  Args:
69
    x: 2D Tensor representing the input batch.
70
    variational_params: 2-tuple of Tensors, where the first tensor is the \theta
71
      values and the second contains the log of the \sigma^2 values.
72
    transpose_a: If True, a is transposed before multiplication.
73
    transpose_b: If True, b is transposed before multiplication.
74
    clip_alpha: Int or None. If integer, we clip the log \alpha values to
75
      [-clip_alpha, clip_alpha]. If None, don't clip the values.
76
    eps: Small constant value to use in log and sqrt operations to avoid NaNs.
77

78
  Returns:
79
    Output Tensor of the matmul operation.
80

81
  Raises:
82
    RuntimeError: If the variational_params argument is not a 2-tuple.
83
  """
84
  # We expect a 2D input tensor, as in standard in fully-connected layers
85
  x.get_shape().assert_has_rank(2)
86

87
  theta, log_sigma2 = _verify_variational_params(
88
      variational_params)
89

90
  if clip_alpha is not None:
91
    # Compute the log_alphas and then compute the
92
    # log_sigma2 again so that we can clip on the
93
    # log alpha magnitudes
94
    log_alpha = common.compute_log_alpha(log_sigma2, theta, eps, clip_alpha)
95
    log_sigma2 = common.compute_log_sigma2(log_alpha, theta, eps)
96

97
  # Compute the mean and standard deviation of the distributions over the
98
  # activations
99
  mu_activation = tf.matmul(
100
      x,
101
      theta,
102
      transpose_a=transpose_a,
103
      transpose_b=transpose_b)
104
  std_activation = tf.sqrt(tf.matmul(
105
      tf.square(x),
106
      tf.exp(log_sigma2),
107
      transpose_a=transpose_a,
108
      transpose_b=transpose_b) + eps)
109

110
  output_shape = tf.shape(std_activation)
111
  return mu_activation + std_activation * tf.random_normal(output_shape)
112

113

114
def matmul_eval(
115
    x,
116
    variational_params,
117
    transpose_a=False,
118
    transpose_b=False,
119
    threshold=3.0,
120
    eps=common.EPSILON):
121
  R"""Evaluation computation for a variation matmul.
122

123
  In variational dropout we train a Bayesian neural network where we assume a
124
  fully-factorized Gaussian posterior and log uniform prior over the weights.
125

126
  The parameters of the posterior are learned during training, and at eval
127
  time we use the learned mean as the weight values.
128

129
  This method also supports the pruning of weights based on their log \alpha
130
  values. All weights with log \alpha >= `threshold` are set to zero.
131

132
  Args:
133
    x: 2D Tensor representing the input batch.
134
    variational_params: 2-tuple of Tensors, where the first tensor is the \theta
135
      values and the second contains the log of the \sigma^2 values.
136
    transpose_a: If True, a is transposed before multiplication.
137
    transpose_b: If True, b is transposed before multiplication.
138
    threshold: Weights with a log \alpha_{ij} value greater than this will be
139
      set to zero.
140
    eps: Small constant value to use in log and sqrt operations to avoid NaNs.
141

142
  Returns:
143
    Output Tensor of the variational matmul operation.
144

145
  Raises:
146
    RuntimeError: If the variational_params argument is not a 2-tuple.
147
  """
148
  # We expect a 2D input tensor, as is standard in fully-connected layers
149
  x.get_shape().assert_has_rank(2)
150

151
  theta, log_sigma2 = _verify_variational_params(
152
      variational_params)
153

154
  # Compute the weight mask by thresholding on
155
  # the log-space alpha values
156
  log_alpha = common.compute_log_alpha(log_sigma2, theta, eps, value_limit=None)
157
  weight_mask = tf.cast(tf.less(log_alpha, threshold), tf.float32)
158

159
  return tf.matmul(
160
      x,
161
      theta * weight_mask,
162
      transpose_a=transpose_a,
163
      transpose_b=transpose_b)
164

165

166
def broadcast_matmul_train(
167
    x,
168
    variational_params,
169
    clip_alpha=None,
170
    eps=common.EPSILON):
171
  R"""Training computation for VD matrix multiplication with N input matrices.
172

173
  Multiplies a 3D tensor `x` with a set of 2D parameters. Each 2D matrix
174
  `x[i, :, :]` in the input tensor is multiplied indendently with the
175
  parameters, resulting in a 3D output tensor with shape
176
  `x.shape[:2] + weight_parameters[0].shape[1]`.
177

178
  Args:
179
    x: 3D Tensor representing the input batch.
180
    variational_params: 2-tuple of Tensors, where the first tensor is the
181
      unscaled weight values and the second is the log of the alpha values
182
      for the hard concrete distribution.
183
    clip_alpha: Int or None. If integer, we clip the log \alpha values to
184
      [-clip_alpha, clip_alpha]. If None, don't clip the values.
185
    eps: Small constant value to use in log and sqrt operations to avoid NaNs.
186

187
  Returns:
188
    Output Tensor of the batched matmul operation.
189

190
  Raises:
191
    RuntimeError: If the variational_params argument is not a 2-tuple.
192
  """
193
  theta, log_sigma2 = _verify_variational_params(
194
      variational_params)
195
  theta.get_shape().assert_has_rank(2)
196
  log_sigma2.get_shape().assert_has_rank(2)
197

198
  # The input data must have be rank 2 or greater
199
  assert x.get_shape().ndims >= 2
200
  input_rank = x.get_shape().ndims
201

202
  if clip_alpha is not None:
203
    # Compute the log_alphas and then compute the
204
    # log_sigma2 again so that we can clip on the
205
    # log alpha magnitudes
206
    log_alpha = common.compute_log_alpha(log_sigma2, theta, eps, clip_alpha)
207
    log_sigma2 = common.compute_log_sigma2(log_alpha, theta, eps)
208

209
  # Compute the mean and standard deviation of the distributions over the
210
  # activations
211
  mu_activation = tf.tensordot(x, theta, [[input_rank-1], [0]])
212

213
  var_activation = tf.tensordot(
214
      tf.square(x),
215
      tf.exp(log_sigma2),
216
      [[input_rank-1], [0]])
217
  std_activation = tf.sqrt(var_activation + eps)
218

219
  # Reshape the output back to the rank of the input
220
  input_shape = x.get_shape().as_list()
221
  weight_shape = theta.get_shape().as_list()
222
  output_shape = input_shape[:-1] + [weight_shape[1]]
223
  mu_activation.set_shape(output_shape)
224
  std_activation.set_shape(output_shape)
225

226
  # NOTE: We sample noise for each weight in theta, which will be shared by
227
  # each matrix product that was done. This is equivalent to sampling the same
228
  # set of weights for all matrix products done by this op in an iteration.
229
  # The element-wise multiply below broadcasts.
230
  num_pad_dims = len(output_shape) - 2
231
  padding = [tf.constant(1, dtype=tf.int32) for _ in range(num_pad_dims)]
232

233
  # NOTE: On GPU, the first dim may not be defined w/ the Transformer. Create
234
  # a tf.Tensor from the list shape and TF should match the first dim
235
  # appropriately
236
  batch_size = tf.shape(x)[0]
237
  data_dim = tf.shape(theta)[-1]
238
  noise_shape = tf.stack([batch_size] + padding + [data_dim], axis=0)
239

240
  output = mu_activation + std_activation * tf.random_normal(noise_shape)
241
  return output
242

243

244
def broadcast_matmul_eval(
245
    x,
246
    variational_params,
247
    threshold=3.0,
248
    eps=common.EPSILON):
249
  R"""Evaluation computation for VD matrix multiplication with N input matrices.
250

251
  Multiplies a 3D tensor `x` with a set of 2D parameters. Each 2D matrix
252
  `x[i, :, :]` in the input tensor is multiplied indendently with the
253
  parameters, resulting in a 3D output tensor with shape
254
  `x.shape[:2] + weight_parameters[0].shape[1]`.
255

256
  Args:
257
    x: 3D Tensor representing the input batch.
258
    variational_params: 2-tuple of Tensors, where the first tensor is the
259
      unscaled weight values and the second is the log of the alpha values
260
      for the hard concrete distribution.
261
    threshold: Weights with a log \alpha_{ij} value greater than this will be
262
      set to zero.
263
    eps: Small constant value to use in log and sqrt operations to avoid NaNs.
264

265
  Returns:
266
    Output Tensor of the batched matmul operation.
267

268
  Raises:
269
    RuntimeError: If the variational_params argument is not a 2-tuple.
270
  """
271
  theta, log_sigma2 = _verify_variational_params(
272
      variational_params)
273
  theta.get_shape().assert_has_rank(2)
274
  log_sigma2.get_shape().assert_has_rank(2)
275

276
  # The input data must have be rank 2 or greater
277
  assert x.get_shape().ndims >= 2
278
  input_rank = x.get_shape().ndims
279

280
  # Compute the weights mask by thresholding on the log-space alpha values
281
  log_alpha = common.compute_log_alpha(log_sigma2, theta, eps, value_limit=None)
282
  weight_mask = tf.cast(tf.less(log_alpha, threshold), tf.float32)
283

284
  output = tf.tensordot(x, theta * weight_mask, [[input_rank-1], [0]])
285

286
  # Reshape the output back to the rank of the input
287
  input_shape = x.get_shape().as_list()
288
  weight_shape = theta.get_shape().as_list()
289
  output_shape = input_shape[:-1] + [weight_shape[1]]
290
  output.set_shape(output_shape)
291
  return output
292

293

294
def conv2d_train(x,
295
                 variational_params,
296
                 strides,
297
                 padding,
298
                 data_format="NHWC",
299
                 clip_alpha=None,
300
                 eps=common.EPSILON):
301
  R"""Training computation for a variational conv2d.
302

303
  In variational dropout we train a Bayesian neural network where we assume a
304
  fully-factorized Gaussian posterior and log uniform prior over the weights.
305

306
  During training, we need to sample weights from this distribution. Rather
307
  than sample weights for each sample in the input batch, we can calculate the
308
  parameters of the distribution over the pre-activations analytically (this
309
  step is called the local reparameterization trick). This function calculates
310
  the mean and standard deviation of the distribution over the pre-activations,
311
  and then draws a single sample for each element in the input batch and passes
312
  them as output.
313

314
  Args:
315
    x: NHWC tf.Tensor representing the input batch of features.
316
    variational_params: 2-tuple of Tensors, where the first tensor is the \theta
317
      values and the second contains the log of the \sigma^2 values.
318
    strides: The stride of the sliding window for each dimension of `x`.
319
      Identical to standard strides argument for tf.conv2d.
320
    padding: String. One of "SAME", or "VALID". Identical to standard padding
321
      argument for tf.conv2d.
322
    data_format: 'NHWC' or 'NCHW' ordering of 4-D input Tensor.
323
    clip_alpha: Int or None. If integer, we clip the log \alpha values to
324
      [-clip_alpha, clip_alpha]. If None, don't clip the values.
325
    eps: Small constant value to use in log and sqrt operations to avoid NaNs.
326

327
  Returns:
328
    Output Tensor of the conv2d operation.
329

330
  Raises:
331
    RuntimeError: If the variational_params argument
332
    is not a 2-tuple.
333
  """
334
  theta, log_sigma2 = _verify_variational_params(variational_params)
335

336
  if clip_alpha:
337
    # Compute the log_alphas and then compute the
338
    # log_sigma2 again so that we can clip on the
339
    # log alpha magnitudes
340
    log_alpha = common.compute_log_alpha(log_sigma2, theta, eps, clip_alpha)
341
    log_sigma2 = common.compute_log_sigma2(log_alpha, theta, eps)
342

343
  # Compute the mean and standard deviation of the distribution over the
344
  # convolution outputs
345
  mu_activation = tf.nn.conv2d(
346
      x, theta, strides, padding, data_format=data_format)
347
  std_activation = tf.sqrt(
348
      tf.nn.conv2d(
349
          tf.square(x),
350
          tf.exp(log_sigma2),
351
          strides,
352
          padding,
353
          data_format=data_format) + eps)
354

355
  output_shape = tf.shape(std_activation)
356
  return mu_activation + std_activation * tf.random_normal(output_shape)
357

358

359
def conv2d_eval(x,
360
                variational_params,
361
                strides,
362
                padding,
363
                data_format="NHWC",
364
                threshold=3.0,
365
                eps=common.EPSILON):
366
  R"""Evaluation computation for a variation conv2d.
367

368
  In variational dropout we train a Bayesian neural network where we assume a
369
  fully-factorized Gaussian posterior and log uniform prior over the weights.
370

371
  The parameters of the posterior are learned during training, and at eval
372
  time we use the learned mean as the weight values.
373

374
  This method also supports the pruning of weights based on their log \alpha
375
  values. All weights with log \alpha >= `threshold` are set to zero.
376

377
  Args:
378
    x: Tensor representing the input batch.
379
    variational_params: 2-tuple of Tensors, where the first tensor is the
380
      \theta values and the second contains the log of the \sigma^2 values.
381
    strides: The stride of the sliding window for each dimension of `x`.
382
      Identical to standard strides argument for tf.conv2d.
383
    padding: String. One of "SAME", or "VALID". Identical to standard
384
     padding argument for tf.conv2d.
385
    data_format: 'NHWC' or 'NCHW' ordering of 4-D input Tensor.
386
    threshold: Weights with a log \alpha_{ij} value greater than this will
387
      be set to zero.
388
    eps: Small constant value to use in log and sqrt operations to avoid NaNs.
389

390
  Returns:
391
    Output Tensor of the conv2d operation.
392

393
  Raises:
394
    RuntimeError: If the variational_params argument is not a 2-tuple.
395
  """
396
  theta, log_sigma2 = _verify_variational_params(
397
      variational_params)
398

399
  # Compute the weight mask by thresholding on
400
  # the log-space alpha values
401
  log_alpha = common.compute_log_alpha(log_sigma2, theta, eps, value_limit=None)
402
  weight_mask = tf.cast(tf.less(log_alpha, threshold), tf.float32)
403

404
  return tf.nn.conv2d(
405
      x, theta * weight_mask, strides, padding, data_format=data_format)
406

407

408
# NOTE: This implementation of variational dropout on an embedding samples
409
# new noise for each embedding vectors at all timesteps in the batch
410
# and across sequences in the batch. An alternative implementation would
411
# be to sample a noise vector for each token in the vocabulary, so that
412
# all instances of an embedding vector for a given token would be the
413
# same within a batch. Another alternative implementation would be to
414
# sample a noise vector for each token in the vocabulary for each element
415
# in the batch so that, within a sequence, all instances of an embedding
416
# vector for a given token would be the same, but across different elements
417
# in the batch they could be different.
418
#
419
# The first alternative implementation would add another embedding lookup
420
# to the implementation. We'd generate a noise tensor with shape
421
# [vocab_size, embedding_size], and for each token id in the batch we'd
422
# do an embedding lookup to get the appropriate noise vector. We'd then
423
# do two more embedding lookups, one to get the mean vector and one to
424
# get the log variance vector for the token. These 3 tensors with shape
425
# [batch_size, seq_length, embedding_size] would then be combined the
426
# same way they are in this implementation.
427
#
428
# This last implementation may not be practical, because we would have to
429
# sample `vocab_size * embedding_size * batch_size` random values per
430
# iteration. We'd also have unique noise embeddings for each element in
431
# the batch, meaning we'd have to do `batch_size` + 2 embedding lookups.
432
#
433
# This implementation is the most efficient in terms of embedding lookups
434
# and noise sampling.
435
def embedding_lookup_train(
436
    variational_params,
437
    ids,
438
    name=None,
439
    clip_alpha=None,
440
    eps=common.EPSILON):
441
  R"""Embedding trained with variational dropout.
442

443
  In a standard embedding lookup, `ids` are looked-up in a list of embedding
444
  tensors. In an embedding trained with variational dropout, we lookup the
445
  parameters of the fully-factorized Gaussian posterior over the embedding
446
  tensor for each index in `ids` and draw a sample from this distribution
447
  that is returned.
448

449
  The `ids` argument is analogous to those in the standard tf.embedding_lookup.
450

451
  Args:
452
    variational_params: 2-tuple of Tensors, where the first tensor is the \theta
453
      values and the second contains the log of the \sigma^2 values.
454
    ids: A Tensor with type int32 or int64 containing the ids to be looked up
455
      in params.
456
    name: String. Name of the operator.
457
    clip_alpha: Int or None. If integer, we clip the log \alpha values
458
      to [-clip_alpha, clip_alpha]. If None, don't clip the values.
459
    eps: Small constant value to use in log and sqrt operations to avoid NaNs.
460

461
  Returns:
462
    The output Tensor result of the embedding lookup.
463

464
  Raises:
465
    RuntimeError: If the input variational_params is not a 2-tuple of Tensors
466
      that have the same shape.
467
  """
468
  theta, log_sigma2 = _verify_variational_params(
469
      variational_params)
470

471
  # Before we do anything, lookup the mean and log variances of the embedding
472
  # vectors we are going to output and do all our operations in this lower
473
  # dimensional space
474
  embedding_theta = layer_utils.gather(theta, ids)
475
  embedding_log_sigma2 = layer_utils.gather(log_sigma2, ids)
476

477
  if clip_alpha:
478
    # Compute the log_alphas and then compute the
479
    # log_sigma2 again so that we can clip on the
480
    # log alpha magnitudes
481
    embedding_log_alpha = common.compute_log_alpha(
482
        embedding_log_sigma2, embedding_theta, eps, clip_alpha)
483
    embedding_log_sigma2 = common.compute_log_sigma2(
484
        embedding_log_alpha, embedding_theta, eps)
485

486
  # Calculate the standard deviation from the log variance
487
  embedding_std = tf.sqrt(tf.exp(embedding_log_sigma2) + eps)
488

489
  # Output samples from the distribution over the embedding vectors
490
  output_shape = tf.shape(embedding_std)
491
  embedding = embedding_theta + embedding_std * tf.random_normal(output_shape)
492
  return tf.identity(embedding, name=name)
493

494

495
def embedding_lookup_eval(
496
    variational_params,
497
    ids,
498
    name=None,
499
    threshold=3.0,
500
    eps=common.EPSILON):
501
  R"""Evaluation mode embedding trained with variational dropout.
502

503
  In a standard embedding lookup, `ids` are looked-up in a list of embedding
504
  tensors. In an embedding trained with variational dropout, we lookup the
505
  parameters of the fully-factorized Gaussian posterior over the embedding
506
  tensor for each index in `ids` and draw a sample from this distribution
507
  that is returned. At evaluation time, we use the mean of the posterior
508
  over each embedding tensor instead of sampling.
509

510
  The `ids` and `partition_strategy` arguments are analogous to those in the
511
  standard tf.embedding_lookup.
512

513
  Args:
514
    variational_params: 2-tuple of Tensors, where the first tensor is the \theta
515
      values and the second contains the log of the \sigma^2 values.
516
    ids: A Tensor with type int32 or int64 containing the ids to be looked up
517
      in params.
518
    name: String. Name of the operator.
519
    threshold: Weights with a log \alpha_{ij} value greater than this will be
520
      set to zero.
521
    eps: Small constant value to use in log and sqrt operations to avoid NaNs.
522

523
  Returns:
524
    The output Tensor result of the embedding lookup.
525

526
  Raises:
527
    RuntimeError: If the input variational_params is not a 2-tuple of Tensors
528
      that have the same shape.
529
  """
530
  theta, log_sigma2 = _verify_variational_params(
531
      variational_params)
532

533
  # Rather than mask the whole embedding every iteration, we can do a second
534
  # embedding lookup on the log \sigma2 values, compute the log \alpha values
535
  # for each output embedding vector, and then mask the much lower dimensional
536
  # output embedding vectors
537
  embedding_theta = layer_utils.gather(theta, ids)
538
  embedding_log_sigma2 = layer_utils.gather(log_sigma2, ids)
539

540
  # Compute the weight mask by thresholding on the log-space alpha values
541
  embedding_log_alpha = common.compute_log_alpha(
542
      embedding_log_sigma2, embedding_theta, eps, value_limit=None)
543
  embedding_mask = tf.cast(tf.less(embedding_log_alpha, threshold), tf.float32)
544

545
  # Return the masked embedding vectors
546
  return tf.identity(embedding_theta * embedding_mask, name=name)
547

548

549
def negative_dkl(variational_params=None,
550
                 clip_alpha=None,
551
                 eps=common.EPSILON,
552
                 log_alpha=None):
553
  R"""Compute the negative kl-divergence loss term.
554

555
  Computes the negative kl-divergence between the log-uniform prior over the
556
  weights and the variational posterior over the weights for each element
557
  in the set of variational parameters. Each contribution is summed and the
558
  sum is returned as a scalar Tensor.
559

560
  The true kl-divergence is intractable, so we compute the tight approximation
561
  from https://arxiv.org/abs/1701.05369.
562

563
  Args:
564
    variational_params: 2-tuple of Tensors, where the first tensor is the \theta
565
      values and the second contains the log of the \sigma^2 values.
566
    clip_alpha: Int or None. If integer, we clip the log \alpha values to
567
      [-clip_alpha, clip_alpha]. If None, don't clip the values.
568
    eps: Small constant value to use in log and sqrt operations to avoid NaNs.
569
    log_alpha: float32 tensor of log alpha values.
570
  Returns:
571
    Output scalar Tensor containing the sum of all negative kl-divergence
572
    contributions for each element in the input variational_params.
573

574
  Raises:
575
    RuntimeError: If the variational_params argument is not a 2-tuple.
576
  """
577

578
  if variational_params is not None:
579
    theta, log_sigma2 = _verify_variational_params(variational_params)
580

581
  if log_alpha is None:
582
    log_alpha = common.compute_log_alpha(log_sigma2, theta, eps, clip_alpha)
583

584
  # Constant values for approximating the kl divergence
585
  k1, k2, k3 = 0.63576, 1.8732, 1.48695
586
  c = -k1
587

588
  # Compute each term of the KL and combine
589
  term_1 = k1 * tf.nn.sigmoid(k2 + k3*log_alpha)
590
  term_2 = -0.5 * tf.log1p(tf.exp(tf.negative(log_alpha)))
591
  eltwise_dkl = term_1 + term_2 + c
592
  return -tf.reduce_sum(eltwise_dkl)
593
google-research

Использование cookies