google-research

modeling.py
1044 строки · 38.6 Кб
Перенос по словам
1
# coding=utf-8
2
# Copyright 2024 The Google Research Authors.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15

16
"""The main BERT model and related functions. Copied from bert."""
17

18
from __future__ import absolute_import
19
from __future__ import division
20
from __future__ import print_function
21

22
import collections
23
import copy
24
import json
25
import math
26
import re
27
import numpy as np
28
import six
29
import tensorflow.compat.v1 as tf
30
from tf_slim.layers import layers
31

32

33
class BertConfig(object):
34
  """Configuration for `BertModel`."""
35

36
  def __init__(self,
37
               vocab_size,
38
               hidden_size=768,
39
               num_hidden_layers=12,
40
               num_attention_heads=12,
41
               intermediate_size=3072,
42
               hidden_act="gelu",
43
               hidden_dropout_prob=0.1,
44
               attention_probs_dropout_prob=0.1,
45
               max_position_embeddings=512,
46
               type_vocab_size=16,
47
               initializer_range=0.02):
48
    """Constructs BertConfig.
49

50
    Args:
51
      vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
52
      hidden_size: Size of the encoder layers and the pooler layer.
53
      num_hidden_layers: Number of hidden layers in the Transformer encoder.
54
      num_attention_heads: Number of attention heads for each attention layer in
55
        the Transformer encoder.
56
      intermediate_size: The size of the "intermediate" (i.e., feed-forward)
57
        layer in the Transformer encoder.
58
      hidden_act: The non-linear activation function (function or string) in the
59
        encoder and pooler.
60
      hidden_dropout_prob: The dropout probability for all fully connected
61
        layers in the embeddings, encoder, and pooler.
62
      attention_probs_dropout_prob: The dropout ratio for the attention
63
        probabilities.
64
      max_position_embeddings: The maximum sequence length that this model might
65
        ever be used with. Typically set this to something large just in case
66
        (e.g., 512 or 1024 or 2048).
67
      type_vocab_size: The vocabulary size of the `token_type_ids` passed into
68
        `BertModel`.
69
      initializer_range: The stdev of the truncated_normal_initializer for
70
        initializing all weight matrices.
71
    """
72
    self.vocab_size = vocab_size
73
    self.hidden_size = hidden_size
74
    self.num_hidden_layers = num_hidden_layers
75
    self.num_attention_heads = num_attention_heads
76
    self.hidden_act = hidden_act
77
    self.intermediate_size = intermediate_size
78
    self.hidden_dropout_prob = hidden_dropout_prob
79
    self.attention_probs_dropout_prob = attention_probs_dropout_prob
80
    self.max_position_embeddings = max_position_embeddings
81
    self.type_vocab_size = type_vocab_size
82
    self.initializer_range = initializer_range
83

84
  @classmethod
85
  def from_dict(cls, json_object, strict=False):
86
    """Constructs a `BertConfig` from a Python dictionary of parameters."""
87
    config = cls(vocab_size=None)
88
    for (key, value) in six.iteritems(json_object):
89
      if strict and key not in config.__dict__:
90
        raise ValueError("BertConfig has no field '{}'".format(key))
91
      config.__dict__[key] = value
92
    if strict and config.vocab_size is None:
93
      raise ValueError("BertConfig field 'vocab_size' is unset")
94
    return config
95

96
  @classmethod
97
  def from_json_file(cls, json_file, strict=False):
98
    """Constructs a `BertConfig` from a json file of parameters."""
99
    with tf.io.gfile.GFile(json_file, "r") as reader:
100
      text = reader.read()
101
    return cls.from_dict(json.loads(text), strict=strict)
102

103
  def to_dict(self):
104
    """Serializes this instance to a Python dictionary."""
105
    output = copy.deepcopy(self.__dict__)
106
    return output
107

108
  def to_json_string(self):
109
    """Serializes this instance to a JSON string."""
110
    return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
111

112

113
class BertModel(object):
114
  """BERT model ("Bidirectional Encoder Representations from Transformers").
115

116
  Example usage:
117

118
  ```python
119
  # Already been converted into WordPiece token ids
120
  input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])
121
  input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])
122
  token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]])
123

124
  config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
125
    num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
126

127
  model = modeling.BertModel(config=config, is_training=True,
128
    input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)
129

130
  label_embeddings = tf.get_variable(...)
131
  pooled_output = model.get_pooled_output()
132
  logits = tf.matmul(pooled_output, label_embeddings)
133
  ...
134
  ```
135
  """
136

137
  def __init__(self,
138
               config,
139
               is_training,
140
               input_ids,
141
               input_mask=None,
142
               token_type_ids=None,
143
               use_one_hot_embeddings=False,
144
               scope=None):
145
    """Constructor for BertModel.
146

147
    Args:
148
      config: `BertConfig` instance.
149
      is_training: bool. true for training model, false for eval model. Controls
150
        whether dropout will be applied.
151
      input_ids: int32 Tensor of shape [batch_size, seq_length].
152
      input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
153
      token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
154
      use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
155
        embeddings or tf.embedding_lookup() for the word embeddings.
156
      scope: (optional) variable scope. Defaults to "bert".
157

158
    Raises:
159
      ValueError: The config is invalid or one of the input tensor shapes
160
        is invalid.
161
    """
162
    config = copy.deepcopy(config)
163
    if not is_training:
164
      config.hidden_dropout_prob = 0.0
165
      config.attention_probs_dropout_prob = 0.0
166

167
    input_shape = get_shape_list(input_ids, expected_rank=2)
168
    batch_size = input_shape[0]
169
    seq_length = input_shape[1]
170

171
    if input_mask is None:
172
      input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)
173

174
    if token_type_ids is None:
175
      token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32)
176

177
    with tf.variable_scope(scope, default_name="bert"):
178
      with tf.variable_scope("embeddings"):
179
        # Perform embedding lookup on the word ids.
180
        (self.word_embedding_output, self.embedding_table) = embedding_lookup(
181
            input_ids=input_ids,
182
            vocab_size=config.vocab_size,
183
            embedding_size=config.hidden_size,
184
            initializer_range=config.initializer_range,
185
            word_embedding_name="word_embeddings",
186
            use_one_hot_embeddings=use_one_hot_embeddings)
187

188
        # Add positional embeddings and token type embeddings, then layer
189
        # normalize and perform dropout.
190
        self.embedding_output = embedding_postprocessor(
191
            input_tensor=self.word_embedding_output,
192
            use_token_type=True,
193
            token_type_ids=token_type_ids,
194
            token_type_vocab_size=config.type_vocab_size,
195
            token_type_embedding_name="token_type_embeddings",
196
            use_position_embeddings=True,
197
            position_embedding_name="position_embeddings",
198
            initializer_range=config.initializer_range,
199
            max_position_embeddings=config.max_position_embeddings,
200
            dropout_prob=config.hidden_dropout_prob)
201

202
      with tf.variable_scope("encoder"):
203
        # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
204
        # mask of shape [batch_size, seq_length, seq_length] which is used
205
        # for the attention scores.
206
        attention_mask = create_attention_mask_from_input_mask(
207
            input_ids, input_mask)
208

209
        # Run the stacked transformer.
210
        # `sequence_output` shape = [batch_size, seq_length, hidden_size].
211
        self.all_encoder_layers = transformer_model(
212
            input_tensor=self.embedding_output,
213
            attention_mask=attention_mask,
214
            hidden_size=config.hidden_size,
215
            num_hidden_layers=config.num_hidden_layers,
216
            num_attention_heads=config.num_attention_heads,
217
            intermediate_size=config.intermediate_size,
218
            intermediate_act_fn=get_activation(config.hidden_act),
219
            hidden_dropout_prob=config.hidden_dropout_prob,
220
            attention_probs_dropout_prob=config.attention_probs_dropout_prob,
221
            initializer_range=config.initializer_range,
222
            do_return_all_layers=True)
223

224
      self.sequence_output = self.all_encoder_layers[-1]
225
      # The "pooler" converts the encoded sequence tensor of shape
226
      # [batch_size, seq_length, hidden_size] to a tensor of shape
227
      # [batch_size, hidden_size]. This is necessary for segment-level
228
      # (or segment-pair-level) classification tasks where we need a fixed
229
      # dimensional representation of the segment.
230
      with tf.variable_scope("pooler"):
231
        # We "pool" the model by simply taking the hidden state corresponding
232
        # to the first token. We assume that this has been pre-trained
233
        first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1)
234
        self.pooled_output = tf.layers.dense(
235
            first_token_tensor,
236
            config.hidden_size,
237
            activation=tf.tanh,
238
            kernel_initializer=create_initializer(config.initializer_range))
239

240
  def get_pooled_output(self):
241
    return self.pooled_output
242

243
  def get_sequence_output(self):
244
    """Gets final hidden layer of encoder.
245

246
    Returns:
247
      float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
248
      to the final hidden of the transformer encoder.
249
    """
250
    return self.sequence_output
251

252
  def get_all_encoder_layers(self):
253
    return self.all_encoder_layers
254

255
  def get_word_embedding_output(self):
256
    """Get output of the word(piece) embedding lookup.
257

258
    This is BEFORE positional embeddings and token type embeddings have been
259
    added.
260

261
    Returns:
262
      float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
263
      to the output of the word(piece) embedding layer.
264
    """
265
    return self.word_embedding_output
266

267
  def get_embedding_output(self):
268
    """Gets output of the embedding lookup (i.e., input to the transformer).
269

270
    Returns:
271
      float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
272
      to the output of the embedding layer, after summing the word
273
      embeddings with the positional embeddings and the token type embeddings,
274
      then performing layer normalization. This is the input to the transformer.
275
    """
276
    return self.embedding_output
277

278
  def get_embedding_table(self):
279
    return self.embedding_table
280

281

282
def gelu(x):
283
  """Gaussian Error Linear Unit.
284

285
  This is a smoother version of the RELU.
286
  Original paper: https://arxiv.org/abs/1606.08415
287
  Args:
288
    x: float Tensor to perform activation.
289

290
  Returns:
291
    `x` with the GELU activation applied.
292
  """
293
  cdf = 0.5 * (1.0 + tf.tanh(
294
      (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
295
  return x * cdf
296

297

298
def get_activation(activation_string):
299
  """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`.
300

301
  Args:
302
    activation_string: String name of the activation function.
303

304
  Returns:
305
    A Python function corresponding to the activation function. If
306
    `activation_string` is None, empty, or "linear", this will return None.
307
    If `activation_string` is not a string, it will return `activation_string`.
308

309
  Raises:
310
    ValueError: The `activation_string` does not correspond to a known
311
      activation.
312
  """
313

314
  # We assume that anything that"s not a string is already an activation
315
  # function, so we just return it.
316
  if not isinstance(activation_string, six.string_types):
317
    return activation_string
318

319
  if not activation_string:
320
    return None
321

322
  act = activation_string.lower()
323
  if act == "linear":
324
    return None
325
  elif act == "relu":
326
    return tf.nn.relu
327
  elif act == "gelu":
328
    return gelu
329
  elif act == "tanh":
330
    return tf.tanh
331
  else:
332
    raise ValueError("Unsupported activation: %s" % act)
333

334

335
def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
336
  """Compute the union of the current variables and checkpoint variables."""
337
  assignment_map = {}
338
  initialized_variable_names = {}
339

340
  name_to_variable = collections.OrderedDict()
341
  for var in tvars:
342
    name = var.name
343
    m = re.match("^(.*):\\d+$", name)
344
    if m is not None:
345
      name = m.group(1)
346
    name_to_variable[name] = var
347

348
  init_vars = tf.train.list_variables(init_checkpoint)
349

350
  assignment_map = collections.OrderedDict()
351
  for x in init_vars:
352
    (name, var) = (x[0], x[1])
353
    if name not in name_to_variable:
354
      continue
355
    assignment_map[name] = name
356
    initialized_variable_names[name] = 1
357
    initialized_variable_names[name + ":0"] = 1
358

359
  return (assignment_map, initialized_variable_names)
360

361

362
def dropout(input_tensor, dropout_prob):
363
  """Perform dropout.
364

365
  Args:
366
    input_tensor: float Tensor.
367
    dropout_prob: Python float. The probability of dropping out a value (NOT of
368
      *keeping* a dimension as in `tf.nn.dropout`).
369

370
  Returns:
371
    A version of `input_tensor` with dropout applied.
372
  """
373
  if dropout_prob is None or dropout_prob == 0.0:
374
    return input_tensor
375

376
  output = tf.nn.dropout(input_tensor, rate=dropout_prob)
377
  return output
378

379

380
def layer_norm(input_tensor, name=None):
381
  """Run layer normalization on the last dimension of the tensor."""
382
  return layers.layer_norm(
383
      inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name)
384

385

386
def layer_norm_and_dropout(input_tensor, dropout_prob, name=None):
387
  """Runs layer normalization followed by dropout."""
388
  output_tensor = layer_norm(input_tensor, name)
389
  output_tensor = dropout(output_tensor, dropout_prob)
390
  return output_tensor
391

392

393
def create_initializer(initializer_range=0.02):
394
  """Creates a `truncated_normal_initializer` with the given range."""
395
  return tf.truncated_normal_initializer(stddev=initializer_range)
396

397

398
def embedding_lookup(input_ids,
399
                     vocab_size,
400
                     embedding_size=128,
401
                     initializer_range=0.02,
402
                     word_embedding_name="word_embeddings",
403
                     use_one_hot_embeddings=False):
404
  """Looks up words embeddings for id tensor.
405

406
  Args:
407
    input_ids: int32 Tensor of shape [batch_size, seq_length] containing word
408
      ids.
409
    vocab_size: int. Size of the embedding vocabulary.
410
    embedding_size: int. Width of the word embeddings.
411
    initializer_range: float. Embedding initialization range.
412
    word_embedding_name: string. Name of the embedding table.
413
    use_one_hot_embeddings: bool. If True, use one-hot method for word
414
      embeddings. If False, use `tf.nn.embedding_lookup()`.
415

416
  Returns:
417
    float Tensor of shape [batch_size, seq_length, embedding_size].
418
  """
419
  # This function assumes that the input is of shape [batch_size, seq_length,
420
  # num_inputs].
421
  #
422
  # If the input is a 2D tensor of shape [batch_size, seq_length], we
423
  # reshape to [batch_size, seq_length, 1].
424
  if input_ids.shape.ndims == 2:
425
    input_ids = tf.expand_dims(input_ids, axis=[-1])
426

427
  embedding_table = tf.get_variable(
428
      name=word_embedding_name,
429
      shape=[vocab_size, embedding_size],
430
      initializer=create_initializer(initializer_range))
431

432
  if use_one_hot_embeddings:
433
    flat_input_ids = tf.reshape(input_ids, [-1])
434
    one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)
435
    output = tf.matmul(one_hot_input_ids, embedding_table)
436
  else:
437
    output = tf.nn.embedding_lookup(embedding_table, input_ids)
438

439
  input_shape = get_shape_list(input_ids)
440

441
  output = tf.reshape(output,
442
                      input_shape[0:-1] + [input_shape[-1] * embedding_size])
443
  return (output, embedding_table)
444

445

446
def embedding_postprocessor(input_tensor,
447
                            use_token_type=False,
448
                            token_type_ids=None,
449
                            token_type_vocab_size=16,
450
                            token_type_embedding_name="token_type_embeddings",
451
                            use_position_embeddings=True,
452
                            position_embedding_name="position_embeddings",
453
                            initializer_range=0.02,
454
                            max_position_embeddings=512,
455
                            dropout_prob=0.1):
456
  """Performs various post-processing on a word embedding tensor.
457

458
  Args:
459
    input_tensor: float Tensor of shape [batch_size, seq_length,
460
      embedding_size].
461
    use_token_type: bool. Whether to add embeddings for `token_type_ids`.
462
    token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
463
      Must be specified if `use_token_type` is True.
464
    token_type_vocab_size: int. The vocabulary size of `token_type_ids`.
465
    token_type_embedding_name: string. The name of the embedding table variable
466
      for token type ids.
467
    use_position_embeddings: bool. Whether to add position embeddings for the
468
      position of each token in the sequence.
469
    position_embedding_name: string. The name of the embedding table variable
470
      for positional embeddings.
471
    initializer_range: float. Range of the weight initialization.
472
    max_position_embeddings: int. Maximum sequence length that might ever be
473
      used with this model. This can be longer than the sequence length of
474
      input_tensor, but cannot be shorter.
475
    dropout_prob: float. Dropout probability applied to the final output tensor.
476

477
  Returns:
478
    float tensor with same shape as `input_tensor`.
479

480
  Raises:
481
    ValueError: One of the tensor shapes or input values is invalid.
482
  """
483
  input_shape = get_shape_list(input_tensor, expected_rank=3)
484
  batch_size = input_shape[0]
485
  seq_length = input_shape[1]
486
  width = input_shape[2]
487

488
  output = input_tensor
489

490
  if use_token_type:
491
    if token_type_ids is None:
492
      raise ValueError("`token_type_ids` must be specified if"
493
                       "`use_token_type` is True.")
494
    token_type_table = tf.get_variable(
495
        name=token_type_embedding_name,
496
        shape=[token_type_vocab_size, width],
497
        initializer=create_initializer(initializer_range))
498
    # This vocab will be small so we always do one-hot here, since it is always
499
    # faster for a small vocabulary.
500
    flat_token_type_ids = tf.reshape(token_type_ids, [-1])
501
    one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size)
502
    token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
503
    token_type_embeddings = tf.reshape(token_type_embeddings,
504
                                       [batch_size, seq_length, width])
505
    output += token_type_embeddings
506

507
  if use_position_embeddings:
508
    # Create the variable outside the assertion to avoid TF2 compatibility
509
    # issues.
510
    full_position_embeddings = tf.get_variable(
511
        name=position_embedding_name,
512
        shape=[max_position_embeddings, width],
513
        initializer=create_initializer(initializer_range))
514

515
    assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
516
    with tf.control_dependencies([assert_op]):
517
      # Since the position embedding table is a learned variable, we create it
518
      # using a (long) sequence length `max_position_embeddings`. The actual
519
      # sequence length might be shorter than this, for faster training of
520
      # tasks that do not have long sequences.
521
      #
522
      # So `full_position_embeddings` is effectively an embedding table
523
      # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
524
      # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
525
      # perform a slice.
526
      position_embeddings = tf.slice(full_position_embeddings, [0, 0],
527
                                     [seq_length, -1])
528
      num_dims = len(output.shape.as_list())
529

530
      # Only the last two dimensions are relevant (`seq_length` and `width`), so
531
      # we broadcast among the first dimensions, which is typically just
532
      # the batch size.
533
      position_broadcast_shape = []
534
      for _ in range(num_dims - 2):
535
        position_broadcast_shape.append(1)
536
      position_broadcast_shape.extend([seq_length, width])
537
      position_embeddings = tf.reshape(position_embeddings,
538
                                       position_broadcast_shape)
539
      output += position_embeddings
540

541
  output = layer_norm_and_dropout(output, dropout_prob)
542
  return output
543

544

545
def create_attention_mask_from_input_mask(from_tensor, to_mask):
546
  """Create 3D attention mask from a 2D tensor mask.
547

548
  Args:
549
    from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
550
    to_mask: int32 Tensor of shape [batch_size, to_seq_length].
551

552
  Returns:
553
    float Tensor of shape [batch_size, from_seq_length, to_seq_length].
554
  """
555
  from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
556
  batch_size = from_shape[0]
557
  from_seq_length = from_shape[1]
558

559
  to_shape = get_shape_list(to_mask, expected_rank=2)
560
  to_seq_length = to_shape[1]
561

562
  to_mask = tf.cast(
563
      tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32)
564

565
  # We don't assume that `from_tensor` is a mask (although it could be). We
566
  # don't actually care if we attend *from* padding tokens (only *to* padding)
567
  # tokens so we create a tensor of all ones.
568
  #
569
  # `broadcast_ones` = [batch_size, from_seq_length, 1]
570
  broadcast_ones = tf.ones(
571
      shape=[batch_size, from_seq_length, 1], dtype=tf.float32)
572

573
  # Here we broadcast along two dimensions to create the mask.
574
  mask = broadcast_ones * to_mask
575

576
  return mask
577

578

579
def dense_layer_3d(input_tensor,
580
                   num_attention_heads,
581
                   size_per_head,
582
                   initializer,
583
                   activation,
584
                   name=None):
585
  """A dense layer with 3D kernel.
586

587
  Args:
588
    input_tensor: float Tensor of shape [batch, seq_length, hidden_size].
589
    num_attention_heads: Number of attention heads.
590
    size_per_head: The size per attention head.
591
    initializer: Kernel initializer.
592
    activation: Actication function.
593
    name: The name scope of this layer.
594

595
  Returns:
596
    float logits Tensor.
597
  """
598

599
  last_dim = get_shape_list(input_tensor)[-1]
600

601
  with tf.variable_scope(name):
602
    w = tf.get_variable(
603
        name="kernel",
604
        shape=[last_dim, num_attention_heads * size_per_head],
605
        initializer=initializer)
606
    w = tf.reshape(w, [last_dim, num_attention_heads, size_per_head])
607
    b = tf.get_variable(
608
        name="bias",
609
        shape=[num_attention_heads * size_per_head],
610
        initializer=tf.zeros_initializer)
611
    b = tf.reshape(b, [num_attention_heads, size_per_head])
612
    ret = tf.einsum("abc,cde->abde", input_tensor, w)
613
    ret += b
614
    if activation is not None:
615
      return activation(ret)
616
    else:
617
      return ret
618

619

620
def dense_layer_3d_proj(input_tensor,
621
                        hidden_size,
622
                        num_attention_heads,
623
                        head_size,
624
                        initializer,
625
                        activation,
626
                        name=None):
627
  """A dense layer with 3D kernel for projection.
628

629
  Args:
630
    input_tensor: float Tensor of shape [batch,from_seq_length,
631
      num_attention_heads, size_per_head].
632
    hidden_size: The size of hidden layer.
633
    num_attention_heads: The size of output dimension.
634
    head_size: The size of head.
635
    initializer: Kernel initializer.
636
    activation: Actication function.
637
    name: The name scope of this layer.
638

639
  Returns:
640
    float logits Tensor.
641
  """
642
  head_size = hidden_size // num_attention_heads
643
  with tf.variable_scope(name):
644
    w = tf.get_variable(
645
        name="kernel",
646
        shape=[hidden_size, hidden_size],
647
        initializer=initializer)
648
    w = tf.reshape(w, [num_attention_heads, head_size, hidden_size])
649
    b = tf.get_variable(
650
        name="bias", shape=[hidden_size], initializer=tf.zeros_initializer)
651

652
  ret = tf.einsum("BFNH,NHD->BFD", input_tensor, w)
653
  ret += b
654
  if activation is not None:
655
    return activation(ret)
656
  else:
657
    return ret
658

659

660
def dense_layer_2d(input_tensor,
661
                   output_size,
662
                   initializer,
663
                   activation,
664
                   name=None):
665
  """A dense layer with 2D kernel.
666

667
  Args:
668
    input_tensor: Float tensor with rank 3.
669
    output_size: The size of output dimension.
670
    initializer: Kernel initializer.
671
    activation: Actication function.
672
    name: The name scope of this layer.
673

674
  Returns:
675
    float logits Tensor.
676
  """
677
  last_dim = get_shape_list(input_tensor)[-1]
678
  with tf.variable_scope(name):
679
    w = tf.get_variable(
680
        name="kernel", shape=[last_dim, output_size], initializer=initializer)
681
    b = tf.get_variable(
682
        name="bias", shape=[output_size], initializer=tf.zeros_initializer)
683

684
  ret = tf.einsum("abc,cd->abd", input_tensor, w)
685
  ret += b
686
  if activation is not None:
687
    return activation(ret)
688
  else:
689
    return ret
690

691

692
def attention_layer(from_tensor,
693
                    to_tensor,
694
                    attention_mask=None,
695
                    num_attention_heads=1,
696
                    size_per_head=512,
697
                    query_act=None,
698
                    key_act=None,
699
                    value_act=None,
700
                    attention_probs_dropout_prob=0.0,
701
                    initializer_range=0.02,
702
                    batch_size=None,
703
                    from_seq_length=None,
704
                    to_seq_length=None):
705
  """Performs multi-headed attention from `from_tensor` to `to_tensor`.
706

707
  This is an implementation of multi-headed attention based on "Attention
708
  is all you Need". If `from_tensor` and `to_tensor` are the same, then
709
  this is self-attention. Each timestep in `from_tensor` attends to the
710
  corresponding sequence in `to_tensor`, and returns a fixed-with vector.
711

712
  This function first projects `from_tensor` into a "query" tensor and
713
  `to_tensor` into "key" and "value" tensors. These are (effectively) a list
714
  of tensors of length `num_attention_heads`, where each tensor is of shape
715
  [batch_size, seq_length, size_per_head].
716

717
  Then, the query and key tensors are dot-producted and scaled. These are
718
  softmaxed to obtain attention probabilities. The value tensors are then
719
  interpolated by these probabilities, then concatenated back to a single
720
  tensor and returned.
721

722
  In practice, the multi-headed attention are done with tf.einsum as follows:
723
    Input_tensor: [BFD]
724
    Wq, Wk, Wv: [DNH]
725
    Q:[BFNH] = einsum('BFD,DNH->BFNH', Input_tensor, Wq)
726
    K:[BTNH] = einsum('BTD,DNH->BTNH', Input_tensor, Wk)
727
    V:[BTNH] = einsum('BTD,DNH->BTNH', Input_tensor, Wv)
728
    attention_scores:[BNFT] = einsum('BFNH,BTNH>BNFT', Q, K) / sqrt(H)
729
    attention_probs:[BNFT] = softmax(attention_scores)
730
    context_layer:[BFNH] = einsum('BNFT,BTNH->BFNH', attention_probs, V)
731
    Wout:[DNH]
732
    Output:[BFD] = einsum('BFNH,DNH>BFD', context_layer, Wout)
733

734
  Args:
735
    from_tensor: float Tensor of shape [batch_size, from_seq_length,
736
      from_width].
737
    to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].
738
    attention_mask: (optional) int32 Tensor of shape [batch_size,
739
      from_seq_length, to_seq_length]. The values should be 1 or 0. The
740
      attention scores will effectively be set to -infinity for any positions in
741
      the mask that are 0, and will be unchanged for positions that are 1.
742
    num_attention_heads: int. Number of attention heads.
743
    size_per_head: int. Size of each attention head.
744
    query_act: (optional) Activation function for the query transform.
745
    key_act: (optional) Activation function for the key transform.
746
    value_act: (optional) Activation function for the value transform.
747
    attention_probs_dropout_prob: (optional) float. Dropout probability of the
748
      attention probabilities.
749
    initializer_range: float. Range of the weight initializer.
750
    batch_size: (Optional) int. If the input is 2D, this might be the batch size
751
      of the 3D version of the `from_tensor` and `to_tensor`.
752
    from_seq_length: (Optional) If the input is 2D, this might be the seq length
753
      of the 3D version of the `from_tensor`.
754
    to_seq_length: (Optional) If the input is 2D, this might be the seq length
755
      of the 3D version of the `to_tensor`.
756

757
  Returns:
758
    float Tensor of shape [batch_size, from_seq_length, num_attention_heads,
759
      size_per_head].
760

761
  Raises:
762
    ValueError: Any of the arguments or tensor shapes are invalid.
763
  """
764
  from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
765
  to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])
766

767
  if len(from_shape) != len(to_shape):
768
    raise ValueError(
769
        "The rank of `from_tensor` must match the rank of `to_tensor`.")
770

771
  if len(from_shape) == 3:
772
    batch_size = from_shape[0]
773
    from_seq_length = from_shape[1]
774
    to_seq_length = to_shape[1]
775
  elif len(from_shape) == 2:
776
    if (batch_size is None or from_seq_length is None or to_seq_length is None):
777
      raise ValueError(
778
          "When passing in rank 2 tensors to attention_layer, the values "
779
          "for `batch_size`, `from_seq_length`, and `to_seq_length` "
780
          "must all be specified.")
781

782
  # Scalar dimensions referenced here:
783
  #   B = batch size (number of sequences)
784
  #   F = `from_tensor` sequence length
785
  #   T = `to_tensor` sequence length
786
  #   N = `num_attention_heads`
787
  #   H = `size_per_head`
788

789
  # `query_layer` = [B, F, N, H]
790
  query_layer = dense_layer_3d(from_tensor, num_attention_heads, size_per_head,
791
                               create_initializer(initializer_range), query_act,
792
                               "query")
793

794
  # `key_layer` = [B, T, N, H]
795
  key_layer = dense_layer_3d(to_tensor, num_attention_heads, size_per_head,
796
                             create_initializer(initializer_range), key_act,
797
                             "key")
798

799
  # `value_layer` = [B, T, N, H]
800
  value_layer = dense_layer_3d(to_tensor, num_attention_heads, size_per_head,
801
                               create_initializer(initializer_range), value_act,
802
                               "value")
803

804
  # Take the dot product between "query" and "key" to get the raw
805
  # attention scores.
806
  attention_scores = tf.einsum("BTNH,BFNH->BNFT", key_layer, query_layer)
807
  attention_scores = tf.multiply(attention_scores,
808
                                 1.0 / math.sqrt(float(size_per_head)))
809

810
  if attention_mask is not None:
811
    # `attention_mask` = [B, 1, F, T]
812
    attention_mask = tf.expand_dims(attention_mask, axis=[1])
813

814
    # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
815
    # masked positions, this operation will create a tensor which is 0.0 for
816
    # positions we want to attend and -10000.0 for masked positions.
817
    adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0
818

819
    # Since we are adding it to the raw scores before the softmax, this is
820
    # effectively the same as removing these entirely.
821
    attention_scores += adder
822

823
  # Normalize the attention scores to probabilities.
824
  # `attention_probs` = [B, N, F, T]
825
  attention_probs = tf.nn.softmax(attention_scores)
826

827
  # This is actually dropping out entire tokens to attend to, which might
828
  # seem a bit unusual, but is taken from the original Transformer paper.
829
  attention_probs = dropout(attention_probs, attention_probs_dropout_prob)
830

831
  # `context_layer` = [B, F, N, H]
832
  context_layer = tf.einsum("BNFT,BTNH->BFNH", attention_probs, value_layer)
833

834
  return context_layer
835

836

837
def transformer_model(input_tensor,
838
                      attention_mask=None,
839
                      hidden_size=768,
840
                      num_hidden_layers=12,
841
                      num_attention_heads=12,
842
                      intermediate_size=3072,
843
                      intermediate_act_fn=gelu,
844
                      hidden_dropout_prob=0.1,
845
                      attention_probs_dropout_prob=0.1,
846
                      initializer_range=0.02,
847
                      do_return_all_layers=False):
848
  """Multi-headed, multi-layer Transformer from "Attention is All You Need".
849

850
  This is almost an exact implementation of the original Transformer encoder.
851

852
  See the original paper:
853
  https://arxiv.org/abs/1706.03762
854

855
  Also see:
856
  https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py
857

858
  Args:
859
    input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
860
    attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
861
      seq_length], with 1 for positions that can be attended to and 0 in
862
      positions that should not be.
863
    hidden_size: int. Hidden size of the Transformer.
864
    num_hidden_layers: int. Number of layers (blocks) in the Transformer.
865
    num_attention_heads: int. Number of attention heads in the Transformer.
866
    intermediate_size: int. The size of the "intermediate" (a.k.a., feed
867
      forward) layer.
868
    intermediate_act_fn: function. The non-linear activation function to apply
869
      to the output of the intermediate/feed-forward layer.
870
    hidden_dropout_prob: float. Dropout probability for the hidden layers.
871
    attention_probs_dropout_prob: float. Dropout probability of the attention
872
      probabilities.
873
    initializer_range: float. Range of the initializer (stddev of truncated
874
      normal).
875
    do_return_all_layers: Whether to also return all layers or just the final
876
      layer.
877

878
  Returns:
879
    float Tensor of shape [batch_size, seq_length, hidden_size], the final
880
    hidden layer of the Transformer.
881

882
  Raises:
883
    ValueError: A Tensor shape or parameter is invalid.
884
  """
885
  if hidden_size % num_attention_heads != 0:
886
    raise ValueError(
887
        "The hidden size (%d) is not a multiple of the number of attention "
888
        "heads (%d)" % (hidden_size, num_attention_heads))
889

890
  attention_head_size = int(hidden_size / num_attention_heads)
891
  input_shape = get_shape_list(input_tensor, expected_rank=3)
892
  input_width = input_shape[2]
893

894
  # The Transformer performs sum residuals on all layers so the input needs
895
  # to be the same as the hidden size.
896
  if input_width != hidden_size:
897
    raise ValueError("The width of the input tensor (%d) != hidden size (%d)" %
898
                     (input_width, hidden_size))
899

900
  prev_output = input_tensor
901
  all_layer_outputs = []
902
  for layer_idx in range(num_hidden_layers):
903
    with tf.variable_scope("layer_%d" % layer_idx):
904
      layer_input = prev_output
905

906
      with tf.variable_scope("attention"):
907
        with tf.variable_scope("self"):
908
          attention_output = attention_layer(
909
              from_tensor=layer_input,
910
              to_tensor=layer_input,
911
              attention_mask=attention_mask,
912
              num_attention_heads=num_attention_heads,
913
              size_per_head=attention_head_size,
914
              attention_probs_dropout_prob=attention_probs_dropout_prob,
915
              initializer_range=initializer_range)
916

917
        # Run a linear projection of `hidden_size` then add a residual
918
        # with `layer_input`.
919
        with tf.variable_scope("output"):
920
          attention_output = dense_layer_3d_proj(
921
              attention_output, hidden_size,
922
              num_attention_heads, attention_head_size,
923
              create_initializer(initializer_range), None, "dense")
924
          attention_output = dropout(attention_output, hidden_dropout_prob)
925
          attention_output = layer_norm(attention_output + layer_input)
926

927
      # The activation is only applied to the "intermediate" hidden layer.
928
      with tf.variable_scope("intermediate"):
929
        intermediate_output = dense_layer_2d(
930
            attention_output, intermediate_size,
931
            create_initializer(initializer_range), intermediate_act_fn, "dense")
932

933
      # Down-project back to `hidden_size` then add the residual.
934
      with tf.variable_scope("output"):
935
        layer_output = dense_layer_2d(intermediate_output, hidden_size,
936
                                      create_initializer(initializer_range),
937
                                      None, "dense")
938
        layer_output = dropout(layer_output, hidden_dropout_prob)
939
        layer_output = layer_norm(layer_output + attention_output)
940
        prev_output = layer_output
941
        all_layer_outputs.append(layer_output)
942

943
  if do_return_all_layers:
944
    return all_layer_outputs
945
  else:
946
    return all_layer_outputs[-1]
947

948

949
def get_shape_list(tensor, expected_rank=None, name=None):
950
  """Returns a list of the shape of tensor, preferring static dimensions.
951

952
  Args:
953
    tensor: A tf.Tensor object to find the shape of.
954
    expected_rank: (optional) int. The expected rank of `tensor`. If this is
955
      specified and the `tensor` has a different rank, and exception will be
956
      thrown.
957
    name: Optional name of the tensor for the error message.
958

959
  Returns:
960
    A list of dimensions of the shape of tensor. All static dimensions will
961
    be returned as python integers, and dynamic dimensions will be returned
962
    as tf.Tensor scalars.
963
  """
964
  if name is None:
965
    # Tensor.name is not supported in Eager mode.
966
    if tf.executing_eagerly():
967
      name = "get_shape_list"
968
    else:
969
      name = tensor.name
970

971
  if expected_rank is not None:
972
    assert_rank(tensor, expected_rank, name)
973

974
  shape = tensor.shape.as_list()
975

976
  non_static_indexes = []
977
  for (index, dim) in enumerate(shape):
978
    if dim is None:
979
      non_static_indexes.append(index)
980

981
  if not non_static_indexes:
982
    return shape
983

984
  dyn_shape = tf.shape(tensor)
985
  for index in non_static_indexes:
986
    shape[index] = dyn_shape[index]
987
  return shape
988

989

990
def reshape_to_matrix(input_tensor):
991
  """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix)."""
992
  ndims = input_tensor.shape.ndims
993
  if ndims < 2:
994
    raise ValueError("Input tensor must have at least rank 2. Shape = %s" %
995
                     (input_tensor.shape))
996
  if ndims == 2:
997
    return input_tensor
998

999
  width = input_tensor.shape[-1]
1000
  output_tensor = tf.reshape(input_tensor, [-1, width])
1001
  return output_tensor
1002

1003

1004
def reshape_from_matrix(output_tensor, orig_shape_list):
1005
  """Reshapes a rank 2 tensor back to its original rank >= 2 tensor."""
1006
  if len(orig_shape_list) == 2:
1007
    return output_tensor
1008

1009
  output_shape = get_shape_list(output_tensor)
1010

1011
  orig_dims = orig_shape_list[0:-1]
1012
  width = output_shape[-1]
1013

1014
  return tf.reshape(output_tensor, orig_dims + [width])
1015

1016

1017
def assert_rank(tensor, expected_rank, name=None):
1018
  """Raises an exception if the tensor rank is not of the expected rank.
1019

1020
  Args:
1021
    tensor: A tf.Tensor to check the rank of.
1022
    expected_rank: Python integer or list of integers, expected rank.
1023
    name: Optional name of the tensor for the error message.
1024

1025
  Raises:
1026
    ValueError: If the expected shape doesn't match the actual shape.
1027
  """
1028
  if name is None:
1029
    name = tensor.name
1030

1031
  expected_rank_dict = {}
1032
  if isinstance(expected_rank, six.integer_types):
1033
    expected_rank_dict[expected_rank] = True
1034
  else:
1035
    for x in expected_rank:
1036
      expected_rank_dict[x] = True
1037

1038
  actual_rank = tensor.shape.ndims
1039
  if actual_rank not in expected_rank_dict:
1040
    scope_name = tf.get_variable_scope().name
1041
    raise ValueError(
1042
        "For the tensor `%s` in scope `%s`, the actual rank "
1043
        "`%d` (shape = %s) is not equal to the expected rank `%s`" %
1044
        (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))
1045
google-research

Использование cookies