google-research

modeling.py
990 строк · 37.2 Кб
Перенос по словам
1
# coding=utf-8
2
# Copyright 2024 The Google Research Authors.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15

16
"""The main BERT model and related functions."""
17

18
from __future__ import absolute_import
19
from __future__ import division
20
from __future__ import print_function
21

22
import collections
23
import copy
24
import json
25
import math
26
import re
27
import six
28
import tensorflow.compat.v1 as tf
29
from tensorflow.contrib import layers as contrib_layers
30

31

32
class BertConfig(object):
33
  """Configuration for `BertModel`."""
34

35
  def __init__(self,
36
               vocab_size,
37
               hidden_size=768,
38
               num_hidden_layers=12,
39
               num_attention_heads=12,
40
               intermediate_size=3072,
41
               hidden_act="gelu",
42
               hidden_dropout_prob=0.1,
43
               attention_probs_dropout_prob=0.1,
44
               max_position_embeddings=512,
45
               type_vocab_size=16,
46
               initializer_range=0.02):
47
    """Constructs BertConfig.
48

49
    Args:
50
      vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
51
      hidden_size: Size of the encoder layers and the pooler layer.
52
      num_hidden_layers: Number of hidden layers in the Transformer encoder.
53
      num_attention_heads: Number of attention heads for each attention layer in
54
        the Transformer encoder.
55
      intermediate_size: The size of the "intermediate" (i.e., feed-forward)
56
        layer in the Transformer encoder.
57
      hidden_act: The non-linear activation function (function or string) in the
58
        encoder and pooler.
59
      hidden_dropout_prob: The dropout probability for all fully connected
60
        layers in the embeddings, encoder, and pooler.
61
      attention_probs_dropout_prob: The dropout ratio for the attention
62
        probabilities.
63
      max_position_embeddings: The maximum sequence length that this model might
64
        ever be used with. Typically set this to something large just in case
65
        (e.g., 512 or 1024 or 2048).
66
      type_vocab_size: The vocabulary size of the `token_type_ids` passed into
67
        `BertModel`.
68
      initializer_range: The stdev of the truncated_normal_initializer for
69
        initializing all weight matrices.
70
    """
71
    self.vocab_size = vocab_size
72
    self.hidden_size = hidden_size
73
    self.num_hidden_layers = num_hidden_layers
74
    self.num_attention_heads = num_attention_heads
75
    self.hidden_act = hidden_act
76
    self.intermediate_size = intermediate_size
77
    self.hidden_dropout_prob = hidden_dropout_prob
78
    self.attention_probs_dropout_prob = attention_probs_dropout_prob
79
    self.max_position_embeddings = max_position_embeddings
80
    self.type_vocab_size = type_vocab_size
81
    self.initializer_range = initializer_range
82

83
  @classmethod
84
  def from_dict(cls, json_object):
85
    """Constructs a `BertConfig` from a Python dictionary of parameters."""
86
    config = BertConfig(vocab_size=None)
87
    for (key, value) in six.iteritems(json_object):
88
      config.__dict__[key] = value
89
    return config
90

91
  @classmethod
92
  def from_json_file(cls, json_file):
93
    """Constructs a `BertConfig` from a json file of parameters."""
94
    with tf.gfile.GFile(json_file, "r") as reader:
95
      text = reader.read()
96
    return cls.from_dict(json.loads(text))
97

98
  def to_dict(self):
99
    """Serializes this instance to a Python dictionary."""
100
    output = copy.deepcopy(self.__dict__)
101
    return output
102

103
  def to_json_string(self):
104
    """Serializes this instance to a JSON string."""
105
    return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
106

107

108
class BertModel(object):
109
  """BERT model ("Bidirectional Encoder Representations from Transformers").
110

111
  Example usage:
112

113
  ```python
114
  # Already been converted into WordPiece token ids
115
  input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])
116
  input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])
117
  token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]])
118

119
  config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
120
    num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
121

122
  model = modeling.BertModel(config=config, is_training=True,
123
    input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)
124

125
  label_embeddings = tf.get_variable(...)
126
  pooled_output = model.get_pooled_output()
127
  logits = tf.matmul(pooled_output, label_embeddings)
128
  ...
129
  ```
130
  """
131

132
  def __init__(self,
133
               config,
134
               is_training,
135
               input_ids,
136
               input_mask=None,
137
               token_type_ids=None,
138
               use_one_hot_embeddings=True,
139
               scope=None):
140
    """Constructor for BertModel.
141

142
    Args:
143
      config: `BertConfig` instance.
144
      is_training: bool. true for training model, false for eval model. Controls
145
        whether dropout will be applied.
146
      input_ids: int32 Tensor of shape [batch_size, seq_length].
147
      input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
148
      token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
149
      use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
150
        embeddings or tf.embedding_lookup() for the word embeddings. On the TPU,
151
        it is much faster if this is True, on the CPU or GPU, it is faster if
152
        this is False.
153
      scope: (optional) variable scope. Defaults to "bert".
154

155
    Raises:
156
      ValueError: The config is invalid or one of the input tensor shapes
157
        is invalid.
158
    """
159
    config = copy.deepcopy(config)
160
    if not is_training:
161
      config.hidden_dropout_prob = 0.0
162
      config.attention_probs_dropout_prob = 0.0
163

164
    input_shape = get_shape_list(input_ids, expected_rank=2)
165
    batch_size = input_shape[0]
166
    seq_length = input_shape[1]
167

168
    if input_mask is None:
169
      input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)
170

171
    if token_type_ids is None:
172
      token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32)
173

174
    with tf.variable_scope(scope, default_name="bert"):
175
      with tf.variable_scope("embeddings"):
176
        # Perform embedding lookup on the word ids.
177
        (self.embedding_output, self.embedding_table) = embedding_lookup(
178
            input_ids=input_ids,
179
            vocab_size=config.vocab_size,
180
            embedding_size=config.hidden_size,
181
            initializer_range=config.initializer_range,
182
            word_embedding_name="word_embeddings",
183
            use_one_hot_embeddings=use_one_hot_embeddings)
184

185
        # Add positional embeddings and token type embeddings, then layer
186
        # normalize and perform dropout.
187
        self.embedding_output = embedding_postprocessor(
188
            input_tensor=self.embedding_output,
189
            use_token_type=True,
190
            token_type_ids=token_type_ids,
191
            token_type_vocab_size=config.type_vocab_size,
192
            token_type_embedding_name="token_type_embeddings",
193
            use_position_embeddings=True,
194
            position_embedding_name="position_embeddings",
195
            initializer_range=config.initializer_range,
196
            max_position_embeddings=config.max_position_embeddings,
197
            dropout_prob=config.hidden_dropout_prob)
198

199
      with tf.variable_scope("encoder"):
200
        # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
201
        # mask of shape [batch_size, seq_length, seq_length] which is used
202
        # for the attention scores.
203
        attention_mask = create_attention_mask_from_input_mask(
204
            input_ids, input_mask)
205

206
        # Run the stacked transformer.
207
        # `sequence_output` shape = [batch_size, seq_length, hidden_size].
208
        self.all_encoder_layers = transformer_model(
209
            input_tensor=self.embedding_output,
210
            attention_mask=attention_mask,
211
            hidden_size=config.hidden_size,
212
            num_hidden_layers=config.num_hidden_layers,
213
            num_attention_heads=config.num_attention_heads,
214
            intermediate_size=config.intermediate_size,
215
            intermediate_act_fn=get_activation(config.hidden_act),
216
            hidden_dropout_prob=config.hidden_dropout_prob,
217
            attention_probs_dropout_prob=config.attention_probs_dropout_prob,
218
            initializer_range=config.initializer_range,
219
            do_return_all_layers=True)
220

221
      self.sequence_output = self.all_encoder_layers[-1]
222
      # The "pooler" converts the encoded sequence tensor of shape
223
      # [batch_size, seq_length, hidden_size] to a tensor of shape
224
      # [batch_size, hidden_size]. This is necessary for segment-level
225
      # (or segment-pair-level) classification tasks where we need a fixed
226
      # dimensional representation of the segment.
227
      with tf.variable_scope("pooler"):
228
        # We "pool" the model by simply taking the hidden state corresponding
229
        # to the first token. We assume that this has been pre-trained
230
        first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1)
231
        self.pooled_output = tf.layers.dense(
232
            first_token_tensor,
233
            config.hidden_size,
234
            activation=tf.tanh,
235
            kernel_initializer=create_initializer(config.initializer_range))
236

237
  def get_pooled_output(self):
238
    return self.pooled_output
239

240
  def get_sequence_output(self):
241
    """Gets final hidden layer of encoder.
242

243
    Returns:
244
      float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
245
      to the final hidden of the transformer encoder.
246
    """
247
    return self.sequence_output
248

249
  def get_all_encoder_layers(self):
250
    return self.all_encoder_layers
251

252
  def get_embedding_output(self):
253
    """Gets output of the embedding lookup (i.e., input to the transformer).
254

255
    Returns:
256
      float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
257
      to the output of the embedding layer, after summing the word
258
      embeddings with the positional embeddings and the token type embeddings,
259
      then performing layer normalization. This is the input to the transformer.
260
    """
261
    return self.embedding_output
262

263
  def get_embedding_table(self):
264
    return self.embedding_table
265

266

267
def gelu(input_tensor):
268
  """Gaussian Error Linear Unit.
269

270
  This is a smoother version of the RELU.
271
  Original paper: https://arxiv.org/abs/1606.08415
272

273
  Args:
274
    input_tensor: float Tensor to perform activation.
275

276
  Returns:
277
    `input_tensor` with the GELU activation applied.
278
  """
279
  cdf = 0.5 * (1.0 + tf.erf(input_tensor / tf.sqrt(2.0)))
280
  return input_tensor * cdf
281

282

283
def get_activation(activation_string):
284
  """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`.
285

286
  Args:
287
    activation_string: String name of the activation function.
288

289
  Returns:
290
    A Python function corresponding to the activation function. If
291
    `activation_string` is None, empty, or "linear", this will return None.
292
    If `activation_string` is not a string, it will return `activation_string`.
293

294
  Raises:
295
    ValueError: The `activation_string` does not correspond to a known
296
      activation.
297
  """
298

299
  # We assume that anything that"s not a string is already an activation
300
  # function, so we just return it.
301
  if not isinstance(activation_string, six.string_types):
302
    return activation_string
303

304
  if not activation_string:
305
    return None
306

307
  act = activation_string.lower()
308
  if act == "linear":
309
    return None
310
  elif act == "relu":
311
    return tf.nn.relu
312
  elif act == "gelu":
313
    return gelu
314
  elif act == "tanh":
315
    return tf.tanh
316
  else:
317
    raise ValueError("Unsupported activation: %s" % act)
318

319

320
def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
321
  """Compute the union of the current variables and checkpoint variables."""
322
  assignment_map = {}
323
  initialized_variable_names = {}
324

325
  name_to_variable = collections.OrderedDict()
326
  for var in tvars:
327
    name = var.name
328
    m = re.match("^(.*):\\d+$", name)
329
    if m is not None:
330
      name = m.group(1)
331
    name_to_variable[name] = var
332

333
  init_vars = tf.train.list_variables(init_checkpoint)
334

335
  assignment_map = collections.OrderedDict()
336
  for x in init_vars:
337
    (name, var) = (x[0], x[1])
338
    if name not in name_to_variable:
339
      continue
340
    assignment_map[name] = name
341
    initialized_variable_names[name] = 1
342
    initialized_variable_names[name + ":0"] = 1
343

344
  return (assignment_map, initialized_variable_names)
345

346

347
def dropout(input_tensor, dropout_prob):
348
  """Perform dropout.
349

350
  Args:
351
    input_tensor: float Tensor.
352
    dropout_prob: Python float. The probability of dropping out a value (NOT of
353
      *keeping* a dimension as in `tf.nn.dropout`).
354

355
  Returns:
356
    A version of `input_tensor` with dropout applied.
357
  """
358
  if dropout_prob is None or dropout_prob == 0.0:
359
    return input_tensor
360

361
  output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob)
362
  return output
363

364

365
def layer_norm(input_tensor, name=None):
366
  """Run layer normalization on the last dimension of the tensor."""
367
  return contrib_layers.layer_norm(
368
      inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name)
369

370

371
def layer_norm_and_dropout(input_tensor, dropout_prob, name=None):
372
  """Runs layer normalization followed by dropout."""
373
  output_tensor = layer_norm(input_tensor, name)
374
  output_tensor = dropout(output_tensor, dropout_prob)
375
  return output_tensor
376

377

378
def create_initializer(initializer_range=0.02):
379
  """Creates a `truncated_normal_initializer` with the given range."""
380
  return tf.truncated_normal_initializer(stddev=initializer_range)
381

382

383
def embedding_lookup(input_ids,
384
                     vocab_size,
385
                     embedding_size=128,
386
                     initializer_range=0.02,
387
                     word_embedding_name="word_embeddings",
388
                     use_one_hot_embeddings=False):
389
  """Looks up words embeddings for id tensor.
390

391
  Args:
392
    input_ids: int32 Tensor of shape [batch_size, seq_length] containing word
393
      ids.
394
    vocab_size: int. Size of the embedding vocabulary.
395
    embedding_size: int. Width of the word embeddings.
396
    initializer_range: float. Embedding initialization range.
397
    word_embedding_name: string. Name of the embedding table.
398
    use_one_hot_embeddings: bool. If True, use one-hot method for word
399
      embeddings. If False, use `tf.nn.embedding_lookup()`. One hot is better
400
      for TPUs.
401

402
  Returns:
403
    float Tensor of shape [batch_size, seq_length, embedding_size].
404
  """
405
  # This function assumes that the input is of shape [batch_size, seq_length,
406
  # num_inputs].
407
  #
408
  # If the input is a 2D tensor of shape [batch_size, seq_length], we
409
  # reshape to [batch_size, seq_length, 1].
410
  if input_ids.shape.ndims == 2:
411
    input_ids = tf.expand_dims(input_ids, axis=[-1])
412

413
  embedding_table = tf.get_variable(
414
      name=word_embedding_name,
415
      shape=[vocab_size, embedding_size],
416
      initializer=create_initializer(initializer_range))
417

418
  if use_one_hot_embeddings:
419
    flat_input_ids = tf.reshape(input_ids, [-1])
420
    one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)
421
    output = tf.matmul(one_hot_input_ids, embedding_table)
422
  else:
423
    output = tf.nn.embedding_lookup(embedding_table, input_ids)
424

425
  input_shape = get_shape_list(input_ids)
426

427
  output = tf.reshape(output,
428
                      input_shape[0:-1] + [input_shape[-1] * embedding_size])
429
  return (output, embedding_table)
430

431

432
def embedding_postprocessor(input_tensor,
433
                            use_token_type=False,
434
                            token_type_ids=None,
435
                            token_type_vocab_size=16,
436
                            token_type_embedding_name="token_type_embeddings",
437
                            use_position_embeddings=True,
438
                            position_embedding_name="position_embeddings",
439
                            initializer_range=0.02,
440
                            max_position_embeddings=512,
441
                            dropout_prob=0.1):
442
  """Performs various post-processing on a word embedding tensor.
443

444
  Args:
445
    input_tensor: float Tensor of shape [batch_size, seq_length,
446
      embedding_size].
447
    use_token_type: bool. Whether to add embeddings for `token_type_ids`.
448
    token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
449
      Must be specified if `use_token_type` is True.
450
    token_type_vocab_size: int. The vocabulary size of `token_type_ids`.
451
    token_type_embedding_name: string. The name of the embedding table variable
452
      for token type ids.
453
    use_position_embeddings: bool. Whether to add position embeddings for the
454
      position of each token in the sequence.
455
    position_embedding_name: string. The name of the embedding table variable
456
      for positional embeddings.
457
    initializer_range: float. Range of the weight initialization.
458
    max_position_embeddings: int. Maximum sequence length that might ever be
459
      used with this model. This can be longer than the sequence length of
460
      input_tensor, but cannot be shorter.
461
    dropout_prob: float. Dropout probability applied to the final output tensor.
462

463
  Returns:
464
    float tensor with same shape as `input_tensor`.
465

466
  Raises:
467
    ValueError: One of the tensor shapes or input values is invalid.
468
  """
469
  input_shape = get_shape_list(input_tensor, expected_rank=3)
470
  batch_size = input_shape[0]
471
  seq_length = input_shape[1]
472
  width = input_shape[2]
473

474
  output = input_tensor
475

476
  if use_token_type:
477
    if token_type_ids is None:
478
      raise ValueError("`token_type_ids` must be specified if"
479
                       "`use_token_type` is True.")
480
    token_type_table = tf.get_variable(
481
        name=token_type_embedding_name,
482
        shape=[token_type_vocab_size, width],
483
        initializer=create_initializer(initializer_range))
484
    # This vocab will be small so we always do one-hot here, since it is always
485
    # faster for a small vocabulary.
486
    flat_token_type_ids = tf.reshape(token_type_ids, [-1])
487
    one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size)
488
    token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
489
    token_type_embeddings = tf.reshape(token_type_embeddings,
490
                                       [batch_size, seq_length, width])
491
    output += token_type_embeddings
492

493
  if use_position_embeddings:
494
    assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
495
    with tf.control_dependencies([assert_op]):
496
      full_position_embeddings = tf.get_variable(
497
          name=position_embedding_name,
498
          shape=[max_position_embeddings, width],
499
          initializer=create_initializer(initializer_range))
500
      # Since the position embedding table is a learned variable, we create it
501
      # using a (long) sequence length `max_position_embeddings`. The actual
502
      # sequence length might be shorter than this, for faster training of
503
      # tasks that do not have long sequences.
504
      #
505
      # So `full_position_embeddings` is effectively an embedding table
506
      # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
507
      # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
508
      # perform a slice.
509
      position_embeddings = tf.slice(full_position_embeddings, [0, 0],
510
                                     [seq_length, -1])
511
      num_dims = len(output.shape.as_list())
512

513
      # Only the last two dimensions are relevant (`seq_length` and `width`), so
514
      # we broadcast among the first dimensions, which is typically just
515
      # the batch size.
516
      position_broadcast_shape = []
517
      for _ in range(num_dims - 2):
518
        position_broadcast_shape.append(1)
519
      position_broadcast_shape.extend([seq_length, width])
520
      position_embeddings = tf.reshape(position_embeddings,
521
                                       position_broadcast_shape)
522
      output += position_embeddings
523

524
  output = layer_norm_and_dropout(output, dropout_prob)
525
  return output
526

527

528
def create_attention_mask_from_input_mask(from_tensor, to_mask):
529
  """Create 3D attention mask from a 2D tensor mask.
530

531
  Args:
532
    from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
533
    to_mask: int32 Tensor of shape [batch_size, to_seq_length].
534

535
  Returns:
536
    float Tensor of shape [batch_size, from_seq_length, to_seq_length].
537
  """
538
  from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
539
  batch_size = from_shape[0]
540
  from_seq_length = from_shape[1]
541

542
  to_shape = get_shape_list(to_mask, expected_rank=2)
543
  to_seq_length = to_shape[1]
544

545
  to_mask = tf.cast(
546
      tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32)
547

548
  # We don't assume that `from_tensor` is a mask (although it could be). We
549
  # don't actually care if we attend *from* padding tokens (only *to* padding)
550
  # tokens so we create a tensor of all ones.
551
  #
552
  # `broadcast_ones` = [batch_size, from_seq_length, 1]
553
  broadcast_ones = tf.ones(
554
      shape=[batch_size, from_seq_length, 1], dtype=tf.float32)
555

556
  # Here we broadcast along two dimensions to create the mask.
557
  mask = broadcast_ones * to_mask
558

559
  return mask
560

561

562
def attention_layer(from_tensor,
563
                    to_tensor,
564
                    attention_mask=None,
565
                    num_attention_heads=1,
566
                    size_per_head=512,
567
                    query_act=None,
568
                    key_act=None,
569
                    value_act=None,
570
                    attention_probs_dropout_prob=0.0,
571
                    initializer_range=0.02,
572
                    do_return_2d_tensor=False,
573
                    batch_size=None,
574
                    from_seq_length=None,
575
                    to_seq_length=None):
576
  """Performs multi-headed attention from `from_tensor` to `to_tensor`.
577

578
  This is an implementation of multi-headed attention based on "Attention
579
  is all you Need". If `from_tensor` and `to_tensor` are the same, then
580
  this is self-attention. Each timestep in `from_tensor` attends to the
581
  corresponding sequence in `to_tensor`, and returns a fixed-with vector.
582

583
  This function first projects `from_tensor` into a "query" tensor and
584
  `to_tensor` into "key" and "value" tensors. These are (effectively) a list
585
  of tensors of length `num_attention_heads`, where each tensor is of shape
586
  [batch_size, seq_length, size_per_head].
587

588
  Then, the query and key tensors are dot-producted and scaled. These are
589
  softmaxed to obtain attention probabilities. The value tensors are then
590
  interpolated by these probabilities, then concatenated back to a single
591
  tensor and returned.
592

593
  In practice, the multi-headed attention are done with transposes and
594
  reshapes rather than actual separate tensors.
595

596
  Args:
597
    from_tensor: float Tensor of shape [batch_size, from_seq_length,
598
      from_width].
599
    to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].
600
    attention_mask: (optional) int32 Tensor of shape [batch_size,
601
      from_seq_length, to_seq_length]. The values should be 1 or 0. The
602
      attention scores will effectively be set to -infinity for any positions in
603
      the mask that are 0, and will be unchanged for positions that are 1.
604
    num_attention_heads: int. Number of attention heads.
605
    size_per_head: int. Size of each attention head.
606
    query_act: (optional) Activation function for the query transform.
607
    key_act: (optional) Activation function for the key transform.
608
    value_act: (optional) Activation function for the value transform.
609
    attention_probs_dropout_prob: (optional) float. Dropout probability of the
610
      attention probabilities.
611
    initializer_range: float. Range of the weight initializer.
612
    do_return_2d_tensor: bool. If True, the output will be of shape [batch_size
613
      * from_seq_length, num_attention_heads * size_per_head]. If False, the
614
      output will be of shape [batch_size, from_seq_length, num_attention_heads
615
      * size_per_head].
616
    batch_size: (Optional) int. If the input is 2D, this might be the batch size
617
      of the 3D version of the `from_tensor` and `to_tensor`.
618
    from_seq_length: (Optional) If the input is 2D, this might be the seq length
619
      of the 3D version of the `from_tensor`.
620
    to_seq_length: (Optional) If the input is 2D, this might be the seq length
621
      of the 3D version of the `to_tensor`.
622

623
  Returns:
624
    float Tensor of shape [batch_size, from_seq_length,
625
      num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is
626
      true, this will be of shape [batch_size * from_seq_length,
627
      num_attention_heads * size_per_head]).
628

629
  Raises:
630
    ValueError: Any of the arguments or tensor shapes are invalid.
631
  """
632

633
  def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
634
                           seq_length, width):
635
    output_tensor = tf.reshape(
636
        input_tensor, [batch_size, seq_length, num_attention_heads, width])
637

638
    output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
639
    return output_tensor
640

641
  from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
642
  to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])
643

644
  if len(from_shape) != len(to_shape):
645
    raise ValueError(
646
        "The rank of `from_tensor` must match the rank of `to_tensor`.")
647

648
  if len(from_shape) == 3:
649
    batch_size = from_shape[0]
650
    from_seq_length = from_shape[1]
651
    to_seq_length = to_shape[1]
652
  elif len(from_shape) == 2:
653
    if (batch_size is None or from_seq_length is None or to_seq_length is None):
654
      raise ValueError(
655
          "When passing in rank 2 tensors to attention_layer, the values "
656
          "for `batch_size`, `from_seq_length`, and `to_seq_length` "
657
          "must all be specified.")
658

659
  # Scalar dimensions referenced here:
660
  #   B = batch size (number of sequences)
661
  #   F = `from_tensor` sequence length
662
  #   T = `to_tensor` sequence length
663
  #   N = `num_attention_heads`
664
  #   H = `size_per_head`
665

666
  from_tensor_2d = reshape_to_matrix(from_tensor)
667
  to_tensor_2d = reshape_to_matrix(to_tensor)
668

669
  # `query_layer` = [B*F, N*H]
670
  query_layer = tf.layers.dense(
671
      from_tensor_2d,
672
      num_attention_heads * size_per_head,
673
      activation=query_act,
674
      name="query",
675
      kernel_initializer=create_initializer(initializer_range))
676

677
  # `key_layer` = [B*T, N*H]
678
  key_layer = tf.layers.dense(
679
      to_tensor_2d,
680
      num_attention_heads * size_per_head,
681
      activation=key_act,
682
      name="key",
683
      kernel_initializer=create_initializer(initializer_range))
684

685
  # `value_layer` = [B*T, N*H]
686
  value_layer = tf.layers.dense(
687
      to_tensor_2d,
688
      num_attention_heads * size_per_head,
689
      activation=value_act,
690
      name="value",
691
      kernel_initializer=create_initializer(initializer_range))
692

693
  # `query_layer` = [B, N, F, H]
694
  query_layer = transpose_for_scores(query_layer, batch_size,
695
                                     num_attention_heads, from_seq_length,
696
                                     size_per_head)
697

698
  # `key_layer` = [B, N, T, H]
699
  key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads,
700
                                   to_seq_length, size_per_head)
701

702
  # Take the dot product between "query" and "key" to get the raw
703
  # attention scores.
704
  # `attention_scores` = [B, N, F, T]
705
  attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
706
  attention_scores = tf.multiply(attention_scores,
707
                                 1.0 / math.sqrt(float(size_per_head)))
708

709
  if attention_mask is not None:
710
    # `attention_mask` = [B, 1, F, T]
711
    attention_mask = tf.expand_dims(attention_mask, axis=[1])
712

713
    # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
714
    # masked positions, this operation will create a tensor which is 0.0 for
715
    # positions we want to attend and -10000.0 for masked positions.
716
    adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0
717

718
    # Since we are adding it to the raw scores before the softmax, this is
719
    # effectively the same as removing these entirely.
720
    attention_scores += adder
721

722
  # Normalize the attention scores to probabilities.
723
  # `attention_probs` = [B, N, F, T]
724
  attention_probs = tf.nn.softmax(attention_scores)
725

726
  # This is actually dropping out entire tokens to attend to, which might
727
  # seem a bit unusual, but is taken from the original Transformer paper.
728
  attention_probs = dropout(attention_probs, attention_probs_dropout_prob)
729

730
  # `value_layer` = [B, T, N, H]
731
  value_layer = tf.reshape(
732
      value_layer,
733
      [batch_size, to_seq_length, num_attention_heads, size_per_head])
734

735
  # `value_layer` = [B, N, T, H]
736
  value_layer = tf.transpose(value_layer, [0, 2, 1, 3])
737

738
  # `context_layer` = [B, N, F, H]
739
  context_layer = tf.matmul(attention_probs, value_layer)
740

741
  # `context_layer` = [B, F, N, H]
742
  context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
743

744
  if do_return_2d_tensor:
745
    # `context_layer` = [B*F, N*H]
746
    context_layer = tf.reshape(
747
        context_layer,
748
        [batch_size * from_seq_length, num_attention_heads * size_per_head])
749
  else:
750
    # `context_layer` = [B, F, N*H]
751
    context_layer = tf.reshape(
752
        context_layer,
753
        [batch_size, from_seq_length, num_attention_heads * size_per_head])
754

755
  return context_layer
756

757

758
def transformer_model(input_tensor,
759
                      attention_mask=None,
760
                      hidden_size=768,
761
                      num_hidden_layers=12,
762
                      num_attention_heads=12,
763
                      intermediate_size=3072,
764
                      intermediate_act_fn=gelu,
765
                      hidden_dropout_prob=0.1,
766
                      attention_probs_dropout_prob=0.1,
767
                      initializer_range=0.02,
768
                      do_return_all_layers=False):
769
  """Multi-headed, multi-layer Transformer from "Attention is All You Need".
770

771
  This is almost an exact implementation of the original Transformer encoder.
772

773
  See the original paper:
774
  https://arxiv.org/abs/1706.03762
775

776
  Also see:
777
  https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py
778

779
  Args:
780
    input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
781
    attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
782
      seq_length], with 1 for positions that can be attended to and 0 in
783
      positions that should not be.
784
    hidden_size: int. Hidden size of the Transformer.
785
    num_hidden_layers: int. Number of layers (blocks) in the Transformer.
786
    num_attention_heads: int. Number of attention heads in the Transformer.
787
    intermediate_size: int. The size of the "intermediate" (a.k.a., feed
788
      forward) layer.
789
    intermediate_act_fn: function. The non-linear activation function to apply
790
      to the output of the intermediate/feed-forward layer.
791
    hidden_dropout_prob: float. Dropout probability for the hidden layers.
792
    attention_probs_dropout_prob: float. Dropout probability of the attention
793
      probabilities.
794
    initializer_range: float. Range of the initializer (stddev of truncated
795
      normal).
796
    do_return_all_layers: Whether to also return all layers or just the final
797
      layer.
798

799
  Returns:
800
    float Tensor of shape [batch_size, seq_length, hidden_size], the final
801
    hidden layer of the Transformer.
802

803
  Raises:
804
    ValueError: A Tensor shape or parameter is invalid.
805
  """
806
  if hidden_size % num_attention_heads != 0:
807
    raise ValueError(
808
        "The hidden size (%d) is not a multiple of the number of attention "
809
        "heads (%d)" % (hidden_size, num_attention_heads))
810

811
  attention_head_size = int(hidden_size / num_attention_heads)
812
  input_shape = get_shape_list(input_tensor, expected_rank=3)
813
  batch_size = input_shape[0]
814
  seq_length = input_shape[1]
815
  input_width = input_shape[2]
816

817
  # The Transformer performs sum residuals on all layers so the input needs
818
  # to be the same as the hidden size.
819
  if input_width != hidden_size:
820
    raise ValueError("The width of the input tensor (%d) != hidden size (%d)" %
821
                     (input_width, hidden_size))
822

823
  # We keep the representation as a 2D tensor to avoid re-shaping it back and
824
  # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on
825
  # the GPU/CPU but may not be free on the TPU, so we want to minimize them to
826
  # help the optimizer.
827
  prev_output = reshape_to_matrix(input_tensor)
828

829
  all_layer_outputs = []
830
  for layer_idx in range(num_hidden_layers):
831
    with tf.variable_scope("layer_%d" % layer_idx):
832
      layer_input = prev_output
833

834
      with tf.variable_scope("attention"):
835
        attention_heads = []
836
        with tf.variable_scope("self"):
837
          attention_head = attention_layer(
838
              from_tensor=layer_input,
839
              to_tensor=layer_input,
840
              attention_mask=attention_mask,
841
              num_attention_heads=num_attention_heads,
842
              size_per_head=attention_head_size,
843
              attention_probs_dropout_prob=attention_probs_dropout_prob,
844
              initializer_range=initializer_range,
845
              do_return_2d_tensor=True,
846
              batch_size=batch_size,
847
              from_seq_length=seq_length,
848
              to_seq_length=seq_length)
849
          attention_heads.append(attention_head)
850

851
        attention_output = None
852
        if len(attention_heads) == 1:
853
          attention_output = attention_heads[0]
854
        else:
855
          # In the case where we have other sequences, we just concatenate
856
          # them to the self-attention head before the projection.
857
          attention_output = tf.concat(attention_heads, axis=-1)
858

859
        # Run a linear projection of `hidden_size` then add a residual
860
        # with `layer_input`.
861
        with tf.variable_scope("output"):
862
          attention_output = tf.layers.dense(
863
              attention_output,
864
              hidden_size,
865
              kernel_initializer=create_initializer(initializer_range))
866
          attention_output = dropout(attention_output, hidden_dropout_prob)
867
          attention_output = layer_norm(attention_output + layer_input)
868

869
      # The activation is only applied to the "intermediate" hidden layer.
870
      with tf.variable_scope("intermediate"):
871
        intermediate_output = tf.layers.dense(
872
            attention_output,
873
            intermediate_size,
874
            activation=intermediate_act_fn,
875
            kernel_initializer=create_initializer(initializer_range))
876

877
      # Down-project back to `hidden_size` then add the residual.
878
      with tf.variable_scope("output"):
879
        layer_output = tf.layers.dense(
880
            intermediate_output,
881
            hidden_size,
882
            kernel_initializer=create_initializer(initializer_range))
883
        layer_output = dropout(layer_output, hidden_dropout_prob)
884
        layer_output = layer_norm(layer_output + attention_output)
885
        prev_output = layer_output
886
        all_layer_outputs.append(layer_output)
887

888
  if do_return_all_layers:
889
    final_outputs = []
890
    for layer_output in all_layer_outputs:
891
      final_output = reshape_from_matrix(layer_output, input_shape)
892
      final_outputs.append(final_output)
893
    return final_outputs
894
  else:
895
    final_output = reshape_from_matrix(prev_output, input_shape)
896
    return final_output
897

898

899
def get_shape_list(tensor, expected_rank=None, name=None):
900
  """Returns a list of the shape of tensor, preferring static dimensions.
901

902
  Args:
903
    tensor: A tf.Tensor object to find the shape of.
904
    expected_rank: (optional) int. The expected rank of `tensor`. If this is
905
      specified and the `tensor` has a different rank, and exception will be
906
      thrown.
907
    name: Optional name of the tensor for the error message.
908

909
  Returns:
910
    A list of dimensions of the shape of tensor. All static dimensions will
911
    be returned as python integers, and dynamic dimensions will be returned
912
    as tf.Tensor scalars.
913
  """
914
  if name is None:
915
    name = tensor.name
916

917
  if expected_rank is not None:
918
    assert_rank(tensor, expected_rank, name)
919

920
  shape = tensor.shape.as_list()
921

922
  non_static_indexes = []
923
  for (index, dim) in enumerate(shape):
924
    if dim is None:
925
      non_static_indexes.append(index)
926

927
  if not non_static_indexes:
928
    return shape
929

930
  dyn_shape = tf.shape(tensor)
931
  for index in non_static_indexes:
932
    shape[index] = dyn_shape[index]
933
  return shape
934

935

936
def reshape_to_matrix(input_tensor):
937
  """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix)."""
938
  ndims = input_tensor.shape.ndims
939
  if ndims < 2:
940
    raise ValueError("Input tensor must have at least rank 2. Shape = %s" %
941
                     (input_tensor.shape))
942
  if ndims == 2:
943
    return input_tensor
944

945
  width = input_tensor.shape[-1]
946
  output_tensor = tf.reshape(input_tensor, [-1, width])
947
  return output_tensor
948

949

950
def reshape_from_matrix(output_tensor, orig_shape_list):
951
  """Reshapes a rank 2 tensor back to its original rank >= 2 tensor."""
952
  if len(orig_shape_list) == 2:
953
    return output_tensor
954

955
  output_shape = get_shape_list(output_tensor)
956

957
  orig_dims = orig_shape_list[0:-1]
958
  width = output_shape[-1]
959

960
  return tf.reshape(output_tensor, orig_dims + [width])
961

962

963
def assert_rank(tensor, expected_rank, name=None):
964
  """Raises an exception if the tensor rank is not of the expected rank.
965

966
  Args:
967
    tensor: A tf.Tensor to check the rank of.
968
    expected_rank: Python integer or list of integers, expected rank.
969
    name: Optional name of the tensor for the error message.
970

971
  Raises:
972
    ValueError: If the expected shape doesn't match the actual shape.
973
  """
974
  if name is None:
975
    name = tensor.name
976

977
  expected_rank_dict = {}
978
  if isinstance(expected_rank, six.integer_types):
979
    expected_rank_dict[expected_rank] = True
980
  else:
981
    for x in expected_rank:
982
      expected_rank_dict[x] = True
983

984
  actual_rank = tensor.shape.ndims
985
  if actual_rank not in expected_rank_dict:
986
    scope_name = tf.get_variable_scope().name
987
    raise ValueError(
988
        "For the tensor `%s` in scope `%s`, the actual rank "
989
        "`%d` (shape = %s) is not equal to the expected rank `%s`" %
990
        (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))
991
google-research

Использование cookies