CSS-LM

modeling_tf_electra.py
771 строка · 35.5 Кб
Перенос по словам
1
import logging
2

3
import tensorflow as tf
4

5
from .configuration_electra import ElectraConfig
6
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
7
from .modeling_tf_bert import ACT2FN, TFBertEncoder, TFBertPreTrainedModel
8
from .modeling_tf_utils import (
9
    TFMaskedLanguageModelingLoss,
10
    TFQuestionAnsweringLoss,
11
    TFTokenClassificationLoss,
12
    get_initializer,
13
    keras_serializable,
14
    shape_list,
15
)
16
from .tokenization_utils import BatchEncoding
17

18

19
logger = logging.getLogger(__name__)
20

21
_TOKENIZER_FOR_DOC = "ElectraTokenizer"
22

23
TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
24
    "google/electra-small-generator",
25
    "google/electra-base-generator",
26
    "google/electra-large-generator",
27
    "google/electra-small-discriminator",
28
    "google/electra-base-discriminator",
29
    "google/electra-large-discriminator",
30
    # See all ELECTRA models at https://huggingface.co/models?filter=electra
31
]
32

33

34
class TFElectraEmbeddings(tf.keras.layers.Layer):
35
    """Construct the embeddings from word, position and token_type embeddings.
36
    """
37

38
    def __init__(self, config, **kwargs):
39
        super().__init__(**kwargs)
40
        self.vocab_size = config.vocab_size
41
        self.embedding_size = config.embedding_size
42
        self.initializer_range = config.initializer_range
43

44
        self.position_embeddings = tf.keras.layers.Embedding(
45
            config.max_position_embeddings,
46
            config.embedding_size,
47
            embeddings_initializer=get_initializer(self.initializer_range),
48
            name="position_embeddings",
49
        )
50
        self.token_type_embeddings = tf.keras.layers.Embedding(
51
            config.type_vocab_size,
52
            config.embedding_size,
53
            embeddings_initializer=get_initializer(self.initializer_range),
54
            name="token_type_embeddings",
55
        )
56

57
        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
58
        # any TensorFlow checkpoint file
59
        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
60
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
61

62
    def build(self, input_shape):
63
        """Build shared word embedding layer """
64
        with tf.name_scope("word_embeddings"):
65
            # Create and initialize weights. The random normal initializer was chosen
66
            # arbitrarily, and works well.
67
            self.word_embeddings = self.add_weight(
68
                "weight",
69
                shape=[self.vocab_size, self.embedding_size],
70
                initializer=get_initializer(self.initializer_range),
71
            )
72
        super().build(input_shape)
73

74
    def call(self, inputs, mode="embedding", training=False):
75
        """Get token embeddings of inputs.
76
        Args:
77
            inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
78
            mode: string, a valid value is one of "embedding" and "linear".
79
        Returns:
80
            outputs: (1) If mode == "embedding", output embedding tensor, float32 with
81
                shape [batch_size, length, embedding_size]; (2) mode == "linear", output
82
                linear tensor, float32 with shape [batch_size, length, vocab_size].
83
        Raises:
84
            ValueError: if mode is not valid.
85

86
        Shared weights logic adapted from
87
            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
88
        """
89
        if mode == "embedding":
90
            return self._embedding(inputs, training=training)
91
        elif mode == "linear":
92
            return self._linear(inputs)
93
        else:
94
            raise ValueError("mode {} is not valid.".format(mode))
95

96
    def _embedding(self, inputs, training=False):
97
        """Applies embedding based on inputs tensor."""
98
        input_ids, position_ids, token_type_ids, inputs_embeds = inputs
99

100
        if input_ids is not None:
101
            input_shape = shape_list(input_ids)
102
        else:
103
            input_shape = shape_list(inputs_embeds)[:-1]
104

105
        seq_length = input_shape[1]
106
        if position_ids is None:
107
            position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
108
        if token_type_ids is None:
109
            token_type_ids = tf.fill(input_shape, 0)
110

111
        if inputs_embeds is None:
112
            inputs_embeds = tf.gather(self.word_embeddings, input_ids)
113
        position_embeddings = self.position_embeddings(position_ids)
114
        token_type_embeddings = self.token_type_embeddings(token_type_ids)
115

116
        embeddings = inputs_embeds + position_embeddings + token_type_embeddings
117
        embeddings = self.LayerNorm(embeddings)
118
        embeddings = self.dropout(embeddings, training=training)
119
        return embeddings
120

121
    def _linear(self, inputs):
122
        """Computes logits by running inputs through a linear layer.
123
            Args:
124
                inputs: A float32 tensor with shape [batch_size, length, hidden_size]
125
            Returns:
126
                float32 tensor with shape [batch_size, length, vocab_size].
127
        """
128
        batch_size = shape_list(inputs)[0]
129
        length = shape_list(inputs)[1]
130

131
        x = tf.reshape(inputs, [-1, self.embedding_size])
132
        logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
133

134
        return tf.reshape(logits, [batch_size, length, self.vocab_size])
135

136

137
class TFElectraDiscriminatorPredictions(tf.keras.layers.Layer):
138
    def __init__(self, config, **kwargs):
139
        super().__init__(**kwargs)
140

141
        self.dense = tf.keras.layers.Dense(config.hidden_size, name="dense")
142
        self.dense_prediction = tf.keras.layers.Dense(1, name="dense_prediction")
143
        self.config = config
144

145
    def call(self, discriminator_hidden_states, training=False):
146
        hidden_states = self.dense(discriminator_hidden_states)
147
        hidden_states = ACT2FN[self.config.hidden_act](hidden_states)
148
        logits = tf.squeeze(self.dense_prediction(hidden_states))
149

150
        return logits
151

152

153
class TFElectraGeneratorPredictions(tf.keras.layers.Layer):
154
    def __init__(self, config, **kwargs):
155
        super().__init__(**kwargs)
156

157
        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
158
        self.dense = tf.keras.layers.Dense(config.embedding_size, name="dense")
159

160
    def call(self, generator_hidden_states, training=False):
161
        hidden_states = self.dense(generator_hidden_states)
162
        hidden_states = ACT2FN["gelu"](hidden_states)
163
        hidden_states = self.LayerNorm(hidden_states)
164

165
        return hidden_states
166

167

168
class TFElectraPreTrainedModel(TFBertPreTrainedModel):
169

170
    config_class = ElectraConfig
171
    base_model_prefix = "electra"
172

173
    def get_extended_attention_mask(self, attention_mask, input_shape):
174
        if attention_mask is None:
175
            attention_mask = tf.fill(input_shape, 1)
176

177
        # We create a 3D attention mask from a 2D tensor mask.
178
        # Sizes are [batch_size, 1, 1, to_seq_length]
179
        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
180
        # this attention mask is more simple than the triangular masking of causal attention
181
        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
182
        extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
183

184
        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
185
        # masked positions, this operation will create a tensor which is 0.0 for
186
        # positions we want to attend and -10000.0 for masked positions.
187
        # Since we are adding it to the raw scores before the softmax, this is
188
        # effectively the same as removing these entirely.
189

190
        extended_attention_mask = tf.cast(extended_attention_mask, tf.float32)
191
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
192

193
        return extended_attention_mask
194

195
    def get_head_mask(self, head_mask):
196
        if head_mask is not None:
197
            raise NotImplementedError
198
        else:
199
            head_mask = [None] * self.config.num_hidden_layers
200

201
        return head_mask
202

203

204
@keras_serializable
205
class TFElectraMainLayer(TFElectraPreTrainedModel):
206

207
    config_class = ElectraConfig
208

209
    def __init__(self, config, **kwargs):
210
        super().__init__(config, **kwargs)
211
        self.embeddings = TFElectraEmbeddings(config, name="embeddings")
212

213
        if config.embedding_size != config.hidden_size:
214
            self.embeddings_project = tf.keras.layers.Dense(config.hidden_size, name="embeddings_project")
215
        self.encoder = TFBertEncoder(config, name="encoder")
216
        self.config = config
217

218
    def get_input_embeddings(self):
219
        return self.embeddings
220

221
    def set_input_embeddings(self, value):
222
        self.embeddings.word_embeddings = value
223
        self.embeddings.vocab_size = value.shape[0]
224

225
    def _resize_token_embeddings(self, new_num_tokens):
226
        raise NotImplementedError
227

228
    def _prune_heads(self, heads_to_prune):
229
        """ Prunes heads of the model.
230
            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
231
            See base class PreTrainedModel
232
        """
233
        raise NotImplementedError
234

235
    def call(
236
        self,
237
        inputs,
238
        attention_mask=None,
239
        token_type_ids=None,
240
        position_ids=None,
241
        head_mask=None,
242
        inputs_embeds=None,
243
        output_attentions=None,
244
        output_hidden_states=None,
245
        training=False,
246
    ):
247
        if isinstance(inputs, (tuple, list)):
248
            input_ids = inputs[0]
249
            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
250
            token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
251
            position_ids = inputs[3] if len(inputs) > 3 else position_ids
252
            head_mask = inputs[4] if len(inputs) > 4 else head_mask
253
            inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
254
            output_attentions = inputs[6] if len(inputs) > 6 else output_attentions
255
            output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states
256
            assert len(inputs) <= 8, "Too many inputs."
257
        elif isinstance(inputs, (dict, BatchEncoding)):
258
            input_ids = inputs.get("input_ids")
259
            attention_mask = inputs.get("attention_mask", attention_mask)
260
            token_type_ids = inputs.get("token_type_ids", token_type_ids)
261
            position_ids = inputs.get("position_ids", position_ids)
262
            head_mask = inputs.get("head_mask", head_mask)
263
            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
264
            output_attentions = inputs.get("output_attentions", output_attentions)
265
            output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
266
            assert len(inputs) <= 8, "Too many inputs."
267
        else:
268
            input_ids = inputs
269

270
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
271
        output_hidden_states = (
272
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
273
        )
274

275
        if input_ids is not None and inputs_embeds is not None:
276
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
277
        elif input_ids is not None:
278
            input_shape = shape_list(input_ids)
279
        elif inputs_embeds is not None:
280
            input_shape = shape_list(inputs_embeds)[:-1]
281
        else:
282
            raise ValueError("You have to specify either input_ids or inputs_embeds")
283

284
        if attention_mask is None:
285
            attention_mask = tf.fill(input_shape, 1)
286
        if token_type_ids is None:
287
            token_type_ids = tf.fill(input_shape, 0)
288

289
        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
290
        head_mask = self.get_head_mask(head_mask)
291

292
        hidden_states = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
293

294
        if hasattr(self, "embeddings_project"):
295
            hidden_states = self.embeddings_project(hidden_states, training=training)
296

297
        hidden_states = self.encoder(
298
            [hidden_states, extended_attention_mask, head_mask, output_attentions, output_hidden_states],
299
            training=training,
300
        )
301

302
        return hidden_states
303

304

305
ELECTRA_START_DOCSTRING = r"""
306
    This model is a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ sub-class.
307
    Use it as a regular TF 2.0 Keras Model and
308
    refer to the TF 2.0 documentation for all matter related to general usage and behavior.
309

310
    .. note::
311

312
        TF 2.0 models accepts two formats as inputs:
313

314
            - having all inputs as keyword arguments (like PyTorch models), or
315
            - having all inputs as a list, tuple or dict in the first positional arguments.
316

317
        This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
318
        all the tensors in the first argument of the model call function: :obj:`model(inputs)`.
319

320
        If you choose this second option, there are three possibilities you can use to gather all the input Tensors
321
        in the first positional argument :
322

323
        - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
324
        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
325
          :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
326
        - a dictionary with one or several input Tensors associated to the input names given in the docstring:
327
          :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
328

329
    Parameters:
330
        config (:class:`~transformers.ElectraConfig`): Model configuration class with all the parameters of the model.
331
            Initializing with a config file does not load the weights associated with the model, only the configuration.
332
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
333
"""
334

335
ELECTRA_INPUTS_DOCSTRING = r"""
336
    Args:
337
        input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
338
            Indices of input sequence tokens in the vocabulary.
339

340
            Indices can be obtained using :class:`transformers.ElectraTokenizer`.
341
            See :func:`transformers.PreTrainedTokenizer.encode` and
342
            :func:`transformers.PreTrainedTokenizer.__call__` for details.
343

344
            `What are input IDs? <../glossary.html#input-ids>`__
345
        attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
346
            Mask to avoid performing attention on padding token indices.
347
            Mask values selected in ``[0, 1]``:
348
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
349

350
            `What are attention masks? <../glossary.html#attention-mask>`__
351
        position_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
352
            Indices of positions of each input sequence tokens in the position embeddings.
353
            Selected in the range ``[0, config.max_position_embeddings - 1]``.
354

355
            `What are position IDs? <../glossary.html#position-ids>`__
356
        head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
357
            Mask to nullify selected heads of the self-attention modules.
358
            Mask values selected in ``[0, 1]``:
359
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
360
        inputs_embeds (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, embedding_dim)`, `optional`, defaults to :obj:`None`):
361
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
362
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
363
            than the model's internal embedding lookup matrix.
364
        training (:obj:`boolean`, `optional`, defaults to :obj:`False`):
365
            Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them
366
            (if set to :obj:`False`) for evaluation.
367

368
        output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
369
            If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
370
"""
371

372

373
@add_start_docstrings(
374
    "The bare Electra Model transformer outputting raw hidden-states without any specific head on top. Identical to "
375
    "the BERT model except that it uses an additional linear layer between the embedding layer and the encoder if the "
376
    "hidden size and embedding size are different."
377
    ""
378
    "Both the generator and discriminator checkpoints may be loaded into this model.",
379
    ELECTRA_START_DOCSTRING,
380
)
381
class TFElectraModel(TFElectraPreTrainedModel):
382
    def __init__(self, config, *inputs, **kwargs):
383
        super().__init__(config, *inputs, **kwargs)
384
        self.electra = TFElectraMainLayer(config, name="electra")
385

386
    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
387
    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
388
    def call(self, inputs, **kwargs):
389
        r"""
390
    Returns:
391
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
392
        last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
393
            Sequence of hidden-states at the output of the last layer of the model.
394
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
395
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
396
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
397

398
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
399
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
400
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
401
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
402

403
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
404
            heads.
405
        """
406
        outputs = self.electra(inputs, **kwargs)
407
        return outputs
408

409

410
@add_start_docstrings(
411
    """Electra model with a binary classification head on top as used during pre-training for identifying generated
412
    tokens.
413

414
    Even though both the discriminator and generator may be loaded into this model, the discriminator is
415
    the only model of the two to have the correct classification head to be used for this model.""",
416
    ELECTRA_START_DOCSTRING,
417
)
418
class TFElectraForPreTraining(TFElectraPreTrainedModel):
419
    def __init__(self, config, **kwargs):
420
        super().__init__(config, **kwargs)
421

422
        self.electra = TFElectraMainLayer(config, name="electra")
423
        self.discriminator_predictions = TFElectraDiscriminatorPredictions(config, name="discriminator_predictions")
424

425
    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
426
    def call(
427
        self,
428
        input_ids=None,
429
        attention_mask=None,
430
        token_type_ids=None,
431
        position_ids=None,
432
        head_mask=None,
433
        inputs_embeds=None,
434
        output_attentions=None,
435
        output_hidden_states=None,
436
        training=False,
437
    ):
438
        r"""
439
    Returns:
440
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
441
        scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
442
            Prediction scores of the head (scores for each token before SoftMax).
443
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
444
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
445
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
446

447
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
448
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
449
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
450
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
451

452
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
453
            heads.
454

455
    Examples::
456

457
        import tensorflow as tf
458
        from transformers import ElectraTokenizer, TFElectraForPreTraining
459

460
        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
461
        model = TFElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
462
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
463
        outputs = model(input_ids)
464
        scores = outputs[0]
465
        """
466

467
        discriminator_hidden_states = self.electra(
468
            input_ids,
469
            attention_mask,
470
            token_type_ids,
471
            position_ids,
472
            head_mask,
473
            inputs_embeds,
474
            output_attentions,
475
            output_hidden_states,
476
            training=training,
477
        )
478
        discriminator_sequence_output = discriminator_hidden_states[0]
479
        logits = self.discriminator_predictions(discriminator_sequence_output)
480
        output = (logits,)
481
        output += discriminator_hidden_states[1:]
482

483
        return output  # (loss), scores, (hidden_states), (attentions)
484

485

486
class TFElectraMaskedLMHead(tf.keras.layers.Layer):
487
    def __init__(self, config, input_embeddings, **kwargs):
488
        super().__init__(**kwargs)
489
        self.vocab_size = config.vocab_size
490
        self.input_embeddings = input_embeddings
491

492
    def build(self, input_shape):
493
        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
494
        super().build(input_shape)
495

496
    def call(self, hidden_states, training=False):
497
        hidden_states = self.input_embeddings(hidden_states, mode="linear")
498
        hidden_states = hidden_states + self.bias
499
        return hidden_states
500

501

502
@add_start_docstrings(
503
    """Electra model with a language modeling head on top.
504

505
    Even though both the discriminator and generator may be loaded into this model, the generator is
506
    the only model of the two to have been trained for the masked language modeling task.""",
507
    ELECTRA_START_DOCSTRING,
508
)
509
class TFElectraForMaskedLM(TFElectraPreTrainedModel, TFMaskedLanguageModelingLoss):
510
    def __init__(self, config, **kwargs):
511
        super().__init__(config, **kwargs)
512

513
        self.vocab_size = config.vocab_size
514
        self.electra = TFElectraMainLayer(config, name="electra")
515
        self.generator_predictions = TFElectraGeneratorPredictions(config, name="generator_predictions")
516
        if isinstance(config.hidden_act, str):
517
            self.activation = ACT2FN[config.hidden_act]
518
        else:
519
            self.activation = config.hidden_act
520
        self.generator_lm_head = TFElectraMaskedLMHead(config, self.electra.embeddings, name="generator_lm_head")
521

522
    def get_output_embeddings(self):
523
        return self.generator_lm_head
524

525
    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
526
    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-generator")
527
    def call(
528
        self,
529
        input_ids=None,
530
        attention_mask=None,
531
        token_type_ids=None,
532
        position_ids=None,
533
        head_mask=None,
534
        inputs_embeds=None,
535
        output_attentions=None,
536
        output_hidden_states=None,
537
        labels=None,
538
        training=False,
539
    ):
540
        r"""
541
        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
542
            Labels for computing the masked language modeling loss.
543
            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
544
            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
545
            in ``[0, ..., config.vocab_size]``
546

547
    Returns:
548
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
549
        prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
550
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
551
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
552
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
553
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
554

555
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
556
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
557
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
558
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
559

560
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
561
            heads.
562
        """
563
        if isinstance(input_ids, (tuple, list)):
564
            labels = input_ids[8] if len(input_ids) > 8 else labels
565
            if len(input_ids) > 8:
566
                input_ids = input_ids[:8]
567
        elif isinstance(input_ids, (dict, BatchEncoding)):
568
            labels = input_ids.pop("labels", labels)
569

570
        generator_hidden_states = self.electra(
571
            input_ids,
572
            attention_mask,
573
            token_type_ids,
574
            position_ids,
575
            head_mask,
576
            inputs_embeds,
577
            output_attentions=output_attentions,
578
            output_hidden_states=output_hidden_states,
579
            training=training,
580
        )
581
        generator_sequence_output = generator_hidden_states[0]
582
        prediction_scores = self.generator_predictions(generator_sequence_output, training=training)
583
        prediction_scores = self.generator_lm_head(prediction_scores, training=training)
584
        output = (prediction_scores,)
585
        output += generator_hidden_states[1:]
586

587
        if labels is not None:
588
            loss = self.compute_loss(labels, prediction_scores)
589
            output = (loss,) + output
590

591
        return output  # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
592

593

594
@add_start_docstrings(
595
    """Electra model with a token classification head on top.
596

597
    Both the discriminator and generator may be loaded into this model.""",
598
    ELECTRA_START_DOCSTRING,
599
)
600
class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassificationLoss):
601
    def __init__(self, config, **kwargs):
602
        super().__init__(config, **kwargs)
603

604
        self.electra = TFElectraMainLayer(config, name="electra")
605
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
606
        self.classifier = tf.keras.layers.Dense(
607
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
608
        )
609

610
    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
611
    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
612
    def call(
613
        self,
614
        inputs=None,
615
        attention_mask=None,
616
        token_type_ids=None,
617
        position_ids=None,
618
        head_mask=None,
619
        inputs_embeds=None,
620
        output_attentions=None,
621
        output_hidden_states=None,
622
        labels=None,
623
        training=False,
624
    ):
625
        r"""
626
        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
627
            Labels for computing the token classification loss.
628
            Indices should be in ``[0, ..., config.num_labels - 1]``.
629

630
    Returns:
631
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
632
        scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
633
            Classification scores (before SoftMax).
634
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
635
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
636
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
637

638
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
639
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
640
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
641
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
642

643
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
644
            heads.
645
        """
646
        if isinstance(inputs, (tuple, list)):
647
            labels = inputs[8] if len(inputs) > 8 else labels
648
            if len(inputs) > 8:
649
                inputs = inputs[:8]
650
        elif isinstance(inputs, (dict, BatchEncoding)):
651
            labels = inputs.pop("labels", labels)
652

653
        discriminator_hidden_states = self.electra(
654
            inputs,
655
            attention_mask,
656
            token_type_ids,
657
            position_ids,
658
            head_mask,
659
            inputs_embeds,
660
            output_attentions,
661
            output_hidden_states,
662
            training=training,
663
        )
664
        discriminator_sequence_output = discriminator_hidden_states[0]
665
        discriminator_sequence_output = self.dropout(discriminator_sequence_output)
666
        logits = self.classifier(discriminator_sequence_output)
667

668
        outputs = (logits,) + discriminator_hidden_states[1:]
669

670
        if labels is not None:
671
            loss = self.compute_loss(labels, logits)
672
            outputs = (loss,) + outputs
673

674
        return outputs  # (loss), scores, (hidden_states), (attentions)
675

676

677
@add_start_docstrings(
678
    """Electra Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
679
    the hidden-states output to compute `span start logits` and `span end logits`). """,
680
    ELECTRA_START_DOCSTRING,
681
)
682
class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnsweringLoss):
683
    def __init__(self, config, *inputs, **kwargs):
684
        super().__init__(config, *inputs, **kwargs)
685
        self.num_labels = config.num_labels
686

687
        self.electra = TFElectraMainLayer(config, name="electra")
688
        self.qa_outputs = tf.keras.layers.Dense(
689
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
690
        )
691

692
    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
693
    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
694
    def call(
695
        self,
696
        inputs=None,
697
        attention_mask=None,
698
        token_type_ids=None,
699
        position_ids=None,
700
        head_mask=None,
701
        inputs_embeds=None,
702
        output_attentions=None,
703
        output_hidden_states=None,
704
        start_positions=None,
705
        end_positions=None,
706
        training=False,
707
    ):
708
        r"""
709
        start_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
710
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
711
            Positions are clamped to the length of the sequence (`sequence_length`).
712
            Position outside of the sequence are not taken into account for computing the loss.
713
        end_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
714
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
715
            Positions are clamped to the length of the sequence (`sequence_length`).
716
            Position outside of the sequence are not taken into account for computing the loss.
717

718
    Return:
719
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
720
        start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
721
            Span-start scores (before SoftMax).
722
        end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
723
            Span-end scores (before SoftMax).
724
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
725
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
726
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
727

728
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
729
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
730
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
731
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
732

733
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
734
            heads.
735
        """
736
        if isinstance(inputs, (tuple, list)):
737
            start_positions = inputs[8] if len(inputs) > 8 else start_positions
738
            end_positions = inputs[9] if len(inputs) > 9 else end_positions
739
            if len(inputs) > 8:
740
                inputs = inputs[:8]
741
        elif isinstance(inputs, (dict, BatchEncoding)):
742
            start_positions = inputs.pop("start_positions", start_positions)
743
            end_positions = inputs.pop("end_positions", start_positions)
744

745
        discriminator_hidden_states = self.electra(
746
            inputs,
747
            attention_mask,
748
            token_type_ids,
749
            position_ids,
750
            head_mask,
751
            inputs_embeds,
752
            output_attentions,
753
            output_hidden_states,
754
            training=training,
755
        )
756
        discriminator_sequence_output = discriminator_hidden_states[0]
757

758
        logits = self.qa_outputs(discriminator_sequence_output)
759
        start_logits, end_logits = tf.split(logits, 2, axis=-1)
760
        start_logits = tf.squeeze(start_logits, axis=-1)
761
        end_logits = tf.squeeze(end_logits, axis=-1)
762

763
        outputs = (start_logits, end_logits,) + discriminator_hidden_states[1:]
764

765
        if start_positions is not None and end_positions is not None:
766
            labels = {"start_position": start_positions}
767
            labels["end_position"] = end_positions
768
            loss = self.compute_loss(labels, outputs[:2])
769
            outputs = (loss,) + outputs
770

771
        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
772
CSS-LM

Использование cookies