CSS-LM

modeling_openai.py
714 строк · 30.3 Кб
Перенос по словам
1
# coding=utf-8
2
# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
3
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
4
#
5
# Licensed under the Apache License, Version 2.0 (the "License");
6
# you may not use this file except in compliance with the License.
7
# You may obtain a copy of the License at
8
#
9
#     http://www.apache.org/licenses/LICENSE-2.0
10
#
11
# Unless required by applicable law or agreed to in writing, software
12
# distributed under the License is distributed on an "AS IS" BASIS,
13
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
# See the License for the specific language governing permissions and
15
# limitations under the License.
16
"""PyTorch OpenAI GPT model."""
17

18

19
import json
20
import logging
21
import math
22
import os
23
import warnings
24
from dataclasses import dataclass
25
from typing import Optional, Tuple
26

27
import torch
28
import torch.nn as nn
29
from torch.nn import CrossEntropyLoss
30

31
from .activations import gelu_new, swish
32
from .configuration_openai import OpenAIGPTConfig
33
from .file_utils import (
34
    ModelOutput,
35
    add_code_sample_docstrings,
36
    add_start_docstrings,
37
    add_start_docstrings_to_callable,
38
    replace_return_docstrings,
39
)
40
from .modeling_outputs import BaseModelOutput, CausalLMOutput
41
from .modeling_utils import (
42
    Conv1D,
43
    PreTrainedModel,
44
    SequenceSummary,
45
    find_pruneable_heads_and_indices,
46
    prune_conv1d_layer,
47
)
48

49

50
logger = logging.getLogger(__name__)
51

52
_CONFIG_FOR_DOC = "OpenAIGPTConfig"
53
_TOKENIZER_FOR_DOC = "OpenAIGPTTokenizer"
54

55
OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
56
    "openai-gpt",
57
    # See all OpenAI GPT models at https://huggingface.co/models?filter=openai-gpt
58
]
59

60

61
def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
62
    """ Load tf pre-trained weights in a pytorch model (from NumPy arrays here)
63
    """
64
    import re
65
    import numpy as np
66

67
    if ".ckpt" in openai_checkpoint_folder_path:
68
        openai_checkpoint_folder_path = os.path.dirname(openai_checkpoint_folder_path)
69

70
    logger.info("Loading weights from {}".format(openai_checkpoint_folder_path))
71

72
    with open(openai_checkpoint_folder_path + "/parameters_names.json", "r", encoding="utf-8") as names_handle:
73
        names = json.load(names_handle)
74
    with open(openai_checkpoint_folder_path + "/params_shapes.json", "r", encoding="utf-8") as shapes_handle:
75
        shapes = json.load(shapes_handle)
76
    offsets = np.cumsum([np.prod(shape) for shape in shapes])
77
    init_params = [np.load(openai_checkpoint_folder_path + "/params_{}.npy".format(n)) for n in range(10)]
78
    init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
79
    init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)]
80

81
    # This was used when we had a single embedding matrix for positions and tokens
82
    # init_params[0] = np.concatenate([init_params[1], init_params[0]], 0)
83
    # del init_params[1]
84
    init_params = [arr.squeeze() for arr in init_params]
85

86
    try:
87
        assert model.tokens_embed.weight.shape == init_params[1].shape
88
        assert model.positions_embed.weight.shape == init_params[0].shape
89
    except AssertionError as e:
90
        e.args += (model.tokens_embed.weight.shape, init_params[1].shape)
91
        e.args += (model.positions_embed.weight.shape, init_params[0].shape)
92
        raise
93

94
    model.tokens_embed.weight.data = torch.from_numpy(init_params[1])
95
    model.positions_embed.weight.data = torch.from_numpy(init_params[0])
96
    names.pop(0)
97
    # Pop position and token embedding arrays
98
    init_params.pop(0)
99
    init_params.pop(0)
100

101
    for name, array in zip(names, init_params):  # names[1:n_transfer], init_params[1:n_transfer]):
102
        name = name[6:]  # skip "model/"
103
        assert name[-2:] == ":0"
104
        name = name[:-2]
105
        name = name.split("/")
106
        pointer = model
107
        for m_name in name:
108
            if re.fullmatch(r"[A-Za-z]+\d+", m_name):
109
                scope_names = re.split(r"(\d+)", m_name)
110
            else:
111
                scope_names = [m_name]
112
            if scope_names[0] == "g":
113
                pointer = getattr(pointer, "weight")
114
            elif scope_names[0] == "b":
115
                pointer = getattr(pointer, "bias")
116
            elif scope_names[0] == "w":
117
                pointer = getattr(pointer, "weight")
118
            else:
119
                pointer = getattr(pointer, scope_names[0])
120
            if len(scope_names) >= 2:
121
                num = int(scope_names[1])
122
                pointer = pointer[num]
123
        try:
124
            assert pointer.shape == array.shape
125
        except AssertionError as e:
126
            e.args += (pointer.shape, array.shape)
127
            raise
128
        try:
129
            assert pointer.shape == array.shape
130
        except AssertionError as e:
131
            e.args += (pointer.shape, array.shape)
132
            raise
133
        logger.info("Initialize PyTorch weight {}".format(name))
134
        pointer.data = torch.from_numpy(array)
135
    return model
136

137

138
ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu_new}
139

140

141
class Attention(nn.Module):
142
    def __init__(self, nx, n_ctx, config, scale=False):
143
        super().__init__()
144
        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
145
        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
146
        assert n_state % config.n_head == 0
147
        self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
148
        self.n_head = config.n_head
149
        self.split_size = n_state
150
        self.scale = scale
151

152
        self.c_attn = Conv1D(n_state * 3, nx)
153
        self.c_proj = Conv1D(n_state, nx)
154
        self.attn_dropout = nn.Dropout(config.attn_pdrop)
155
        self.resid_dropout = nn.Dropout(config.resid_pdrop)
156
        self.pruned_heads = set()
157

158
    def prune_heads(self, heads):
159
        if len(heads) == 0:
160
            return
161
        heads, index = find_pruneable_heads_and_indices(
162
            heads, self.n_head, self.split_size // self.n_head, self.pruned_heads
163
        )
164
        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
165
        # Prune conv1d layers
166
        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
167
        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
168
        # Update hyper params
169
        self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
170
        self.n_head = self.n_head - len(heads)
171
        self.pruned_heads = self.pruned_heads.union(heads)
172

173
    def _attn(self, q, k, v, attention_mask=None, head_mask=None, output_attentions=False):
174
        w = torch.matmul(q, k)
175
        if self.scale:
176
            w = w / math.sqrt(v.size(-1))
177
        # w = w * self.bias + -1e9 * (1 - self.bias)  # TF implem method: mask_attn_weights
178
        # XD: self.b may be larger than w, so we need to crop it
179
        b = self.bias[:, :, : w.size(-2), : w.size(-1)]
180
        w = w * b + -1e4 * (1 - b)
181

182
        if attention_mask is not None:
183
            # Apply the attention mask
184
            w = w + attention_mask
185

186
        w = nn.Softmax(dim=-1)(w)
187
        w = self.attn_dropout(w)
188

189
        # Mask heads if we want to
190
        if head_mask is not None:
191
            w = w * head_mask
192

193
        outputs = [torch.matmul(w, v)]
194
        if output_attentions:
195
            outputs.append(w)
196
        return outputs
197

198
    def merge_heads(self, x):
199
        x = x.permute(0, 2, 1, 3).contiguous()
200
        new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
201
        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states
202

203
    def split_heads(self, x, k=False):
204
        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
205
        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
206
        if k:
207
            return x.permute(0, 2, 3, 1)
208
        else:
209
            return x.permute(0, 2, 1, 3)
210

211
    def forward(self, x, attention_mask=None, head_mask=None, output_attentions=False):
212
        x = self.c_attn(x)
213
        query, key, value = x.split(self.split_size, dim=2)
214
        query = self.split_heads(query)
215
        key = self.split_heads(key, k=True)
216
        value = self.split_heads(value)
217

218
        attn_outputs = self._attn(query, key, value, attention_mask, head_mask, output_attentions)
219
        a = attn_outputs[0]
220

221
        a = self.merge_heads(a)
222
        a = self.c_proj(a)
223
        a = self.resid_dropout(a)
224

225
        outputs = [a] + attn_outputs[1:]
226
        return outputs  # a, (attentions)
227

228

229
class MLP(nn.Module):
230
    def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
231
        super().__init__()
232
        nx = config.n_embd
233
        self.c_fc = Conv1D(n_state, nx)
234
        self.c_proj = Conv1D(nx, n_state)
235
        self.act = ACT_FNS[config.afn]
236
        self.dropout = nn.Dropout(config.resid_pdrop)
237

238
    def forward(self, x):
239
        h = self.act(self.c_fc(x))
240
        h2 = self.c_proj(h)
241
        return self.dropout(h2)
242

243

244
class Block(nn.Module):
245
    def __init__(self, n_ctx, config, scale=False):
246
        super().__init__()
247
        nx = config.n_embd
248
        self.attn = Attention(nx, n_ctx, config, scale)
249
        self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
250
        self.mlp = MLP(4 * nx, config)
251
        self.ln_2 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
252

253
    def forward(self, x, attention_mask=None, head_mask=None, output_attentions=False):
254
        attn_outputs = self.attn(
255
            x, attention_mask=attention_mask, head_mask=head_mask, output_attentions=output_attentions,
256
        )
257
        a = attn_outputs[0]
258

259
        n = self.ln_1(x + a)
260
        m = self.mlp(n)
261
        h = self.ln_2(n + m)
262

263
        outputs = [h] + attn_outputs[1:]
264
        return outputs
265

266

267
class OpenAIGPTPreTrainedModel(PreTrainedModel):
268
    """ An abstract class to handle weights initialization and
269
        a simple interface for downloading and loading pretrained models.
270
    """
271

272
    config_class = OpenAIGPTConfig
273
    load_tf_weights = load_tf_weights_in_openai_gpt
274
    base_model_prefix = "transformer"
275
    authorized_missing_keys = [r"position_ids"]
276

277
    def _init_weights(self, module):
278
        """ Initialize the weights.
279
        """
280
        if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
281
            # Slightly different from the TF version which uses truncated_normal for initialization
282
            # cf https://github.com/pytorch/pytorch/pull/5617
283
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
284
            if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None:
285
                module.bias.data.zero_()
286
        elif isinstance(module, nn.LayerNorm):
287
            module.bias.data.zero_()
288
            module.weight.data.fill_(1.0)
289

290

291
@dataclass
292
class OpenAIGPTDoubleHeadsModelOutput(ModelOutput):
293
    """
294
    Base class for outputs of models predicting if two sentences are consecutive or not.
295

296
    Args:
297
        lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided):
298
            Language modeling loss.
299
        mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`mc_labels` is provided):
300
            Multiple choice classification loss.
301
        lm_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
302
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
303
        mc_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
304
            Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
305
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
306
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
307
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
308

309
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
310
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
311
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
312
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
313

314
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
315
            heads.
316
    """
317

318
    lm_loss: Optional[torch.FloatTensor] = None
319
    mc_loss: Optional[torch.FloatTensor] = None
320
    lm_logits: torch.FloatTensor = None
321
    mc_logits: torch.FloatTensor = None
322
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
323
    attentions: Optional[Tuple[torch.FloatTensor]] = None
324

325

326
OPENAI_GPT_START_DOCSTRING = r"""
327

328
    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
329
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
330
    usage and behavior.
331

332
    Parameters:
333
        config (:class:`~transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
334
            Initializing with a config file does not load the weights associated with the model, only the configuration.
335
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
336
"""
337

338
OPENAI_GPT_INPUTS_DOCSTRING = r"""
339
    Args:
340
        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
341
            Indices of input sequence tokens in the vocabulary.
342

343
            Indices can be obtained using :class:`transformers.OpenAIGPTTokenizer`.
344
            See :func:`transformers.PreTrainedTokenizer.encode` and
345
            :func:`transformers.PreTrainedTokenizer.__call__` for details.
346

347
            `What are input IDs? <../glossary.html#input-ids>`__
348
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
349
            Mask to avoid performing attention on padding token indices.
350
            Mask values selected in ``[0, 1]``:
351
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
352

353
            `What are attention masks? <../glossary.html#attention-mask>`__
354
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
355
            Segment token indices to indicate first and second portions of the inputs.
356
            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
357
            corresponds to a `sentence B` token
358

359
            `What are token type IDs? <../glossary.html#token-type-ids>`_
360
        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
361
            Indices of positions of each input sequence tokens in the position embeddings.
362
            Selected in the range ``[0, config.max_position_embeddings - 1]``.
363

364
            `What are position IDs? <../glossary.html#position-ids>`_
365
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
366
            Mask to nullify selected heads of the self-attention modules.
367
            Mask values selected in ``[0, 1]``:
368
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
369
        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
370
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
371
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
372
            than the model's internal embedding lookup matrix.
373
        output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
374
            If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
375
        output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
376
            If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
377
        return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
378
            If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
379
            plain tuple.
380
"""
381

382

383
@add_start_docstrings(
384
    "The bare OpenAI GPT transformer model outputting raw hidden-states without any specific head on top.",
385
    OPENAI_GPT_START_DOCSTRING,
386
)
387
class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
388
    def __init__(self, config):
389
        super().__init__(config)
390

391
        self.tokens_embed = nn.Embedding(config.vocab_size, config.n_embd)
392
        self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)
393
        self.drop = nn.Dropout(config.embd_pdrop)
394
        self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
395

396
        self.register_buffer("position_ids", torch.arange(config.n_positions))
397
        self.init_weights()
398

399
    def get_input_embeddings(self):
400
        return self.tokens_embed
401

402
    def set_input_embeddings(self, new_embeddings):
403
        self.tokens_embed = new_embeddings
404

405
    def _prune_heads(self, heads_to_prune):
406
        """ Prunes heads of the model.
407
            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
408
        """
409
        for layer, heads in heads_to_prune.items():
410
            self.h[layer].attn.prune_heads(heads)
411

412
    @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
413
    @add_code_sample_docstrings(
414
        tokenizer_class=_TOKENIZER_FOR_DOC,
415
        checkpoint="openai-gpt",
416
        output_type=BaseModelOutput,
417
        config_class=_CONFIG_FOR_DOC,
418
    )
419
    def forward(
420
        self,
421
        input_ids=None,
422
        attention_mask=None,
423
        token_type_ids=None,
424
        position_ids=None,
425
        head_mask=None,
426
        inputs_embeds=None,
427
        output_attentions=None,
428
        output_hidden_states=None,
429
        return_dict=None,
430
    ):
431
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
432
        output_hidden_states = (
433
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
434
        )
435
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
436

437
        if input_ids is not None and inputs_embeds is not None:
438
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
439
        elif input_ids is not None:
440
            input_shape = input_ids.size()
441
            input_ids = input_ids.view(-1, input_shape[-1])
442
        elif inputs_embeds is not None:
443
            input_shape = inputs_embeds.size()[:-1]
444
        else:
445
            raise ValueError("You have to specify either input_ids or inputs_embeds")
446

447
        if position_ids is None:
448
            # Code is different from when we had a single embedding matrice from position and token embeddings
449
            position_ids = self.position_ids[None, : input_shape[-1]]
450

451
        # Attention mask.
452
        if attention_mask is not None:
453
            # We create a 3D attention mask from a 2D tensor mask.
454
            # Sizes are [batch_size, 1, 1, to_seq_length]
455
            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
456
            # this attention mask is more simple than the triangular masking of causal attention
457
            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
458
            attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
459

460
            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
461
            # masked positions, this operation will create a tensor which is 0.0 for
462
            # positions we want to attend and -10000.0 for masked positions.
463
            # Since we are adding it to the raw scores before the softmax, this is
464
            # effectively the same as removing these entirely.
465
            attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
466
            attention_mask = (1.0 - attention_mask) * -10000.0
467

468
        # Prepare head mask if needed
469
        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
470

471
        if inputs_embeds is None:
472
            inputs_embeds = self.tokens_embed(input_ids)
473
        position_embeds = self.positions_embed(position_ids)
474
        if token_type_ids is not None:
475
            token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
476
            token_type_embeds = self.tokens_embed(token_type_ids)
477
        else:
478
            token_type_embeds = 0
479
        hidden_states = inputs_embeds + position_embeds + token_type_embeds
480
        hidden_states = self.drop(hidden_states)
481

482
        output_shape = input_shape + (hidden_states.size(-1),)
483

484
        all_attentions = () if output_attentions else None
485
        all_hidden_states = () if output_hidden_states else None
486
        for i, block in enumerate(self.h):
487
            if output_hidden_states:
488
                all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
489

490
            outputs = block(hidden_states, attention_mask, head_mask[i], output_attentions=output_attentions)
491
            hidden_states = outputs[0]
492
            if output_attentions:
493
                all_attentions = all_attentions + (outputs[1],)
494

495
        hidden_states = hidden_states.view(*output_shape)
496
        # Add last layer
497
        if output_hidden_states:
498
            all_hidden_states = all_hidden_states + (hidden_states,)
499

500
        if not return_dict:
501
            return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
502

503
        return BaseModelOutput(
504
            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions,
505
        )
506

507

508
@add_start_docstrings(
509
    """OpenAI GPT Model transformer with a language modeling head on top
510
    (linear layer with weights tied to the input embeddings). """,
511
    OPENAI_GPT_START_DOCSTRING,
512
)
513
class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
514
    def __init__(self, config):
515
        super().__init__(config)
516
        self.transformer = OpenAIGPTModel(config)
517
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
518

519
        self.init_weights()
520

521
    def get_output_embeddings(self):
522
        return self.lm_head
523

524
    @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
525
    @add_code_sample_docstrings(
526
        tokenizer_class=_TOKENIZER_FOR_DOC,
527
        checkpoint="openai-gpt",
528
        output_type=CausalLMOutput,
529
        config_class=_CONFIG_FOR_DOC,
530
    )
531
    def forward(
532
        self,
533
        input_ids=None,
534
        attention_mask=None,
535
        token_type_ids=None,
536
        position_ids=None,
537
        head_mask=None,
538
        inputs_embeds=None,
539
        labels=None,
540
        output_attentions=None,
541
        output_hidden_states=None,
542
        return_dict=None,
543
    ):
544
        r"""
545
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
546
            Labels for language modeling.
547
            Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
548
            Indices are selected in ``[-100, 0, ..., config.vocab_size]``
549
            All labels set to ``-100`` are ignored (masked), the loss is only
550
            computed for labels in ``[0, ..., config.vocab_size]``
551
        """
552
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
553

554
        transformer_outputs = self.transformer(
555
            input_ids,
556
            attention_mask=attention_mask,
557
            token_type_ids=token_type_ids,
558
            position_ids=position_ids,
559
            head_mask=head_mask,
560
            inputs_embeds=inputs_embeds,
561
            output_attentions=output_attentions,
562
            output_hidden_states=output_hidden_states,
563
            return_dict=return_dict,
564
        )
565
        hidden_states = transformer_outputs[0]
566
        lm_logits = self.lm_head(hidden_states)
567

568
        loss = None
569
        if labels is not None:
570
            # Shift so that tokens < n predict n
571
            shift_logits = lm_logits[..., :-1, :].contiguous()
572
            shift_labels = labels[..., 1:].contiguous()
573
            # Flatten the tokens
574
            loss_fct = CrossEntropyLoss()
575
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
576

577
        if not return_dict:
578
            output = (lm_logits,) + transformer_outputs[1:]
579
            return ((loss,) + output) if loss is not None else output
580

581
        return CausalLMOutput(
582
            loss=loss,
583
            logits=lm_logits,
584
            hidden_states=transformer_outputs.hidden_states,
585
            attentions=transformer_outputs.attentions,
586
        )
587

588

589
@add_start_docstrings(
590
    """OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
591
    head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
592
    The language modeling head has its weights tied to the input embeddings,
593
    the classification head takes as input the input of a specified classification token index in the input sequence).
594
""",
595
    OPENAI_GPT_START_DOCSTRING,
596
)
597
class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
598
    def __init__(self, config):
599
        super().__init__(config)
600

601
        config.num_labels = 1
602
        self.transformer = OpenAIGPTModel(config)
603
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
604
        self.multiple_choice_head = SequenceSummary(config)
605

606
        self.init_weights()
607

608
    def get_output_embeddings(self):
609
        return self.lm_head
610

611
    @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
612
    @replace_return_docstrings(output_type=OpenAIGPTDoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)
613
    def forward(
614
        self,
615
        input_ids=None,
616
        attention_mask=None,
617
        token_type_ids=None,
618
        position_ids=None,
619
        head_mask=None,
620
        inputs_embeds=None,
621
        mc_token_ids=None,
622
        labels=None,
623
        mc_labels=None,
624
        output_attentions=None,
625
        output_hidden_states=None,
626
        return_dict=None,
627
        **kwargs
628
    ):
629
        r"""
630
        mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
631
            Index of the classification token in each input sequence.
632
            Selected in the range ``[0, input_ids.size(-1) - 1]``.
633
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`)
634
            Labels for language modeling.
635
            Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
636
            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
637
            All labels set to ``-100`` are ignored (masked), the loss is only
638
            computed for labels in ``[0, ..., config.vocab_size]``
639
        mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`)
640
            Labels for computing the multiple choice classification loss.
641
            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
642
            of the input tensors. (see `input_ids` above)
643
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
644
            Used to hide legacy arguments that have been deprecated.
645

646
    Return:
647

648
    Examples::
649

650
        from transformers import OpenAIGPTTokenizer, OpenAIGPTDoubleHeadsModel
651
        import torch
652

653
        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
654
        model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt', return_dict=True)
655
        tokenizer.add_special_tokens({'cls_token': '[CLS]'})  # Add a [CLS] to the vocabulary (we should train it also!)
656
        model.resize_token_embeddings(len(tokenizer))
657

658
        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
659
        input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
660
        mc_token_ids = torch.tensor([input_ids.size(-1)-1, input_ids.size(-1)-1]).unsqueeze(0)  # Batch size 1
661

662
        outputs = model(input_ids, mc_token_ids=mc_token_ids)
663
        lm_logits = outputs.lm_logits
664
        mc_logits = outputs.mc_logits
665
        """
666
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
667
        if "lm_labels" in kwargs:
668
            warnings.warn(
669
                "The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
670
                FutureWarning,
671
            )
672
            labels = kwargs.pop("lm_labels")
673
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
674

675
        transformer_outputs = self.transformer(
676
            input_ids,
677
            attention_mask=attention_mask,
678
            token_type_ids=token_type_ids,
679
            position_ids=position_ids,
680
            head_mask=head_mask,
681
            inputs_embeds=inputs_embeds,
682
            output_attentions=output_attentions,
683
            output_hidden_states=output_hidden_states,
684
            return_dict=return_dict,
685
        )
686
        hidden_states = transformer_outputs[0]
687

688
        lm_logits = self.lm_head(hidden_states)
689
        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1)
690

691
        lm_loss, mc_loss = None, None
692
        if mc_labels is not None:
693
            loss_fct = CrossEntropyLoss()
694
            mc_loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))
695
        if labels is not None:
696
            shift_logits = lm_logits[..., :-1, :].contiguous()
697
            shift_labels = labels[..., 1:].contiguous()
698
            loss_fct = CrossEntropyLoss()
699
            lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
700

701
        if not return_dict:
702
            output = (lm_logits, mc_logits) + transformer_outputs[1:]
703
            if mc_loss is not None:
704
                output = (mc_loss,) + output
705
            return ((lm_loss,) + output) if lm_loss is not None else output
706

707
        return OpenAIGPTDoubleHeadsModelOutput(
708
            lm_loss=lm_loss,
709
            mc_loss=mc_loss,
710
            lm_logits=lm_logits,
711
            mc_logits=mc_logits,
712
            hidden_states=transformer_outputs.hidden_states,
713
            attentions=transformer_outputs.attentions,
714
        )
715
CSS-LM

Использование cookies