CSS-LM

modeling_roberta_updateRep_nodomain.py
1111 строк · 43.8 Кб
Перенос по словам
1
# coding=utf-8
2
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
4
#
5
# Licensed under the Apache License, Version 2.0 (the "License");
6
# you may not use this file except in compliance with the License.
7
# You may obtain a copy of the License at
8
#
9
#     http://www.apache.org/licenses/LICENSE-2.0
10
#
11
# Unless required by applicable law or agreed to in writing, software
12
# distributed under the License is distributed on an "AS IS" BASIS,
13
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
# See the License for the specific language governing permissions and
15
# limitations under the License.
16
"""PyTorch RoBERTa model. """
17

18

19
import logging
20
import warnings
21

22
import torch
23
import torch.nn as nn
24
from torch.nn import CrossEntropyLoss, MSELoss
25

26
from .configuration_roberta import RobertaConfig
27
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
28
from .modeling_bert import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu
29
from .modeling_outputs import (
30
    MaskedLMOutput,
31
    MultipleChoiceModelOutput,
32
    QuestionAnsweringModelOutput,
33
    SequenceClassifierOutput,
34
    TokenClassifierOutput,
35
)
36

37

38
logger = logging.getLogger(__name__)
39

40
_CONFIG_FOR_DOC = "RobertaConfig"
41
_TOKENIZER_FOR_DOC = "RobertaTokenizer"
42

43
ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
44
    "roberta-base",
45
    "roberta-large",
46
    "roberta-large-mnli",
47
    "distilroberta-base",
48
    "roberta-base-openai-detector",
49
    "roberta-large-openai-detector",
50
    # See all RoBERTa models at https://huggingface.co/models?filter=roberta
51
]
52

53

54
class RobertaEmbeddings(BertEmbeddings):
55
    """
56
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
57
    """
58

59
    def __init__(self, config):
60
        super().__init__(config)
61
        self.padding_idx = config.pad_token_id
62
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx)
63
        self.position_embeddings = nn.Embedding(
64
            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
65
        )
66

67
    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
68
        if position_ids is None:
69
            if input_ids is not None:
70
                # Create the position ids from the input token ids. Any padded tokens remain padded.
71
                position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx).to(input_ids.device)
72
            else:
73
                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
74

75
        return super().forward(
76
            input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds
77
        )
78

79
    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
80
        """ We are provided embeddings directly. We cannot infer which are padded so just generate
81
        sequential position ids.
82

83
        :param torch.Tensor inputs_embeds:
84
        :return torch.Tensor:
85
        """
86
        input_shape = inputs_embeds.size()[:-1]
87
        sequence_length = input_shape[1]
88

89
        position_ids = torch.arange(
90
            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
91
        )
92
        return position_ids.unsqueeze(0).expand(input_shape)
93

94

95
ROBERTA_START_DOCSTRING = r"""
96

97
    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
98
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
99
    usage and behavior.
100

101
    Parameters:
102
        config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the
103
            model. Initializing with a config file does not load the weights associated with the model, only the configuration.
104
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
105
"""
106

107
ROBERTA_INPUTS_DOCSTRING = r"""
108
    Args:
109
        input_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`):
110
            Indices of input sequence tokens in the vocabulary.
111

112
            Indices can be obtained using :class:`transformers.RobertaTokenizer`.
113
            See :func:`transformers.PreTrainedTokenizer.encode` and
114
            :func:`transformers.PreTrainedTokenizer.__call__` for details.
115

116
            `What are input IDs? <../glossary.html#input-ids>`__
117
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
118
            Mask to avoid performing attention on padding token indices.
119
            Mask values selected in ``[0, 1]``:
120
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
121

122
            `What are attention masks? <../glossary.html#attention-mask>`__
123
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
124
            Segment token indices to indicate first and second portions of the inputs.
125
            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
126
            corresponds to a `sentence B` token
127

128
            `What are token type IDs? <../glossary.html#token-type-ids>`_
129
        position_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
130
            Indices of positions of each input sequence tokens in the position embeddings.
131
            Selected in the range ``[0, config.max_position_embeddings - 1]``.
132

133
            `What are position IDs? <../glossary.html#position-ids>`_
134
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
135
            Mask to nullify selected heads of the self-attention modules.
136
            Mask values selected in ``[0, 1]``:
137
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
138
        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
139
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
140
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
141
            than the model's internal embedding lookup matrix.
142
        output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
143
            If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
144
        output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
145
            If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
146
        return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
147
            If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
148
            plain tuple.
149
"""
150

151

152
@add_start_docstrings(
153
    "The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
154
    ROBERTA_START_DOCSTRING,
155
)
156
class RobertaModel(BertModel):
157
    """
158
    This class overrides :class:`~transformers.BertModel`. Please check the
159
    superclass for the appropriate documentation alongside usage examples.
160
    """
161

162
    config_class = RobertaConfig
163
    base_model_prefix = "roberta"
164

165
    def __init__(self, config):
166
        super().__init__(config)
167

168
        self.embeddings = RobertaEmbeddings(config)
169
        self.init_weights()
170

171
    def get_input_embeddings(self):
172
        return self.embeddings.word_embeddings
173

174
    def set_input_embeddings(self, value):
175
        self.embeddings.word_embeddings = value
176

177

178

179

180
@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING)
181
class RobertaForMaskedLMDomainTask(BertPreTrainedModel):
182
    config_class = RobertaConfig
183
    base_model_prefix = "roberta"
184

185
    def __init__(self, config):
186
        super().__init__(config)
187

188
        self.roberta = RobertaModel(config)
189
        self.lm_head = RobertaLMHead(config)
190
        #print("====")
191
        #print(config.num_labels)
192
        #print("====")
193
        #exit()
194
        #config.num_labels = 8 # add
195
        self.classifier = RobertaClassificationHead(config)
196
        #self.classifier = RobertaClassificationHeadandTail(config)
197
        self.num_labels = config.num_labels
198

199
        self.init_weights()
200
        ###
201
        #self.domain_layer = torch.nn.Linear(768,768,bias=False)
202
        #self.task_layer = torch.nn.Linear(768,768,bias=False)
203
        #torch.nn.init.xavier_uniform_(self.domain_layer.weight)
204
        #self.act = nn.ReLU()
205
        #self.layer_out_domainClass = nn.Linear(768,2) #num_class
206
        #self.layer_out_taskClass = nn.Linear(768,8,bias=True) #num_class
207
        ###
208
        self.LeakyReLU = torch.nn.LeakyReLU()
209
        self.LeakyReLU = torch.nn.LeakyReLU()
210
        self.domain_binary_classifier = nn.Linear(768,2,bias=True) #num_class
211
        self.task_binary_classifier = nn.Linear(768*2,2,bias=True) #num_class
212
        #self.act = nn.ReLU()
213
        ###
214

215
    def get_output_embeddings(self):
216
        return self.lm_head.decoder
217

218
    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
219
    @add_code_sample_docstrings(
220
        tokenizer_class=_TOKENIZER_FOR_DOC,
221
        checkpoint="roberta-base",
222
        output_type=MaskedLMOutput,
223
        config_class=_CONFIG_FOR_DOC,
224
    )
225

226
    def forward(
227
        self,
228
        input_ids=None,
229
        input_ids_org=None,
230
        attention_mask=None,
231
        token_type_ids=None,
232
        position_ids=None,
233
        head_mask=None,
234
        inputs_embeds=None,
235
        labels=None,
236
        output_attentions=None,
237
        output_hidden_states=None,
238
        return_dict=None,
239
        func=None,
240
        tail_idxs=None,
241
        in_domain_rep=None,
242
        out_domain_rep=None,
243
        sentence_label=None,
244
        lm_label=None,
245
        batch_size=None,
246
        all_in_task_rep_comb=None,
247
        all_sentence_binary_label=None,
248
        from_query=False,
249
        **kwargs
250
    ):
251
        r"""
252
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
253
            Labels for computing the masked language modeling loss.
254
            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
255
            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
256
            in ``[0, ..., config.vocab_size]``
257
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
258
            Used to hide legacy arguments that have been deprecated.
259
        """
260
        if "masked_lm_labels" in kwargs:
261
            warnings.warn(
262
                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
263
                FutureWarning,
264
            )
265
            labels = kwargs.pop("masked_lm_labels")
266
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
267
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
268

269

270
        if func == "in_domain_task_rep":
271
            #######
272
            outputs = self.roberta(
273
                input_ids=input_ids_org,
274
                attention_mask=attention_mask,
275
                token_type_ids=token_type_ids,
276
                position_ids=position_ids,
277
                head_mask=head_mask,
278
                inputs_embeds=inputs_embeds,
279
                output_attentions=output_attentions,
280
                output_hidden_states=output_hidden_states,
281
                return_dict=return_dict,
282
            )
283
            #######
284
            #x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
285
            #rep = outputs.last_hidden_state[:, 0, :]
286
            #rep = outputs.last_hidden_state[:, 0, :]
287
            rep_head = outputs.last_hidden_state[:, 0, :]
288
            rep_tail = outputs.last_hidden_state[input_ids_org==2]
289
            #detach
290
            #rep = rep.detach()
291
            '''
292
            in_domain_rep = self.domain_layer(rep)
293
            in_task_rep = self.task_layer(rep)
294
            return in_domain_rep, in_task_rep
295
            '''
296
            return rep_tail, rep_head
297

298
        elif func == "in_domain_task_rep_mean":
299
            #######
300
            outputs = self.roberta(
301
                input_ids=input_ids_org,
302
                attention_mask=attention_mask,
303
                token_type_ids=token_type_ids,
304
                position_ids=position_ids,
305
                head_mask=head_mask,
306
                inputs_embeds=inputs_embeds,
307
                output_attentions=output_attentions,
308
                output_hidden_states=output_hidden_states,
309
                return_dict=return_dict,
310
            )
311
            #######
312
            #x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
313
            rep = outputs.last_hidden_state
314
            mask = rep!=0
315
            rep = (rep*mask).sum(dim=1)/mask.sum(dim=1)
316

317
            #detach
318
            #rep = rep.detach()
319
            '''
320
            in_domain_rep = self.domain_layer(rep)
321
            in_task_rep = self.task_layer(rep)
322
            return in_domain_rep, in_task_rep
323
            '''
324
            return rep, rep
325

326
        elif func == "return_task_binary_classifier":
327
            return self.task_binary_classifier.weight.data, self.task_binary_classifier.bias.data
328

329
        elif func == "return_domain_binary_classifier":
330
            return self.domain_binary_classifier.weight.data, self.domain_binary_classifier.bias.data
331

332

333
        #if func == "task_binary_classifier":
334

335
        elif func == "domain_binary_classifier":
336
            #in:1 , out:0
337
            #Need to fix
338
            #######
339
            outputs = self.roberta(
340
                input_ids=input_ids_org,
341
                attention_mask=attention_mask,
342
                token_type_ids=token_type_ids,
343
                position_ids=position_ids,
344
                head_mask=head_mask,
345
                inputs_embeds=inputs_embeds,
346
                output_attentions=output_attentions,
347
                output_hidden_states=output_hidden_states,
348
                return_dict=return_dict,
349
            )
350
            #######
351
            #Didn't include query rep: so it need to add in_domain_rep here
352
            loss_fct = CrossEntropyLoss()
353
            out_domain_rep_head = outputs.last_hidden_state[:, 0, :]
354
            out_domain_rep_tail = outputs.last_hidden_state[input_ids_org==2]
355
            #print("model_head",out_domain_rep_head.shape)
356
            #print("model_tail",out_domain_rep_tail.shape)
357
            domain_rep = torch.cat([in_domain_rep, out_domain_rep_tail], 0)
358
            #detach
359
            #domain_rep = domain_rep.detach()
360
            logit = self.domain_binary_classifier(domain_rep)
361
            logit = self.LeakyReLU(logit)
362
            pos_target = torch.tensor([1]*in_domain_rep.shape[0]).to("cuda")
363
            neg_target = torch.tensor([0]*out_domain_rep_tail.shape[0]).to("cuda")
364
            target = torch.cat([pos_target, neg_target], 0)
365
            domain_loss = loss_fct(logit, target)
366

367
            return domain_loss, logit, out_domain_rep_head, out_domain_rep_tail
368

369

370
        elif func == "domain_binary_classifier_mean":
371
            #in:1 , out:0
372
            #Need to fix
373
            #######
374
            outputs = self.roberta(
375
                input_ids=input_ids_org,
376
                attention_mask=attention_mask,
377
                token_type_ids=token_type_ids,
378
                position_ids=position_ids,
379
                head_mask=head_mask,
380
                inputs_embeds=inputs_embeds,
381
                output_attentions=output_attentions,
382
                output_hidden_states=output_hidden_states,
383
                return_dict=return_dict,
384
            )
385
            #######
386
            #Didn't include query rep: so it need to add in_domain_rep here
387
            loss_fct = CrossEntropyLoss()
388
            out_domain_rep = outputs.last_hidden_state
389
            ###
390
            mask = out_domain_rep!=0
391
            out_domain_rep = (out_domain_rep*mask).sum(dim=1)/mask.sum(dim=1)
392
            ###
393
            domain_rep = torch.cat([in_domain_rep, out_domain_rep], 0)
394
            #detach
395
            #domain_rep = domain_rep.detach()
396
            logit = self.domain_binary_classifier(domain_rep)
397
            logit = self.LeakyReLU(logit)
398
            pos_target = torch.tensor([1]*in_domain_rep.shape[0]).to("cuda")
399
            neg_target = torch.tensor([0]*out_domain_rep.shape[0]).to("cuda")
400
            target = torch.cat([pos_target, neg_target], 0)
401
            domain_loss = loss_fct(logit, target)
402
            return domain_loss, logit
403

404

405
        elif func == "task_binary_classifier":
406
            #Didn't include query rep: so it need to add in_domain_rep here
407
            loss_fct = CrossEntropyLoss()
408
            #detach
409
            #all_in_task_rep_comb = all_in_task_rep_comb.detach()
410
            logit = self.task_binary_classifier(all_in_task_rep_comb)
411
            logit = self.LeakyReLU(logit)
412
            all_sentence_binary_label = all_sentence_binary_label.reshape(all_sentence_binary_label.shape[0]*all_sentence_binary_label.shape[1])
413
            logit = logit.reshape(logit.shape[0]*logit.shape[1],logit.shape[2])
414
            task_binary_loss = loss_fct(logit.view(-1,2), all_sentence_binary_label.view(-1))
415
            return task_binary_loss, logit
416

417

418
        elif func == "task_binary_classifier_mean":
419
            #Didn't include query rep: so it need to add in_domain_rep here
420
            loss_fct = CrossEntropyLoss()
421
            #detach
422
            #all_in_task_rep_comb = all_in_task_rep_comb.detach()
423
            logit = self.task_binary_classifier(all_in_task_rep_comb)
424
            logit = self.LeakyReLU(logit)
425
            all_sentence_binary_label = all_sentence_binary_label.reshape(all_sentence_binary_label.shape[0]*all_sentence_binary_label.shape[1])
426
            logit = logit.reshape(logit.shape[0]*logit.shape[1],logit.shape[2])
427
            task_binary_loss = loss_fct(logit.view(-1,2), all_sentence_binary_label.view(-1))
428
            return task_binary_loss, logit
429

430
        elif func == "task_class":
431
            #######
432
            outputs = self.roberta(
433
                input_ids=input_ids_org,
434
                attention_mask=attention_mask,
435
                token_type_ids=token_type_ids,
436
                position_ids=position_ids,
437
                head_mask=head_mask,
438
                inputs_embeds=inputs_embeds,
439
                output_attentions=output_attentions,
440
                output_hidden_states=output_hidden_states,
441
                return_dict=return_dict,
442
            )
443
            #######
444
            #Already including query rep
445
            loss_fct = CrossEntropyLoss()
446
            ###
447
            #class_logit = self.classifier(outputs.last_hidden_state, input_ids_org)
448
            class_logit = self.classifier(outputs.last_hidden_state)
449
            task_loss = loss_fct(class_logit.view(-1, self.num_labels), sentence_label.view(-1))
450

451
            if from_query==True:
452
                query_rep_head = outputs.last_hidden_state[:,0,:]
453
                query_rep_tail = outputs.last_hidden_state[input_ids_org==2]
454
                return task_loss, class_logit, query_rep_head, query_rep_tail
455
            else:
456
                return task_loss, class_logit
457

458

459
        elif func == "mlm":
460
            outputs_mlm = self.roberta(
461
                input_ids=input_ids,
462
                attention_mask=attention_mask,
463
                token_type_ids=token_type_ids,
464
                position_ids=position_ids,
465
                head_mask=head_mask,
466
                inputs_embeds=inputs_embeds,
467
                output_attentions=output_attentions,
468
                output_hidden_states=output_hidden_states,
469
                return_dict=return_dict,
470
            )
471

472
            loss_fct = CrossEntropyLoss()
473
            #sequence_output = outputs_mlm.last_hidden_state
474
            sequence_output = outputs_mlm[0]
475
            prediction_scores = self.lm_head(sequence_output)
476
            loss_fct = CrossEntropyLoss(ignore_index=-1)
477
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), lm_label.view(-1))
478
            return masked_lm_loss
479

480

481
        elif func == "task_class and mlm":
482
            #######
483
            outputs = self.roberta(
484
                input_ids=input_ids_org,
485
                attention_mask=attention_mask,
486
                token_type_ids=token_type_ids,
487
                position_ids=position_ids,
488
                head_mask=head_mask,
489
                inputs_embeds=inputs_embeds,
490
                output_attentions=output_attentions,
491
                output_hidden_states=output_hidden_states,
492
                return_dict=return_dict,
493
            )
494
            #######
495
            #######
496
            outputs_mlm = self.roberta(
497
                input_ids=input_ids,
498
                attention_mask=attention_mask,
499
                token_type_ids=token_type_ids,
500
                position_ids=position_ids,
501
                head_mask=head_mask,
502
                inputs_embeds=inputs_embeds,
503
                output_attentions=output_attentions,
504
                output_hidden_states=output_hidden_states,
505
                return_dict=return_dict,
506
            )
507
            #######
508
            #Already including query rep
509
            #task loss
510
            loss_fct = CrossEntropyLoss()
511
            ###
512
            '''
513
            #rep = outputs.last_hidden_state[input_ids==2]
514
            rep = outputs.last_hidden_state[:, 0, :]
515
            #rep = rep.detach()
516
            task_rep = self.task_layer(rep)
517
            class_logit = self.layer_out_taskClass((self.act(task_rep)))
518
            '''
519
            class_logit = self.classifier(outputs.last_hidden_state)
520
            ###
521
            task_loss = loss_fct(class_logit.view(-1, 8), sentence_label.view(-1))
522

523
            #mlm loss
524
            sequence_output = outputs_mlm.last_hidden_state
525
            prediction_scores = self.lm_head(sequence_output)
526
            loss_fct = CrossEntropyLoss(ignore_index=-1)
527
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), lm_label.view(-1))
528
            return task_loss, masked_lm_loss
529

530
        elif func == "gen_rep":
531
            outputs = self.roberta(
532
                input_ids=input_ids_org,
533
                attention_mask=attention_mask,
534
                token_type_ids=token_type_ids,
535
                position_ids=position_ids,
536
                head_mask=head_mask,
537
                inputs_embeds=inputs_embeds,
538
                output_attentions=output_attentions,
539
                output_hidden_states=output_hidden_states,
540
                return_dict=return_dict,
541
            )
542
            return outputs
543

544
        '''
545
        #mlm
546
        outputs_mlm = self.roberta(
547
            input_ids=input_ids,
548
            attention_mask=attention_mask,
549
            token_type_ids=token_type_ids,
550
            position_ids=position_ids,
551
            head_mask=head_mask,
552
            inputs_embeds=inputs_embeds,
553
            output_attentions=output_attentions,
554
            output_hidden_states=output_hidden_states,
555
            return_dict=return_dict,
556
        )
557

558
        sequence_output = outputs_mlm[0]
559
        prediction_scores = self.lm_head(sequence_output)
560

561
        masked_lm_loss = None
562
        if labels is not None:
563
            #loss_fct = CrossEntropyLoss()
564
            loss_fct = CrossEntropyLoss(ignore_index=-1)
565
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
566

567
        if not return_dict:
568
            output = (prediction_scores,) + outputs[2:]
569
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
570

571
        return MaskedLMOutput(
572
            loss=masked_lm_loss,
573
            logits=prediction_scores,
574
            hidden_states=outputs.hidden_states,
575
            attentions=outputs.attentions,
576
        )
577
        '''
578

579

580

581
@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING)
582
class RobertaForMaskedLM(BertPreTrainedModel):
583
    config_class = RobertaConfig
584
    base_model_prefix = "roberta"
585

586
    def __init__(self, config):
587
        super().__init__(config)
588

589
        self.roberta = RobertaModel(config)
590
        self.lm_head = RobertaLMHead(config)
591

592
        self.init_weights()
593

594
    def get_output_embeddings(self):
595
        return self.lm_head.decoder
596

597
    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
598
    @add_code_sample_docstrings(
599
        tokenizer_class=_TOKENIZER_FOR_DOC,
600
        checkpoint="roberta-base",
601
        output_type=MaskedLMOutput,
602
        config_class=_CONFIG_FOR_DOC,
603
    )
604
    def forward(
605
        self,
606
        input_ids=None,
607
        attention_mask=None,
608
        token_type_ids=None,
609
        position_ids=None,
610
        head_mask=None,
611
        inputs_embeds=None,
612
        labels=None,
613
        output_attentions=None,
614
        output_hidden_states=None,
615
        return_dict=None,
616
        **kwargs
617
    ):
618
        r"""
619
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
620
            Labels for computing the masked language modeling loss.
621
            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
622
            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
623
            in ``[0, ..., config.vocab_size]``
624
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
625
            Used to hide legacy arguments that have been deprecated.
626
        """
627
        if "masked_lm_labels" in kwargs:
628
            warnings.warn(
629
                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
630
                FutureWarning,
631
            )
632
            labels = kwargs.pop("masked_lm_labels")
633
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
634
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
635

636
        outputs = self.roberta(
637
            input_ids,
638
            attention_mask=attention_mask,
639
            token_type_ids=token_type_ids,
640
            position_ids=position_ids,
641
            head_mask=head_mask,
642
            inputs_embeds=inputs_embeds,
643
            output_attentions=output_attentions,
644
            output_hidden_states=output_hidden_states,
645
            return_dict=return_dict,
646
        )
647

648
        sequence_output = outputs[0]
649
        prediction_scores = self.lm_head(sequence_output)
650

651
        masked_lm_loss = None
652
        if labels is not None:
653
            #loss_fct = CrossEntropyLoss()
654
            loss_fct = CrossEntropyLoss(ignore_index=-1)
655
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
656

657
        if not return_dict:
658
            output = (prediction_scores,) + outputs[2:]
659
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
660

661

662
        return MaskedLMOutput(
663
            loss=masked_lm_loss,
664
            logits=prediction_scores,
665
            hidden_states=outputs.hidden_states,
666
            attentions=outputs.attentions,
667
        )
668

669

670
class RobertaLMHead(nn.Module):
671
    """Roberta Head for masked language modeling."""
672

673
    def __init__(self, config):
674
        super().__init__()
675
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
676
        self.layer_norm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
677

678
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
679
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
680

681
        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
682
        self.decoder.bias = self.bias
683

684
    def forward(self, features, **kwargs):
685
        x = self.dense(features)
686
        x = gelu(x)
687
        x = self.layer_norm(x)
688

689
        # project back to size of vocabulary with bias
690
        x = self.decoder(x)
691

692
        return x
693

694

695
@add_start_docstrings(
696
    """RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer
697
    on top of the pooled output) e.g. for GLUE tasks. """,
698
    ROBERTA_START_DOCSTRING,
699
)
700
class RobertaForSequenceClassification(BertPreTrainedModel):
701
    config_class = RobertaConfig
702
    base_model_prefix = "roberta"
703

704
    def __init__(self, config):
705
        super().__init__(config)
706
        #config.num_labels = 8 # add (can remove)
707
        self.num_labels = config.num_labels
708

709
        self.roberta = RobertaModel(config)
710
        self.classifier = RobertaClassificationHead(config)
711
        #self.classifier = RobertaClassificationHeadandTail(config)
712

713
        self.init_weights()
714

715
    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
716
    @add_code_sample_docstrings(
717
        tokenizer_class=_TOKENIZER_FOR_DOC,
718
        checkpoint="roberta-base",
719
        output_type=SequenceClassifierOutput,
720
        config_class=_CONFIG_FOR_DOC,
721
    )
722
    def forward(
723
        self,
724
        input_ids=None,
725
        attention_mask=None,
726
        token_type_ids=None,
727
        position_ids=None,
728
        head_mask=None,
729
        inputs_embeds=None,
730
        labels=None,
731
        output_attentions=None,
732
        output_hidden_states=None,
733
        return_dict=None,
734
    ):
735

736
        r"""
737
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
738
            Labels for computing the sequence classification/regression loss.
739
            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
740
            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
741
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
742
        """
743
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
744

745
        outputs = self.roberta(
746
            input_ids,
747
            attention_mask=attention_mask,
748
            token_type_ids=token_type_ids,
749
            position_ids=position_ids,
750
            head_mask=head_mask,
751
            inputs_embeds=inputs_embeds,
752
            output_attentions=output_attentions,
753
            output_hidden_states=output_hidden_states,
754
            return_dict=return_dict,
755
        )
756

757
        #####
758
        #return outputs
759
        #####
760

761

762
        sequence_output = outputs[0]
763
        logits = self.classifier(sequence_output)
764

765
        loss = None
766
        if labels is not None:
767
            if self.num_labels == 1:
768
                #  We are doing regression
769
                loss_fct = MSELoss()
770
                loss = loss_fct(logits.view(-1), labels.view(-1))
771
            else:
772
                loss_fct = CrossEntropyLoss()
773
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
774

775
        if not return_dict:
776
            output = (logits,) + outputs[2:]
777
            return ((loss,) + output) if loss is not None else output
778

779
        return SequenceClassifierOutput(
780
            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
781
        )
782

783

784
@add_start_docstrings(
785
    """Roberta Model with a multiple choice classification head on top (a linear layer on top of
786
    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
787
    ROBERTA_START_DOCSTRING,
788
)
789
class RobertaForMultipleChoice(BertPreTrainedModel):
790
    config_class = RobertaConfig
791
    base_model_prefix = "roberta"
792

793
    def __init__(self, config):
794
        super().__init__(config)
795

796
        self.roberta = RobertaModel(config)
797
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
798
        self.classifier = nn.Linear(config.hidden_size, 1)
799

800
        self.init_weights()
801

802
    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
803
    @add_code_sample_docstrings(
804
        tokenizer_class=_TOKENIZER_FOR_DOC,
805
        checkpoint="roberta-base",
806
        output_type=MultipleChoiceModelOutput,
807
        config_class=_CONFIG_FOR_DOC,
808
    )
809
    def forward(
810
        self,
811
        input_ids=None,
812
        token_type_ids=None,
813
        attention_mask=None,
814
        labels=None,
815
        position_ids=None,
816
        head_mask=None,
817
        inputs_embeds=None,
818
        output_attentions=None,
819
        output_hidden_states=None,
820
        return_dict=None,
821
    ):
822
        r"""
823
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
824
            Labels for computing the multiple choice classification loss.
825
            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
826
            of the input tensors. (see `input_ids` above)
827
        """
828
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
829
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
830

831
        flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
832
        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
833
        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
834
        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
835
        flat_inputs_embeds = (
836
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
837
            if inputs_embeds is not None
838
            else None
839
        )
840

841
        outputs = self.roberta(
842
            flat_input_ids,
843
            position_ids=flat_position_ids,
844
            token_type_ids=flat_token_type_ids,
845
            attention_mask=flat_attention_mask,
846
            head_mask=head_mask,
847
            inputs_embeds=flat_inputs_embeds,
848
            output_attentions=output_attentions,
849
            output_hidden_states=output_hidden_states,
850
            return_dict=return_dict,
851
        )
852
        pooled_output = outputs[1]
853

854
        pooled_output = self.dropout(pooled_output)
855
        logits = self.classifier(pooled_output)
856
        reshaped_logits = logits.view(-1, num_choices)
857

858
        loss = None
859
        if labels is not None:
860
            loss_fct = CrossEntropyLoss()
861
            loss = loss_fct(reshaped_logits, labels)
862

863
        if not return_dict:
864
            output = (reshaped_logits,) + outputs[2:]
865
            return ((loss,) + output) if loss is not None else output
866

867
        return MultipleChoiceModelOutput(
868
            loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
869
        )
870

871

872
@add_start_docstrings(
873
    """Roberta Model with a token classification head on top (a linear layer on top of
874
    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
875
    ROBERTA_START_DOCSTRING,
876
)
877
class RobertaForTokenClassification(BertPreTrainedModel):
878
    config_class = RobertaConfig
879
    base_model_prefix = "roberta"
880

881
    def __init__(self, config):
882
        super().__init__(config)
883
        self.num_labels = config.num_labels
884

885
        self.roberta = RobertaModel(config)
886
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
887
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
888

889
        self.init_weights()
890

891
    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
892
    @add_code_sample_docstrings(
893
        tokenizer_class=_TOKENIZER_FOR_DOC,
894
        checkpoint="roberta-base",
895
        output_type=TokenClassifierOutput,
896
        config_class=_CONFIG_FOR_DOC,
897
    )
898
    def forward(
899
        self,
900
        input_ids=None,
901
        attention_mask=None,
902
        token_type_ids=None,
903
        position_ids=None,
904
        head_mask=None,
905
        inputs_embeds=None,
906
        labels=None,
907
        output_attentions=None,
908
        output_hidden_states=None,
909
        return_dict=None,
910
    ):
911
        r"""
912
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
913
            Labels for computing the token classification loss.
914
            Indices should be in ``[0, ..., config.num_labels - 1]``.
915
        """
916
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
917

918
        outputs = self.roberta(
919
            input_ids,
920
            attention_mask=attention_mask,
921
            token_type_ids=token_type_ids,
922
            position_ids=position_ids,
923
            head_mask=head_mask,
924
            inputs_embeds=inputs_embeds,
925
            output_attentions=output_attentions,
926
            output_hidden_states=output_hidden_states,
927
            return_dict=return_dict,
928
        )
929

930
        sequence_output = outputs[0]
931

932
        sequence_output = self.dropout(sequence_output)
933
        logits = self.classifier(sequence_output)
934

935
        loss = None
936
        if labels is not None:
937
            loss_fct = CrossEntropyLoss()
938
            # Only keep active parts of the loss
939
            if attention_mask is not None:
940
                active_loss = attention_mask.view(-1) == 1
941
                active_logits = logits.view(-1, self.num_labels)
942
                active_labels = torch.where(
943
                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
944
                )
945
                loss = loss_fct(active_logits, active_labels)
946
            else:
947
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
948

949
        if not return_dict:
950
            output = (logits,) + outputs[2:]
951
            return ((loss,) + output) if loss is not None else output
952

953
        return TokenClassifierOutput(
954
            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
955
        )
956

957
class RobertaClassificationHead(nn.Module):
958
    """Head for sentence-level classification tasks."""
959

960
    def __init__(self, config):
961
        super().__init__()
962
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
963
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
964
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
965

966
    def forward(self, features, **kwargs):
967
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
968
        #x = features[input_ids==2]  # take </s> token (equiv. to the last token)
969
        x = self.dropout(x)
970
        x = self.dense(x)
971
        x = torch.tanh(x)
972
        x = self.dropout(x)
973
        x = self.out_proj(x)
974
        return x
975

976

977
class RobertaClassificationHeadandTail(nn.Module):
978
    """Head for sentence-level classification tasks."""
979

980
    def __init__(self, config):
981
        super().__init__()
982
        self.dense = nn.Linear(config.hidden_size*2, config.hidden_size*2)
983
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
984
        self.out_proj = nn.Linear(config.hidden_size*2, config.num_labels)
985
        self.num_labels = config.num_labels
986

987
    def forward(self, features, input_ids, **kwargs):
988
        head = features[:, 0, :]  # take <s> token (equiv. to [CLS])
989
        #print(input_ids==2)
990
        #x = features[input_ids==2]  # take </s> token (equiv. to the last token)
991
        tail = features[input_ids==2]  # take </s> token (equiv. to the last token)
992
        x = torch.cat((head, tail),-1) # [, 768*2]
993
        x = self.dropout(x)
994
        x = self.dense(x)
995
        x = torch.tanh(x)
996
        x = self.dropout(x)
997
        x = self.out_proj(x)
998
        return x
999

1000
@add_start_docstrings(
1001
    """Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
1002
    the hidden-states output to compute `span start logits` and `span end logits`). """,
1003
    ROBERTA_START_DOCSTRING,
1004
)
1005
class RobertaForQuestionAnswering(BertPreTrainedModel):
1006
    config_class = RobertaConfig
1007
    base_model_prefix = "roberta"
1008

1009
    def __init__(self, config):
1010
        super().__init__(config)
1011
        self.num_labels = config.num_labels
1012

1013
        self.roberta = RobertaModel(config)
1014
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
1015

1016
        self.init_weights()
1017

1018
    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
1019
    @add_code_sample_docstrings(
1020
        tokenizer_class=_TOKENIZER_FOR_DOC,
1021
        checkpoint="roberta-base",
1022
        output_type=QuestionAnsweringModelOutput,
1023
        config_class=_CONFIG_FOR_DOC,
1024
    )
1025
    def forward(
1026
        self,
1027
        input_ids=None,
1028
        attention_mask=None,
1029
        token_type_ids=None,
1030
        position_ids=None,
1031
        head_mask=None,
1032
        inputs_embeds=None,
1033
        start_positions=None,
1034
        end_positions=None,
1035
        output_attentions=None,
1036
        output_hidden_states=None,
1037
        return_dict=None,
1038
    ):
1039
        r"""
1040
        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
1041
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
1042
            Positions are clamped to the length of the sequence (`sequence_length`).
1043
            Position outside of the sequence are not taken into account for computing the loss.
1044
        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
1045
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
1046
            Positions are clamped to the length of the sequence (`sequence_length`).
1047
            Position outside of the sequence are not taken into account for computing the loss.
1048
        """
1049
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1050

1051
        outputs = self.roberta(
1052
            input_ids,
1053
            attention_mask=attention_mask,
1054
            token_type_ids=token_type_ids,
1055
            position_ids=position_ids,
1056
            head_mask=head_mask,
1057
            inputs_embeds=inputs_embeds,
1058
            output_attentions=output_attentions,
1059
            output_hidden_states=output_hidden_states,
1060
            return_dict=return_dict,
1061
        )
1062

1063
        sequence_output = outputs[0]
1064

1065
        logits = self.qa_outputs(sequence_output)
1066
        start_logits, end_logits = logits.split(1, dim=-1)
1067
        start_logits = start_logits.squeeze(-1)
1068
        end_logits = end_logits.squeeze(-1)
1069

1070
        total_loss = None
1071
        if start_positions is not None and end_positions is not None:
1072
            # If we are on multi-GPU, split add a dimension
1073
            if len(start_positions.size()) > 1:
1074
                start_positions = start_positions.squeeze(-1)
1075
            if len(end_positions.size()) > 1:
1076
                end_positions = end_positions.squeeze(-1)
1077
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
1078
            ignored_index = start_logits.size(1)
1079
            start_positions.clamp_(0, ignored_index)
1080
            end_positions.clamp_(0, ignored_index)
1081

1082
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
1083
            start_loss = loss_fct(start_logits, start_positions)
1084
            end_loss = loss_fct(end_logits, end_positions)
1085
            total_loss = (start_loss + end_loss) / 2
1086

1087
        if not return_dict:
1088
            output = (start_logits, end_logits) + outputs[2:]
1089
            return ((total_loss,) + output) if total_loss is not None else output
1090

1091
        return QuestionAnsweringModelOutput(
1092
            loss=total_loss,
1093
            start_logits=start_logits,
1094
            end_logits=end_logits,
1095
            hidden_states=outputs.hidden_states,
1096
            attentions=outputs.attentions,
1097
        )
1098

1099

1100
def create_position_ids_from_input_ids(input_ids, padding_idx):
1101
    """ Replace non-padding symbols with their position numbers. Position numbers begin at
1102
    padding_idx+1. Padding symbols are ignored. This is modified from fairseq's
1103
    `utils.make_positions`.
1104

1105
    :param torch.Tensor x:
1106
    :return torch.Tensor:
1107
    """
1108
    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
1109
    mask = input_ids.ne(padding_idx).int()
1110
    incremental_indices = torch.cumsum(mask, dim=1).type_as(mask) * mask
1111
    return incremental_indices.long() + padding_idx
1112
CSS-LM

Использование cookies