CSS-LM

modeling_roberta.py
1096 строк · 43.1 Кб
Перенос по словам
1
# coding=utf-8
2
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
4
#
5
# Licensed under the Apache License, Version 2.0 (the "License");
6
# you may not use this file except in compliance with the License.
7
# You may obtain a copy of the License at
8
#
9
#     http://www.apache.org/licenses/LICENSE-2.0
10
#
11
# Unless required by applicable law or agreed to in writing, software
12
# distributed under the License is distributed on an "AS IS" BASIS,
13
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
# See the License for the specific language governing permissions and
15
# limitations under the License.
16
"""PyTorch RoBERTa model. """
17

18

19
import logging
20
import warnings
21

22
import torch
23
import torch.nn as nn
24
from torch.nn import CrossEntropyLoss, MSELoss
25

26
from .configuration_roberta import RobertaConfig
27
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
28
from .modeling_bert import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu
29
from .modeling_outputs import (
30
    MaskedLMOutput,
31
    MultipleChoiceModelOutput,
32
    QuestionAnsweringModelOutput,
33
    SequenceClassifierOutput,
34
    TokenClassifierOutput,
35
)
36

37

38
logger = logging.getLogger(__name__)
39

40
_CONFIG_FOR_DOC = "RobertaConfig"
41
_TOKENIZER_FOR_DOC = "RobertaTokenizer"
42

43
ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
44
    "roberta-base",
45
    "roberta-large",
46
    "roberta-large-mnli",
47
    "distilroberta-base",
48
    "roberta-base-openai-detector",
49
    "roberta-large-openai-detector",
50
    # See all RoBERTa models at https://huggingface.co/models?filter=roberta
51
]
52

53

54
class RobertaEmbeddings(BertEmbeddings):
55
    """
56
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
57
    """
58

59
    def __init__(self, config):
60
        super().__init__(config)
61
        self.padding_idx = config.pad_token_id
62
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx)
63
        self.position_embeddings = nn.Embedding(
64
            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
65
        )
66

67
    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
68
        if position_ids is None:
69
            if input_ids is not None:
70
                # Create the position ids from the input token ids. Any padded tokens remain padded.
71
                position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx).to(input_ids.device)
72
            else:
73
                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
74

75
        return super().forward(
76
            input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds
77
        )
78

79
    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
80
        """ We are provided embeddings directly. We cannot infer which are padded so just generate
81
        sequential position ids.
82

83
        :param torch.Tensor inputs_embeds:
84
        :return torch.Tensor:
85
        """
86
        input_shape = inputs_embeds.size()[:-1]
87
        sequence_length = input_shape[1]
88

89
        position_ids = torch.arange(
90
            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
91
        )
92
        return position_ids.unsqueeze(0).expand(input_shape)
93

94

95
ROBERTA_START_DOCSTRING = r"""
96

97
    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
98
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
99
    usage and behavior.
100

101
    Parameters:
102
        config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the
103
            model. Initializing with a config file does not load the weights associated with the model, only the configuration.
104
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
105
"""
106

107
ROBERTA_INPUTS_DOCSTRING = r"""
108
    Args:
109
        input_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`):
110
            Indices of input sequence tokens in the vocabulary.
111

112
            Indices can be obtained using :class:`transformers.RobertaTokenizer`.
113
            See :func:`transformers.PreTrainedTokenizer.encode` and
114
            :func:`transformers.PreTrainedTokenizer.__call__` for details.
115

116
            `What are input IDs? <../glossary.html#input-ids>`__
117
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
118
            Mask to avoid performing attention on padding token indices.
119
            Mask values selected in ``[0, 1]``:
120
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
121

122
            `What are attention masks? <../glossary.html#attention-mask>`__
123
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
124
            Segment token indices to indicate first and second portions of the inputs.
125
            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
126
            corresponds to a `sentence B` token
127

128
            `What are token type IDs? <../glossary.html#token-type-ids>`_
129
        position_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
130
            Indices of positions of each input sequence tokens in the position embeddings.
131
            Selected in the range ``[0, config.max_position_embeddings - 1]``.
132

133
            `What are position IDs? <../glossary.html#position-ids>`_
134
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
135
            Mask to nullify selected heads of the self-attention modules.
136
            Mask values selected in ``[0, 1]``:
137
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
138
        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
139
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
140
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
141
            than the model's internal embedding lookup matrix.
142
        output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
143
            If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
144
        output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
145
            If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
146
        return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
147
            If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
148
            plain tuple.
149
"""
150

151

152
@add_start_docstrings(
153
    "The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
154
    ROBERTA_START_DOCSTRING,
155
)
156
class RobertaModel(BertModel):
157
    """
158
    This class overrides :class:`~transformers.BertModel`. Please check the
159
    superclass for the appropriate documentation alongside usage examples.
160
    """
161

162
    config_class = RobertaConfig
163
    base_model_prefix = "roberta"
164

165
    def __init__(self, config):
166
        super().__init__(config)
167

168
        self.embeddings = RobertaEmbeddings(config)
169
        self.init_weights()
170

171
    def get_input_embeddings(self):
172
        return self.embeddings.word_embeddings
173

174
    def set_input_embeddings(self, value):
175
        self.embeddings.word_embeddings = value
176

177

178

179

180
@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING)
181
class RobertaForMaskedLMDomainTask(BertPreTrainedModel):
182
    config_class = RobertaConfig
183
    base_model_prefix = "roberta"
184

185
    def __init__(self, config):
186
        super().__init__(config)
187

188
        self.roberta = RobertaModel(config)
189
        self.lm_head = RobertaLMHead(config)
190
        #print("====")
191
        #print(config.num_labels)
192
        #print("====")
193
        #exit()
194
        #config.num_labels = 8 # add
195
        #self.classifier = RobertaClassificationHead(config)
196
        self.classifier = RobertaClassificationHeadandTail(config)
197
        self.num_labels = config.num_labels
198

199
        self.init_weights()
200
        ###
201
        #self.domain_layer = torch.nn.Linear(768,768,bias=False)
202
        #self.task_layer = torch.nn.Linear(768,768,bias=False)
203
        #torch.nn.init.xavier_uniform_(self.domain_layer.weight)
204
        #self.act = nn.ReLU()
205
        #self.layer_out_domainClass = nn.Linear(768,2) #num_class
206
        #self.layer_out_taskClass = nn.Linear(768,8,bias=True) #num_class
207
        ###
208
        self.LeakyReLU = torch.nn.LeakyReLU()
209
        self.LeakyReLU = torch.nn.LeakyReLU()
210
        self.domain_binary_classifier = nn.Linear(768,2,bias=True) #num_class
211
        self.task_binary_classifier = nn.Linear(768*2,2,bias=True) #num_class
212
        #self.act = nn.ReLU()
213
        ###
214

215
    def get_output_embeddings(self):
216
        return self.lm_head.decoder
217

218
    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
219
    @add_code_sample_docstrings(
220
        tokenizer_class=_TOKENIZER_FOR_DOC,
221
        checkpoint="roberta-base",
222
        output_type=MaskedLMOutput,
223
        config_class=_CONFIG_FOR_DOC,
224
    )
225

226
    def forward(
227
        self,
228
        input_ids=None,
229
        input_ids_org=None,
230
        attention_mask=None,
231
        token_type_ids=None,
232
        position_ids=None,
233
        head_mask=None,
234
        inputs_embeds=None,
235
        labels=None,
236
        output_attentions=None,
237
        output_hidden_states=None,
238
        return_dict=None,
239
        func=None,
240
        tail_idxs=None,
241
        in_domain_rep=None,
242
        out_domain_rep=None,
243
        sentence_label=None,
244
        lm_label=None,
245
        batch_size=None,
246
        all_in_task_rep_comb=None,
247
        all_sentence_binary_label=None,
248
        **kwargs
249
    ):
250
        r"""
251
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
252
            Labels for computing the masked language modeling loss.
253
            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
254
            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
255
            in ``[0, ..., config.vocab_size]``
256
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
257
            Used to hide legacy arguments that have been deprecated.
258
        """
259
        if "masked_lm_labels" in kwargs:
260
            warnings.warn(
261
                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
262
                FutureWarning,
263
            )
264
            labels = kwargs.pop("masked_lm_labels")
265
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
266
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
267

268

269
        if func == "in_domain_task_rep":
270
            #######
271
            outputs = self.roberta(
272
                input_ids=input_ids_org,
273
                attention_mask=attention_mask,
274
                token_type_ids=token_type_ids,
275
                position_ids=position_ids,
276
                head_mask=head_mask,
277
                inputs_embeds=inputs_embeds,
278
                output_attentions=output_attentions,
279
                output_hidden_states=output_hidden_states,
280
                return_dict=return_dict,
281
            )
282
            #######
283
            #x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
284
            #rep = outputs.last_hidden_state[:, 0, :]
285
            #rep = outputs.last_hidden_state[:, 0, :]
286
            rep_head = outputs.last_hidden_state[:, 0, :]
287
            rep_tail = outputs.last_hidden_state[input_ids_org==2]
288
            #detach
289
            #rep = rep.detach()
290
            '''
291
            in_domain_rep = self.domain_layer(rep)
292
            in_task_rep = self.task_layer(rep)
293
            return in_domain_rep, in_task_rep
294
            '''
295
            return rep_tail, rep_head
296

297
        elif func == "in_domain_task_rep_mean":
298
            #######
299
            outputs = self.roberta(
300
                input_ids=input_ids_org,
301
                attention_mask=attention_mask,
302
                token_type_ids=token_type_ids,
303
                position_ids=position_ids,
304
                head_mask=head_mask,
305
                inputs_embeds=inputs_embeds,
306
                output_attentions=output_attentions,
307
                output_hidden_states=output_hidden_states,
308
                return_dict=return_dict,
309
            )
310
            #######
311
            #x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
312
            rep = outputs.last_hidden_state
313
            mask = rep!=0
314
            rep = (rep*mask).sum(dim=1)/mask.sum(dim=1)
315

316
            #detach
317
            #rep = rep.detach()
318
            '''
319
            in_domain_rep = self.domain_layer(rep)
320
            in_task_rep = self.task_layer(rep)
321
            return in_domain_rep, in_task_rep
322
            '''
323
            return rep, rep
324

325
        elif func == "return_task_binary_classifier":
326
            return self.task_binary_classifier.weight.data, self.task_binary_classifier.bias.data
327

328
        elif func == "return_domain_binary_classifier":
329
            return self.domain_binary_classifier.weight.data, self.domain_binary_classifier.bias.data
330

331

332
        #if func == "task_binary_classifier":
333

334
        elif func == "domain_binary_classifier":
335
            #in:1 , out:0
336
            #Need to fix
337
            #######
338
            outputs = self.roberta(
339
                input_ids=input_ids_org,
340
                attention_mask=attention_mask,
341
                token_type_ids=token_type_ids,
342
                position_ids=position_ids,
343
                head_mask=head_mask,
344
                inputs_embeds=inputs_embeds,
345
                output_attentions=output_attentions,
346
                output_hidden_states=output_hidden_states,
347
                return_dict=return_dict,
348
            )
349
            #######
350
            #Didn't include query rep: so it need to add in_domain_rep here
351
            loss_fct = CrossEntropyLoss()
352
            #out_domain_rep = outputs.last_hidden_state[:, 0, :]
353
            out_domain_rep = outputs.last_hidden_state[input_ids_org==2]
354
            domain_rep = torch.cat([in_domain_rep, out_domain_rep], 0)
355
            #detach_on
356
            ###
357
            #domain_rep = domain_rep.detach()
358
            ###
359
            logit = self.domain_binary_classifier(domain_rep)
360
            logit = self.LeakyReLU(logit)
361
            pos_target = torch.tensor([1]*in_domain_rep.shape[0]).to("cuda")
362
            neg_target = torch.tensor([0]*out_domain_rep.shape[0]).to("cuda")
363
            target = torch.cat([pos_target, neg_target], 0)
364
            domain_loss = loss_fct(logit, target)
365
            return domain_loss, logit
366

367

368
        elif func == "domain_binary_classifier_mean":
369
            #in:1 , out:0
370
            #Need to fix
371
            #######
372
            outputs = self.roberta(
373
                input_ids=input_ids_org,
374
                attention_mask=attention_mask,
375
                token_type_ids=token_type_ids,
376
                position_ids=position_ids,
377
                head_mask=head_mask,
378
                inputs_embeds=inputs_embeds,
379
                output_attentions=output_attentions,
380
                output_hidden_states=output_hidden_states,
381
                return_dict=return_dict,
382
            )
383
            #######
384
            #Didn't include query rep: so it need to add in_domain_rep here
385
            loss_fct = CrossEntropyLoss()
386
            out_domain_rep = outputs.last_hidden_state
387
            ###
388
            mask = out_domain_rep!=0
389
            out_domain_rep = (out_domain_rep*mask).sum(dim=1)/mask.sum(dim=1)
390
            ###
391
            domain_rep = torch.cat([in_domain_rep, out_domain_rep], 0)
392
            #detach
393
            #domain_rep = domain_rep.detach()
394
            logit = self.domain_binary_classifier(domain_rep)
395
            logit = self.LeakyReLU(logit)
396
            pos_target = torch.tensor([1]*in_domain_rep.shape[0]).to("cuda")
397
            neg_target = torch.tensor([0]*out_domain_rep.shape[0]).to("cuda")
398
            target = torch.cat([pos_target, neg_target], 0)
399
            domain_loss = loss_fct(logit, target)
400
            return domain_loss, logit
401

402

403
        elif func == "task_binary_classifier":
404
            #Didn't include query rep: so it need to add in_domain_rep here
405
            loss_fct = CrossEntropyLoss()
406
            #detach_on
407
            ###
408
            #all_in_task_rep_comb = all_in_task_rep_comb.detach()
409
            ###
410
            logit = self.task_binary_classifier(all_in_task_rep_comb)
411
            logit = self.LeakyReLU(logit)
412
            all_sentence_binary_label = all_sentence_binary_label.reshape(all_sentence_binary_label.shape[0]*all_sentence_binary_label.shape[1])
413
            logit = logit.reshape(logit.shape[0]*logit.shape[1],logit.shape[2])
414
            task_binary_loss = loss_fct(logit.view(-1,2), all_sentence_binary_label.view(-1))
415
            return task_binary_loss, logit
416

417

418
        elif func == "task_binary_classifier_mean":
419
            #Didn't include query rep: so it need to add in_domain_rep here
420
            loss_fct = CrossEntropyLoss()
421
            #detach
422
            #all_in_task_rep_comb = all_in_task_rep_comb.detach()
423
            logit = self.task_binary_classifier(all_in_task_rep_comb)
424
            logit = self.LeakyReLU(logit)
425
            all_sentence_binary_label = all_sentence_binary_label.reshape(all_sentence_binary_label.shape[0]*all_sentence_binary_label.shape[1])
426
            logit = logit.reshape(logit.shape[0]*logit.shape[1],logit.shape[2])
427
            task_binary_loss = loss_fct(logit.view(-1,2), all_sentence_binary_label.view(-1))
428
            return task_binary_loss, logit
429

430
        elif func == "task_class":
431
            #######
432
            outputs = self.roberta(
433
                input_ids=input_ids_org,
434
                attention_mask=attention_mask,
435
                token_type_ids=token_type_ids,
436
                position_ids=position_ids,
437
                head_mask=head_mask,
438
                inputs_embeds=inputs_embeds,
439
                output_attentions=output_attentions,
440
                output_hidden_states=output_hidden_states,
441
                return_dict=return_dict,
442
            )
443
            #######
444
            #Already including query rep
445
            loss_fct = CrossEntropyLoss()
446
            ###
447
            class_logit = self.classifier(outputs.last_hidden_state, input_ids_org)
448
            ###
449
            task_loss = loss_fct(class_logit.view(-1, self.num_labels), sentence_label.view(-1))
450
            return task_loss, class_logit
451

452

453
        elif func == "mlm":
454
            outputs_mlm = self.roberta(
455
                input_ids=input_ids,
456
                attention_mask=attention_mask,
457
                token_type_ids=token_type_ids,
458
                position_ids=position_ids,
459
                head_mask=head_mask,
460
                inputs_embeds=inputs_embeds,
461
                output_attentions=output_attentions,
462
                output_hidden_states=output_hidden_states,
463
                return_dict=return_dict,
464
            )
465

466
            loss_fct = CrossEntropyLoss()
467
            sequence_output = outputs_mlm.last_hidden_state
468
            prediction_scores = self.lm_head(sequence_output)
469
            loss_fct = CrossEntropyLoss(ignore_index=-1)
470
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), lm_label.view(-1))
471
            return masked_lm_loss
472

473

474
        elif func == "task_class and mlm":
475
            #######
476
            outputs = self.roberta(
477
                input_ids=input_ids_org,
478
                attention_mask=attention_mask,
479
                token_type_ids=token_type_ids,
480
                position_ids=position_ids,
481
                head_mask=head_mask,
482
                inputs_embeds=inputs_embeds,
483
                output_attentions=output_attentions,
484
                output_hidden_states=output_hidden_states,
485
                return_dict=return_dict,
486
            )
487
            #######
488
            #######
489
            outputs_mlm = self.roberta(
490
                input_ids=input_ids,
491
                attention_mask=attention_mask,
492
                token_type_ids=token_type_ids,
493
                position_ids=position_ids,
494
                head_mask=head_mask,
495
                inputs_embeds=inputs_embeds,
496
                output_attentions=output_attentions,
497
                output_hidden_states=output_hidden_states,
498
                return_dict=return_dict,
499
            )
500
            #######
501
            #Already including query rep
502
            #task loss
503
            loss_fct = CrossEntropyLoss()
504
            ###
505
            '''
506
            #rep = outputs.last_hidden_state[input_ids==2]
507
            rep = outputs.last_hidden_state[:, 0, :]
508
            #rep = rep.detach()
509
            task_rep = self.task_layer(rep)
510
            class_logit = self.layer_out_taskClass((self.act(task_rep)))
511
            '''
512
            class_logit = self.classifier(outputs.last_hidden_state)
513
            ###
514
            task_loss = loss_fct(class_logit.view(-1, 8), sentence_label.view(-1))
515

516
            #mlm loss
517
            sequence_output = outputs_mlm.last_hidden_state
518
            prediction_scores = self.lm_head(sequence_output)
519
            loss_fct = CrossEntropyLoss(ignore_index=-1)
520
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), lm_label.view(-1))
521
            return task_loss, masked_lm_loss
522

523
        elif func == "gen_rep":
524
            outputs = self.roberta(
525
                input_ids=input_ids_org,
526
                attention_mask=attention_mask,
527
                token_type_ids=token_type_ids,
528
                position_ids=position_ids,
529
                head_mask=head_mask,
530
                inputs_embeds=inputs_embeds,
531
                output_attentions=output_attentions,
532
                output_hidden_states=output_hidden_states,
533
                return_dict=return_dict,
534
            )
535
            return outputs
536

537

538
        #mlm
539
        outputs_mlm = self.roberta(
540
            input_ids=input_ids,
541
            attention_mask=attention_mask,
542
            token_type_ids=token_type_ids,
543
            position_ids=position_ids,
544
            head_mask=head_mask,
545
            inputs_embeds=inputs_embeds,
546
            output_attentions=output_attentions,
547
            output_hidden_states=output_hidden_states,
548
            return_dict=return_dict,
549
        )
550

551
        sequence_output = outputs_mlm[0]
552
        prediction_scores = self.lm_head(sequence_output)
553

554
        masked_lm_loss = None
555
        if labels is not None:
556
            #loss_fct = CrossEntropyLoss()
557
            loss_fct = CrossEntropyLoss(ignore_index=-1)
558
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
559

560
        if not return_dict:
561
            output = (prediction_scores,) + outputs[2:]
562
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
563

564
        return MaskedLMOutput(
565
            loss=masked_lm_loss,
566
            logits=prediction_scores,
567
            hidden_states=outputs.hidden_states,
568
            attentions=outputs.attentions,
569
        )
570

571

572

573
@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING)
574
class RobertaForMaskedLM(BertPreTrainedModel):
575
    config_class = RobertaConfig
576
    base_model_prefix = "roberta"
577

578
    def __init__(self, config):
579
        super().__init__(config)
580

581
        self.roberta = RobertaModel(config)
582
        self.lm_head = RobertaLMHead(config)
583

584
        self.init_weights()
585

586
    def get_output_embeddings(self):
587
        return self.lm_head.decoder
588

589
    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
590
    @add_code_sample_docstrings(
591
        tokenizer_class=_TOKENIZER_FOR_DOC,
592
        checkpoint="roberta-base",
593
        output_type=MaskedLMOutput,
594
        config_class=_CONFIG_FOR_DOC,
595
    )
596
    def forward(
597
        self,
598
        input_ids=None,
599
        attention_mask=None,
600
        token_type_ids=None,
601
        position_ids=None,
602
        head_mask=None,
603
        inputs_embeds=None,
604
        labels=None,
605
        output_attentions=None,
606
        output_hidden_states=None,
607
        return_dict=None,
608
        **kwargs
609
    ):
610
        r"""
611
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
612
            Labels for computing the masked language modeling loss.
613
            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
614
            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
615
            in ``[0, ..., config.vocab_size]``
616
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
617
            Used to hide legacy arguments that have been deprecated.
618
        """
619
        if "masked_lm_labels" in kwargs:
620
            warnings.warn(
621
                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
622
                FutureWarning,
623
            )
624
            labels = kwargs.pop("masked_lm_labels")
625
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
626
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
627

628
        outputs = self.roberta(
629
            input_ids,
630
            attention_mask=attention_mask,
631
            token_type_ids=token_type_ids,
632
            position_ids=position_ids,
633
            head_mask=head_mask,
634
            inputs_embeds=inputs_embeds,
635
            output_attentions=output_attentions,
636
            output_hidden_states=output_hidden_states,
637
            return_dict=return_dict,
638
        )
639

640
        sequence_output = outputs[0]
641
        prediction_scores = self.lm_head(sequence_output)
642

643
        masked_lm_loss = None
644
        if labels is not None:
645
            #loss_fct = CrossEntropyLoss()
646
            loss_fct = CrossEntropyLoss(ignore_index=-1)
647
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
648

649
        if not return_dict:
650
            output = (prediction_scores,) + outputs[2:]
651
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
652

653
        return MaskedLMOutput(
654
            loss=masked_lm_loss,
655
            logits=prediction_scores,
656
            hidden_states=outputs.hidden_states,
657
            attentions=outputs.attentions,
658
        )
659

660

661
class RobertaLMHead(nn.Module):
662
    """Roberta Head for masked language modeling."""
663

664
    def __init__(self, config):
665
        super().__init__()
666
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
667
        self.layer_norm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
668

669
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
670
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
671

672
        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
673
        self.decoder.bias = self.bias
674

675
    def forward(self, features, **kwargs):
676
        x = self.dense(features)
677
        x = gelu(x)
678
        x = self.layer_norm(x)
679

680
        # project back to size of vocabulary with bias
681
        x = self.decoder(x)
682

683
        return x
684

685

686
@add_start_docstrings(
687
    """RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer
688
    on top of the pooled output) e.g. for GLUE tasks. """,
689
    ROBERTA_START_DOCSTRING,
690
)
691
class RobertaForSequenceClassification(BertPreTrainedModel):
692
    config_class = RobertaConfig
693
    base_model_prefix = "roberta"
694

695
    def __init__(self, config):
696
        super().__init__(config)
697
        #config.num_labels = 8 # add (can remove)
698
        self.num_labels = config.num_labels
699
        #print(self.num_labels)
700
        #exit()
701

702
        self.roberta = RobertaModel(config)
703
        self.classifier = RobertaClassificationHead(config)
704

705
        self.init_weights()
706

707
    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
708
    @add_code_sample_docstrings(
709
        tokenizer_class=_TOKENIZER_FOR_DOC,
710
        checkpoint="roberta-base",
711
        output_type=SequenceClassifierOutput,
712
        config_class=_CONFIG_FOR_DOC,
713
    )
714
    def forward(
715
        self,
716
        input_ids=None,
717
        attention_mask=None,
718
        token_type_ids=None,
719
        position_ids=None,
720
        head_mask=None,
721
        inputs_embeds=None,
722
        labels=None,
723
        output_attentions=None,
724
        output_hidden_states=None,
725
        return_dict=None,
726
    ):
727

728
        r"""
729
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
730
            Labels for computing the sequence classification/regression loss.
731
            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
732
            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
733
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
734
        """
735
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
736

737
        outputs = self.roberta(
738
            input_ids,
739
            attention_mask=attention_mask,
740
            token_type_ids=token_type_ids,
741
            position_ids=position_ids,
742
            head_mask=head_mask,
743
            inputs_embeds=inputs_embeds,
744
            output_attentions=output_attentions,
745
            output_hidden_states=output_hidden_states,
746
            return_dict=return_dict,
747
        )
748
        sequence_output = outputs[0]
749
        logits = self.classifier(sequence_output)
750

751
        loss = None
752
        if labels is not None:
753
            if self.num_labels == 1:
754
                #  We are doing regression
755
                loss_fct = MSELoss()
756
                loss = loss_fct(logits.view(-1), labels.view(-1))
757
            else:
758
                loss_fct = CrossEntropyLoss()
759
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
760

761
        if not return_dict:
762
            output = (logits,) + outputs[2:]
763
            return ((loss,) + output) if loss is not None else output
764

765
        return SequenceClassifierOutput(
766
            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
767
        )
768

769

770
@add_start_docstrings(
771
    """Roberta Model with a multiple choice classification head on top (a linear layer on top of
772
    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
773
    ROBERTA_START_DOCSTRING,
774
)
775
class RobertaForMultipleChoice(BertPreTrainedModel):
776
    config_class = RobertaConfig
777
    base_model_prefix = "roberta"
778

779
    def __init__(self, config):
780
        super().__init__(config)
781

782
        self.roberta = RobertaModel(config)
783
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
784
        self.classifier = nn.Linear(config.hidden_size, 1)
785

786
        self.init_weights()
787

788
    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
789
    @add_code_sample_docstrings(
790
        tokenizer_class=_TOKENIZER_FOR_DOC,
791
        checkpoint="roberta-base",
792
        output_type=MultipleChoiceModelOutput,
793
        config_class=_CONFIG_FOR_DOC,
794
    )
795
    def forward(
796
        self,
797
        input_ids=None,
798
        token_type_ids=None,
799
        attention_mask=None,
800
        labels=None,
801
        position_ids=None,
802
        head_mask=None,
803
        inputs_embeds=None,
804
        output_attentions=None,
805
        output_hidden_states=None,
806
        return_dict=None,
807
    ):
808
        r"""
809
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
810
            Labels for computing the multiple choice classification loss.
811
            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
812
            of the input tensors. (see `input_ids` above)
813
        """
814
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
815
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
816

817
        flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
818
        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
819
        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
820
        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
821
        flat_inputs_embeds = (
822
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
823
            if inputs_embeds is not None
824
            else None
825
        )
826

827
        outputs = self.roberta(
828
            flat_input_ids,
829
            position_ids=flat_position_ids,
830
            token_type_ids=flat_token_type_ids,
831
            attention_mask=flat_attention_mask,
832
            head_mask=head_mask,
833
            inputs_embeds=flat_inputs_embeds,
834
            output_attentions=output_attentions,
835
            output_hidden_states=output_hidden_states,
836
            return_dict=return_dict,
837
        )
838
        pooled_output = outputs[1]
839

840
        pooled_output = self.dropout(pooled_output)
841
        logits = self.classifier(pooled_output)
842
        reshaped_logits = logits.view(-1, num_choices)
843

844
        loss = None
845
        if labels is not None:
846
            loss_fct = CrossEntropyLoss()
847
            loss = loss_fct(reshaped_logits, labels)
848

849
        if not return_dict:
850
            output = (reshaped_logits,) + outputs[2:]
851
            return ((loss,) + output) if loss is not None else output
852

853
        return MultipleChoiceModelOutput(
854
            loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
855
        )
856

857

858
@add_start_docstrings(
859
    """Roberta Model with a token classification head on top (a linear layer on top of
860
    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
861
    ROBERTA_START_DOCSTRING,
862
)
863
class RobertaForTokenClassification(BertPreTrainedModel):
864
    config_class = RobertaConfig
865
    base_model_prefix = "roberta"
866

867
    def __init__(self, config):
868
        super().__init__(config)
869
        self.num_labels = config.num_labels
870

871
        self.roberta = RobertaModel(config)
872
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
873
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
874

875
        self.init_weights()
876

877
    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
878
    @add_code_sample_docstrings(
879
        tokenizer_class=_TOKENIZER_FOR_DOC,
880
        checkpoint="roberta-base",
881
        output_type=TokenClassifierOutput,
882
        config_class=_CONFIG_FOR_DOC,
883
    )
884
    def forward(
885
        self,
886
        input_ids=None,
887
        attention_mask=None,
888
        token_type_ids=None,
889
        position_ids=None,
890
        head_mask=None,
891
        inputs_embeds=None,
892
        labels=None,
893
        output_attentions=None,
894
        output_hidden_states=None,
895
        return_dict=None,
896
    ):
897
        r"""
898
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
899
            Labels for computing the token classification loss.
900
            Indices should be in ``[0, ..., config.num_labels - 1]``.
901
        """
902
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
903

904
        outputs = self.roberta(
905
            input_ids,
906
            attention_mask=attention_mask,
907
            token_type_ids=token_type_ids,
908
            position_ids=position_ids,
909
            head_mask=head_mask,
910
            inputs_embeds=inputs_embeds,
911
            output_attentions=output_attentions,
912
            output_hidden_states=output_hidden_states,
913
            return_dict=return_dict,
914
        )
915

916
        sequence_output = outputs[0]
917

918
        sequence_output = self.dropout(sequence_output)
919
        logits = self.classifier(sequence_output)
920

921
        loss = None
922
        if labels is not None:
923
            loss_fct = CrossEntropyLoss()
924
            # Only keep active parts of the loss
925
            if attention_mask is not None:
926
                active_loss = attention_mask.view(-1) == 1
927
                active_logits = logits.view(-1, self.num_labels)
928
                active_labels = torch.where(
929
                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
930
                )
931
                loss = loss_fct(active_logits, active_labels)
932
            else:
933
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
934

935
        if not return_dict:
936
            output = (logits,) + outputs[2:]
937
            return ((loss,) + output) if loss is not None else output
938

939
        return TokenClassifierOutput(
940
            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
941
        )
942

943
class RobertaClassificationHead(nn.Module):
944
    """Head for sentence-level classification tasks."""
945

946
    def __init__(self, config):
947
        super().__init__()
948
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
949
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
950
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
951

952
    def forward(self, features, **kwargs):
953
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
954
        #x = features[input_ids==2]  # take </s> token (equiv. to the last token)
955
        x = self.dropout(x)
956
        x = self.dense(x)
957
        x = torch.tanh(x)
958
        x = self.dropout(x)
959
        x = self.out_proj(x)
960
        return x
961

962

963
class RobertaClassificationHeadandTail(nn.Module):
964
    """Head for sentence-level classification tasks."""
965

966
    def __init__(self, config):
967
        super().__init__()
968
        self.dense = nn.Linear(config.hidden_size*2, config.hidden_size*2)
969
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
970
        self.out_proj = nn.Linear(config.hidden_size*2, config.num_labels)
971

972
    def forward(self, features, input_ids, **kwargs):
973
        head = features[:, 0, :]  # take <s> token (equiv. to [CLS])
974
        #print(input_ids==2)
975
        #x = features[input_ids==2]  # take </s> token (equiv. to the last token)
976
        tail = features[input_ids==2]  # take </s> token (equiv. to the last token)
977
        x = torch.cat((head, tail),-1) # [, 768*2]
978
        x = self.dropout(x)
979
        x = self.dense(x)
980
        x = torch.tanh(x)
981
        x = self.dropout(x)
982
        x = self.out_proj(x)
983
        return x
984

985
@add_start_docstrings(
986
    """Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
987
    the hidden-states output to compute `span start logits` and `span end logits`). """,
988
    ROBERTA_START_DOCSTRING,
989
)
990
class RobertaForQuestionAnswering(BertPreTrainedModel):
991
    config_class = RobertaConfig
992
    base_model_prefix = "roberta"
993

994
    def __init__(self, config):
995
        super().__init__(config)
996
        self.num_labels = config.num_labels
997

998
        self.roberta = RobertaModel(config)
999
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
1000

1001
        self.init_weights()
1002

1003
    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
1004
    @add_code_sample_docstrings(
1005
        tokenizer_class=_TOKENIZER_FOR_DOC,
1006
        checkpoint="roberta-base",
1007
        output_type=QuestionAnsweringModelOutput,
1008
        config_class=_CONFIG_FOR_DOC,
1009
    )
1010
    def forward(
1011
        self,
1012
        input_ids=None,
1013
        attention_mask=None,
1014
        token_type_ids=None,
1015
        position_ids=None,
1016
        head_mask=None,
1017
        inputs_embeds=None,
1018
        start_positions=None,
1019
        end_positions=None,
1020
        output_attentions=None,
1021
        output_hidden_states=None,
1022
        return_dict=None,
1023
    ):
1024
        r"""
1025
        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
1026
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
1027
            Positions are clamped to the length of the sequence (`sequence_length`).
1028
            Position outside of the sequence are not taken into account for computing the loss.
1029
        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
1030
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
1031
            Positions are clamped to the length of the sequence (`sequence_length`).
1032
            Position outside of the sequence are not taken into account for computing the loss.
1033
        """
1034
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1035

1036
        outputs = self.roberta(
1037
            input_ids,
1038
            attention_mask=attention_mask,
1039
            token_type_ids=token_type_ids,
1040
            position_ids=position_ids,
1041
            head_mask=head_mask,
1042
            inputs_embeds=inputs_embeds,
1043
            output_attentions=output_attentions,
1044
            output_hidden_states=output_hidden_states,
1045
            return_dict=return_dict,
1046
        )
1047

1048
        sequence_output = outputs[0]
1049

1050
        logits = self.qa_outputs(sequence_output)
1051
        start_logits, end_logits = logits.split(1, dim=-1)
1052
        start_logits = start_logits.squeeze(-1)
1053
        end_logits = end_logits.squeeze(-1)
1054

1055
        total_loss = None
1056
        if start_positions is not None and end_positions is not None:
1057
            # If we are on multi-GPU, split add a dimension
1058
            if len(start_positions.size()) > 1:
1059
                start_positions = start_positions.squeeze(-1)
1060
            if len(end_positions.size()) > 1:
1061
                end_positions = end_positions.squeeze(-1)
1062
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
1063
            ignored_index = start_logits.size(1)
1064
            start_positions.clamp_(0, ignored_index)
1065
            end_positions.clamp_(0, ignored_index)
1066

1067
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
1068
            start_loss = loss_fct(start_logits, start_positions)
1069
            end_loss = loss_fct(end_logits, end_positions)
1070
            total_loss = (start_loss + end_loss) / 2
1071

1072
        if not return_dict:
1073
            output = (start_logits, end_logits) + outputs[2:]
1074
            return ((total_loss,) + output) if total_loss is not None else output
1075

1076
        return QuestionAnsweringModelOutput(
1077
            loss=total_loss,
1078
            start_logits=start_logits,
1079
            end_logits=end_logits,
1080
            hidden_states=outputs.hidden_states,
1081
            attentions=outputs.attentions,
1082
        )
1083

1084

1085
def create_position_ids_from_input_ids(input_ids, padding_idx):
1086
    """ Replace non-padding symbols with their position numbers. Position numbers begin at
1087
    padding_idx+1. Padding symbols are ignored. This is modified from fairseq's
1088
    `utils.make_positions`.
1089

1090
    :param torch.Tensor x:
1091
    :return torch.Tensor:
1092
    """
1093
    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
1094
    mask = input_ids.ne(padding_idx).int()
1095
    incremental_indices = torch.cumsum(mask, dim=1).type_as(mask) * mask
1096
    return incremental_indices.long() + padding_idx
1097
CSS-LM

Использование cookies