CSS-LM
1096 строк · 43.1 Кб
1# coding=utf-8
2# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16"""PyTorch RoBERTa model. """
17
18
19import logging20import warnings21
22import torch23import torch.nn as nn24from torch.nn import CrossEntropyLoss, MSELoss25
26from .configuration_roberta import RobertaConfig27from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable28from .modeling_bert import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu29from .modeling_outputs import (30MaskedLMOutput,31MultipleChoiceModelOutput,32QuestionAnsweringModelOutput,33SequenceClassifierOutput,34TokenClassifierOutput,35)
36
37
38logger = logging.getLogger(__name__)39
40_CONFIG_FOR_DOC = "RobertaConfig"41_TOKENIZER_FOR_DOC = "RobertaTokenizer"42
43ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [44"roberta-base",45"roberta-large",46"roberta-large-mnli",47"distilroberta-base",48"roberta-base-openai-detector",49"roberta-large-openai-detector",50# See all RoBERTa models at https://huggingface.co/models?filter=roberta51]
52
53
54class RobertaEmbeddings(BertEmbeddings):55"""56Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
57"""
58
59def __init__(self, config):60super().__init__(config)61self.padding_idx = config.pad_token_id62self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx)63self.position_embeddings = nn.Embedding(64config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx65)66
67def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):68if position_ids is None:69if input_ids is not None:70# Create the position ids from the input token ids. Any padded tokens remain padded.71position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx).to(input_ids.device)72else:73position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)74
75return super().forward(76input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds77)78
79def create_position_ids_from_inputs_embeds(self, inputs_embeds):80""" We are provided embeddings directly. We cannot infer which are padded so just generate81sequential position ids.
82
83:param torch.Tensor inputs_embeds:
84:return torch.Tensor:
85"""
86input_shape = inputs_embeds.size()[:-1]87sequence_length = input_shape[1]88
89position_ids = torch.arange(90self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device91)92return position_ids.unsqueeze(0).expand(input_shape)93
94
95ROBERTA_START_DOCSTRING = r"""96
97This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
98Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
99usage and behavior.
100
101Parameters:
102config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the
103model. Initializing with a config file does not load the weights associated with the model, only the configuration.
104Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
105"""
106
107ROBERTA_INPUTS_DOCSTRING = r"""108Args:
109input_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`):
110Indices of input sequence tokens in the vocabulary.
111
112Indices can be obtained using :class:`transformers.RobertaTokenizer`.
113See :func:`transformers.PreTrainedTokenizer.encode` and
114:func:`transformers.PreTrainedTokenizer.__call__` for details.
115
116`What are input IDs? <../glossary.html#input-ids>`__
117attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
118Mask to avoid performing attention on padding token indices.
119Mask values selected in ``[0, 1]``:
120``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
121
122`What are attention masks? <../glossary.html#attention-mask>`__
123token_type_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
124Segment token indices to indicate first and second portions of the inputs.
125Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
126corresponds to a `sentence B` token
127
128`What are token type IDs? <../glossary.html#token-type-ids>`_
129position_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
130Indices of positions of each input sequence tokens in the position embeddings.
131Selected in the range ``[0, config.max_position_embeddings - 1]``.
132
133`What are position IDs? <../glossary.html#position-ids>`_
134head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
135Mask to nullify selected heads of the self-attention modules.
136Mask values selected in ``[0, 1]``:
137:obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
138inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
139Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
140This is useful if you want more control over how to convert `input_ids` indices into associated vectors
141than the model's internal embedding lookup matrix.
142output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
143If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
144output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
145If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
146return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
147If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
148plain tuple.
149"""
150
151
152@add_start_docstrings(153"The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",154ROBERTA_START_DOCSTRING,155)
156class RobertaModel(BertModel):157"""158This class overrides :class:`~transformers.BertModel`. Please check the
159superclass for the appropriate documentation alongside usage examples.
160"""
161
162config_class = RobertaConfig163base_model_prefix = "roberta"164
165def __init__(self, config):166super().__init__(config)167
168self.embeddings = RobertaEmbeddings(config)169self.init_weights()170
171def get_input_embeddings(self):172return self.embeddings.word_embeddings173
174def set_input_embeddings(self, value):175self.embeddings.word_embeddings = value176
177
178
179
180@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING)181class RobertaForMaskedLMDomainTask(BertPreTrainedModel):182config_class = RobertaConfig183base_model_prefix = "roberta"184
185def __init__(self, config):186super().__init__(config)187
188self.roberta = RobertaModel(config)189self.lm_head = RobertaLMHead(config)190#print("====")191#print(config.num_labels)192#print("====")193#exit()194#config.num_labels = 8 # add195#self.classifier = RobertaClassificationHead(config)196self.classifier = RobertaClassificationHeadandTail(config)197self.num_labels = config.num_labels198
199self.init_weights()200###201#self.domain_layer = torch.nn.Linear(768,768,bias=False)202#self.task_layer = torch.nn.Linear(768,768,bias=False)203#torch.nn.init.xavier_uniform_(self.domain_layer.weight)204#self.act = nn.ReLU()205#self.layer_out_domainClass = nn.Linear(768,2) #num_class206#self.layer_out_taskClass = nn.Linear(768,8,bias=True) #num_class207###208self.LeakyReLU = torch.nn.LeakyReLU()209self.LeakyReLU = torch.nn.LeakyReLU()210self.domain_binary_classifier = nn.Linear(768,2,bias=True) #num_class211self.task_binary_classifier = nn.Linear(768*2,2,bias=True) #num_class212#self.act = nn.ReLU()213###214
215def get_output_embeddings(self):216return self.lm_head.decoder217
218@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))219@add_code_sample_docstrings(220tokenizer_class=_TOKENIZER_FOR_DOC,221checkpoint="roberta-base",222output_type=MaskedLMOutput,223config_class=_CONFIG_FOR_DOC,224)225
226def forward(227self,228input_ids=None,229input_ids_org=None,230attention_mask=None,231token_type_ids=None,232position_ids=None,233head_mask=None,234inputs_embeds=None,235labels=None,236output_attentions=None,237output_hidden_states=None,238return_dict=None,239func=None,240tail_idxs=None,241in_domain_rep=None,242out_domain_rep=None,243sentence_label=None,244lm_label=None,245batch_size=None,246all_in_task_rep_comb=None,247all_sentence_binary_label=None,248**kwargs249):250r"""251labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
252Labels for computing the masked language modeling loss.
253Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
254Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
255in ``[0, ..., config.vocab_size]``
256kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
257Used to hide legacy arguments that have been deprecated.
258"""
259if "masked_lm_labels" in kwargs:260warnings.warn(261"The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",262FutureWarning,263)264labels = kwargs.pop("masked_lm_labels")265assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."266return_dict = return_dict if return_dict is not None else self.config.use_return_dict267
268
269if func == "in_domain_task_rep":270#######271outputs = self.roberta(272input_ids=input_ids_org,273attention_mask=attention_mask,274token_type_ids=token_type_ids,275position_ids=position_ids,276head_mask=head_mask,277inputs_embeds=inputs_embeds,278output_attentions=output_attentions,279output_hidden_states=output_hidden_states,280return_dict=return_dict,281)282#######283#x = features[:, 0, :] # take <s> token (equiv. to [CLS])284#rep = outputs.last_hidden_state[:, 0, :]285#rep = outputs.last_hidden_state[:, 0, :]286rep_head = outputs.last_hidden_state[:, 0, :]287rep_tail = outputs.last_hidden_state[input_ids_org==2]288#detach289#rep = rep.detach()290'''291in_domain_rep = self.domain_layer(rep)
292in_task_rep = self.task_layer(rep)
293return in_domain_rep, in_task_rep
294'''
295return rep_tail, rep_head296
297elif func == "in_domain_task_rep_mean":298#######299outputs = self.roberta(300input_ids=input_ids_org,301attention_mask=attention_mask,302token_type_ids=token_type_ids,303position_ids=position_ids,304head_mask=head_mask,305inputs_embeds=inputs_embeds,306output_attentions=output_attentions,307output_hidden_states=output_hidden_states,308return_dict=return_dict,309)310#######311#x = features[:, 0, :] # take <s> token (equiv. to [CLS])312rep = outputs.last_hidden_state313mask = rep!=0314rep = (rep*mask).sum(dim=1)/mask.sum(dim=1)315
316#detach317#rep = rep.detach()318'''319in_domain_rep = self.domain_layer(rep)
320in_task_rep = self.task_layer(rep)
321return in_domain_rep, in_task_rep
322'''
323return rep, rep324
325elif func == "return_task_binary_classifier":326return self.task_binary_classifier.weight.data, self.task_binary_classifier.bias.data327
328elif func == "return_domain_binary_classifier":329return self.domain_binary_classifier.weight.data, self.domain_binary_classifier.bias.data330
331
332#if func == "task_binary_classifier":333
334elif func == "domain_binary_classifier":335#in:1 , out:0336#Need to fix337#######338outputs = self.roberta(339input_ids=input_ids_org,340attention_mask=attention_mask,341token_type_ids=token_type_ids,342position_ids=position_ids,343head_mask=head_mask,344inputs_embeds=inputs_embeds,345output_attentions=output_attentions,346output_hidden_states=output_hidden_states,347return_dict=return_dict,348)349#######350#Didn't include query rep: so it need to add in_domain_rep here351loss_fct = CrossEntropyLoss()352#out_domain_rep = outputs.last_hidden_state[:, 0, :]353out_domain_rep = outputs.last_hidden_state[input_ids_org==2]354domain_rep = torch.cat([in_domain_rep, out_domain_rep], 0)355#detach_on356###357#domain_rep = domain_rep.detach()358###359logit = self.domain_binary_classifier(domain_rep)360logit = self.LeakyReLU(logit)361pos_target = torch.tensor([1]*in_domain_rep.shape[0]).to("cuda")362neg_target = torch.tensor([0]*out_domain_rep.shape[0]).to("cuda")363target = torch.cat([pos_target, neg_target], 0)364domain_loss = loss_fct(logit, target)365return domain_loss, logit366
367
368elif func == "domain_binary_classifier_mean":369#in:1 , out:0370#Need to fix371#######372outputs = self.roberta(373input_ids=input_ids_org,374attention_mask=attention_mask,375token_type_ids=token_type_ids,376position_ids=position_ids,377head_mask=head_mask,378inputs_embeds=inputs_embeds,379output_attentions=output_attentions,380output_hidden_states=output_hidden_states,381return_dict=return_dict,382)383#######384#Didn't include query rep: so it need to add in_domain_rep here385loss_fct = CrossEntropyLoss()386out_domain_rep = outputs.last_hidden_state387###388mask = out_domain_rep!=0389out_domain_rep = (out_domain_rep*mask).sum(dim=1)/mask.sum(dim=1)390###391domain_rep = torch.cat([in_domain_rep, out_domain_rep], 0)392#detach393#domain_rep = domain_rep.detach()394logit = self.domain_binary_classifier(domain_rep)395logit = self.LeakyReLU(logit)396pos_target = torch.tensor([1]*in_domain_rep.shape[0]).to("cuda")397neg_target = torch.tensor([0]*out_domain_rep.shape[0]).to("cuda")398target = torch.cat([pos_target, neg_target], 0)399domain_loss = loss_fct(logit, target)400return domain_loss, logit401
402
403elif func == "task_binary_classifier":404#Didn't include query rep: so it need to add in_domain_rep here405loss_fct = CrossEntropyLoss()406#detach_on407###408#all_in_task_rep_comb = all_in_task_rep_comb.detach()409###410logit = self.task_binary_classifier(all_in_task_rep_comb)411logit = self.LeakyReLU(logit)412all_sentence_binary_label = all_sentence_binary_label.reshape(all_sentence_binary_label.shape[0]*all_sentence_binary_label.shape[1])413logit = logit.reshape(logit.shape[0]*logit.shape[1],logit.shape[2])414task_binary_loss = loss_fct(logit.view(-1,2), all_sentence_binary_label.view(-1))415return task_binary_loss, logit416
417
418elif func == "task_binary_classifier_mean":419#Didn't include query rep: so it need to add in_domain_rep here420loss_fct = CrossEntropyLoss()421#detach422#all_in_task_rep_comb = all_in_task_rep_comb.detach()423logit = self.task_binary_classifier(all_in_task_rep_comb)424logit = self.LeakyReLU(logit)425all_sentence_binary_label = all_sentence_binary_label.reshape(all_sentence_binary_label.shape[0]*all_sentence_binary_label.shape[1])426logit = logit.reshape(logit.shape[0]*logit.shape[1],logit.shape[2])427task_binary_loss = loss_fct(logit.view(-1,2), all_sentence_binary_label.view(-1))428return task_binary_loss, logit429
430elif func == "task_class":431#######432outputs = self.roberta(433input_ids=input_ids_org,434attention_mask=attention_mask,435token_type_ids=token_type_ids,436position_ids=position_ids,437head_mask=head_mask,438inputs_embeds=inputs_embeds,439output_attentions=output_attentions,440output_hidden_states=output_hidden_states,441return_dict=return_dict,442)443#######444#Already including query rep445loss_fct = CrossEntropyLoss()446###447class_logit = self.classifier(outputs.last_hidden_state, input_ids_org)448###449task_loss = loss_fct(class_logit.view(-1, self.num_labels), sentence_label.view(-1))450return task_loss, class_logit451
452
453elif func == "mlm":454outputs_mlm = self.roberta(455input_ids=input_ids,456attention_mask=attention_mask,457token_type_ids=token_type_ids,458position_ids=position_ids,459head_mask=head_mask,460inputs_embeds=inputs_embeds,461output_attentions=output_attentions,462output_hidden_states=output_hidden_states,463return_dict=return_dict,464)465
466loss_fct = CrossEntropyLoss()467sequence_output = outputs_mlm.last_hidden_state468prediction_scores = self.lm_head(sequence_output)469loss_fct = CrossEntropyLoss(ignore_index=-1)470masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), lm_label.view(-1))471return masked_lm_loss472
473
474elif func == "task_class and mlm":475#######476outputs = self.roberta(477input_ids=input_ids_org,478attention_mask=attention_mask,479token_type_ids=token_type_ids,480position_ids=position_ids,481head_mask=head_mask,482inputs_embeds=inputs_embeds,483output_attentions=output_attentions,484output_hidden_states=output_hidden_states,485return_dict=return_dict,486)487#######488#######489outputs_mlm = self.roberta(490input_ids=input_ids,491attention_mask=attention_mask,492token_type_ids=token_type_ids,493position_ids=position_ids,494head_mask=head_mask,495inputs_embeds=inputs_embeds,496output_attentions=output_attentions,497output_hidden_states=output_hidden_states,498return_dict=return_dict,499)500#######501#Already including query rep502#task loss503loss_fct = CrossEntropyLoss()504###505'''506#rep = outputs.last_hidden_state[input_ids==2]
507rep = outputs.last_hidden_state[:, 0, :]
508#rep = rep.detach()
509task_rep = self.task_layer(rep)
510class_logit = self.layer_out_taskClass((self.act(task_rep)))
511'''
512class_logit = self.classifier(outputs.last_hidden_state)513###514task_loss = loss_fct(class_logit.view(-1, 8), sentence_label.view(-1))515
516#mlm loss517sequence_output = outputs_mlm.last_hidden_state518prediction_scores = self.lm_head(sequence_output)519loss_fct = CrossEntropyLoss(ignore_index=-1)520masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), lm_label.view(-1))521return task_loss, masked_lm_loss522
523elif func == "gen_rep":524outputs = self.roberta(525input_ids=input_ids_org,526attention_mask=attention_mask,527token_type_ids=token_type_ids,528position_ids=position_ids,529head_mask=head_mask,530inputs_embeds=inputs_embeds,531output_attentions=output_attentions,532output_hidden_states=output_hidden_states,533return_dict=return_dict,534)535return outputs536
537
538#mlm539outputs_mlm = self.roberta(540input_ids=input_ids,541attention_mask=attention_mask,542token_type_ids=token_type_ids,543position_ids=position_ids,544head_mask=head_mask,545inputs_embeds=inputs_embeds,546output_attentions=output_attentions,547output_hidden_states=output_hidden_states,548return_dict=return_dict,549)550
551sequence_output = outputs_mlm[0]552prediction_scores = self.lm_head(sequence_output)553
554masked_lm_loss = None555if labels is not None:556#loss_fct = CrossEntropyLoss()557loss_fct = CrossEntropyLoss(ignore_index=-1)558masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))559
560if not return_dict:561output = (prediction_scores,) + outputs[2:]562return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output563
564return MaskedLMOutput(565loss=masked_lm_loss,566logits=prediction_scores,567hidden_states=outputs.hidden_states,568attentions=outputs.attentions,569)570
571
572
573@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING)574class RobertaForMaskedLM(BertPreTrainedModel):575config_class = RobertaConfig576base_model_prefix = "roberta"577
578def __init__(self, config):579super().__init__(config)580
581self.roberta = RobertaModel(config)582self.lm_head = RobertaLMHead(config)583
584self.init_weights()585
586def get_output_embeddings(self):587return self.lm_head.decoder588
589@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))590@add_code_sample_docstrings(591tokenizer_class=_TOKENIZER_FOR_DOC,592checkpoint="roberta-base",593output_type=MaskedLMOutput,594config_class=_CONFIG_FOR_DOC,595)596def forward(597self,598input_ids=None,599attention_mask=None,600token_type_ids=None,601position_ids=None,602head_mask=None,603inputs_embeds=None,604labels=None,605output_attentions=None,606output_hidden_states=None,607return_dict=None,608**kwargs609):610r"""611labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
612Labels for computing the masked language modeling loss.
613Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
614Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
615in ``[0, ..., config.vocab_size]``
616kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
617Used to hide legacy arguments that have been deprecated.
618"""
619if "masked_lm_labels" in kwargs:620warnings.warn(621"The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",622FutureWarning,623)624labels = kwargs.pop("masked_lm_labels")625assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."626return_dict = return_dict if return_dict is not None else self.config.use_return_dict627
628outputs = self.roberta(629input_ids,630attention_mask=attention_mask,631token_type_ids=token_type_ids,632position_ids=position_ids,633head_mask=head_mask,634inputs_embeds=inputs_embeds,635output_attentions=output_attentions,636output_hidden_states=output_hidden_states,637return_dict=return_dict,638)639
640sequence_output = outputs[0]641prediction_scores = self.lm_head(sequence_output)642
643masked_lm_loss = None644if labels is not None:645#loss_fct = CrossEntropyLoss()646loss_fct = CrossEntropyLoss(ignore_index=-1)647masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))648
649if not return_dict:650output = (prediction_scores,) + outputs[2:]651return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output652
653return MaskedLMOutput(654loss=masked_lm_loss,655logits=prediction_scores,656hidden_states=outputs.hidden_states,657attentions=outputs.attentions,658)659
660
661class RobertaLMHead(nn.Module):662"""Roberta Head for masked language modeling."""663
664def __init__(self, config):665super().__init__()666self.dense = nn.Linear(config.hidden_size, config.hidden_size)667self.layer_norm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)668
669self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)670self.bias = nn.Parameter(torch.zeros(config.vocab_size))671
672# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`673self.decoder.bias = self.bias674
675def forward(self, features, **kwargs):676x = self.dense(features)677x = gelu(x)678x = self.layer_norm(x)679
680# project back to size of vocabulary with bias681x = self.decoder(x)682
683return x684
685
686@add_start_docstrings(687"""RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer688on top of the pooled output) e.g. for GLUE tasks. """,689ROBERTA_START_DOCSTRING,690)
691class RobertaForSequenceClassification(BertPreTrainedModel):692config_class = RobertaConfig693base_model_prefix = "roberta"694
695def __init__(self, config):696super().__init__(config)697#config.num_labels = 8 # add (can remove)698self.num_labels = config.num_labels699#print(self.num_labels)700#exit()701
702self.roberta = RobertaModel(config)703self.classifier = RobertaClassificationHead(config)704
705self.init_weights()706
707@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))708@add_code_sample_docstrings(709tokenizer_class=_TOKENIZER_FOR_DOC,710checkpoint="roberta-base",711output_type=SequenceClassifierOutput,712config_class=_CONFIG_FOR_DOC,713)714def forward(715self,716input_ids=None,717attention_mask=None,718token_type_ids=None,719position_ids=None,720head_mask=None,721inputs_embeds=None,722labels=None,723output_attentions=None,724output_hidden_states=None,725return_dict=None,726):727
728r"""729labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
730Labels for computing the sequence classification/regression loss.
731Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
732If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
733If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
734"""
735return_dict = return_dict if return_dict is not None else self.config.use_return_dict736
737outputs = self.roberta(738input_ids,739attention_mask=attention_mask,740token_type_ids=token_type_ids,741position_ids=position_ids,742head_mask=head_mask,743inputs_embeds=inputs_embeds,744output_attentions=output_attentions,745output_hidden_states=output_hidden_states,746return_dict=return_dict,747)748sequence_output = outputs[0]749logits = self.classifier(sequence_output)750
751loss = None752if labels is not None:753if self.num_labels == 1:754# We are doing regression755loss_fct = MSELoss()756loss = loss_fct(logits.view(-1), labels.view(-1))757else:758loss_fct = CrossEntropyLoss()759loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))760
761if not return_dict:762output = (logits,) + outputs[2:]763return ((loss,) + output) if loss is not None else output764
765return SequenceClassifierOutput(766loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,767)768
769
770@add_start_docstrings(771"""Roberta Model with a multiple choice classification head on top (a linear layer on top of772the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,773ROBERTA_START_DOCSTRING,774)
775class RobertaForMultipleChoice(BertPreTrainedModel):776config_class = RobertaConfig777base_model_prefix = "roberta"778
779def __init__(self, config):780super().__init__(config)781
782self.roberta = RobertaModel(config)783self.dropout = nn.Dropout(config.hidden_dropout_prob)784self.classifier = nn.Linear(config.hidden_size, 1)785
786self.init_weights()787
788@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))789@add_code_sample_docstrings(790tokenizer_class=_TOKENIZER_FOR_DOC,791checkpoint="roberta-base",792output_type=MultipleChoiceModelOutput,793config_class=_CONFIG_FOR_DOC,794)795def forward(796self,797input_ids=None,798token_type_ids=None,799attention_mask=None,800labels=None,801position_ids=None,802head_mask=None,803inputs_embeds=None,804output_attentions=None,805output_hidden_states=None,806return_dict=None,807):808r"""809labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
810Labels for computing the multiple choice classification loss.
811Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
812of the input tensors. (see `input_ids` above)
813"""
814return_dict = return_dict if return_dict is not None else self.config.use_return_dict815num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]816
817flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None818flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None819flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None820flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None821flat_inputs_embeds = (822inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))823if inputs_embeds is not None824else None825)826
827outputs = self.roberta(828flat_input_ids,829position_ids=flat_position_ids,830token_type_ids=flat_token_type_ids,831attention_mask=flat_attention_mask,832head_mask=head_mask,833inputs_embeds=flat_inputs_embeds,834output_attentions=output_attentions,835output_hidden_states=output_hidden_states,836return_dict=return_dict,837)838pooled_output = outputs[1]839
840pooled_output = self.dropout(pooled_output)841logits = self.classifier(pooled_output)842reshaped_logits = logits.view(-1, num_choices)843
844loss = None845if labels is not None:846loss_fct = CrossEntropyLoss()847loss = loss_fct(reshaped_logits, labels)848
849if not return_dict:850output = (reshaped_logits,) + outputs[2:]851return ((loss,) + output) if loss is not None else output852
853return MultipleChoiceModelOutput(854loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,855)856
857
858@add_start_docstrings(859"""Roberta Model with a token classification head on top (a linear layer on top of860the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,861ROBERTA_START_DOCSTRING,862)
863class RobertaForTokenClassification(BertPreTrainedModel):864config_class = RobertaConfig865base_model_prefix = "roberta"866
867def __init__(self, config):868super().__init__(config)869self.num_labels = config.num_labels870
871self.roberta = RobertaModel(config)872self.dropout = nn.Dropout(config.hidden_dropout_prob)873self.classifier = nn.Linear(config.hidden_size, config.num_labels)874
875self.init_weights()876
877@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))878@add_code_sample_docstrings(879tokenizer_class=_TOKENIZER_FOR_DOC,880checkpoint="roberta-base",881output_type=TokenClassifierOutput,882config_class=_CONFIG_FOR_DOC,883)884def forward(885self,886input_ids=None,887attention_mask=None,888token_type_ids=None,889position_ids=None,890head_mask=None,891inputs_embeds=None,892labels=None,893output_attentions=None,894output_hidden_states=None,895return_dict=None,896):897r"""898labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
899Labels for computing the token classification loss.
900Indices should be in ``[0, ..., config.num_labels - 1]``.
901"""
902return_dict = return_dict if return_dict is not None else self.config.use_return_dict903
904outputs = self.roberta(905input_ids,906attention_mask=attention_mask,907token_type_ids=token_type_ids,908position_ids=position_ids,909head_mask=head_mask,910inputs_embeds=inputs_embeds,911output_attentions=output_attentions,912output_hidden_states=output_hidden_states,913return_dict=return_dict,914)915
916sequence_output = outputs[0]917
918sequence_output = self.dropout(sequence_output)919logits = self.classifier(sequence_output)920
921loss = None922if labels is not None:923loss_fct = CrossEntropyLoss()924# Only keep active parts of the loss925if attention_mask is not None:926active_loss = attention_mask.view(-1) == 1927active_logits = logits.view(-1, self.num_labels)928active_labels = torch.where(929active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)930)931loss = loss_fct(active_logits, active_labels)932else:933loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))934
935if not return_dict:936output = (logits,) + outputs[2:]937return ((loss,) + output) if loss is not None else output938
939return TokenClassifierOutput(940loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,941)942
943class RobertaClassificationHead(nn.Module):944"""Head for sentence-level classification tasks."""945
946def __init__(self, config):947super().__init__()948self.dense = nn.Linear(config.hidden_size, config.hidden_size)949self.dropout = nn.Dropout(config.hidden_dropout_prob)950self.out_proj = nn.Linear(config.hidden_size, config.num_labels)951
952def forward(self, features, **kwargs):953x = features[:, 0, :] # take <s> token (equiv. to [CLS])954#x = features[input_ids==2] # take </s> token (equiv. to the last token)955x = self.dropout(x)956x = self.dense(x)957x = torch.tanh(x)958x = self.dropout(x)959x = self.out_proj(x)960return x961
962
963class RobertaClassificationHeadandTail(nn.Module):964"""Head for sentence-level classification tasks."""965
966def __init__(self, config):967super().__init__()968self.dense = nn.Linear(config.hidden_size*2, config.hidden_size*2)969self.dropout = nn.Dropout(config.hidden_dropout_prob)970self.out_proj = nn.Linear(config.hidden_size*2, config.num_labels)971
972def forward(self, features, input_ids, **kwargs):973head = features[:, 0, :] # take <s> token (equiv. to [CLS])974#print(input_ids==2)975#x = features[input_ids==2] # take </s> token (equiv. to the last token)976tail = features[input_ids==2] # take </s> token (equiv. to the last token)977x = torch.cat((head, tail),-1) # [, 768*2]978x = self.dropout(x)979x = self.dense(x)980x = torch.tanh(x)981x = self.dropout(x)982x = self.out_proj(x)983return x984
985@add_start_docstrings(986"""Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of987the hidden-states output to compute `span start logits` and `span end logits`). """,988ROBERTA_START_DOCSTRING,989)
990class RobertaForQuestionAnswering(BertPreTrainedModel):991config_class = RobertaConfig992base_model_prefix = "roberta"993
994def __init__(self, config):995super().__init__(config)996self.num_labels = config.num_labels997
998self.roberta = RobertaModel(config)999self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)1000
1001self.init_weights()1002
1003@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))1004@add_code_sample_docstrings(1005tokenizer_class=_TOKENIZER_FOR_DOC,1006checkpoint="roberta-base",1007output_type=QuestionAnsweringModelOutput,1008config_class=_CONFIG_FOR_DOC,1009)1010def forward(1011self,1012input_ids=None,1013attention_mask=None,1014token_type_ids=None,1015position_ids=None,1016head_mask=None,1017inputs_embeds=None,1018start_positions=None,1019end_positions=None,1020output_attentions=None,1021output_hidden_states=None,1022return_dict=None,1023):1024r"""1025start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
1026Labels for position (index) of the start of the labelled span for computing the token classification loss.
1027Positions are clamped to the length of the sequence (`sequence_length`).
1028Position outside of the sequence are not taken into account for computing the loss.
1029end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
1030Labels for position (index) of the end of the labelled span for computing the token classification loss.
1031Positions are clamped to the length of the sequence (`sequence_length`).
1032Position outside of the sequence are not taken into account for computing the loss.
1033"""
1034return_dict = return_dict if return_dict is not None else self.config.use_return_dict1035
1036outputs = self.roberta(1037input_ids,1038attention_mask=attention_mask,1039token_type_ids=token_type_ids,1040position_ids=position_ids,1041head_mask=head_mask,1042inputs_embeds=inputs_embeds,1043output_attentions=output_attentions,1044output_hidden_states=output_hidden_states,1045return_dict=return_dict,1046)1047
1048sequence_output = outputs[0]1049
1050logits = self.qa_outputs(sequence_output)1051start_logits, end_logits = logits.split(1, dim=-1)1052start_logits = start_logits.squeeze(-1)1053end_logits = end_logits.squeeze(-1)1054
1055total_loss = None1056if start_positions is not None and end_positions is not None:1057# If we are on multi-GPU, split add a dimension1058if len(start_positions.size()) > 1:1059start_positions = start_positions.squeeze(-1)1060if len(end_positions.size()) > 1:1061end_positions = end_positions.squeeze(-1)1062# sometimes the start/end positions are outside our model inputs, we ignore these terms1063ignored_index = start_logits.size(1)1064start_positions.clamp_(0, ignored_index)1065end_positions.clamp_(0, ignored_index)1066
1067loss_fct = CrossEntropyLoss(ignore_index=ignored_index)1068start_loss = loss_fct(start_logits, start_positions)1069end_loss = loss_fct(end_logits, end_positions)1070total_loss = (start_loss + end_loss) / 21071
1072if not return_dict:1073output = (start_logits, end_logits) + outputs[2:]1074return ((total_loss,) + output) if total_loss is not None else output1075
1076return QuestionAnsweringModelOutput(1077loss=total_loss,1078start_logits=start_logits,1079end_logits=end_logits,1080hidden_states=outputs.hidden_states,1081attentions=outputs.attentions,1082)1083
1084
1085def create_position_ids_from_input_ids(input_ids, padding_idx):1086""" Replace non-padding symbols with their position numbers. Position numbers begin at1087padding_idx+1. Padding symbols are ignored. This is modified from fairseq's
1088`utils.make_positions`.
1089
1090:param torch.Tensor x:
1091:return torch.Tensor:
1092"""
1093# The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.1094mask = input_ids.ne(padding_idx).int()1095incremental_indices = torch.cumsum(mask, dim=1).type_as(mask) * mask1096return incremental_indices.long() + padding_idx1097