CSS-LM
1111 строк · 43.8 Кб
1# coding=utf-8
2# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16"""PyTorch RoBERTa model. """
17
18
19import logging
20import warnings
21
22import torch
23import torch.nn as nn
24from torch.nn import CrossEntropyLoss, MSELoss
25
26from .configuration_roberta import RobertaConfig
27from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
28from .modeling_bert import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu
29from .modeling_outputs import (
30MaskedLMOutput,
31MultipleChoiceModelOutput,
32QuestionAnsweringModelOutput,
33SequenceClassifierOutput,
34TokenClassifierOutput,
35)
36
37
38logger = logging.getLogger(__name__)
39
40_CONFIG_FOR_DOC = "RobertaConfig"
41_TOKENIZER_FOR_DOC = "RobertaTokenizer"
42
43ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
44"roberta-base",
45"roberta-large",
46"roberta-large-mnli",
47"distilroberta-base",
48"roberta-base-openai-detector",
49"roberta-large-openai-detector",
50# See all RoBERTa models at https://huggingface.co/models?filter=roberta
51]
52
53
54class RobertaEmbeddings(BertEmbeddings):
55"""
56Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
57"""
58
59def __init__(self, config):
60super().__init__(config)
61self.padding_idx = config.pad_token_id
62self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx)
63self.position_embeddings = nn.Embedding(
64config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
65)
66
67def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
68if position_ids is None:
69if input_ids is not None:
70# Create the position ids from the input token ids. Any padded tokens remain padded.
71position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx).to(input_ids.device)
72else:
73position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
74
75return super().forward(
76input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds
77)
78
79def create_position_ids_from_inputs_embeds(self, inputs_embeds):
80""" We are provided embeddings directly. We cannot infer which are padded so just generate
81sequential position ids.
82
83:param torch.Tensor inputs_embeds:
84:return torch.Tensor:
85"""
86input_shape = inputs_embeds.size()[:-1]
87sequence_length = input_shape[1]
88
89position_ids = torch.arange(
90self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
91)
92return position_ids.unsqueeze(0).expand(input_shape)
93
94
95ROBERTA_START_DOCSTRING = r"""
96
97This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
98Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
99usage and behavior.
100
101Parameters:
102config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the
103model. Initializing with a config file does not load the weights associated with the model, only the configuration.
104Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
105"""
106
107ROBERTA_INPUTS_DOCSTRING = r"""
108Args:
109input_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`):
110Indices of input sequence tokens in the vocabulary.
111
112Indices can be obtained using :class:`transformers.RobertaTokenizer`.
113See :func:`transformers.PreTrainedTokenizer.encode` and
114:func:`transformers.PreTrainedTokenizer.__call__` for details.
115
116`What are input IDs? <../glossary.html#input-ids>`__
117attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
118Mask to avoid performing attention on padding token indices.
119Mask values selected in ``[0, 1]``:
120``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
121
122`What are attention masks? <../glossary.html#attention-mask>`__
123token_type_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
124Segment token indices to indicate first and second portions of the inputs.
125Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
126corresponds to a `sentence B` token
127
128`What are token type IDs? <../glossary.html#token-type-ids>`_
129position_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
130Indices of positions of each input sequence tokens in the position embeddings.
131Selected in the range ``[0, config.max_position_embeddings - 1]``.
132
133`What are position IDs? <../glossary.html#position-ids>`_
134head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
135Mask to nullify selected heads of the self-attention modules.
136Mask values selected in ``[0, 1]``:
137:obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
138inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
139Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
140This is useful if you want more control over how to convert `input_ids` indices into associated vectors
141than the model's internal embedding lookup matrix.
142output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
143If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
144output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
145If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
146return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
147If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
148plain tuple.
149"""
150
151
152@add_start_docstrings(
153"The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
154ROBERTA_START_DOCSTRING,
155)
156class RobertaModel(BertModel):
157"""
158This class overrides :class:`~transformers.BertModel`. Please check the
159superclass for the appropriate documentation alongside usage examples.
160"""
161
162config_class = RobertaConfig
163base_model_prefix = "roberta"
164
165def __init__(self, config):
166super().__init__(config)
167
168self.embeddings = RobertaEmbeddings(config)
169self.init_weights()
170
171def get_input_embeddings(self):
172return self.embeddings.word_embeddings
173
174def set_input_embeddings(self, value):
175self.embeddings.word_embeddings = value
176
177
178
179
180@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING)
181class RobertaForMaskedLMDomainTask(BertPreTrainedModel):
182config_class = RobertaConfig
183base_model_prefix = "roberta"
184
185def __init__(self, config):
186super().__init__(config)
187
188self.roberta = RobertaModel(config)
189self.lm_head = RobertaLMHead(config)
190#print("====")
191#print(config.num_labels)
192#print("====")
193#exit()
194#config.num_labels = 8 # add
195self.classifier = RobertaClassificationHead(config)
196#self.classifier = RobertaClassificationHeadandTail(config)
197self.num_labels = config.num_labels
198
199self.init_weights()
200###
201#self.domain_layer = torch.nn.Linear(768,768,bias=False)
202#self.task_layer = torch.nn.Linear(768,768,bias=False)
203#torch.nn.init.xavier_uniform_(self.domain_layer.weight)
204#self.act = nn.ReLU()
205#self.layer_out_domainClass = nn.Linear(768,2) #num_class
206#self.layer_out_taskClass = nn.Linear(768,8,bias=True) #num_class
207###
208self.LeakyReLU = torch.nn.LeakyReLU()
209self.LeakyReLU = torch.nn.LeakyReLU()
210self.domain_binary_classifier = nn.Linear(768,2,bias=True) #num_class
211self.task_binary_classifier = nn.Linear(768*2,2,bias=True) #num_class
212#self.act = nn.ReLU()
213###
214
215def get_output_embeddings(self):
216return self.lm_head.decoder
217
218@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
219@add_code_sample_docstrings(
220tokenizer_class=_TOKENIZER_FOR_DOC,
221checkpoint="roberta-base",
222output_type=MaskedLMOutput,
223config_class=_CONFIG_FOR_DOC,
224)
225
226def forward(
227self,
228input_ids=None,
229input_ids_org=None,
230attention_mask=None,
231token_type_ids=None,
232position_ids=None,
233head_mask=None,
234inputs_embeds=None,
235labels=None,
236output_attentions=None,
237output_hidden_states=None,
238return_dict=None,
239func=None,
240tail_idxs=None,
241in_domain_rep=None,
242out_domain_rep=None,
243sentence_label=None,
244lm_label=None,
245batch_size=None,
246all_in_task_rep_comb=None,
247all_sentence_binary_label=None,
248from_query=False,
249**kwargs
250):
251r"""
252labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
253Labels for computing the masked language modeling loss.
254Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
255Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
256in ``[0, ..., config.vocab_size]``
257kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
258Used to hide legacy arguments that have been deprecated.
259"""
260if "masked_lm_labels" in kwargs:
261warnings.warn(
262"The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
263FutureWarning,
264)
265labels = kwargs.pop("masked_lm_labels")
266assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
267return_dict = return_dict if return_dict is not None else self.config.use_return_dict
268
269
270if func == "in_domain_task_rep":
271#######
272outputs = self.roberta(
273input_ids=input_ids_org,
274attention_mask=attention_mask,
275token_type_ids=token_type_ids,
276position_ids=position_ids,
277head_mask=head_mask,
278inputs_embeds=inputs_embeds,
279output_attentions=output_attentions,
280output_hidden_states=output_hidden_states,
281return_dict=return_dict,
282)
283#######
284#x = features[:, 0, :] # take <s> token (equiv. to [CLS])
285#rep = outputs.last_hidden_state[:, 0, :]
286#rep = outputs.last_hidden_state[:, 0, :]
287rep_head = outputs.last_hidden_state[:, 0, :]
288rep_tail = outputs.last_hidden_state[input_ids_org==2]
289#detach
290#rep = rep.detach()
291'''
292in_domain_rep = self.domain_layer(rep)
293in_task_rep = self.task_layer(rep)
294return in_domain_rep, in_task_rep
295'''
296return rep_tail, rep_head
297
298elif func == "in_domain_task_rep_mean":
299#######
300outputs = self.roberta(
301input_ids=input_ids_org,
302attention_mask=attention_mask,
303token_type_ids=token_type_ids,
304position_ids=position_ids,
305head_mask=head_mask,
306inputs_embeds=inputs_embeds,
307output_attentions=output_attentions,
308output_hidden_states=output_hidden_states,
309return_dict=return_dict,
310)
311#######
312#x = features[:, 0, :] # take <s> token (equiv. to [CLS])
313rep = outputs.last_hidden_state
314mask = rep!=0
315rep = (rep*mask).sum(dim=1)/mask.sum(dim=1)
316
317#detach
318#rep = rep.detach()
319'''
320in_domain_rep = self.domain_layer(rep)
321in_task_rep = self.task_layer(rep)
322return in_domain_rep, in_task_rep
323'''
324return rep, rep
325
326elif func == "return_task_binary_classifier":
327return self.task_binary_classifier.weight.data, self.task_binary_classifier.bias.data
328
329elif func == "return_domain_binary_classifier":
330return self.domain_binary_classifier.weight.data, self.domain_binary_classifier.bias.data
331
332
333#if func == "task_binary_classifier":
334
335elif func == "domain_binary_classifier":
336#in:1 , out:0
337#Need to fix
338#######
339outputs = self.roberta(
340input_ids=input_ids_org,
341attention_mask=attention_mask,
342token_type_ids=token_type_ids,
343position_ids=position_ids,
344head_mask=head_mask,
345inputs_embeds=inputs_embeds,
346output_attentions=output_attentions,
347output_hidden_states=output_hidden_states,
348return_dict=return_dict,
349)
350#######
351#Didn't include query rep: so it need to add in_domain_rep here
352loss_fct = CrossEntropyLoss()
353out_domain_rep_head = outputs.last_hidden_state[:, 0, :]
354out_domain_rep_tail = outputs.last_hidden_state[input_ids_org==2]
355#print("model_head",out_domain_rep_head.shape)
356#print("model_tail",out_domain_rep_tail.shape)
357domain_rep = torch.cat([in_domain_rep, out_domain_rep_tail], 0)
358#detach
359#domain_rep = domain_rep.detach()
360logit = self.domain_binary_classifier(domain_rep)
361logit = self.LeakyReLU(logit)
362pos_target = torch.tensor([1]*in_domain_rep.shape[0]).to("cuda")
363neg_target = torch.tensor([0]*out_domain_rep_tail.shape[0]).to("cuda")
364target = torch.cat([pos_target, neg_target], 0)
365domain_loss = loss_fct(logit, target)
366
367return domain_loss, logit, out_domain_rep_head, out_domain_rep_tail
368
369
370elif func == "domain_binary_classifier_mean":
371#in:1 , out:0
372#Need to fix
373#######
374outputs = self.roberta(
375input_ids=input_ids_org,
376attention_mask=attention_mask,
377token_type_ids=token_type_ids,
378position_ids=position_ids,
379head_mask=head_mask,
380inputs_embeds=inputs_embeds,
381output_attentions=output_attentions,
382output_hidden_states=output_hidden_states,
383return_dict=return_dict,
384)
385#######
386#Didn't include query rep: so it need to add in_domain_rep here
387loss_fct = CrossEntropyLoss()
388out_domain_rep = outputs.last_hidden_state
389###
390mask = out_domain_rep!=0
391out_domain_rep = (out_domain_rep*mask).sum(dim=1)/mask.sum(dim=1)
392###
393domain_rep = torch.cat([in_domain_rep, out_domain_rep], 0)
394#detach
395#domain_rep = domain_rep.detach()
396logit = self.domain_binary_classifier(domain_rep)
397logit = self.LeakyReLU(logit)
398pos_target = torch.tensor([1]*in_domain_rep.shape[0]).to("cuda")
399neg_target = torch.tensor([0]*out_domain_rep.shape[0]).to("cuda")
400target = torch.cat([pos_target, neg_target], 0)
401domain_loss = loss_fct(logit, target)
402return domain_loss, logit
403
404
405elif func == "task_binary_classifier":
406#Didn't include query rep: so it need to add in_domain_rep here
407loss_fct = CrossEntropyLoss()
408#detach
409#all_in_task_rep_comb = all_in_task_rep_comb.detach()
410logit = self.task_binary_classifier(all_in_task_rep_comb)
411logit = self.LeakyReLU(logit)
412all_sentence_binary_label = all_sentence_binary_label.reshape(all_sentence_binary_label.shape[0]*all_sentence_binary_label.shape[1])
413logit = logit.reshape(logit.shape[0]*logit.shape[1],logit.shape[2])
414task_binary_loss = loss_fct(logit.view(-1,2), all_sentence_binary_label.view(-1))
415return task_binary_loss, logit
416
417
418elif func == "task_binary_classifier_mean":
419#Didn't include query rep: so it need to add in_domain_rep here
420loss_fct = CrossEntropyLoss()
421#detach
422#all_in_task_rep_comb = all_in_task_rep_comb.detach()
423logit = self.task_binary_classifier(all_in_task_rep_comb)
424logit = self.LeakyReLU(logit)
425all_sentence_binary_label = all_sentence_binary_label.reshape(all_sentence_binary_label.shape[0]*all_sentence_binary_label.shape[1])
426logit = logit.reshape(logit.shape[0]*logit.shape[1],logit.shape[2])
427task_binary_loss = loss_fct(logit.view(-1,2), all_sentence_binary_label.view(-1))
428return task_binary_loss, logit
429
430elif func == "task_class":
431#######
432outputs = self.roberta(
433input_ids=input_ids_org,
434attention_mask=attention_mask,
435token_type_ids=token_type_ids,
436position_ids=position_ids,
437head_mask=head_mask,
438inputs_embeds=inputs_embeds,
439output_attentions=output_attentions,
440output_hidden_states=output_hidden_states,
441return_dict=return_dict,
442)
443#######
444#Already including query rep
445loss_fct = CrossEntropyLoss()
446###
447#class_logit = self.classifier(outputs.last_hidden_state, input_ids_org)
448class_logit = self.classifier(outputs.last_hidden_state)
449task_loss = loss_fct(class_logit.view(-1, self.num_labels), sentence_label.view(-1))
450
451if from_query==True:
452query_rep_head = outputs.last_hidden_state[:,0,:]
453query_rep_tail = outputs.last_hidden_state[input_ids_org==2]
454return task_loss, class_logit, query_rep_head, query_rep_tail
455else:
456return task_loss, class_logit
457
458
459elif func == "mlm":
460outputs_mlm = self.roberta(
461input_ids=input_ids,
462attention_mask=attention_mask,
463token_type_ids=token_type_ids,
464position_ids=position_ids,
465head_mask=head_mask,
466inputs_embeds=inputs_embeds,
467output_attentions=output_attentions,
468output_hidden_states=output_hidden_states,
469return_dict=return_dict,
470)
471
472loss_fct = CrossEntropyLoss()
473#sequence_output = outputs_mlm.last_hidden_state
474sequence_output = outputs_mlm[0]
475prediction_scores = self.lm_head(sequence_output)
476loss_fct = CrossEntropyLoss(ignore_index=-1)
477masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), lm_label.view(-1))
478return masked_lm_loss
479
480
481elif func == "task_class and mlm":
482#######
483outputs = self.roberta(
484input_ids=input_ids_org,
485attention_mask=attention_mask,
486token_type_ids=token_type_ids,
487position_ids=position_ids,
488head_mask=head_mask,
489inputs_embeds=inputs_embeds,
490output_attentions=output_attentions,
491output_hidden_states=output_hidden_states,
492return_dict=return_dict,
493)
494#######
495#######
496outputs_mlm = self.roberta(
497input_ids=input_ids,
498attention_mask=attention_mask,
499token_type_ids=token_type_ids,
500position_ids=position_ids,
501head_mask=head_mask,
502inputs_embeds=inputs_embeds,
503output_attentions=output_attentions,
504output_hidden_states=output_hidden_states,
505return_dict=return_dict,
506)
507#######
508#Already including query rep
509#task loss
510loss_fct = CrossEntropyLoss()
511###
512'''
513#rep = outputs.last_hidden_state[input_ids==2]
514rep = outputs.last_hidden_state[:, 0, :]
515#rep = rep.detach()
516task_rep = self.task_layer(rep)
517class_logit = self.layer_out_taskClass((self.act(task_rep)))
518'''
519class_logit = self.classifier(outputs.last_hidden_state)
520###
521task_loss = loss_fct(class_logit.view(-1, 8), sentence_label.view(-1))
522
523#mlm loss
524sequence_output = outputs_mlm.last_hidden_state
525prediction_scores = self.lm_head(sequence_output)
526loss_fct = CrossEntropyLoss(ignore_index=-1)
527masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), lm_label.view(-1))
528return task_loss, masked_lm_loss
529
530elif func == "gen_rep":
531outputs = self.roberta(
532input_ids=input_ids_org,
533attention_mask=attention_mask,
534token_type_ids=token_type_ids,
535position_ids=position_ids,
536head_mask=head_mask,
537inputs_embeds=inputs_embeds,
538output_attentions=output_attentions,
539output_hidden_states=output_hidden_states,
540return_dict=return_dict,
541)
542return outputs
543
544'''
545#mlm
546outputs_mlm = self.roberta(
547input_ids=input_ids,
548attention_mask=attention_mask,
549token_type_ids=token_type_ids,
550position_ids=position_ids,
551head_mask=head_mask,
552inputs_embeds=inputs_embeds,
553output_attentions=output_attentions,
554output_hidden_states=output_hidden_states,
555return_dict=return_dict,
556)
557
558sequence_output = outputs_mlm[0]
559prediction_scores = self.lm_head(sequence_output)
560
561masked_lm_loss = None
562if labels is not None:
563#loss_fct = CrossEntropyLoss()
564loss_fct = CrossEntropyLoss(ignore_index=-1)
565masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
566
567if not return_dict:
568output = (prediction_scores,) + outputs[2:]
569return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
570
571return MaskedLMOutput(
572loss=masked_lm_loss,
573logits=prediction_scores,
574hidden_states=outputs.hidden_states,
575attentions=outputs.attentions,
576)
577'''
578
579
580
581@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING)
582class RobertaForMaskedLM(BertPreTrainedModel):
583config_class = RobertaConfig
584base_model_prefix = "roberta"
585
586def __init__(self, config):
587super().__init__(config)
588
589self.roberta = RobertaModel(config)
590self.lm_head = RobertaLMHead(config)
591
592self.init_weights()
593
594def get_output_embeddings(self):
595return self.lm_head.decoder
596
597@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
598@add_code_sample_docstrings(
599tokenizer_class=_TOKENIZER_FOR_DOC,
600checkpoint="roberta-base",
601output_type=MaskedLMOutput,
602config_class=_CONFIG_FOR_DOC,
603)
604def forward(
605self,
606input_ids=None,
607attention_mask=None,
608token_type_ids=None,
609position_ids=None,
610head_mask=None,
611inputs_embeds=None,
612labels=None,
613output_attentions=None,
614output_hidden_states=None,
615return_dict=None,
616**kwargs
617):
618r"""
619labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
620Labels for computing the masked language modeling loss.
621Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
622Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
623in ``[0, ..., config.vocab_size]``
624kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
625Used to hide legacy arguments that have been deprecated.
626"""
627if "masked_lm_labels" in kwargs:
628warnings.warn(
629"The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
630FutureWarning,
631)
632labels = kwargs.pop("masked_lm_labels")
633assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
634return_dict = return_dict if return_dict is not None else self.config.use_return_dict
635
636outputs = self.roberta(
637input_ids,
638attention_mask=attention_mask,
639token_type_ids=token_type_ids,
640position_ids=position_ids,
641head_mask=head_mask,
642inputs_embeds=inputs_embeds,
643output_attentions=output_attentions,
644output_hidden_states=output_hidden_states,
645return_dict=return_dict,
646)
647
648sequence_output = outputs[0]
649prediction_scores = self.lm_head(sequence_output)
650
651masked_lm_loss = None
652if labels is not None:
653#loss_fct = CrossEntropyLoss()
654loss_fct = CrossEntropyLoss(ignore_index=-1)
655masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
656
657if not return_dict:
658output = (prediction_scores,) + outputs[2:]
659return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
660
661
662return MaskedLMOutput(
663loss=masked_lm_loss,
664logits=prediction_scores,
665hidden_states=outputs.hidden_states,
666attentions=outputs.attentions,
667)
668
669
670class RobertaLMHead(nn.Module):
671"""Roberta Head for masked language modeling."""
672
673def __init__(self, config):
674super().__init__()
675self.dense = nn.Linear(config.hidden_size, config.hidden_size)
676self.layer_norm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
677
678self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
679self.bias = nn.Parameter(torch.zeros(config.vocab_size))
680
681# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
682self.decoder.bias = self.bias
683
684def forward(self, features, **kwargs):
685x = self.dense(features)
686x = gelu(x)
687x = self.layer_norm(x)
688
689# project back to size of vocabulary with bias
690x = self.decoder(x)
691
692return x
693
694
695@add_start_docstrings(
696"""RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer
697on top of the pooled output) e.g. for GLUE tasks. """,
698ROBERTA_START_DOCSTRING,
699)
700class RobertaForSequenceClassification(BertPreTrainedModel):
701config_class = RobertaConfig
702base_model_prefix = "roberta"
703
704def __init__(self, config):
705super().__init__(config)
706#config.num_labels = 8 # add (can remove)
707self.num_labels = config.num_labels
708
709self.roberta = RobertaModel(config)
710self.classifier = RobertaClassificationHead(config)
711#self.classifier = RobertaClassificationHeadandTail(config)
712
713self.init_weights()
714
715@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
716@add_code_sample_docstrings(
717tokenizer_class=_TOKENIZER_FOR_DOC,
718checkpoint="roberta-base",
719output_type=SequenceClassifierOutput,
720config_class=_CONFIG_FOR_DOC,
721)
722def forward(
723self,
724input_ids=None,
725attention_mask=None,
726token_type_ids=None,
727position_ids=None,
728head_mask=None,
729inputs_embeds=None,
730labels=None,
731output_attentions=None,
732output_hidden_states=None,
733return_dict=None,
734):
735
736r"""
737labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
738Labels for computing the sequence classification/regression loss.
739Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
740If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
741If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
742"""
743return_dict = return_dict if return_dict is not None else self.config.use_return_dict
744
745outputs = self.roberta(
746input_ids,
747attention_mask=attention_mask,
748token_type_ids=token_type_ids,
749position_ids=position_ids,
750head_mask=head_mask,
751inputs_embeds=inputs_embeds,
752output_attentions=output_attentions,
753output_hidden_states=output_hidden_states,
754return_dict=return_dict,
755)
756
757#####
758#return outputs
759#####
760
761
762sequence_output = outputs[0]
763logits = self.classifier(sequence_output)
764
765loss = None
766if labels is not None:
767if self.num_labels == 1:
768# We are doing regression
769loss_fct = MSELoss()
770loss = loss_fct(logits.view(-1), labels.view(-1))
771else:
772loss_fct = CrossEntropyLoss()
773loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
774
775if not return_dict:
776output = (logits,) + outputs[2:]
777return ((loss,) + output) if loss is not None else output
778
779return SequenceClassifierOutput(
780loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
781)
782
783
784@add_start_docstrings(
785"""Roberta Model with a multiple choice classification head on top (a linear layer on top of
786the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
787ROBERTA_START_DOCSTRING,
788)
789class RobertaForMultipleChoice(BertPreTrainedModel):
790config_class = RobertaConfig
791base_model_prefix = "roberta"
792
793def __init__(self, config):
794super().__init__(config)
795
796self.roberta = RobertaModel(config)
797self.dropout = nn.Dropout(config.hidden_dropout_prob)
798self.classifier = nn.Linear(config.hidden_size, 1)
799
800self.init_weights()
801
802@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
803@add_code_sample_docstrings(
804tokenizer_class=_TOKENIZER_FOR_DOC,
805checkpoint="roberta-base",
806output_type=MultipleChoiceModelOutput,
807config_class=_CONFIG_FOR_DOC,
808)
809def forward(
810self,
811input_ids=None,
812token_type_ids=None,
813attention_mask=None,
814labels=None,
815position_ids=None,
816head_mask=None,
817inputs_embeds=None,
818output_attentions=None,
819output_hidden_states=None,
820return_dict=None,
821):
822r"""
823labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
824Labels for computing the multiple choice classification loss.
825Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
826of the input tensors. (see `input_ids` above)
827"""
828return_dict = return_dict if return_dict is not None else self.config.use_return_dict
829num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
830
831flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
832flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
833flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
834flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
835flat_inputs_embeds = (
836inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
837if inputs_embeds is not None
838else None
839)
840
841outputs = self.roberta(
842flat_input_ids,
843position_ids=flat_position_ids,
844token_type_ids=flat_token_type_ids,
845attention_mask=flat_attention_mask,
846head_mask=head_mask,
847inputs_embeds=flat_inputs_embeds,
848output_attentions=output_attentions,
849output_hidden_states=output_hidden_states,
850return_dict=return_dict,
851)
852pooled_output = outputs[1]
853
854pooled_output = self.dropout(pooled_output)
855logits = self.classifier(pooled_output)
856reshaped_logits = logits.view(-1, num_choices)
857
858loss = None
859if labels is not None:
860loss_fct = CrossEntropyLoss()
861loss = loss_fct(reshaped_logits, labels)
862
863if not return_dict:
864output = (reshaped_logits,) + outputs[2:]
865return ((loss,) + output) if loss is not None else output
866
867return MultipleChoiceModelOutput(
868loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
869)
870
871
872@add_start_docstrings(
873"""Roberta Model with a token classification head on top (a linear layer on top of
874the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
875ROBERTA_START_DOCSTRING,
876)
877class RobertaForTokenClassification(BertPreTrainedModel):
878config_class = RobertaConfig
879base_model_prefix = "roberta"
880
881def __init__(self, config):
882super().__init__(config)
883self.num_labels = config.num_labels
884
885self.roberta = RobertaModel(config)
886self.dropout = nn.Dropout(config.hidden_dropout_prob)
887self.classifier = nn.Linear(config.hidden_size, config.num_labels)
888
889self.init_weights()
890
891@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
892@add_code_sample_docstrings(
893tokenizer_class=_TOKENIZER_FOR_DOC,
894checkpoint="roberta-base",
895output_type=TokenClassifierOutput,
896config_class=_CONFIG_FOR_DOC,
897)
898def forward(
899self,
900input_ids=None,
901attention_mask=None,
902token_type_ids=None,
903position_ids=None,
904head_mask=None,
905inputs_embeds=None,
906labels=None,
907output_attentions=None,
908output_hidden_states=None,
909return_dict=None,
910):
911r"""
912labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
913Labels for computing the token classification loss.
914Indices should be in ``[0, ..., config.num_labels - 1]``.
915"""
916return_dict = return_dict if return_dict is not None else self.config.use_return_dict
917
918outputs = self.roberta(
919input_ids,
920attention_mask=attention_mask,
921token_type_ids=token_type_ids,
922position_ids=position_ids,
923head_mask=head_mask,
924inputs_embeds=inputs_embeds,
925output_attentions=output_attentions,
926output_hidden_states=output_hidden_states,
927return_dict=return_dict,
928)
929
930sequence_output = outputs[0]
931
932sequence_output = self.dropout(sequence_output)
933logits = self.classifier(sequence_output)
934
935loss = None
936if labels is not None:
937loss_fct = CrossEntropyLoss()
938# Only keep active parts of the loss
939if attention_mask is not None:
940active_loss = attention_mask.view(-1) == 1
941active_logits = logits.view(-1, self.num_labels)
942active_labels = torch.where(
943active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
944)
945loss = loss_fct(active_logits, active_labels)
946else:
947loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
948
949if not return_dict:
950output = (logits,) + outputs[2:]
951return ((loss,) + output) if loss is not None else output
952
953return TokenClassifierOutput(
954loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
955)
956
957class RobertaClassificationHead(nn.Module):
958"""Head for sentence-level classification tasks."""
959
960def __init__(self, config):
961super().__init__()
962self.dense = nn.Linear(config.hidden_size, config.hidden_size)
963self.dropout = nn.Dropout(config.hidden_dropout_prob)
964self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
965
966def forward(self, features, **kwargs):
967x = features[:, 0, :] # take <s> token (equiv. to [CLS])
968#x = features[input_ids==2] # take </s> token (equiv. to the last token)
969x = self.dropout(x)
970x = self.dense(x)
971x = torch.tanh(x)
972x = self.dropout(x)
973x = self.out_proj(x)
974return x
975
976
977class RobertaClassificationHeadandTail(nn.Module):
978"""Head for sentence-level classification tasks."""
979
980def __init__(self, config):
981super().__init__()
982self.dense = nn.Linear(config.hidden_size*2, config.hidden_size*2)
983self.dropout = nn.Dropout(config.hidden_dropout_prob)
984self.out_proj = nn.Linear(config.hidden_size*2, config.num_labels)
985self.num_labels = config.num_labels
986
987def forward(self, features, input_ids, **kwargs):
988head = features[:, 0, :] # take <s> token (equiv. to [CLS])
989#print(input_ids==2)
990#x = features[input_ids==2] # take </s> token (equiv. to the last token)
991tail = features[input_ids==2] # take </s> token (equiv. to the last token)
992x = torch.cat((head, tail),-1) # [, 768*2]
993x = self.dropout(x)
994x = self.dense(x)
995x = torch.tanh(x)
996x = self.dropout(x)
997x = self.out_proj(x)
998return x
999
1000@add_start_docstrings(
1001"""Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
1002the hidden-states output to compute `span start logits` and `span end logits`). """,
1003ROBERTA_START_DOCSTRING,
1004)
1005class RobertaForQuestionAnswering(BertPreTrainedModel):
1006config_class = RobertaConfig
1007base_model_prefix = "roberta"
1008
1009def __init__(self, config):
1010super().__init__(config)
1011self.num_labels = config.num_labels
1012
1013self.roberta = RobertaModel(config)
1014self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
1015
1016self.init_weights()
1017
1018@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
1019@add_code_sample_docstrings(
1020tokenizer_class=_TOKENIZER_FOR_DOC,
1021checkpoint="roberta-base",
1022output_type=QuestionAnsweringModelOutput,
1023config_class=_CONFIG_FOR_DOC,
1024)
1025def forward(
1026self,
1027input_ids=None,
1028attention_mask=None,
1029token_type_ids=None,
1030position_ids=None,
1031head_mask=None,
1032inputs_embeds=None,
1033start_positions=None,
1034end_positions=None,
1035output_attentions=None,
1036output_hidden_states=None,
1037return_dict=None,
1038):
1039r"""
1040start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
1041Labels for position (index) of the start of the labelled span for computing the token classification loss.
1042Positions are clamped to the length of the sequence (`sequence_length`).
1043Position outside of the sequence are not taken into account for computing the loss.
1044end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
1045Labels for position (index) of the end of the labelled span for computing the token classification loss.
1046Positions are clamped to the length of the sequence (`sequence_length`).
1047Position outside of the sequence are not taken into account for computing the loss.
1048"""
1049return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1050
1051outputs = self.roberta(
1052input_ids,
1053attention_mask=attention_mask,
1054token_type_ids=token_type_ids,
1055position_ids=position_ids,
1056head_mask=head_mask,
1057inputs_embeds=inputs_embeds,
1058output_attentions=output_attentions,
1059output_hidden_states=output_hidden_states,
1060return_dict=return_dict,
1061)
1062
1063sequence_output = outputs[0]
1064
1065logits = self.qa_outputs(sequence_output)
1066start_logits, end_logits = logits.split(1, dim=-1)
1067start_logits = start_logits.squeeze(-1)
1068end_logits = end_logits.squeeze(-1)
1069
1070total_loss = None
1071if start_positions is not None and end_positions is not None:
1072# If we are on multi-GPU, split add a dimension
1073if len(start_positions.size()) > 1:
1074start_positions = start_positions.squeeze(-1)
1075if len(end_positions.size()) > 1:
1076end_positions = end_positions.squeeze(-1)
1077# sometimes the start/end positions are outside our model inputs, we ignore these terms
1078ignored_index = start_logits.size(1)
1079start_positions.clamp_(0, ignored_index)
1080end_positions.clamp_(0, ignored_index)
1081
1082loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
1083start_loss = loss_fct(start_logits, start_positions)
1084end_loss = loss_fct(end_logits, end_positions)
1085total_loss = (start_loss + end_loss) / 2
1086
1087if not return_dict:
1088output = (start_logits, end_logits) + outputs[2:]
1089return ((total_loss,) + output) if total_loss is not None else output
1090
1091return QuestionAnsweringModelOutput(
1092loss=total_loss,
1093start_logits=start_logits,
1094end_logits=end_logits,
1095hidden_states=outputs.hidden_states,
1096attentions=outputs.attentions,
1097)
1098
1099
1100def create_position_ids_from_input_ids(input_ids, padding_idx):
1101""" Replace non-padding symbols with their position numbers. Position numbers begin at
1102padding_idx+1. Padding symbols are ignored. This is modified from fairseq's
1103`utils.make_positions`.
1104
1105:param torch.Tensor x:
1106:return torch.Tensor:
1107"""
1108# The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
1109mask = input_ids.ne(padding_idx).int()
1110incremental_indices = torch.cumsum(mask, dim=1).type_as(mask) * mask
1111return incremental_indices.long() + padding_idx
1112