CSS-LM

modeling_bert_updateRep_self.py
2245 строк · 90.8 Кб
Перенос по словам
1
# coding=utf-8
2
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
4
#
5
# Licensed under the Apache License, Version 2.0 (the "License");
6
# you may not use this file except in compliance with the License.
7
# You may obtain a copy of the License at
8
#
9
#     http://www.apache.org/licenses/LICENSE-2.0
10
#
11
# Unless required by applicable law or agreed to in writing, software
12
# distributed under the License is distributed on an "AS IS" BASIS,
13
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
# See the License for the specific language governing permissions and
15
# limitations under the License.
16
"""PyTorch BERT model. """
17

18

19
import logging
20
import math
21
import os
22
import warnings
23
from dataclasses import dataclass
24
from typing import Optional, Tuple
25

26
import torch
27
import torch.utils.checkpoint
28
from torch import nn
29
from torch.nn import CrossEntropyLoss, MSELoss
30

31
from .activations import gelu, gelu_new, swish
32
from .configuration_bert import BertConfig
33
from .file_utils import (
34
    ModelOutput,
35
    add_code_sample_docstrings,
36
    add_start_docstrings,
37
    add_start_docstrings_to_callable,
38
    replace_return_docstrings,
39
)
40
from .modeling_outputs import (
41
    BaseModelOutput,
42
    BaseModelOutputWithPooling,
43
    CausalLMOutput,
44
    MaskedLMOutput,
45
    MultipleChoiceModelOutput,
46
    NextSentencePredictorOutput,
47
    QuestionAnsweringModelOutput,
48
    SequenceClassifierOutput,
49
    TokenClassifierOutput,
50
)
51
from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
52

53

54
logger = logging.getLogger(__name__)
55

56
_CONFIG_FOR_DOC = "BertConfig"
57
_TOKENIZER_FOR_DOC = "BertTokenizer"
58

59
BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
60
    "bert-base-uncased",
61
    "bert-large-uncased",
62
    "bert-base-cased",
63
    "bert-large-cased",
64
    "bert-base-multilingual-uncased",
65
    "bert-base-multilingual-cased",
66
    "bert-base-chinese",
67
    "bert-base-german-cased",
68
    "bert-large-uncased-whole-word-masking",
69
    "bert-large-cased-whole-word-masking",
70
    "bert-large-uncased-whole-word-masking-finetuned-squad",
71
    "bert-large-cased-whole-word-masking-finetuned-squad",
72
    "bert-base-cased-finetuned-mrpc",
73
    "bert-base-german-dbmdz-cased",
74
    "bert-base-german-dbmdz-uncased",
75
    "cl-tohoku/bert-base-japanese",
76
    "cl-tohoku/bert-base-japanese-whole-word-masking",
77
    "cl-tohoku/bert-base-japanese-char",
78
    "cl-tohoku/bert-base-japanese-char-whole-word-masking",
79
    "TurkuNLP/bert-base-finnish-cased-v1",
80
    "TurkuNLP/bert-base-finnish-uncased-v1",
81
    "wietsedv/bert-base-dutch-cased",
82
    # See all BERT models at https://huggingface.co/models?filter=bert
83
]
84

85

86
def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
87
    """ Load tf checkpoints in a pytorch model.
88
    """
89
    try:
90
        import re
91
        import numpy as np
92
        import tensorflow as tf
93
    except ImportError:
94
        logger.error(
95
            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
96
            "https://www.tensorflow.org/install/ for installation instructions."
97
        )
98
        raise
99
    tf_path = os.path.abspath(tf_checkpoint_path)
100
    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
101
    # Load weights from TF model
102
    init_vars = tf.train.list_variables(tf_path)
103
    names = []
104
    arrays = []
105
    for name, shape in init_vars:
106
        logger.info("Loading TF weight {} with shape {}".format(name, shape))
107
        array = tf.train.load_variable(tf_path, name)
108
        names.append(name)
109
        arrays.append(array)
110

111
    for name, array in zip(names, arrays):
112
        name = name.split("/")
113
        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
114
        # which are not required for using pretrained model
115
        if any(
116
            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
117
            for n in name
118
        ):
119
            logger.info("Skipping {}".format("/".join(name)))
120
            continue
121
        pointer = model
122
        for m_name in name:
123
            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
124
                scope_names = re.split(r"_(\d+)", m_name)
125
            else:
126
                scope_names = [m_name]
127
            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
128
                pointer = getattr(pointer, "weight")
129
            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
130
                pointer = getattr(pointer, "bias")
131
            elif scope_names[0] == "output_weights":
132
                pointer = getattr(pointer, "weight")
133
            elif scope_names[0] == "squad":
134
                pointer = getattr(pointer, "classifier")
135
            else:
136
                try:
137
                    pointer = getattr(pointer, scope_names[0])
138
                except AttributeError:
139
                    logger.info("Skipping {}".format("/".join(name)))
140
                    continue
141
            if len(scope_names) >= 2:
142
                num = int(scope_names[1])
143
                pointer = pointer[num]
144
        if m_name[-11:] == "_embeddings":
145
            pointer = getattr(pointer, "weight")
146
        elif m_name == "kernel":
147
            array = np.transpose(array)
148
        try:
149
            assert pointer.shape == array.shape
150
        except AssertionError as e:
151
            e.args += (pointer.shape, array.shape)
152
            raise
153
        logger.info("Initialize PyTorch weight {}".format(name))
154
        pointer.data = torch.from_numpy(array)
155
    return model
156

157

158
def mish(x):
159
    return x * torch.tanh(nn.functional.softplus(x))
160

161

162
ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new, "mish": mish}
163

164

165
BertLayerNorm = torch.nn.LayerNorm
166

167

168
class BertEmbeddings(nn.Module):
169
    """Construct the embeddings from word, position and token_type embeddings.
170
    """
171

172
    def __init__(self, config):
173
        super().__init__()
174
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
175
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
176
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
177

178
        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
179
        # any TensorFlow checkpoint file
180
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
181
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
182

183
        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
184
        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
185

186
    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
187
        if input_ids is not None:
188
            input_shape = input_ids.size()
189
        else:
190
            input_shape = inputs_embeds.size()[:-1]
191

192
        seq_length = input_shape[1]
193

194
        if position_ids is None:
195
            position_ids = self.position_ids[:, :seq_length]
196

197
        if token_type_ids is None:
198
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
199

200
        if inputs_embeds is None:
201
            inputs_embeds = self.word_embeddings(input_ids)
202
        position_embeddings = self.position_embeddings(position_ids)
203
        token_type_embeddings = self.token_type_embeddings(token_type_ids)
204

205

206
        embeddings = inputs_embeds + position_embeddings + token_type_embeddings
207
        embeddings = self.LayerNorm(embeddings)
208
        embeddings = self.dropout(embeddings)
209
        return embeddings
210

211

212
class BertSelfAttention(nn.Module):
213
    def __init__(self, config):
214
        super().__init__()
215
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
216
            raise ValueError(
217
                "The hidden size (%d) is not a multiple of the number of attention "
218
                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
219
            )
220

221
        self.num_attention_heads = config.num_attention_heads
222
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
223
        self.all_head_size = self.num_attention_heads * self.attention_head_size
224

225
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
226
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
227
        self.value = nn.Linear(config.hidden_size, self.all_head_size)
228

229
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
230

231
    def transpose_for_scores(self, x):
232
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
233
        x = x.view(*new_x_shape)
234
        return x.permute(0, 2, 1, 3)
235

236
    def forward(
237
        self,
238
        hidden_states,
239
        attention_mask=None,
240
        head_mask=None,
241
        encoder_hidden_states=None,
242
        encoder_attention_mask=None,
243
        output_attentions=False,
244
    ):
245
        mixed_query_layer = self.query(hidden_states)
246

247
        # If this is instantiated as a cross-attention module, the keys
248
        # and values come from an encoder; the attention mask needs to be
249
        # such that the encoder's padding tokens are not attended to.
250
        if encoder_hidden_states is not None:
251
            mixed_key_layer = self.key(encoder_hidden_states)
252
            mixed_value_layer = self.value(encoder_hidden_states)
253
            attention_mask = encoder_attention_mask
254
        else:
255
            mixed_key_layer = self.key(hidden_states)
256
            mixed_value_layer = self.value(hidden_states)
257

258
        query_layer = self.transpose_for_scores(mixed_query_layer)
259
        key_layer = self.transpose_for_scores(mixed_key_layer)
260
        value_layer = self.transpose_for_scores(mixed_value_layer)
261

262
        # Take the dot product between "query" and "key" to get the raw attention scores.
263
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
264
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
265
        if attention_mask is not None:
266
            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
267
            attention_scores = attention_scores + attention_mask
268

269
        # Normalize the attention scores to probabilities.
270
        attention_probs = nn.Softmax(dim=-1)(attention_scores)
271

272
        # This is actually dropping out entire tokens to attend to, which might
273
        # seem a bit unusual, but is taken from the original Transformer paper.
274
        attention_probs = self.dropout(attention_probs)
275

276
        # Mask heads if we want to
277
        if head_mask is not None:
278
            attention_probs = attention_probs * head_mask
279

280
        context_layer = torch.matmul(attention_probs, value_layer)
281

282
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
283
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
284
        context_layer = context_layer.view(*new_context_layer_shape)
285

286
        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
287
        return outputs
288

289

290
class BertSelfOutput(nn.Module):
291
    def __init__(self, config):
292
        super().__init__()
293
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
294
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
295
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
296

297
    def forward(self, hidden_states, input_tensor):
298
        hidden_states = self.dense(hidden_states)
299
        hidden_states = self.dropout(hidden_states)
300
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
301
        return hidden_states
302

303

304
class BertAttention(nn.Module):
305
    def __init__(self, config):
306
        super().__init__()
307
        self.self = BertSelfAttention(config)
308
        self.output = BertSelfOutput(config)
309
        self.pruned_heads = set()
310

311
    def prune_heads(self, heads):
312
        if len(heads) == 0:
313
            return
314
        heads, index = find_pruneable_heads_and_indices(
315
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
316
        )
317

318
        # Prune linear layers
319
        self.self.query = prune_linear_layer(self.self.query, index)
320
        self.self.key = prune_linear_layer(self.self.key, index)
321
        self.self.value = prune_linear_layer(self.self.value, index)
322
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
323

324
        # Update hyper params and store pruned heads
325
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
326
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
327
        self.pruned_heads = self.pruned_heads.union(heads)
328

329
    def forward(
330
        self,
331
        hidden_states,
332
        attention_mask=None,
333
        head_mask=None,
334
        encoder_hidden_states=None,
335
        encoder_attention_mask=None,
336
        output_attentions=False,
337
    ):
338
        self_outputs = self.self(
339
            hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, output_attentions,
340
        )
341
        attention_output = self.output(self_outputs[0], hidden_states)
342
        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
343
        return outputs
344

345

346
class BertIntermediate(nn.Module):
347
    def __init__(self, config):
348
        super().__init__()
349
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
350
        if isinstance(config.hidden_act, str):
351
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
352
        else:
353
            self.intermediate_act_fn = config.hidden_act
354

355
    def forward(self, hidden_states):
356
        hidden_states = self.dense(hidden_states)
357
        hidden_states = self.intermediate_act_fn(hidden_states)
358
        return hidden_states
359

360

361
class BertOutput(nn.Module):
362
    def __init__(self, config):
363
        super().__init__()
364
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
365
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
366
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
367

368
    def forward(self, hidden_states, input_tensor):
369
        hidden_states = self.dense(hidden_states)
370
        hidden_states = self.dropout(hidden_states)
371
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
372
        return hidden_states
373

374

375
class BertLayer(nn.Module):
376
    def __init__(self, config):
377
        super().__init__()
378
        self.attention = BertAttention(config)
379
        self.is_decoder = config.is_decoder
380
        if self.is_decoder:
381
            self.crossattention = BertAttention(config)
382
        self.intermediate = BertIntermediate(config)
383
        self.output = BertOutput(config)
384

385
    def forward(
386
        self,
387
        hidden_states,
388
        attention_mask=None,
389
        head_mask=None,
390
        encoder_hidden_states=None,
391
        encoder_attention_mask=None,
392
        output_attentions=False,
393
    ):
394
        self_attention_outputs = self.attention(
395
            hidden_states, attention_mask, head_mask, output_attentions=output_attentions,
396
        )
397
        attention_output = self_attention_outputs[0]
398
        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
399

400
        if self.is_decoder and encoder_hidden_states is not None:
401
            cross_attention_outputs = self.crossattention(
402
                attention_output,
403
                attention_mask,
404
                head_mask,
405
                encoder_hidden_states,
406
                encoder_attention_mask,
407
                output_attentions,
408
            )
409
            attention_output = cross_attention_outputs[0]
410
            outputs = outputs + cross_attention_outputs[1:]  # add cross attentions if we output attention weights
411

412
        intermediate_output = self.intermediate(attention_output)
413
        layer_output = self.output(intermediate_output, attention_output)
414
        outputs = (layer_output,) + outputs
415
        return outputs
416

417

418
class BertEncoder(nn.Module):
419
    def __init__(self, config):
420
        super().__init__()
421
        self.config = config
422
        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
423

424
    def forward(
425
        self,
426
        hidden_states,
427
        attention_mask=None,
428
        head_mask=None,
429
        encoder_hidden_states=None,
430
        encoder_attention_mask=None,
431
        output_attentions=False,
432
        output_hidden_states=False,
433
        return_dict=False,
434
    ):
435
        all_hidden_states = () if output_hidden_states else None
436
        all_attentions = () if output_attentions else None
437
        for i, layer_module in enumerate(self.layer):
438
            if output_hidden_states:
439
                all_hidden_states = all_hidden_states + (hidden_states,)
440

441
            if getattr(self.config, "gradient_checkpointing", False):
442

443
                def create_custom_forward(module):
444
                    def custom_forward(*inputs):
445
                        return module(*inputs, output_attentions)
446

447
                    return custom_forward
448

449
                layer_outputs = torch.utils.checkpoint.checkpoint(
450
                    create_custom_forward(layer_module),
451
                    hidden_states,
452
                    attention_mask,
453
                    head_mask[i],
454
                    encoder_hidden_states,
455
                    encoder_attention_mask,
456
                )
457
            else:
458
                layer_outputs = layer_module(
459
                    hidden_states,
460
                    attention_mask,
461
                    head_mask[i],
462
                    encoder_hidden_states,
463
                    encoder_attention_mask,
464
                    output_attentions,
465
                )
466
            hidden_states = layer_outputs[0]
467
            if output_attentions:
468
                all_attentions = all_attentions + (layer_outputs[1],)
469

470
        if output_hidden_states:
471
            all_hidden_states = all_hidden_states + (hidden_states,)
472

473
        if not return_dict:
474
            return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
475
        return BaseModelOutput(
476
            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
477
        )
478

479

480
class BertPooler(nn.Module):
481
    def __init__(self, config):
482
        super().__init__()
483
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
484
        self.activation = nn.Tanh()
485

486
    def forward(self, hidden_states):
487
        # We "pool" the model by simply taking the hidden state corresponding
488
        # to the first token.
489
        first_token_tensor = hidden_states[:, 0]
490
        pooled_output = self.dense(first_token_tensor)
491
        pooled_output = self.activation(pooled_output)
492
        return pooled_output
493

494

495
class BertPredictionHeadTransform(nn.Module):
496
    def __init__(self, config):
497
        super().__init__()
498
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
499
        if isinstance(config.hidden_act, str):
500
            self.transform_act_fn = ACT2FN[config.hidden_act]
501
        else:
502
            self.transform_act_fn = config.hidden_act
503
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
504

505
    def forward(self, hidden_states):
506
        hidden_states = self.dense(hidden_states)
507
        hidden_states = self.transform_act_fn(hidden_states)
508
        hidden_states = self.LayerNorm(hidden_states)
509
        return hidden_states
510

511

512
class BertLMPredictionHead(nn.Module):
513
    def __init__(self, config):
514
        super().__init__()
515
        self.transform = BertPredictionHeadTransform(config)
516

517
        # The output weights are the same as the input embeddings, but there is
518
        # an output-only bias for each token.
519
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
520

521
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
522

523
        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
524
        self.decoder.bias = self.bias
525

526
    def forward(self, hidden_states):
527
        hidden_states = self.transform(hidden_states)
528
        hidden_states = self.decoder(hidden_states)
529
        return hidden_states
530

531

532
class BertOnlyMLMHead(nn.Module):
533
    def __init__(self, config):
534
        super().__init__()
535
        self.predictions = BertLMPredictionHead(config)
536

537
    def forward(self, sequence_output):
538
        prediction_scores = self.predictions(sequence_output)
539
        return prediction_scores
540

541

542
class BertOnlyNSPHead(nn.Module):
543
    def __init__(self, config):
544
        super().__init__()
545
        self.seq_relationship = nn.Linear(config.hidden_size, 2)
546

547
    def forward(self, pooled_output):
548
        seq_relationship_score = self.seq_relationship(pooled_output)
549
        return seq_relationship_score
550

551

552
class BertPreTrainingHeads(nn.Module):
553
    def __init__(self, config):
554
        super().__init__()
555
        self.predictions = BertLMPredictionHead(config)
556
        self.seq_relationship = nn.Linear(config.hidden_size, 2)
557

558
    def forward(self, sequence_output, pooled_output):
559
        prediction_scores = self.predictions(sequence_output)
560
        seq_relationship_score = self.seq_relationship(pooled_output)
561
        return prediction_scores, seq_relationship_score
562

563

564
class BertPreTrainedModel(PreTrainedModel):
565
    """ An abstract class to handle weights initialization and
566
        a simple interface for downloading and loading pretrained models.
567
    """
568

569
    config_class = BertConfig
570
    load_tf_weights = load_tf_weights_in_bert
571
    base_model_prefix = "bert"
572
    authorized_missing_keys = [r"position_ids"]
573

574
    def _init_weights(self, module):
575
        """ Initialize the weights """
576
        if isinstance(module, (nn.Linear, nn.Embedding)):
577
            # Slightly different from the TF version which uses truncated_normal for initialization
578
            # cf https://github.com/pytorch/pytorch/pull/5617
579
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
580
        elif isinstance(module, BertLayerNorm):
581
            module.bias.data.zero_()
582
            module.weight.data.fill_(1.0)
583
        if isinstance(module, nn.Linear) and module.bias is not None:
584
            module.bias.data.zero_()
585

586

587
@dataclass
588
class BertForPretrainingOutput(ModelOutput):
589
    """
590
    Output type of :class:`~transformers.BertForPretrainingModel`.
591

592
    Args:
593
        loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
594
            Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
595
        prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
596
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
597
        seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
598
            Prediction scores of the next sequence prediction (classification) head (scores of True/False
599
            continuation before SoftMax).
600
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
601
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
602
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
603

604
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
605
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
606
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
607
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
608

609
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
610
            heads.
611
    """
612

613
    loss: Optional[torch.FloatTensor] = None
614
    prediction_logits: torch.FloatTensor = None
615
    seq_relationship_logits: torch.FloatTensor = None
616
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
617
    attentions: Optional[Tuple[torch.FloatTensor]] = None
618

619

620
BERT_START_DOCSTRING = r"""
621
    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
622
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
623
    usage and behavior.
624

625
    Parameters:
626
        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
627
            Initializing with a config file does not load the weights associated with the model, only the configuration.
628
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
629
"""
630

631
BERT_INPUTS_DOCSTRING = r"""
632
    Args:
633
        input_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`):
634
            Indices of input sequence tokens in the vocabulary.
635

636
            Indices can be obtained using :class:`transformers.BertTokenizer`.
637
            See :func:`transformers.PreTrainedTokenizer.encode` and
638
            :func:`transformers.PreTrainedTokenizer.__call__` for details.
639

640
            `What are input IDs? <../glossary.html#input-ids>`__
641
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
642
            Mask to avoid performing attention on padding token indices.
643
            Mask values selected in ``[0, 1]``:
644
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
645

646
            `What are attention masks? <../glossary.html#attention-mask>`__
647
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
648
            Segment token indices to indicate first and second portions of the inputs.
649
            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
650
            corresponds to a `sentence B` token
651

652
            `What are token type IDs? <../glossary.html#token-type-ids>`_
653
        position_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
654
            Indices of positions of each input sequence tokens in the position embeddings.
655
            Selected in the range ``[0, config.max_position_embeddings - 1]``.
656

657
            `What are position IDs? <../glossary.html#position-ids>`_
658
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
659
            Mask to nullify selected heads of the self-attention modules.
660
            Mask values selected in ``[0, 1]``:
661
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
662
        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
663
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
664
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
665
            than the model's internal embedding lookup matrix.
666
        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
667
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
668
            if the model is configured as a decoder.
669
        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
670
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
671
            is used in the cross-attention if the model is configured as a decoder.
672
            Mask values selected in ``[0, 1]``:
673
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
674
        output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
675
            If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
676
        output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
677
            If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
678
        return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
679
            If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
680
            plain tuple.
681
"""
682

683

684
@add_start_docstrings(
685
    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
686
    BERT_START_DOCSTRING,
687
)
688
class BertModel(BertPreTrainedModel):
689
    """
690

691
    The model can behave as an encoder (with only self-attention) as well
692
    as a decoder, in which case a layer of cross-attention is added between
693
    the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani,
694
    Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
695

696
    To behave as an decoder the model needs to be initialized with the
697
    :obj:`is_decoder` argument of the configuration set to :obj:`True`; an
698
    :obj:`encoder_hidden_states` is expected as an input to the forward pass.
699

700
    .. _`Attention is all you need`:
701
        https://arxiv.org/abs/1706.03762
702

703
    """
704

705
    def __init__(self, config):
706
        super().__init__(config)
707
        self.config = config
708

709
        self.embeddings = BertEmbeddings(config)
710
        self.encoder = BertEncoder(config)
711
        self.pooler = BertPooler(config)
712

713
        self.init_weights()
714

715
    def get_input_embeddings(self):
716
        return self.embeddings.word_embeddings
717

718
    def set_input_embeddings(self, value):
719
        self.embeddings.word_embeddings = value
720

721
    def _prune_heads(self, heads_to_prune):
722
        """ Prunes heads of the model.
723
            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
724
            See base class PreTrainedModel
725
        """
726
        for layer, heads in heads_to_prune.items():
727
            self.encoder.layer[layer].attention.prune_heads(heads)
728

729
    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
730
    @add_code_sample_docstrings(
731
        tokenizer_class=_TOKENIZER_FOR_DOC,
732
        checkpoint="bert-base-uncased",
733
        output_type=BaseModelOutputWithPooling,
734
        config_class=_CONFIG_FOR_DOC,
735
    )
736
    def forward(
737
        self,
738
        input_ids=None,
739
        attention_mask=None,
740
        token_type_ids=None,
741
        position_ids=None,
742
        head_mask=None,
743
        inputs_embeds=None,
744
        encoder_hidden_states=None,
745
        encoder_attention_mask=None,
746
        output_attentions=None,
747
        output_hidden_states=None,
748
        return_dict=None,
749
    ):
750
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
751
        output_hidden_states = (
752
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
753
        )
754
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
755

756
        if input_ids is not None and inputs_embeds is not None:
757
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
758
        elif input_ids is not None:
759
            input_shape = input_ids.size()
760
        elif inputs_embeds is not None:
761
            input_shape = inputs_embeds.size()[:-1]
762
        else:
763
            raise ValueError("You have to specify either input_ids or inputs_embeds")
764

765
        device = input_ids.device if input_ids is not None else inputs_embeds.device
766

767
        if attention_mask is None:
768
            attention_mask = torch.ones(input_shape, device=device)
769
        if token_type_ids is None:
770
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
771

772
        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
773
        # ourselves in which case we just need to make it broadcastable to all heads.
774
        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
775

776
        # If a 2D ou 3D attention mask is provided for the cross-attention
777
        # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
778
        if self.config.is_decoder and encoder_hidden_states is not None:
779
            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
780
            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
781
            if encoder_attention_mask is None:
782
                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
783
            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
784
        else:
785
            encoder_extended_attention_mask = None
786

787
        # Prepare head mask if needed
788
        # 1.0 in head_mask indicate we keep the head
789
        # attention_probs has shape bsz x n_heads x N x N
790
        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
791
        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
792
        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
793

794
        embedding_output = self.embeddings(
795
            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
796
        )
797
        encoder_outputs = self.encoder(
798
            embedding_output,
799
            attention_mask=extended_attention_mask,
800
            head_mask=head_mask,
801
            encoder_hidden_states=encoder_hidden_states,
802
            encoder_attention_mask=encoder_extended_attention_mask,
803
            output_attentions=output_attentions,
804
            output_hidden_states=output_hidden_states,
805
            return_dict=return_dict,
806
        )
807
        sequence_output = encoder_outputs[0]
808
        pooled_output = self.pooler(sequence_output)
809

810
        if not return_dict:
811
            return (sequence_output, pooled_output) + encoder_outputs[1:]
812

813
        return BaseModelOutputWithPooling(
814
            last_hidden_state=sequence_output,
815
            pooler_output=pooled_output,
816
            hidden_states=encoder_outputs.hidden_states,
817
            attentions=encoder_outputs.attentions,
818
        )
819

820

821
@add_start_docstrings(
822
    """Domain-Task Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and
823
    a `next sentence prediction (classification)` head. """,
824
    BERT_START_DOCSTRING,
825
)
826
class BertForPreTrainingDomainTask(BertPreTrainedModel):
827
    def __init__(self, config):
828
        super().__init__(config)
829

830
        self.bert = BertModel(config)
831
        self.cls = BertPreTrainingHeads(config)
832
        ###
833
        self.in_domain_layer = torch.nn.Linear(768,768,bias=False)
834
        torch.nn.init.xavier_uniform_(self.in_domain_layer.weight)
835
        self.out_domain_layer = torch.nn.Linear(768,768,bias=False)
836
        torch.nn.init.xavier_uniform_(self.out_domain_layer.weight)
837
        self.act = nn.ReLU()
838
        self.layer_out = nn.Linear(768, 2) #num_class
839
        ###
840
        self.init_weights()
841

842
    def get_output_embeddings(self):
843
        return self.cls.predictions.decoder
844

845
    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
846
    @replace_return_docstrings(output_type=BertForPretrainingOutput, config_class=_CONFIG_FOR_DOC)
847
    def forward(
848
        self,
849
        input_ids=None,
850
        attention_mask=None,
851
        token_type_ids=None,
852
        position_ids=None,
853
        head_mask=None,
854
        inputs_embeds=None,
855
        labels=None,
856
        next_sentence_label=None,
857
        output_attentions=None,
858
        output_hidden_states=None,
859
        return_dict=None,
860
        tail_idxs=None,
861
        #in_domain_rep_batch=None,
862
        in_domain_rep=None,
863
        out_domain_rep=None,
864
        func=None,
865
        **kwargs
866
    ):
867

868
        r"""
869
        labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
870
            Labels for computing the masked language modeling loss.
871
            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
872
            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
873
            in ``[0, ..., config.vocab_size]``
874

875
        next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
876
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
877
            Indices should be in ``[0, 1]``.
878
            ``0`` indicates sequence B is a continuation of sequence A,
879
            ``1`` indicates sequence B is a random sequence.
880

881
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
882
            Used to hide legacy arguments that have been deprecated.
883

884
    Returns:
885

886
    Examples::
887

888
        >>> from transformers import BertTokenizer, BertForPreTraining
889
        >>> import torch
890

891
        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
892
        >>> model = BertForPreTraining.from_pretrained('bert-base-uncased', return_dict=True)
893

894
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
895
        >>> outputs = model(**inputs)
896

897
        >>> prediction_logits = outptus.prediction_logits
898
        >>> seq_relationship_logits = outputs.seq_relationship_logits
899
        """
900

901
        if "masked_lm_labels" in kwargs:
902
            warnings.warn(
903
                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
904
                FutureWarning,
905
            )
906
            labels = kwargs.pop("masked_lm_labels")
907
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
908
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
909

910
        outputs = self.bert(
911
            input_ids,
912
            attention_mask=attention_mask,
913
            token_type_ids=token_type_ids,
914
            position_ids=position_ids,
915
            head_mask=head_mask,
916
            inputs_embeds=inputs_embeds,
917
            output_attentions=output_attentions,
918
            output_hidden_states=output_hidden_states,
919
            return_dict=return_dict,
920
        )
921

922
        if func == "in_domain_rep":
923
            #return outputs.hidden_states[0]
924
            '''
925
            in_domain_rep=list()
926
            for id, idx in enumerate(tail_idxs):
927
                in_domain_rep.append(outputs.hidden_states[0][id,idx,:])
928
            in_domain_rep = torch.stack(in_domain_rep)
929
            in_domain_rep = self.in_domain_layer(in_domain_rep)
930
            '''
931
            in_domain_rep = self.in_domain_layer(outputs.hidden_states[0][:,0,:])
932
            return in_domain_rep
933

934
        elif func == "domain_class":
935
            #in_domain_rep = in_domain_rep_batch.squeeze(0)
936
            #print(in_domain_rep.shape)
937
            #exit()
938
            loss_fct = CrossEntropyLoss()
939
            '''
940
            out_domain_rep = list()
941
            for id, idx in enumerate(tail_idxs):
942
                out_domain_rep.append(outputs.hidden_states[0][id,idx,:])
943
            out_domain_rep = torch.stack(out_domain_rep)
944
            out_domain_rep = self.out_domain_layer(out_domain_rep)
945
            '''
946
            out_domain_rep = self.out_domain_layer(outputs.hidden_states[0][:,0,:])
947
            pos_rep = self.layer_out((self.act(in_domain_rep)))
948
            pos_target = torch.tensor([1]*pos_rep.shape[0]).to("cuda")
949
            neg_rep = self.layer_out((self.act(out_domain_rep)))
950
            neg_target = torch.tensor([0]*neg_rep.shape[0]).to("cuda")
951
            rep = torch.cat([pos_rep, neg_rep], 0)
952
            target = torch.cat([pos_target, neg_target], 0)
953
            domain_loss = loss_fct(rep, target)
954
            return domain_loss
955
        else:
956
            pass
957

958

959
        sequence_output, pooled_output = outputs[:2]
960
        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
961

962

963
        total_loss = None
964
        if labels is not None and next_sentence_label is not None:
965
            loss_fct = CrossEntropyLoss()
966
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
967
            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
968
            total_loss = masked_lm_loss + next_sentence_loss
969
        elif labels is not None:
970
            #exit()
971
            #loss_fct = CrossEntropyLoss()
972
            loss_fct = CrossEntropyLoss(ignore_index=-1)
973
            '''
974
            print(prediction_scores)
975
            print(prediction_scores.shape)
976
            print("====")
977
            print(labels)
978
            print(labels.shape)
979
            exit()
980
            '''
981
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
982
            total_loss = masked_lm_loss
983

984

985

986
        if not return_dict:
987
            output = (prediction_scores, seq_relationship_score) + outputs[2:]
988
            return ((total_loss,) + output) if total_loss is not None else output
989

990

991
        return BertForPretrainingOutput(
992
            loss=total_loss,
993
            prediction_logits=prediction_scores,
994
            seq_relationship_logits=seq_relationship_score,
995
            hidden_states=outputs.hidden_states,
996
            attentions=outputs.attentions,
997
        )
998

999

1000
@add_start_docstrings(
1001
    """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and
1002
    a `next sentence prediction (classification)` head. """,
1003
    BERT_START_DOCSTRING,
1004
)
1005
class BertForPreTraining(BertPreTrainedModel):
1006
    def __init__(self, config):
1007
        super().__init__(config)
1008

1009
        self.bert = BertModel(config)
1010
        self.cls = BertPreTrainingHeads(config)
1011

1012
        self.init_weights()
1013

1014
    def get_output_embeddings(self):
1015
        return self.cls.predictions.decoder
1016

1017
    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
1018
    @replace_return_docstrings(output_type=BertForPretrainingOutput, config_class=_CONFIG_FOR_DOC)
1019
    def forward(
1020
        self,
1021
        input_ids=None,
1022
        attention_mask=None,
1023
        token_type_ids=None,
1024
        position_ids=None,
1025
        head_mask=None,
1026
        inputs_embeds=None,
1027
        labels=None,
1028
        next_sentence_label=None,
1029
        output_attentions=None,
1030
        output_hidden_states=None,
1031
        return_dict=None,
1032
        **kwargs
1033
    ):
1034

1035
        r"""
1036
        labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
1037
            Labels for computing the masked language modeling loss.
1038
            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
1039
            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
1040
            in ``[0, ..., config.vocab_size]``
1041

1042
        next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
1043
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
1044
            Indices should be in ``[0, 1]``.
1045
            ``0`` indicates sequence B is a continuation of sequence A,
1046
            ``1`` indicates sequence B is a random sequence.
1047

1048
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
1049
            Used to hide legacy arguments that have been deprecated.
1050

1051
    Returns:
1052

1053
    Examples::
1054

1055
        >>> from transformers import BertTokenizer, BertForPreTraining
1056
        >>> import torch
1057

1058
        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
1059
        >>> model = BertForPreTraining.from_pretrained('bert-base-uncased', return_dict=True)
1060

1061
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
1062
        >>> outputs = model(**inputs)
1063

1064
        >>> prediction_logits = outptus.prediction_logits
1065
        >>> seq_relationship_logits = outputs.seq_relationship_logits
1066
        """
1067

1068
        if "masked_lm_labels" in kwargs:
1069
            warnings.warn(
1070
                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
1071
                FutureWarning,
1072
            )
1073
            labels = kwargs.pop("masked_lm_labels")
1074
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
1075
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1076

1077
        outputs = self.bert(
1078
            input_ids,
1079
            attention_mask=attention_mask,
1080
            token_type_ids=token_type_ids,
1081
            position_ids=position_ids,
1082
            head_mask=head_mask,
1083
            inputs_embeds=inputs_embeds,
1084
            output_attentions=output_attentions,
1085
            output_hidden_states=output_hidden_states,
1086
            return_dict=return_dict,
1087
        )
1088

1089
        sequence_output, pooled_output = outputs[:2]
1090
        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
1091

1092

1093
        total_loss = None
1094
        if labels is not None and next_sentence_label is not None:
1095
            loss_fct = CrossEntropyLoss()
1096
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
1097
            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
1098
            total_loss = masked_lm_loss + next_sentence_loss
1099
        elif labels is not None:
1100
            #exit()
1101
            #loss_fct = CrossEntropyLoss()
1102
            loss_fct = CrossEntropyLoss(ignore_index=-1)
1103
            '''
1104
            print(prediction_scores)
1105
            print(prediction_scores.shape)
1106
            print("====")
1107
            print(labels)
1108
            print(labels.shape)
1109
            exit()
1110
            '''
1111
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
1112
            total_loss = masked_lm_loss
1113

1114

1115

1116
        if not return_dict:
1117
            output = (prediction_scores, seq_relationship_score) + outputs[2:]
1118
            return ((total_loss,) + output) if total_loss is not None else output
1119

1120

1121
        return BertForPretrainingOutput(
1122
            loss=total_loss,
1123
            prediction_logits=prediction_scores,
1124
            seq_relationship_logits=seq_relationship_score,
1125
            hidden_states=outputs.hidden_states,
1126
            attentions=outputs.attentions,
1127
        )
1128

1129

1130

1131

1132
@add_start_docstrings(
1133
    """Bert Model with a `language modeling` head on top for CLM fine-tuning. """, BERT_START_DOCSTRING
1134
)
1135
class BertLMHeadModel(BertPreTrainedModel):
1136
    def __init__(self, config):
1137
        super().__init__(config)
1138
        assert config.is_decoder, "If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True`."
1139

1140
        self.bert = BertModel(config)
1141
        self.cls = BertOnlyMLMHead(config)
1142

1143
        self.init_weights()
1144

1145
    def get_output_embeddings(self):
1146
        return self.cls.predictions.decoder
1147

1148
    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
1149
    @replace_return_docstrings(output_type=CausalLMOutput, config_class=_CONFIG_FOR_DOC)
1150
    def forward(
1151
        self,
1152
        input_ids=None,
1153
        attention_mask=None,
1154
        token_type_ids=None,
1155
        position_ids=None,
1156
        head_mask=None,
1157
        inputs_embeds=None,
1158
        labels=None,
1159
        encoder_hidden_states=None,
1160
        encoder_attention_mask=None,
1161
        output_attentions=None,
1162
        output_hidden_states=None,
1163
        return_dict=None,
1164
        **kwargs
1165
    ):
1166
        r"""
1167
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
1168
            Labels for computing the left-to-right language modeling loss (next word prediction).
1169
            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
1170
            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
1171
            in ``[0, ..., config.vocab_size]``
1172
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
1173
            Used to hide legacy arguments that have been deprecated.
1174

1175
    Returns:
1176

1177
    Example::
1178

1179
        >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
1180
        >>> import torch
1181

1182
        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
1183
        >>> config = BertConfig.from_pretrained("bert-base-cased")
1184
        >>> config.is_decoder = True
1185
        >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config, return_dict=True)
1186

1187
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
1188
        >>> outputs = model(**inputs)
1189

1190
        >>> prediction_logits = outputs.logits
1191
        """
1192
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1193

1194
        outputs = self.bert(
1195
            input_ids,
1196
            attention_mask=attention_mask,
1197
            token_type_ids=token_type_ids,
1198
            position_ids=position_ids,
1199
            head_mask=head_mask,
1200
            inputs_embeds=inputs_embeds,
1201
            encoder_hidden_states=encoder_hidden_states,
1202
            encoder_attention_mask=encoder_attention_mask,
1203
            output_attentions=output_attentions,
1204
            output_hidden_states=output_hidden_states,
1205
            return_dict=return_dict,
1206
        )
1207

1208
        sequence_output = outputs[0]
1209
        prediction_scores = self.cls(sequence_output)
1210

1211
        lm_loss = None
1212
        if labels is not None:
1213
            # we are doing next-token prediction; shift prediction scores and input ids by one
1214
            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
1215
            labels = labels[:, 1:].contiguous()
1216
            loss_fct = CrossEntropyLoss()
1217
            lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
1218

1219
        if not return_dict:
1220
            output = (prediction_scores,) + outputs[2:]
1221
            return ((lm_loss,) + output) if lm_loss is not None else output
1222

1223
        return CausalLMOutput(
1224
            loss=lm_loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
1225
        )
1226

1227
    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
1228
        input_shape = input_ids.shape
1229

1230
        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
1231
        if attention_mask is None:
1232
            attention_mask = input_ids.new_ones(input_shape)
1233

1234
        return {"input_ids": input_ids, "attention_mask": attention_mask}
1235

1236

1237
@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
1238
class BertForMaskedLM(BertPreTrainedModel):
1239
    def __init__(self, config):
1240
        super().__init__(config)
1241
        assert (
1242
            not config.is_decoder
1243
        ), "If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention."
1244

1245
        self.bert = BertModel(config)
1246
        self.cls = BertOnlyMLMHead(config)
1247

1248
        self.init_weights()
1249

1250
    def get_output_embeddings(self):
1251
        return self.cls.predictions.decoder
1252

1253
    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
1254
    @add_code_sample_docstrings(
1255
        tokenizer_class=_TOKENIZER_FOR_DOC,
1256
        checkpoint="bert-base-uncased",
1257
        output_type=MaskedLMOutput,
1258
        config_class=_CONFIG_FOR_DOC,
1259
    )
1260
    def forward(
1261
        self,
1262
        input_ids=None,
1263
        attention_mask=None,
1264
        token_type_ids=None,
1265
        position_ids=None,
1266
        head_mask=None,
1267
        inputs_embeds=None,
1268
        labels=None,
1269
        encoder_hidden_states=None,
1270
        encoder_attention_mask=None,
1271
        output_attentions=None,
1272
        output_hidden_states=None,
1273
        return_dict=None,
1274
        **kwargs
1275
    ):
1276

1277
        """
1278
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
1279
            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
1280
            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
1281
        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
1282
            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
1283
            a `sentence B` token (see BERT paper for more details).
1284
        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
1285
            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
1286
            input sequence length in the current batch. It's the mask that we typically use for attention when
1287
            a batch has varying length sentences.
1288
        `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
1289
            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
1290
            is only computed for the labels set in [0, ..., vocab_size]
1291
        `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
1292
            with indices selected in [0, 1].
1293
            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
1294
        """
1295

1296

1297
        r"""
1298
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
1299
            Labels for computing the masked language modeling loss.
1300
            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
1301
            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
1302
            in ``[0, ..., config.vocab_size]``
1303
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
1304
            Used to hide legacy arguments that have been deprecated.
1305
        """
1306
        if "masked_lm_labels" in kwargs:
1307
            warnings.warn(
1308
                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
1309
                FutureWarning,
1310
            )
1311
            labels = kwargs.pop("masked_lm_labels")
1312
        assert "lm_labels" not in kwargs, "Use `BertWithLMHead` for autoregressive language modeling task."
1313
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
1314

1315
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1316

1317
        outputs = self.bert(
1318
            input_ids,
1319
            attention_mask=attention_mask,
1320
            token_type_ids=token_type_ids,
1321
            position_ids=position_ids,
1322
            head_mask=head_mask,
1323
            inputs_embeds=inputs_embeds,
1324
            encoder_hidden_states=encoder_hidden_states,
1325
            encoder_attention_mask=encoder_attention_mask,
1326
            output_attentions=output_attentions,
1327
            output_hidden_states=output_hidden_states,
1328
            return_dict=return_dict,
1329
        )
1330

1331
        sequence_output = outputs[0]
1332
        prediction_scores = self.cls(sequence_output)
1333

1334
        masked_lm_loss = None
1335
        if labels is not None:
1336
            loss_fct = CrossEntropyLoss()  # -100 index = padding token
1337
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
1338

1339
        if not return_dict:
1340
            output = (prediction_scores,) + outputs[2:]
1341
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
1342

1343
        return MaskedLMOutput(
1344
            loss=masked_lm_loss,
1345
            logits=prediction_scores,
1346
            hidden_states=outputs.hidden_states,
1347
            attentions=outputs.attentions,
1348
        )
1349

1350
    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
1351
        input_shape = input_ids.shape
1352
        effective_batch_size = input_shape[0]
1353

1354
        #  add a dummy token
1355
        assert self.config.pad_token_id is not None, "The PAD token should be defined for generation"
1356
        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
1357
        dummy_token = torch.full(
1358
            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
1359
        )
1360
        input_ids = torch.cat([input_ids, dummy_token], dim=1)
1361

1362
        return {"input_ids": input_ids, "attention_mask": attention_mask}
1363

1364

1365

1366
class BertClassificationHead(nn.Module):
1367
    """Head for sentence-level classification tasks."""
1368

1369
    def __init__(self, config):
1370
        super().__init__()
1371
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
1372
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
1373
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
1374

1375
    def forward(self, features, **kwargs):
1376
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
1377
        #x = features[input_ids==2]  # take </s> token (equiv. to the last token)
1378
        x = self.dropout(x)
1379
        x = self.dense(x)
1380
        x = torch.tanh(x)
1381
        x = self.dropout(x)
1382
        x = self.out_proj(x)
1383
        return x
1384

1385

1386

1387
class BertClassificationTail(nn.Module):
1388
    """Head for sentence-level classification tasks."""
1389

1390
    def __init__(self, config):
1391
        super().__init__()
1392
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
1393
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
1394
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
1395

1396
    def forward(self, features, input_ids, **kwargs):
1397
        #x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
1398
        #x = features[input_ids==2]  # take </s> token (equiv. to the last token)
1399
        x = features[input_ids==102]  # take </s> token (equiv. to the last token)
1400
        x = self.dropout(x)
1401
        x = self.dense(x)
1402
        x = torch.tanh(x)
1403
        x = self.dropout(x)
1404
        x = self.out_proj(x)
1405
        return x
1406

1407

1408
class BertClassificationHeadandTail(nn.Module):
1409
    """Head for sentence-level classification tasks."""
1410

1411
    def __init__(self, config):
1412
        super().__init__()
1413
        self.dense = nn.Linear(config.hidden_size*2, config.hidden_size*2)
1414
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
1415
        self.out_proj = nn.Linear(config.hidden_size*2, config.num_labels)
1416
        self.num_labels = config.num_labels
1417

1418
    def forward(self, features, input_ids, **kwargs):
1419
        head = features[:, 0, :]  # take <s> token (equiv. to [CLS])
1420
        #tail = features[input_ids==2]  # take </s> token (equiv. to the last token)
1421
        tail = features[input_ids==102]  # take </s> token (equiv. to the last token)
1422
        x = torch.cat((head, tail),-1) # [, 768*2]
1423
        x = self.dropout(x)
1424
        x = self.dense(x)
1425
        x = torch.tanh(x)
1426
        x = self.dropout(x)
1427
        x = self.out_proj(x)
1428
        return x
1429

1430

1431

1432
@add_start_docstrings("""BERT Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
1433
class BertForMaskedLMDomainTask(BertPreTrainedModel):
1434
    #config_class = RobertaConfig
1435
    #base_model_prefix = "roberta"
1436

1437
    def __init__(self, config):
1438
        super().__init__(config)
1439

1440
        self.bert = BertModel(config)
1441
        self.cls = BertOnlyMLMHead(config)
1442
        self.classifier_Task = BertClassificationHead(config)
1443
        self.classifier_Domain = BertClassificationTail(config)
1444
        self.classifier_DomainandTask = BertClassificationHeadandTail(config)
1445
        self.num_labels = config.num_labels
1446

1447
        self.init_weights()
1448
        self.LeakyReLU = torch.nn.LeakyReLU()
1449
        self.domain_binary_classifier = nn.Linear(768*2,2,bias=True) #num_class
1450
        self.task_binary_classifier = nn.Linear(768*2,2,bias=True) #num_class
1451
        self.domain_task_binary_classifier = nn.Linear(768*4,2,bias=True) #num_class
1452

1453
    def get_output_embeddings(self):
1454
        return self.cls.predictions.decoder
1455

1456
    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
1457
    @add_code_sample_docstrings(
1458
        tokenizer_class=_TOKENIZER_FOR_DOC,
1459
        checkpoint="bert-base-uncased",
1460
        output_type=MaskedLMOutput,
1461
        config_class=_CONFIG_FOR_DOC,
1462
    )
1463

1464
    def forward(
1465
        self,
1466
        input_ids=None,
1467
        input_ids_org=None,
1468
        attention_mask=None,
1469
        token_type_ids=None,
1470
        position_ids=None,
1471
        head_mask=None,
1472
        inputs_embeds=None,
1473
        labels=None,
1474
        output_attentions=None,
1475
        output_hidden_states=None,
1476
        return_dict=None,
1477
        func=None,
1478
        tail_idxs=None,
1479
        in_domain_rep=None,
1480
        out_domain_rep=None,
1481
        sentence_label=None,
1482
        lm_label=None,
1483
        batch_size=None,
1484
        all_in_task_rep_comb=None,
1485
        all_sentence_binary_label=None,
1486
        from_query=False,
1487
        task_loss_org=None,
1488
        task_loss_cotrain=None,
1489
        domain_id=None,
1490
        use_detach=False,
1491
        **kwargs
1492
    ):
1493
        r"""
1494
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
1495
            Labels for computing the masked language modeling loss.
1496
            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
1497
            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
1498
            in ``[0, ..., config.vocab_size]``
1499
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
1500
            Used to hide legacy arguments that have been deprecated.
1501
        """
1502
        if "masked_lm_labels" in kwargs:
1503
            warnings.warn(
1504
                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
1505
                FutureWarning,
1506
            )
1507
            labels = kwargs.pop("masked_lm_labels")
1508
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
1509
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1510

1511

1512
        if func == "in_domain_task_rep":
1513
            #######
1514
            outputs = self.bert(
1515
                input_ids=input_ids_org,
1516
                attention_mask=attention_mask,
1517
                token_type_ids=token_type_ids,
1518
                position_ids=position_ids,
1519
                head_mask=head_mask,
1520
                inputs_embeds=inputs_embeds,
1521
                output_attentions=output_attentions,
1522
                output_hidden_states=output_hidden_states,
1523
                return_dict=return_dict,
1524
            )
1525
            #######
1526
            #x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
1527
            #rep = outputs.last_hidden_state[:, 0, :]
1528
            #rep = outputs.last_hidden_state[:, 0, :]
1529
            rep_head = outputs.last_hidden_state[:, 0, :]
1530
            #rep_tail = outputs.last_hidden_state[input_ids_org==2]
1531
            rep_tail = outputs.last_hidden_state[input_ids_org==102]
1532
            #print("rep:",rep_tail.shape[0])
1533
            #print("input:",input_ids_org.shape[0])
1534
            if rep_tail.shape[0] != input_ids_org.shape[0]:
1535
                #print(input_ids_org)
1536
                print("!!!!!!!!!!!!!!!!!!!!!!")
1537
                print("Error: have no 102 id")
1538
                print("!!!!!!!!!!!!!!!!!!!!!!")
1539
                rep_tail = outputs.last_hidden_state[input_ids_org==tail_idxs]
1540

1541

1542

1543
            #detach
1544
            #rep = rep.detach()
1545
            '''
1546
            in_domain_rep = self.domain_layer(rep)
1547
            in_task_rep = self.task_layer(rep)
1548
            return in_domain_rep, in_task_rep
1549
            '''
1550
            return rep_tail, rep_head
1551

1552

1553
        elif func == "return_task_binary_classifier":
1554
            return self.task_binary_classifier.weight.data, self.task_binary_classifier.bias.data
1555

1556
        elif func == "return_domain_binary_classifier":
1557
            return self.domain_binary_classifier.weight.data, self.domain_binary_classifier.bias.data
1558

1559
        elif func == "return_domain_task_binary_classifier":
1560
            return self.domain_task_binary_classifier.weight.data, self.domain_task_binary_classifier.bias.data
1561

1562
        #if func == "task_binary_classifier":
1563

1564
        elif func == "domain_binary_classifier":
1565
            #in:1 , out:0
1566
            #Need to fix
1567
            #######
1568

1569
            loss_fct = CrossEntropyLoss()
1570
            domain_rep = torch.cat([in_domain_rep, out_domain_rep], 0)
1571
            if use_detach==True:
1572
                domain_rep = domain_rep.detach()
1573
            logit = self.domain_binary_classifier(domain_rep)
1574
            pos_target = torch.tensor([1]*in_domain_rep.shape[0]).to("cuda")
1575
            unknow_target = domain_id.to("cuda")
1576
            target = torch.cat([pos_target, unknow_target], 0)
1577
            domain_loss = loss_fct(logit, target)
1578

1579

1580
            #return domain_loss, logit, out_domain_rep_head, out_domain_rep_tail
1581
            return domain_loss, logit
1582

1583

1584
        elif func == "task_binary_classifier":
1585
            #Didn't include query rep: so it need to add in_domain_rep here
1586
            loss_fct = CrossEntropyLoss()
1587
            #detach
1588
            #all_in_task_rep_comb = all_in_task_rep_comb.detach()
1589
            if use_detach==True:
1590
                all_in_task_rep_comb = all_in_task_rep_comb.detach()
1591
            logit = self.task_binary_classifier(all_in_task_rep_comb)
1592
            #logit = self.LeakyReLU(logit)
1593
            all_sentence_binary_label = all_sentence_binary_label.reshape(all_sentence_binary_label.shape[0]*all_sentence_binary_label.shape[1])
1594
            logit = logit.reshape(logit.shape[0]*logit.shape[1],logit.shape[2])
1595
            task_binary_loss = loss_fct(logit.view(-1,2), all_sentence_binary_label.view(-1))
1596
            return task_binary_loss, logit
1597

1598

1599
        elif func == "domain_task_binary_classifier":
1600
            #Didn't include query rep: so it need to add in_domain_rep here
1601
            loss_fct = CrossEntropyLoss()
1602
            #detach
1603
            #all_in_task_rep_comb = all_in_task_rep_comb.detach()
1604
            logit = self.domain_task_binary_classifier(all_in_task_rep_comb)
1605
            #logit = self.LeakyReLU(logit)
1606
            all_sentence_binary_label = all_sentence_binary_label.reshape(all_sentence_binary_label.shape[0]*all_sentence_binary_label.shape[1])
1607
            logit = logit.reshape(logit.shape[0]*logit.shape[1],logit.shape[2])
1608
            task_binary_loss = loss_fct(logit.view(-1,2), all_sentence_binary_label.view(-1))
1609
            return task_binary_loss, logit
1610

1611

1612
        elif func == "task_class":
1613
            #######
1614
            outputs = self.bert(
1615
                input_ids=input_ids_org,
1616
                attention_mask=attention_mask,
1617
                token_type_ids=token_type_ids,
1618
                position_ids=position_ids,
1619
                head_mask=head_mask,
1620
                inputs_embeds=inputs_embeds,
1621
                output_attentions=output_attentions,
1622
                output_hidden_states=output_hidden_states,
1623
                return_dict=return_dict,
1624
            )
1625
            #######
1626
            #Already including query rep
1627
            loss_fct = CrossEntropyLoss()
1628
            ###
1629
            class_logit = self.classifier_DomainandTask(outputs.last_hidden_state, input_ids_org)
1630
            task_loss = loss_fct(class_logit.view(-1, self.num_labels), sentence_label.view(-1))
1631

1632
            if from_query==True:
1633
                query_rep_head = outputs.last_hidden_state[:,0,:]
1634
                #query_rep_tail = outputs.last_hidden_state[input_ids_org==2]
1635
                query_rep_tail = outputs.last_hidden_state[input_ids_org==102]
1636

1637
                if query_rep_tail.shape[0] != input_ids_org.shape[0]:
1638
                    #print(input_ids_org)
1639
                    print("!!!!!!!!!!!!!!!!!!!!!!")
1640
                    print("Error: have no 102 id")
1641
                    print("!!!!!!!!!!!!!!!!!!!!!!")
1642
                    query_rep_tail = outputs.last_hidden_state[input_ids_org==tail_idxs]
1643
                return task_loss, class_logit, query_rep_head, query_rep_tail
1644
            else:
1645
                return task_loss, class_logit
1646

1647

1648
        elif func == "task_class_domain":
1649
            #######
1650
            outputs = self.bert(
1651
                input_ids=input_ids_org,
1652
                attention_mask=attention_mask,
1653
                token_type_ids=token_type_ids,
1654
                position_ids=position_ids,
1655
                head_mask=head_mask,
1656
                inputs_embeds=inputs_embeds,
1657
                output_attentions=output_attentions,
1658
                output_hidden_states=output_hidden_states,
1659
                return_dict=return_dict,
1660
            )
1661
            #######
1662
            #Already including query rep
1663
            loss_fct = CrossEntropyLoss()
1664
            ###
1665
            class_logit = self.classifier_Domain(outputs.last_hidden_state, input_ids_org)
1666
            task_loss = loss_fct(class_logit.view(-1, self.num_labels), sentence_label.view(-1))
1667

1668
            if from_query==True:
1669
                query_rep_head = outputs.last_hidden_state[:,0,:]
1670
                #query_rep_tail = outputs.last_hidden_state[input_ids_org==2]
1671
                query_rep_tail = outputs.last_hidden_state[input_ids_org==102]
1672
                if query_rep_tail.shape[0] != input_ids_org.shape[0]:
1673
                    #print(input_ids_org)
1674
                    print("!!!!!!!!!!!!!!!!!!!!!!")
1675
                    print("Error: have no 102 id")
1676
                    print("!!!!!!!!!!!!!!!!!!!!!!")
1677
                    query_rep_tail = outputs.last_hidden_state[input_ids_org==tail_idxs]
1678
                return task_loss, class_logit, query_rep_head, query_rep_tail
1679
            else:
1680
                return task_loss, class_logit
1681

1682

1683
        elif func == "task_class_nodomain":
1684
            #######
1685
            outputs = self.bert(
1686
                input_ids=input_ids_org,
1687
                attention_mask=attention_mask,
1688
                token_type_ids=token_type_ids,
1689
                position_ids=position_ids,
1690
                head_mask=head_mask,
1691
                inputs_embeds=inputs_embeds,
1692
                output_attentions=output_attentions,
1693
                output_hidden_states=output_hidden_states,
1694
                return_dict=return_dict,
1695
            )
1696
            #######
1697
            #Already including query rep
1698
            loss_fct = CrossEntropyLoss()
1699
            ###
1700
            class_logit = self.classifier_Task(outputs.last_hidden_state)
1701
            task_loss = loss_fct(class_logit.view(-1, self.num_labels), sentence_label.view(-1))
1702

1703
            if from_query==True:
1704
                query_rep_head = outputs.last_hidden_state[:,0,:]
1705
                #query_rep_tail = outputs.last_hidden_state[input_ids_org==2]
1706
                query_rep_tail = outputs.last_hidden_state[input_ids_org==102]
1707
                if query_rep_tail.shape[0] != input_ids_org.shape[0]:
1708
                    #print(input_ids_org)
1709
                    print("!!!!!!!!!!!!!!!!!!!!!!")
1710
                    print("Error: have no 102 id")
1711
                    print("!!!!!!!!!!!!!!!!!!!!!!")
1712
                    query_rep_tail = outputs.last_hidden_state[input_ids_org==tail_idxs]
1713
                return task_loss, class_logit, query_rep_head, query_rep_tail
1714
            else:
1715
                return task_loss, class_logit
1716

1717

1718
        elif func == "mlm":
1719
            outputs_mlm = self.bert(
1720
                input_ids=input_ids,
1721
                attention_mask=attention_mask,
1722
                token_type_ids=token_type_ids,
1723
                position_ids=position_ids,
1724
                head_mask=head_mask,
1725
                inputs_embeds=inputs_embeds,
1726
                output_attentions=output_attentions,
1727
                output_hidden_states=output_hidden_states,
1728
                return_dict=return_dict,
1729
            )
1730

1731
            loss_fct = CrossEntropyLoss()
1732
            sequence_output = outputs_mlm.last_hidden_state
1733
            #sequence_output = outputs_mlm[0]
1734
            #prediction_scores = self.lm_head(sequence_output)
1735
            prediction_scores = self.cls(sequence_output)
1736
            loss_fct = CrossEntropyLoss(ignore_index=-1)
1737
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), lm_label.view(-1))
1738
            return masked_lm_loss
1739

1740

1741
        elif func == "task_class and mlm":
1742
            #######
1743
            outputs = self.bert(
1744
                input_ids=input_ids_org,
1745
                attention_mask=attention_mask,
1746
                token_type_ids=token_type_ids,
1747
                position_ids=position_ids,
1748
                head_mask=head_mask,
1749
                inputs_embeds=inputs_embeds,
1750
                output_attentions=output_attentions,
1751
                output_hidden_states=output_hidden_states,
1752
                return_dict=return_dict,
1753
            )
1754
            #######
1755
            #######
1756
            outputs_mlm = self.bert(
1757
                input_ids=input_ids,
1758
                attention_mask=attention_mask,
1759
                token_type_ids=token_type_ids,
1760
                position_ids=position_ids,
1761
                head_mask=head_mask,
1762
                inputs_embeds=inputs_embeds,
1763
                output_attentions=output_attentions,
1764
                output_hidden_states=output_hidden_states,
1765
                return_dict=return_dict,
1766
            )
1767
            #######
1768
            #Already including query rep
1769
            #task loss
1770
            loss_fct = CrossEntropyLoss()
1771
            ###
1772
            '''
1773
            #rep = outputs.last_hidden_state[input_ids==2]
1774
            rep = outputs.last_hidden_state[:, 0, :]
1775
            #rep = rep.detach()
1776
            task_rep = self.task_layer(rep)
1777
            class_logit = self.layer_out_taskClass((self.act(task_rep)))
1778
            '''
1779
            class_logit = self.classifier(outputs.last_hidden_state)
1780
            ###
1781
            task_loss = loss_fct(class_logit.view(-1, 8), sentence_label.view(-1))
1782

1783
            #mlm loss
1784
            sequence_output = outputs_mlm.last_hidden_state
1785
            #prediction_scores = self.lm_head(sequence_output)
1786
            prediction_scores = self.cls(sequence_output)
1787
            loss_fct = CrossEntropyLoss(ignore_index=-1)
1788
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), lm_label.view(-1))
1789
            return task_loss, masked_lm_loss
1790

1791
        elif func == "gen_rep":
1792
            outputs = self.bert(
1793
                input_ids=input_ids_org,
1794
                attention_mask=attention_mask,
1795
                token_type_ids=token_type_ids,
1796
                position_ids=position_ids,
1797
                head_mask=head_mask,
1798
                inputs_embeds=inputs_embeds,
1799
                output_attentions=output_attentions,
1800
                output_hidden_states=output_hidden_states,
1801
                return_dict=return_dict,
1802
            )
1803
            return outputs
1804

1805

1806

1807

1808

1809

1810

1811
@add_start_docstrings(
1812
    """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING,
1813
)
1814
class BertForNextSentencePrediction(BertPreTrainedModel):
1815
    def __init__(self, config):
1816
        super().__init__(config)
1817

1818
        self.bert = BertModel(config)
1819
        self.cls = BertOnlyNSPHead(config)
1820

1821
        self.init_weights()
1822

1823
    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
1824
    @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
1825
    def forward(
1826
        self,
1827
        input_ids=None,
1828
        attention_mask=None,
1829
        token_type_ids=None,
1830
        position_ids=None,
1831
        head_mask=None,
1832
        inputs_embeds=None,
1833
        next_sentence_label=None,
1834
        output_attentions=None,
1835
        output_hidden_states=None,
1836
        return_dict=None,
1837
    ):
1838
        r"""
1839
        next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
1840
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
1841
            Indices should be in ``[0, 1]``.
1842
            ``0`` indicates sequence B is a continuation of sequence A,
1843
            ``1`` indicates sequence B is a random sequence.
1844

1845
    Returns:
1846

1847
    Example::
1848

1849
        >>> from transformers import BertTokenizer, BertForNextSentencePrediction
1850
        >>> import torch
1851

1852
        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
1853
        >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased', return_dict=True)
1854

1855
        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
1856
        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
1857
        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
1858

1859
        >>> outputs = model(**encoding, next_sentence_label=torch.LongTensor([1]))
1860
        >>> logits = outputs.logits
1861
        >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
1862
        """
1863
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1864

1865
        outputs = self.bert(
1866
            input_ids,
1867
            attention_mask=attention_mask,
1868
            token_type_ids=token_type_ids,
1869
            position_ids=position_ids,
1870
            head_mask=head_mask,
1871
            inputs_embeds=inputs_embeds,
1872
            output_attentions=output_attentions,
1873
            output_hidden_states=output_hidden_states,
1874
            return_dict=return_dict,
1875
        )
1876

1877
        pooled_output = outputs[1]
1878

1879
        seq_relationship_scores = self.cls(pooled_output)
1880

1881
        next_sentence_loss = None
1882
        if next_sentence_label is not None:
1883
            loss_fct = CrossEntropyLoss()
1884
            next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), next_sentence_label.view(-1))
1885

1886
        if not return_dict:
1887
            output = (seq_relationship_scores,) + outputs[2:]
1888
            return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output
1889

1890
        return NextSentencePredictorOutput(
1891
            loss=next_sentence_loss,
1892
            logits=seq_relationship_scores,
1893
            hidden_states=outputs.hidden_states,
1894
            attentions=outputs.attentions,
1895
        )
1896

1897

1898
@add_start_docstrings(
1899
    """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
1900
    the pooled output) e.g. for GLUE tasks. """,
1901
    BERT_START_DOCSTRING,
1902
)
1903
class BertForSequenceClassification(BertPreTrainedModel):
1904
    def __init__(self, config):
1905
        super().__init__(config)
1906
        self.num_labels = config.num_labels
1907

1908
        self.bert = BertModel(config)
1909
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
1910
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
1911

1912
        self.init_weights()
1913

1914

1915
    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
1916
    @add_code_sample_docstrings(
1917
        tokenizer_class=_TOKENIZER_FOR_DOC,
1918
        checkpoint="bert-base-uncased",
1919
        output_type=SequenceClassifierOutput,
1920
        config_class=_CONFIG_FOR_DOC,
1921
    )
1922
    def forward(
1923
        self,
1924
        input_ids=None,
1925
        attention_mask=None,
1926
        token_type_ids=None, #segment_ids
1927
        position_ids=None, #
1928
        head_mask=None,
1929
        inputs_embeds=None,
1930
        labels=None,
1931
        output_attentions=None,
1932
        output_hidden_states=None,
1933
        return_dict=None,
1934
    ):
1935
        r"""
1936
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
1937
            Labels for computing the sequence classification/regression loss.
1938
            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
1939
            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
1940
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1941
        """
1942
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1943

1944
        outputs = self.bert(
1945
            input_ids,
1946
            attention_mask=attention_mask,
1947
            token_type_ids=token_type_ids,
1948
            position_ids=position_ids,
1949
            head_mask=head_mask,
1950
            inputs_embeds=inputs_embeds,
1951
            output_attentions=output_attentions,
1952
            output_hidden_states=output_hidden_states,
1953
            return_dict=return_dict,
1954
        )
1955

1956
        pooled_output = outputs[1]
1957

1958
        #pooled_output = pooled_output.detach()
1959
        pooled_output = self.dropout(pooled_output)
1960
        logits = self.classifier(pooled_output)
1961

1962
        loss = None
1963
        if labels is not None:
1964
            if self.num_labels == 1:
1965
                #  We are doing regression
1966
                loss_fct = MSELoss()
1967
                loss = loss_fct(logits.view(-1), labels.view(-1))
1968
            else:
1969
                loss_fct = CrossEntropyLoss()
1970
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
1971

1972

1973
        if not return_dict:
1974
            output = (logits,) + outputs[2:]
1975
            return ((loss,) + output) if loss is not None else output
1976

1977
        return SequenceClassifierOutput(
1978
            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
1979
        )
1980

1981

1982
@add_start_docstrings(
1983
    """Bert Model with a multiple choice classification head on top (a linear layer on top of
1984
    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
1985
    BERT_START_DOCSTRING,
1986
)
1987
class BertForMultipleChoice(BertPreTrainedModel):
1988
    def __init__(self, config):
1989
        super().__init__(config)
1990

1991
        self.bert = BertModel(config)
1992
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
1993
        self.classifier = nn.Linear(config.hidden_size, 1)
1994

1995
        self.init_weights()
1996

1997
    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
1998
    @add_code_sample_docstrings(
1999
        tokenizer_class=_TOKENIZER_FOR_DOC,
2000
        checkpoint="bert-base-uncased",
2001
        output_type=MultipleChoiceModelOutput,
2002
        config_class=_CONFIG_FOR_DOC,
2003
    )
2004
    def forward(
2005
        self,
2006
        input_ids=None,
2007
        attention_mask=None,
2008
        token_type_ids=None,
2009
        position_ids=None,
2010
        head_mask=None,
2011
        inputs_embeds=None,
2012
        labels=None,
2013
        output_attentions=None,
2014
        output_hidden_states=None,
2015
        return_dict=None,
2016
    ):
2017
        r"""
2018
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
2019
            Labels for computing the multiple choice classification loss.
2020
            Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension
2021
            of the input tensors. (see `input_ids` above)
2022
        """
2023
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
2024
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
2025

2026
        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
2027
        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
2028
        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
2029
        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
2030
        inputs_embeds = (
2031
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
2032
            if inputs_embeds is not None
2033
            else None
2034
        )
2035

2036
        outputs = self.bert(
2037
            input_ids,
2038
            attention_mask=attention_mask,
2039
            token_type_ids=token_type_ids,
2040
            position_ids=position_ids,
2041
            head_mask=head_mask,
2042
            inputs_embeds=inputs_embeds,
2043
            output_attentions=output_attentions,
2044
            output_hidden_states=output_hidden_states,
2045
            return_dict=return_dict,
2046
        )
2047

2048
        pooled_output = outputs[1]
2049

2050
        pooled_output = self.dropout(pooled_output)
2051
        logits = self.classifier(pooled_output)
2052
        reshaped_logits = logits.view(-1, num_choices)
2053

2054
        loss = None
2055
        if labels is not None:
2056
            loss_fct = CrossEntropyLoss()
2057
            loss = loss_fct(reshaped_logits, labels)
2058

2059
        if not return_dict:
2060
            output = (reshaped_logits,) + outputs[2:]
2061
            return ((loss,) + output) if loss is not None else output
2062

2063
        return MultipleChoiceModelOutput(
2064
            loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
2065
        )
2066

2067

2068
@add_start_docstrings(
2069
    """Bert Model with a token classification head on top (a linear layer on top of
2070
    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
2071
    BERT_START_DOCSTRING,
2072
)
2073
class BertForTokenClassification(BertPreTrainedModel):
2074
    def __init__(self, config):
2075
        super().__init__(config)
2076
        self.num_labels = config.num_labels
2077

2078
        self.bert = BertModel(config)
2079
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
2080
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
2081

2082
        self.init_weights()
2083

2084
    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
2085
    @add_code_sample_docstrings(
2086
        tokenizer_class=_TOKENIZER_FOR_DOC,
2087
        checkpoint="bert-base-uncased",
2088
        output_type=TokenClassifierOutput,
2089
        config_class=_CONFIG_FOR_DOC,
2090
    )
2091
    def forward(
2092
        self,
2093
        input_ids=None,
2094
        attention_mask=None,
2095
        token_type_ids=None,
2096
        position_ids=None,
2097
        head_mask=None,
2098
        inputs_embeds=None,
2099
        labels=None,
2100
        output_attentions=None,
2101
        output_hidden_states=None,
2102
        return_dict=None,
2103
    ):
2104
        r"""
2105
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
2106
            Labels for computing the token classification loss.
2107
            Indices should be in ``[0, ..., config.num_labels - 1]``.
2108
        """
2109
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
2110

2111
        outputs = self.bert(
2112
            input_ids,
2113
            attention_mask=attention_mask,
2114
            token_type_ids=token_type_ids,
2115
            position_ids=position_ids,
2116
            head_mask=head_mask,
2117
            inputs_embeds=inputs_embeds,
2118
            output_attentions=output_attentions,
2119
            output_hidden_states=output_hidden_states,
2120
            return_dict=return_dict,
2121
        )
2122

2123
        sequence_output = outputs[0]
2124

2125
        sequence_output = self.dropout(sequence_output)
2126
        logits = self.classifier(sequence_output)
2127

2128
        loss = None
2129
        if labels is not None:
2130
            loss_fct = CrossEntropyLoss()
2131
            # Only keep active parts of the loss
2132
            if attention_mask is not None:
2133
                active_loss = attention_mask.view(-1) == 1
2134
                active_logits = logits.view(-1, self.num_labels)
2135
                active_labels = torch.where(
2136
                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
2137
                )
2138
                loss = loss_fct(active_logits, active_labels)
2139
            else:
2140
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
2141

2142
        if not return_dict:
2143
            output = (logits,) + outputs[2:]
2144
            return ((loss,) + output) if loss is not None else output
2145

2146
        return TokenClassifierOutput(
2147
            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
2148
        )
2149

2150

2151
@add_start_docstrings(
2152
    """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
2153
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
2154
    BERT_START_DOCSTRING,
2155
)
2156
class BertForQuestionAnswering(BertPreTrainedModel):
2157
    def __init__(self, config):
2158
        super().__init__(config)
2159
        self.num_labels = config.num_labels
2160

2161
        self.bert = BertModel(config)
2162
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
2163

2164
        self.init_weights()
2165

2166
    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
2167
    @add_code_sample_docstrings(
2168
        tokenizer_class=_TOKENIZER_FOR_DOC,
2169
        checkpoint="bert-base-uncased",
2170
        output_type=QuestionAnsweringModelOutput,
2171
        config_class=_CONFIG_FOR_DOC,
2172
    )
2173
    def forward(
2174
        self,
2175
        input_ids=None,
2176
        attention_mask=None,
2177
        token_type_ids=None,
2178
        position_ids=None,
2179
        head_mask=None,
2180
        inputs_embeds=None,
2181
        start_positions=None,
2182
        end_positions=None,
2183
        output_attentions=None,
2184
        output_hidden_states=None,
2185
        return_dict=None,
2186
    ):
2187
        r"""
2188
        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
2189
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
2190
            Positions are clamped to the length of the sequence (`sequence_length`).
2191
            Position outside of the sequence are not taken into account for computing the loss.
2192
        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
2193
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
2194
            Positions are clamped to the length of the sequence (`sequence_length`).
2195
            Position outside of the sequence are not taken into account for computing the loss.
2196
        """
2197
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
2198

2199
        outputs = self.bert(
2200
            input_ids,
2201
            attention_mask=attention_mask,
2202
            token_type_ids=token_type_ids,
2203
            position_ids=position_ids,
2204
            head_mask=head_mask,
2205
            inputs_embeds=inputs_embeds,
2206
            output_attentions=output_attentions,
2207
            output_hidden_states=output_hidden_states,
2208
            return_dict=return_dict,
2209
        )
2210

2211
        sequence_output = outputs[0]
2212

2213
        logits = self.qa_outputs(sequence_output)
2214
        start_logits, end_logits = logits.split(1, dim=-1)
2215
        start_logits = start_logits.squeeze(-1)
2216
        end_logits = end_logits.squeeze(-1)
2217

2218
        total_loss = None
2219
        if start_positions is not None and end_positions is not None:
2220
            # If we are on multi-GPU, split add a dimension
2221
            if len(start_positions.size()) > 1:
2222
                start_positions = start_positions.squeeze(-1)
2223
            if len(end_positions.size()) > 1:
2224
                end_positions = end_positions.squeeze(-1)
2225
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
2226
            ignored_index = start_logits.size(1)
2227
            start_positions.clamp_(0, ignored_index)
2228
            end_positions.clamp_(0, ignored_index)
2229

2230
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
2231
            start_loss = loss_fct(start_logits, start_positions)
2232
            end_loss = loss_fct(end_logits, end_positions)
2233
            total_loss = (start_loss + end_loss) / 2
2234

2235
        if not return_dict:
2236
            output = (start_logits, end_logits) + outputs[2:]
2237
            return ((total_loss,) + output) if total_loss is not None else output
2238

2239
        return QuestionAnsweringModelOutput(
2240
            loss=total_loss,
2241
            start_logits=start_logits,
2242
            end_logits=end_logits,
2243
            hidden_states=outputs.hidden_states,
2244
            attentions=outputs.attentions,
2245
        )
2246
CSS-LM

Использование cookies