CSS-LM

eval_roberta_useMLMCLASS_sentiment_noaspect_HEADandTAIL_updateRep_dev.py
587 строк · 23.0 Кб
Перенос по словам
1
import argparse
2
import logging
3
import random
4
import numpy as np
5
import os
6
import json
7

8
import torch
9
from transformers import RobertaTokenizer, RobertaForMaskedLM, RobertaForSequenceClassification
10
#from transformers.modeling_roberta import RobertaForMaskedLMDomainTask
11
from transformers.modeling_roberta_updateRep import RobertaForMaskedLMDomainTask
12
from tqdm import tqdm, trange
13
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
14
from torch.utils.data.distributed import DistributedSampler
15
from transformers.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
16
from transformers.optimization import AdamW, get_linear_schedule_with_warmup
17

18

19

20
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
21
                    datefmt = '%m/%d/%Y %H:%M:%S',
22
                    level = logging.INFO)
23
logger = logging.getLogger(__name__)
24

25
def accuracy(out, labels):
26
    outputs = np.argmax(out, axis=1)
27
    return np.sum(outputs == labels), outputs
28

29
class InputFeatures(object):
30
    """A single set of features of data."""
31

32
    def __init__(self, input_ids=None, attention_mask=None, segment_ids=None, label_id=None):
33
        self.input_ids = input_ids
34
        self.attention_mask = attention_mask
35
        self.segment_ids = segment_ids
36
        self.label_id = label_id
37

38
class InputExample(object):
39
    """A single training/test example for simple sequence classification."""
40
    def __init__(self, guid, sentence, aspect, sentiment=None):
41
        """Constructs a InputExample.
42

43
        Args:
44
            guid: Unique id for the example.
45
            text_a: string. The untokenized text of the first sequence. For single
46
            sequence tasks, only this sequence must be specified.
47
            text_b: (Optional) string. The untokenized text of the second sequence.
48
            Only must be specified for sequence pair tasks.
49
            label: (Optional) string. The label of the example. This should be
50
            specified for train and dev examples, but not for test examples.
51
        """
52
        self.guid = guid
53
        self.sentence = sentence
54
        self.aspect = aspect
55
        self.sentiment = sentiment
56

57

58
class DataProcessor(object):
59
    """Base class for data converters for sequence classification data sets."""
60

61
    def get_train_examples(self, data_dir):
62
        """Gets a collection of `InputExample`s for the train set."""
63
        raise NotImplementedError()
64

65
    def get_dev_examples(self, data_dir):
66
        """Gets a collection of `InputExample`s for the dev set."""
67
        raise NotImplementedError()
68

69
    def get_labels(self):
70
        """Gets the list of labels for this data set."""
71
        raise NotImplementedError()
72

73
    @classmethod
74
    def _read_json(cls, input_file):
75
        with open(input_file, "r", encoding='utf-8') as f:
76
            return json.loads(f.read())
77

78

79
class Processor_1(DataProcessor):
80
    """Processor for the CoLA data set (GLUE version)."""
81

82
    def get_train_examples(self, data_dir):
83
        """See base class."""
84
        examples = self._create_examples(
85
            self._read_json(os.path.join(data_dir, "train.json")), "train")
86
        aspect = set([x.aspect for x in examples])
87
        sentiment = set([x.sentiment for x in examples])
88
        return examples, list(aspect), list(sentiment)
89

90
    def get_dev_examples(self, data_dir):
91
        """See base class."""
92
        examples = self._create_examples(
93
            self._read_json(os.path.join(data_dir, "dev.json")), "dev")
94
        aspect = set([x.aspect for x in examples])
95
        sentiment = set([x.sentiment for x in examples])
96
        return examples, list(aspect), list(sentiment)
97

98
    def get_test_examples(self, data_dir):
99
        """See base class."""
100
        examples = self._create_examples(
101
            self._read_json(os.path.join(data_dir, "test.json")), "test")
102
        aspect = set([x.aspect for x in examples])
103
        sentiment = set([x.sentiment for x in examples])
104
        return examples, list(aspect), list(sentiment)
105

106
    def get_labels(self):
107
        """Useless"""
108
        return ["0", "1"]
109

110
    def _create_examples(self, lines, set_type):
111
        """Creates examples for the training and dev sets."""
112
        examples = []
113
        for (i, line) in enumerate(lines):
114
            guid = "%s-%s" % (set_type, i)
115

116
            sentence = line["sentence"]
117
            aspect = line["aspect"]
118
            sentiment = line["sentiment"]
119

120
            examples.append(
121
                InputExample(guid=guid, sentence=sentence, aspect=aspect, sentiment=sentiment))
122
        return examples
123

124
def convert_examples_to_features(examples, aspect_list, sentiment_list, max_seq_length, tokenizer, task_n):
125

126
    """Loads a data file into a list of `InputBatch`s."""
127

128
    #Task_1: sentence --> aspect
129
    #Task_2: aspect+sentence --> sentiment
130
    if task_n == 1:
131
        label_list = sorted(aspect_list)
132
    elif task_n == 2:
133
        label_list = sorted(sentiment_list)
134
    else:
135
        print("Wrong task")
136
    '''
137
    for w in label_list:
138
        print(w,tokenizer.encode(w))
139
    exit()
140
    '''
141
    label_map = {label : i for i, label in enumerate(label_list)}
142
    print("=======")
143
    print(label_map)
144
    print("=======")
145

146

147
    features = []
148
    for (ex_index, example) in enumerate(examples):
149

150
        #Add new special tokens
151
        '''
152
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
153
model = GPT2Model.from_pretrained('gpt2')
154
        special_tokens_dict = {'cls_token': '<CLS>'}
155
        num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
156
        print('We have added', num_added_toks, 'tokens')
157
        model.resize_token_embeddings(len(tokenizer))
158
        '''
159

160
        '''
161
        print(tokenizer.all_special_tokens)
162
        print(tokenizer.encode(tokenizer.all_special_tokens))
163
        #['[PAD]', '[SEP]', '[CLS]', '[MASK]', '[UNK]']
164
        #[ 0, 102, 101, 103, 100]
165
        '''
166

167

168
        # The convention in BERT is:
169
        # (a) For sequence pairs:
170
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
171
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
172
        # (b) For single sequences:
173
        #  tokens:   [CLS] the dog is hairy . [SEP]
174
        #  type_ids: 0   0   0   0  0     0 0
175
        #
176
        # Where "type_ids" are used to indicate whether this is the first
177
        # sequence or the second sequence. The embedding vectors for `type=0` and
178
        # `type=1` were learned during pre-training and are added to the wordpiece
179
        # embedding vector (and position vector). This is not *strictly* necessary
180
        # since the [SEP] token unambigiously separates the sequences, but it makes
181
        # it easier for the model to learn the concept of sequences.
182
        #
183
        # For classification tasks, the first vector (corresponding to [CLS]) is
184
        # used as as the "sentence vector". Note that this only makes sense because
185
        # the entire model is fine-tuned.
186

187
        ###
188
        #Already add [CLS] and [SEP]
189
        #101, 102
190
        input_ids = tokenizer.encode(example.sentence,add_special_tokens=True)
191
        segment_ids = [0] * len(input_ids)
192

193

194
        '''
195
        if task_n==2:
196
            #"[SEP]"
197
            input_ids += input_ids + [102]
198
            #sentiment: word (Next sentence)
199
            #segment_ids += [1] * (len(tokens_b) + 1)
200
        '''
201

202
        # The “Attention Mask” is simply an array of 1s and 0s indicating which tokens are padding and which aren’t (including special tokens)
203

204
        # The mask has 1 for real tokens and 0 for padding tokens. Only real
205
        # tokens are attended to.
206
        attention_mask = [1] * len(input_ids)
207

208
        # Zero-pad up to the sequence length.
209
        padding = [0] * (max_seq_length - len(input_ids))
210
        input_ids += padding
211
        attention_mask += padding
212
        segment_ids += padding
213

214
        assert len(input_ids) == max_seq_length
215
        assert len(attention_mask) == max_seq_length
216
        assert len(segment_ids) == max_seq_length
217

218
        if task_n == 1:
219
            label_id = label_map[example.aspect]
220
        elif task_n == 2:
221
            label_id = label_map[example.sentiment]
222
        else:
223
            print("Wrong task")
224

225

226
        if task_n == 1:
227
            features.append(
228
                    InputFeatures(input_ids=input_ids,
229
                                  attention_mask=attention_mask,
230
                                  segment_ids=None,
231
                                  label_id=label_id))
232
        elif task_n == 2:
233
            features.append(
234
                    InputFeatures(input_ids=input_ids,
235
                                  attention_mask=attention_mask,
236
                                  segment_ids=segment_ids,
237
                                  label_id=label_id))
238
        else:
239
            print("Wrong in convert_examples")
240

241

242
    return features
243

244

245
def main():
246
    parser = argparse.ArgumentParser()
247
    ## Required parameters
248
    ###############
249
    parser.add_argument("--data_dir",
250
                        default=None,
251
                        type=str,
252
                        required=True,
253
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
254
    parser.add_argument("--output_dir",
255
                        default=None,
256
                        type=str,
257
                        required=True,
258
                        help="The output directory where the model predictions and checkpoints will be written.")
259
    parser.add_argument("--pretrain_model",
260
                        default='bert-case-uncased',
261
                        type=str,
262
                        required=True,
263
                        help="Pre-trained model")
264
    parser.add_argument("--num_labels_task",
265
                        default=None, type=int,
266
                        required=True,
267
                        help="num_labels_task")
268
    parser.add_argument("--max_seq_length",
269
                        default=128,
270
                        type=int,
271
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
272
                             "Sequences longer than this will be truncated, and sequences shorter \n"
273
                             "than this will be padded.")
274
    parser.add_argument("--do_train",
275
                        default=False,
276
                        action='store_true',
277
                        help="Whether to run training.")
278
    parser.add_argument("--do_eval",
279
                        default=False,
280
                        action='store_true',
281
                        help="Whether to run eval on the dev set.")
282
    parser.add_argument("--do_lower_case",
283
                        default=False,
284
                        action='store_true',
285
                        help="Set this flag if you are using an uncased model.")
286
    parser.add_argument("--eval_batch_size",
287
                        default=32,
288
                        type=int,
289
                        help="Total batch size for training.")
290
    parser.add_argument("--learning_rate",
291
                        default=5e-5,
292
                        type=float,
293
                        help="The initial learning rate for Adam.")
294
    parser.add_argument("--num_train_epochs",
295
                        default=3.0,
296
                        type=float,
297
                        help="Total number of training epochs to perform.")
298
    parser.add_argument("--warmup_proportion",
299
                        default=0.1,
300
                        type=float,
301
                        help="Proportion of training to perform linear learning rate warmup for. "
302
                             "E.g., 0.1 = 10%% of training.")
303
    parser.add_argument("--no_cuda",
304
                        default=False,
305
                        action='store_true',
306
                        help="Whether not to use CUDA when available")
307
    parser.add_argument("--local_rank",
308
                        type=int,
309
                        default=-1,
310
                        help="local_rank for distributed training on gpus")
311
    parser.add_argument('--seed',
312
                        type=int,
313
                        default=42,
314
                        help="random seed for initialization")
315
    parser.add_argument('--gradient_accumulation_steps',
316
                        type=int,
317
                        default=1,
318
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
319
    parser.add_argument('--fp16',
320
                        default=False,
321
                        action='store_true',
322
                        help="Whether to use 16-bit float precision instead of 32-bit")
323
    parser.add_argument('--loss_scale',
324
                        type=float, default=0,
325
                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
326
                             "0 (default value): dynamic loss scaling.\n"
327
                             "Positive power of 2: static loss scaling value.\n")
328
    parser.add_argument("--weight_decay",
329
                        default=0.0,
330
                        type=float,
331
                        help="Weight decay if we apply some.")
332
    parser.add_argument("--adam_epsilon",
333
                        default=1e-8,
334
                        type=float,
335
                        help="Epsilon for Adam optimizer.")
336
    parser.add_argument("--max_grad_norm",
337
                        default=1.0,
338
                        type=float,
339
                        help="Max gradient norm.")
340
    parser.add_argument('--fp16_opt_level',
341
                        type=str,
342
                        default='O1',
343
                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
344
                             "See details at https://nvidia.github.io/apex/amp.html")
345
    parser.add_argument("--task",
346
                        default=None,
347
                        type=int,
348
                        required=True,
349
                        help="Choose Task")
350
    ###############
351

352
    args = parser.parse_args()
353
    #print(args.do_train, args.do_eval)
354
    #exit()
355

356
    processors = Processor_1
357

358
    num_labels = args.num_labels_task
359

360
    if args.local_rank == -1 or args.no_cuda:
361
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
362
        n_gpu = torch.cuda.device_count()
363
    else:
364
        torch.cuda.set_device(args.local_rank)
365
        device = torch.device("cuda", args.local_rank)
366
        n_gpu = 1
367
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
368
        torch.distributed.init_process_group(backend='nccl')
369
    logger.info("device: {}, n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
370
        device, n_gpu, bool(args.local_rank != -1), args.fp16))
371

372

373

374
    if args.gradient_accumulation_steps < 1:
375
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
376
                            args.gradient_accumulation_steps))
377

378
    #args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)
379

380
    random.seed(args.seed)
381
    np.random.seed(args.seed)
382
    torch.manual_seed(args.seed)
383
    if n_gpu > 0:
384
        torch.cuda.manual_seed_all(args.seed)
385

386
    if not args.do_eval:
387
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
388

389
    '''
390
    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
391
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
392
    '''
393
    os.makedirs(args.output_dir, exist_ok=True)
394

395

396

397
    tokenizer = RobertaTokenizer.from_pretrained(args.pretrain_model)
398

399

400
    train_examples = None
401
    num_train_steps = None
402
    aspect_list = None
403
    sentiment_list = None
404
    processor = processors()
405
    num_labels = num_labels
406
    #train_examples, aspect_list, sentiment_list = processor.get_train_examples(args.data_dir)
407

408
    filenames = os.listdir(args.output_dir)
409
    filenames = [x for x in filenames if "pytorch_model.bin_" in x]
410
    print(filenames)
411

412
    file_mark = []
413
    model_performace = dict()
414
    for x in filenames:
415
        file_mark.append([x, True])
416
        #file_mark.append([x, False])
417

418
    ####
419
    ####
420
    test_examples, aspect_list, sentiment_list = processor.get_test_examples(args.data_dir)
421
    if args.task == 1:
422
        num_labels = len(aspect_list)
423
    elif args.task == 2:
424
        num_labels = len(sentiment_list)
425
    else:
426
        print("What's task?")
427
        exit()
428
    dev = convert_examples_to_features(
429
        test_examples, aspect_list, sentiment_list, args.max_seq_length, tokenizer, args.task)
430
    eval_examples = test_examples
431
    ###
432

433

434
    for x, mark in file_mark:
435
        print(x, mark)
436
        output_model_file = os.path.join(args.output_dir, x)
437

438
        #model = RobertaForSequenceClassification.from_pretrained(args.pretrain_model, num_labels=num_labels, output_hidden_states=False, output_attentions=False, return_dict=True)
439
        model = RobertaForMaskedLMDomainTask.from_pretrained(args.pretrain_model, output_hidden_states=False, output_attentions=False, return_dict=True, num_labels=args.num_labels_task)
440
        model.load_state_dict(torch.load(output_model_file), strict=False)
441
        #strict False: ignore non-matching keys
442

443

444
        #param_optimizer = [para[0] for para in model.named_parameters()]
445
        #param_optimizer = [para for para in model.named_parameters()][-2]
446
        #print(param_optimizer)
447

448
        model.to(device)
449
        if mark:
450
            eval_features = dev
451
        else:
452
            eval_features = test
453

454
        logger.info("***** Running evaluation *****")
455
        logger.info("  Num examples = %d", len(eval_examples))
456
        logger.info("  Batch size = %d", args.eval_batch_size)
457

458

459
        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
460
        all_attention_mask = torch.tensor([f.attention_mask for f in eval_features], dtype=torch.long)
461
        if args.task == 1:
462
            print("Excuting the task 1")
463
        elif args.task == 2:
464
            all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
465
        else:
466
            print("Wrong here2")
467

468
        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
469

470
        if args.task == 1:
471
            eval_data = TensorDataset(all_input_ids, all_attention_mask, all_label_ids)
472
        elif args.task == 2:
473
            eval_data = TensorDataset(all_input_ids, all_attention_mask, all_segment_ids, all_label_ids)
474
        else:
475
            print("Wrong here1")
476

477
        if args.local_rank == -1:
478
            eval_sampler = RandomSampler(eval_data)
479
        else:
480
            eval_sampler = DistributedSampler(eval_data)
481
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
482

483
        if mark:
484
            output_eval_file = os.path.join(args.output_dir, "eval_results_{}.txt".format(x.split("_")[-1]))
485
            output_file_pred = os.path.join(args.output_dir, "eval_pred_{}.txt".format(x.split("_")[-1]))
486
            output_file_glod = os.path.join(args.output_dir, "eval_gold_{}.txt".format(x.split("_")[-1]))
487
        else:
488
            output_eval_file = os.path.join(args.output_dir, "test_results_{}.txt".format(x.split("_")[-1]))
489
            output_file_pred = os.path.join(args.output_dir, "test_pred_{}.txt".format(x.split("_")[-1]))
490
            output_file_glod = os.path.join(args.output_dir, "test_gold_{}.txt".format(x.split("_")[-1]))
491

492
        fpred = open(output_file_pred, "w")
493
        fgold = open(output_file_glod, "w")
494

495
        model.eval()
496
        eval_loss, eval_accuracy = 0, 0
497
        nb_eval_steps, nb_eval_examples = 0, 0
498

499

500
        for step, batch in enumerate(tqdm(eval_dataloader, desc="Iteration")):
501
            #batch = tuple(t.to(device) if i != 3 else t for i, t in enumerate(batch))
502
            batch = tuple(t.to(device) for i, t in enumerate(batch))
503

504
            if args.task == 1:
505
                input_ids, attention_mask, label_ids = batch
506
            elif args.task == 2:
507
                input_ids, attention_mask, segment_ids, label_ids = batch
508
            else:
509
                print("Wrong here3")
510

511

512
            if args.task == 1:
513
                #loss, logits, hidden_states, attentions
514
                '''
515
                output = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels=label_ids)
516
                logits = output.logits
517
                tmp_eval_loss = output.loss
518
                '''
519
                #
520
                tmp_eval_loss, logits = model(input_ids_org=input_ids, sentence_label=label_ids, attention_mask=attention_mask, func="task_class")
521
                #logits = output.logits
522
                #tmp_eval_loss = output.loss
523
            elif args.task == 2:
524
                #loss, logits, hidden_states, attentions
525
                '''
526
                output = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels=label_ids)
527
                logits = output.logits
528
                tmp_eval_loss = output.loss
529
                '''
530
                #
531
                tmp_eval_loss, logits = model(input_ids_org=input_ids, sentence_label=label_ids, attention_mask=attention_mask, func="task_class")
532
                #exit()
533
                #logits = output.logits
534
                #tmp_eval_loss = output.loss
535
            else:
536
                print("Wrong!!")
537

538

539
            logits = logits.detach().cpu().numpy()
540
            label_ids = label_ids.to('cpu').numpy()
541
            tmp_eval_accuracy, pred = accuracy(logits, label_ids)
542
            for a, b in zip(pred, label_ids):
543
                fgold.write("{}\n".format(b))
544
                fpred.write("{}\n".format(a))
545

546
            eval_loss += tmp_eval_loss.mean().item()
547
            eval_accuracy += tmp_eval_accuracy
548

549
            nb_eval_examples += input_ids.size(0)
550
            nb_eval_steps += 1
551

552
        eval_loss = eval_loss / nb_eval_steps
553
        eval_accuracy = eval_accuracy / nb_eval_examples
554

555
        result = {'eval_loss': eval_loss,
556
                  'eval_accuracy': eval_accuracy
557
                  }
558

559
        with open(output_eval_file, "w") as writer:
560
            logger.info("***** Eval results *****")
561
            for key in sorted(result.keys()):
562
                logger.info("  %s = %s", key, str(result[key]))
563
                writer.write("%s = %s\n" % (key, str(result[key])))
564

565
        model_performace[x] = eval_accuracy
566

567
    #################
568
    #################
569
    model_name_best=0
570
    score_best=0
571
    for model_name, score in model_performace.items():
572
        if score > score_best:
573
            score_best = score
574
            model_name_best = model_name
575

576

577
    model = RobertaForMaskedLMDomainTask.from_pretrained(args.pretrain_model, output_hidden_states=False, output_attentions=False, return_dict=True, num_labels=args.num_labels_task)
578
    model_name_best = os.path.join(args.output_dir, model_name_best)
579
    model.load_state_dict(torch.load(model_name_best), strict=False)
580
    # Save a trained model
581
    logger.info("** ** * Saving fine - tuned model ** ** * ")
582
    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
583
    output_model_file = os.path.join(args.output_dir, "pytorch_model.bin_best")
584
    torch.save(model_to_save.state_dict(), output_model_file)
585

586
if __name__ == "__main__":
587
    main()
588

589

590
CSS-LM

Использование cookies