CSS-LM

eval_roberta_useMLMCLASS.py
565 строк · 22.0 Кб
Перенос по словам
1
import argparse
2
import logging
3
import random
4
import numpy as np
5
import os
6
import json
7

8
import torch
9
from transformers import RobertaTokenizer, RobertaForMaskedLM, RobertaForSequenceClassification
10
from transformers.modeling_roberta import RobertaForMaskedLMDomainTask
11
from tqdm import tqdm, trange
12
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
13
from torch.utils.data.distributed import DistributedSampler
14
from transformers.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
15
from transformers.optimization import AdamW, get_linear_schedule_with_warmup
16

17

18

19
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
20
                    datefmt = '%m/%d/%Y %H:%M:%S',
21
                    level = logging.INFO)
22
logger = logging.getLogger(__name__)
23

24
def accuracy(out, labels):
25
    outputs = np.argmax(out, axis=1)
26
    return np.sum(outputs == labels), outputs
27

28
class InputFeatures(object):
29
    """A single set of features of data."""
30

31
    def __init__(self, input_ids=None, attention_mask=None, segment_ids=None, label_id=None):
32
        self.input_ids = input_ids
33
        self.attention_mask = attention_mask
34
        self.segment_ids = segment_ids
35
        self.label_id = label_id
36

37
class InputExample(object):
38
    """A single training/test example for simple sequence classification."""
39
    def __init__(self, guid, sentence, aspect, sentiment=None):
40
        """Constructs a InputExample.
41

42
        Args:
43
            guid: Unique id for the example.
44
            text_a: string. The untokenized text of the first sequence. For single
45
            sequence tasks, only this sequence must be specified.
46
            text_b: (Optional) string. The untokenized text of the second sequence.
47
            Only must be specified for sequence pair tasks.
48
            label: (Optional) string. The label of the example. This should be
49
            specified for train and dev examples, but not for test examples.
50
        """
51
        self.guid = guid
52
        self.sentence = sentence
53
        self.aspect = aspect
54
        self.sentiment = sentiment
55

56

57
class DataProcessor(object):
58
    """Base class for data converters for sequence classification data sets."""
59

60
    def get_train_examples(self, data_dir):
61
        """Gets a collection of `InputExample`s for the train set."""
62
        raise NotImplementedError()
63

64
    def get_dev_examples(self, data_dir):
65
        """Gets a collection of `InputExample`s for the dev set."""
66
        raise NotImplementedError()
67

68
    def get_labels(self):
69
        """Gets the list of labels for this data set."""
70
        raise NotImplementedError()
71

72
    @classmethod
73
    def _read_json(cls, input_file):
74
        with open(input_file, "r", encoding='utf-8') as f:
75
            return json.loads(f.read())
76

77

78
class Processor_1(DataProcessor):
79
    """Processor for the CoLA data set (GLUE version)."""
80

81
    def get_train_examples(self, data_dir):
82
        """See base class."""
83
        examples = self._create_examples(
84
            self._read_json(os.path.join(data_dir, "train.json")), "train")
85
        aspect = set([x.aspect for x in examples])
86
        sentiment = set([x.sentiment for x in examples])
87
        return examples, list(aspect), list(sentiment)
88

89
    def get_dev_examples(self, data_dir):
90
        """See base class."""
91
        examples = self._create_examples(
92
            self._read_json(os.path.join(data_dir, "dev.json")), "dev")
93
        aspect = set([x.aspect for x in examples])
94
        sentiment = set([x.sentiment for x in examples])
95
        return examples, list(aspect), list(sentiment)
96

97
    def get_test_examples(self, data_dir):
98
        """See base class."""
99
        examples = self._create_examples(
100
            self._read_json(os.path.join(data_dir, "test.json")), "test")
101
        aspect = set([x.aspect for x in examples])
102
        sentiment = set([x.sentiment for x in examples])
103
        return examples, list(aspect), list(sentiment)
104

105
    def get_labels(self):
106
        """Useless"""
107
        return ["0", "1"]
108

109
    def _create_examples(self, lines, set_type):
110
        """Creates examples for the training and dev sets."""
111
        examples = []
112
        for (i, line) in enumerate(lines):
113
            guid = "%s-%s" % (set_type, i)
114

115
            sentence = line["sentence"]
116
            aspect = line["aspect"]
117
            sentiment = line["sentiment"]
118

119
            examples.append(
120
                InputExample(guid=guid, sentence=sentence, aspect=aspect, sentiment=sentiment))
121
        return examples
122

123
def convert_examples_to_features(examples, aspect_list, sentiment_list, max_seq_length, tokenizer, task_n):
124

125
    """Loads a data file into a list of `InputBatch`s."""
126

127
    #Task_1: sentence --> aspect
128
    #Task_2: aspect+sentence --> sentiment
129
    if task_n == 1:
130
        label_list = sorted(aspect_list)
131
    elif task_n == 2:
132
        label_list = sorted(sentiment_list)
133
    else:
134
        print("Wrong task")
135
    '''
136
    for w in label_list:
137
        print(w,tokenizer.encode(w))
138
    exit()
139
    '''
140
    label_map = {label : i for i, label in enumerate(label_list)}
141
    print("=======")
142
    print(label_map)
143
    print("=======")
144

145

146
    features = []
147
    for (ex_index, example) in enumerate(examples):
148

149
        #Add new special tokens
150
        '''
151
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
152
model = GPT2Model.from_pretrained('gpt2')
153
        special_tokens_dict = {'cls_token': '<CLS>'}
154
        num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
155
        print('We have added', num_added_toks, 'tokens')
156
        model.resize_token_embeddings(len(tokenizer))
157
        '''
158

159
        '''
160
        print(tokenizer.all_special_tokens)
161
        print(tokenizer.encode(tokenizer.all_special_tokens))
162
        #['[PAD]', '[SEP]', '[CLS]', '[MASK]', '[UNK]']
163
        #[ 0, 102, 101, 103, 100]
164
        '''
165

166

167
        # The convention in BERT is:
168
        # (a) For sequence pairs:
169
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
170
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
171
        # (b) For single sequences:
172
        #  tokens:   [CLS] the dog is hairy . [SEP]
173
        #  type_ids: 0   0   0   0  0     0 0
174
        #
175
        # Where "type_ids" are used to indicate whether this is the first
176
        # sequence or the second sequence. The embedding vectors for `type=0` and
177
        # `type=1` were learned during pre-training and are added to the wordpiece
178
        # embedding vector (and position vector). This is not *strictly* necessary
179
        # since the [SEP] token unambigiously separates the sequences, but it makes
180
        # it easier for the model to learn the concept of sequences.
181
        #
182
        # For classification tasks, the first vector (corresponding to [CLS]) is
183
        # used as as the "sentence vector". Note that this only makes sense because
184
        # the entire model is fine-tuned.
185

186
        ###
187
        #Already add [CLS] and [SEP]
188
        #101, 102
189
        input_ids = tokenizer.encode(example.sentence,add_special_tokens=True)
190
        segment_ids = [0] * len(input_ids)
191

192

193
        if task_n==2:
194
            #"[SEP]"
195
            input_ids += input_ids + [102]
196
            #sentiment: word (Next sentence)
197
            #segment_ids += [1] * (len(tokens_b) + 1)
198

199
        # The “Attention Mask” is simply an array of 1s and 0s indicating which tokens are padding and which aren’t (including special tokens)
200

201
        # The mask has 1 for real tokens and 0 for padding tokens. Only real
202
        # tokens are attended to.
203
        attention_mask = [1] * len(input_ids)
204

205
        # Zero-pad up to the sequence length.
206
        padding = [0] * (max_seq_length - len(input_ids))
207
        input_ids += padding
208
        attention_mask += padding
209
        segment_ids += padding
210

211
        assert len(input_ids) == max_seq_length
212
        assert len(attention_mask) == max_seq_length
213
        assert len(segment_ids) == max_seq_length
214

215
        if task_n == 1:
216
            label_id = label_map[example.aspect]
217
        elif task_n == 2:
218
            label_id = label_map[example.sentiment]
219
        else:
220
            print("Wrong task")
221

222

223
        if task_n == 1:
224
            features.append(
225
                    InputFeatures(input_ids=input_ids,
226
                                  attention_mask=attention_mask,
227
                                  segment_ids=None,
228
                                  label_id=label_id))
229
        elif task_n == 2:
230
            features.append(
231
                    InputFeatures(input_ids=input_ids,
232
                                  attention_mask=attention_mask,
233
                                  segment_ids=segment_ids,
234
                                  label_id=label_id))
235
        else:
236
            print("Wrong in convert_examples")
237

238

239
    return features
240

241

242
def main():
243
    parser = argparse.ArgumentParser()
244
    ## Required parameters
245
    ###############
246
    parser.add_argument("--data_dir",
247
                        default=None,
248
                        type=str,
249
                        required=True,
250
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
251
    parser.add_argument("--output_dir",
252
                        default=None,
253
                        type=str,
254
                        required=True,
255
                        help="The output directory where the model predictions and checkpoints will be written.")
256
    parser.add_argument("--pretrain_model",
257
                        default='bert-case-uncased',
258
                        type=str,
259
                        required=True,
260
                        help="Pre-trained model")
261
    parser.add_argument("--num_labels_task",
262
                        default=None, type=int,
263
                        required=True,
264
                        help="num_labels_task")
265
    parser.add_argument("--max_seq_length",
266
                        default=128,
267
                        type=int,
268
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
269
                             "Sequences longer than this will be truncated, and sequences shorter \n"
270
                             "than this will be padded.")
271
    parser.add_argument("--do_train",
272
                        default=False,
273
                        action='store_true',
274
                        help="Whether to run training.")
275
    parser.add_argument("--do_eval",
276
                        default=False,
277
                        action='store_true',
278
                        help="Whether to run eval on the dev set.")
279
    parser.add_argument("--do_lower_case",
280
                        default=False,
281
                        action='store_true',
282
                        help="Set this flag if you are using an uncased model.")
283
    parser.add_argument("--eval_batch_size",
284
                        default=32,
285
                        type=int,
286
                        help="Total batch size for training.")
287
    parser.add_argument("--learning_rate",
288
                        default=5e-5,
289
                        type=float,
290
                        help="The initial learning rate for Adam.")
291
    parser.add_argument("--num_train_epochs",
292
                        default=3.0,
293
                        type=float,
294
                        help="Total number of training epochs to perform.")
295
    parser.add_argument("--warmup_proportion",
296
                        default=0.1,
297
                        type=float,
298
                        help="Proportion of training to perform linear learning rate warmup for. "
299
                             "E.g., 0.1 = 10%% of training.")
300
    parser.add_argument("--no_cuda",
301
                        default=False,
302
                        action='store_true',
303
                        help="Whether not to use CUDA when available")
304
    parser.add_argument("--local_rank",
305
                        type=int,
306
                        default=-1,
307
                        help="local_rank for distributed training on gpus")
308
    parser.add_argument('--seed',
309
                        type=int,
310
                        default=42,
311
                        help="random seed for initialization")
312
    parser.add_argument('--gradient_accumulation_steps',
313
                        type=int,
314
                        default=1,
315
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
316
    parser.add_argument('--fp16',
317
                        default=False,
318
                        action='store_true',
319
                        help="Whether to use 16-bit float precision instead of 32-bit")
320
    parser.add_argument('--loss_scale',
321
                        type=float, default=0,
322
                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
323
                             "0 (default value): dynamic loss scaling.\n"
324
                             "Positive power of 2: static loss scaling value.\n")
325
    parser.add_argument("--weight_decay",
326
                        default=0.0,
327
                        type=float,
328
                        help="Weight decay if we apply some.")
329
    parser.add_argument("--adam_epsilon",
330
                        default=1e-8,
331
                        type=float,
332
                        help="Epsilon for Adam optimizer.")
333
    parser.add_argument("--max_grad_norm",
334
                        default=1.0,
335
                        type=float,
336
                        help="Max gradient norm.")
337
    parser.add_argument('--fp16_opt_level',
338
                        type=str,
339
                        default='O1',
340
                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
341
                             "See details at https://nvidia.github.io/apex/amp.html")
342
    parser.add_argument("--task",
343
                        default=None,
344
                        type=int,
345
                        required=True,
346
                        help="Choose Task")
347
    ###############
348

349
    args = parser.parse_args()
350
    #print(args.do_train, args.do_eval)
351
    #exit()
352

353
    processors = Processor_1
354

355
    num_labels = args.num_labels_task
356

357
    if args.local_rank == -1 or args.no_cuda:
358
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
359
        n_gpu = torch.cuda.device_count()
360
    else:
361
        torch.cuda.set_device(args.local_rank)
362
        device = torch.device("cuda", args.local_rank)
363
        n_gpu = 1
364
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
365
        torch.distributed.init_process_group(backend='nccl')
366
    logger.info("device: {}, n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
367
        device, n_gpu, bool(args.local_rank != -1), args.fp16))
368

369

370

371
    if args.gradient_accumulation_steps < 1:
372
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
373
                            args.gradient_accumulation_steps))
374

375
    #args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)
376

377
    random.seed(args.seed)
378
    np.random.seed(args.seed)
379
    torch.manual_seed(args.seed)
380
    if n_gpu > 0:
381
        torch.cuda.manual_seed_all(args.seed)
382

383
    if not args.do_eval:
384
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
385

386
    '''
387
    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
388
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
389
    '''
390
    os.makedirs(args.output_dir, exist_ok=True)
391

392

393

394
    tokenizer = RobertaTokenizer.from_pretrained(args.pretrain_model)
395

396

397
    train_examples = None
398
    num_train_steps = None
399
    aspect_list = None
400
    sentiment_list = None
401
    processor = processors()
402
    num_labels = num_labels
403
    #train_examples, aspect_list, sentiment_list = processor.get_train_examples(args.data_dir)
404

405
    filenames = os.listdir(args.output_dir)
406
    filenames = [x for x in filenames if "pytorch_model.bin_" in x]
407
    print(filenames)
408

409
    file_mark = []
410
    for x in filenames:
411
        file_mark.append([x, True])
412
        file_mark.append([x, False])
413

414
    ####
415
    ####
416
    test_examples, aspect_list, sentiment_list = processor.get_test_examples(args.data_dir)
417
    if args.task == 1:
418
        num_labels = len(aspect_list)
419
    elif args.task == 2:
420
        num_labels = len(sentiment_list)
421
    else:
422
        print("What's task?")
423
        exit()
424
    test = convert_examples_to_features(
425
        test_examples, aspect_list, sentiment_list, args.max_seq_length, tokenizer, args.task)
426
    eval_examples = test_examples
427
    ###
428

429

430
    for x, mark in file_mark:
431
        print(x, mark)
432
        output_model_file = os.path.join(args.output_dir, x)
433
        print(output_model_file)
434

435
        #model = RobertaForSequenceClassification.from_pretrained(args.pretrain_model, num_labels=num_labels, output_hidden_states=False, output_attentions=False, return_dict=True)
436
        model = RobertaForMaskedLMDomainTask.from_pretrained(args.pretrain_model, output_hidden_states=False, output_attentions=False, return_dict=True, num_labels=args.num_labels_task)
437
        model.load_state_dict(torch.load(output_model_file), strict=False)
438
        #strict False: ignore non-matching keys
439

440

441
        #param_optimizer = [para[0] for para in model.named_parameters()]
442
        #param_optimizer = [para for para in model.named_parameters()][-2]
443
        #print(param_optimizer)
444

445
        model.to(device)
446
        if mark:
447
            eval_features = dev
448
        else:
449
            eval_features = test
450

451
        logger.info("***** Running evaluation *****")
452
        logger.info("  Num examples = %d", len(eval_examples))
453
        logger.info("  Batch size = %d", args.eval_batch_size)
454

455

456
        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
457
        all_attention_mask = torch.tensor([f.attention_mask for f in eval_features], dtype=torch.long)
458
        if args.task == 1:
459
            print("Excuting the task 1")
460
        elif args.task == 2:
461
            all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
462
        else:
463
            print("Wrong here2")
464

465
        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
466

467
        if args.task == 1:
468
            eval_data = TensorDataset(all_input_ids, all_attention_mask, all_label_ids)
469
        elif args.task == 2:
470
            eval_data = TensorDataset(all_input_ids, all_attention_mask, all_segment_ids, all_label_ids)
471
        else:
472
            print("Wrong here1")
473

474
        if args.local_rank == -1:
475
            eval_sampler = RandomSampler(eval_data)
476
        else:
477
            eval_sampler = DistributedSampler(eval_data)
478
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
479

480
        if mark:
481
            output_eval_file = os.path.join(args.output_dir, "eval_results_{}.txt".format(x.split("_")[-1]))
482
            output_file_pred = os.path.join(args.output_dir, "eval_pred_{}.txt".format(x.split("_")[-1]))
483
            output_file_glod = os.path.join(args.output_dir, "eval_gold_{}.txt".format(x.split("_")[-1]))
484
        else:
485
            output_eval_file = os.path.join(args.output_dir, "test_results_{}.txt".format(x.split("_")[-1]))
486
            output_file_pred = os.path.join(args.output_dir, "test_pred_{}.txt".format(x.split("_")[-1]))
487
            output_file_glod = os.path.join(args.output_dir, "test_gold_{}.txt".format(x.split("_")[-1]))
488

489
        fpred = open(output_file_pred, "w")
490
        fgold = open(output_file_glod, "w")
491

492
        model.eval()
493
        eval_loss, eval_accuracy = 0, 0
494
        nb_eval_steps, nb_eval_examples = 0, 0
495

496

497
        for step, batch in enumerate(tqdm(eval_dataloader, desc="Iteration")):
498
            #batch = tuple(t.to(device) if i != 3 else t for i, t in enumerate(batch))
499
            batch = tuple(t.to(device) for i, t in enumerate(batch))
500

501
            if args.task == 1:
502
                input_ids, attention_mask, label_ids = batch
503
            elif args.task == 2:
504
                input_ids, attention_mask, segment_ids, label_ids = batch
505
            else:
506
                print("Wrong here3")
507

508

509
            if args.task == 1:
510
                #loss, logits, hidden_states, attentions
511
                '''
512
                output = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels=label_ids)
513
                logits = output.logits
514
                tmp_eval_loss = output.loss
515
                '''
516
                #
517
                tmp_eval_loss, logits = model(input_ids_org=input_ids, sentence_label=label_ids, attention_mask=attention_mask, func="task_class")
518
                #logits = output.logits
519
                #tmp_eval_loss = output.loss
520
            elif args.task == 2:
521
                #loss, logits, hidden_states, attentions
522
                '''
523
                output = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels=label_ids)
524
                logits = output.logits
525
                tmp_eval_loss = output.loss
526
                '''
527
                #
528
                tmp_eval_loss, logits = model(input_ids_org=input_ids, sentence_label=label_ids, attention_mask=attention_mask, func="task_class")
529
                #exit()
530
                #logits = output.logits
531
                #tmp_eval_loss = output.loss
532
            else:
533
                print("Wrong!!")
534

535

536
            logits = logits.detach().cpu().numpy()
537
            label_ids = label_ids.to('cpu').numpy()
538
            tmp_eval_accuracy, pred = accuracy(logits, label_ids)
539
            for a, b in zip(pred, label_ids):
540
                fgold.write("{}\n".format(b))
541
                fpred.write("{}\n".format(a))
542

543
            eval_loss += tmp_eval_loss.mean().item()
544
            eval_accuracy += tmp_eval_accuracy
545

546
            nb_eval_examples += input_ids.size(0)
547
            nb_eval_steps += 1
548

549
        eval_loss = eval_loss / nb_eval_steps
550
        eval_accuracy = eval_accuracy / nb_eval_examples
551

552
        result = {'eval_loss': eval_loss,
553
                  'eval_accuracy': eval_accuracy
554
                  }
555

556
        with open(output_eval_file, "w") as writer:
557
            logger.info("***** Eval results *****")
558
            for key in sorted(result.keys()):
559
                logger.info("  %s = %s", key, str(result[key]))
560
                writer.write("%s = %s\n" % (key, str(result[key])))
561

562

563

564
if __name__ == "__main__":
565
    main()
566

567

568
CSS-LM

Использование cookies