CSS-LM

eval_roberta_sentiment_class.py
564 строки · 21.8 Кб
Перенос по словам
1
import argparse
2
import logging
3
import random
4
import numpy as np
5
import os
6
import json
7

8
import torch
9
from transformers import RobertaTokenizer, RobertaForMaskedLM, RobertaForSequenceClassification
10
from transformers.modeling_roberta import RobertaForMaskedLMDomainTask
11
from tqdm import tqdm, trange
12
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
13
from torch.utils.data.distributed import DistributedSampler
14
from transformers.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
15
from transformers.optimization import AdamW, get_linear_schedule_with_warmup
16

17

18

19
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
20
                    datefmt = '%m/%d/%Y %H:%M:%S',
21
                    level = logging.INFO)
22
logger = logging.getLogger(__name__)
23

24
def accuracy(out, labels):
25
    outputs = np.argmax(out, axis=1)
26
    return np.sum(outputs == labels), outputs
27

28
class InputFeatures(object):
29
    """A single set of features of data."""
30

31
    def __init__(self, input_ids=None, attention_mask=None, segment_ids=None, label_id=None):
32
        self.input_ids = input_ids
33
        self.attention_mask = attention_mask
34
        self.segment_ids = segment_ids
35
        self.label_id = label_id
36

37
class InputExample(object):
38
    """A single training/test example for simple sequence classification."""
39
    def __init__(self, guid, sentence, aspect, sentiment=None):
40
        """Constructs a InputExample.
41

42
        Args:
43
            guid: Unique id for the example.
44
            text_a: string. The untokenized text of the first sequence. For single
45
            sequence tasks, only this sequence must be specified.
46
            text_b: (Optional) string. The untokenized text of the second sequence.
47
            Only must be specified for sequence pair tasks.
48
            label: (Optional) string. The label of the example. This should be
49
            specified for train and dev examples, but not for test examples.
50
        """
51
        self.guid = guid
52
        self.sentence = sentence
53
        self.aspect = aspect
54
        self.sentiment = sentiment
55

56

57
class DataProcessor(object):
58
    """Base class for data converters for sequence classification data sets."""
59

60
    def get_train_examples(self, data_dir):
61
        """Gets a collection of `InputExample`s for the train set."""
62
        raise NotImplementedError()
63

64
    def get_dev_examples(self, data_dir):
65
        """Gets a collection of `InputExample`s for the dev set."""
66
        raise NotImplementedError()
67

68
    def get_labels(self):
69
        """Gets the list of labels for this data set."""
70
        raise NotImplementedError()
71

72
    @classmethod
73
    def _read_json(cls, input_file):
74
        with open(input_file, "r", encoding='utf-8') as f:
75
            return json.loads(f.read())
76

77

78
class Processor_1(DataProcessor):
79
    """Processor for the CoLA data set (GLUE version)."""
80

81
    def get_train_examples(self, data_dir):
82
        """See base class."""
83
        examples = self._create_examples(
84
            self._read_json(os.path.join(data_dir, "train.json")), "train")
85
        aspect = set([x.aspect for x in examples])
86
        sentiment = set([x.sentiment for x in examples])
87
        return examples, list(aspect), list(sentiment)
88

89
    def get_dev_examples(self, data_dir):
90
        """See base class."""
91
        examples = self._create_examples(
92
            self._read_json(os.path.join(data_dir, "dev.json")), "dev")
93
        aspect = set([x.aspect for x in examples])
94
        sentiment = set([x.sentiment for x in examples])
95
        return examples, list(aspect), list(sentiment)
96

97
    def get_test_examples(self, data_dir):
98
        """See base class."""
99
        examples = self._create_examples(
100
            self._read_json(os.path.join(data_dir, "test.json")), "test")
101
        aspect = set([x.aspect for x in examples])
102
        sentiment = set([x.sentiment for x in examples])
103
        return examples, list(aspect), list(sentiment)
104

105
    def get_labels(self):
106
        """Useless"""
107
        return ["0", "1"]
108

109
    def _create_examples(self, lines, set_type):
110
        """Creates examples for the training and dev sets."""
111
        examples = []
112
        for (i, line) in enumerate(lines):
113
            guid = "%s-%s" % (set_type, i)
114

115
            sentence = line["sentence"]
116
            aspect = line["aspect"]
117
            sentiment = line["sentiment"]
118

119
            examples.append(
120
                InputExample(guid=guid, sentence=sentence, aspect=aspect, sentiment=sentiment))
121
        return examples
122

123
def convert_examples_to_features(examples, aspect_list, sentiment_list, max_seq_length, tokenizer, task_n):
124

125
    """Loads a data file into a list of `InputBatch`s."""
126

127
    #Task_1: sentence --> aspect
128
    #Task_2: aspect+sentence --> sentiment
129
    if task_n == 1:
130
        label_list = sorted(aspect_list)
131
    elif task_n == 2:
132
        label_list = sorted(sentiment_list)
133
    else:
134
        print("Wrong task")
135
    '''
136
    for w in label_list:
137
        print(w,tokenizer.encode(w))
138
    exit()
139
    '''
140
    label_map = {label : i for i, label in enumerate(label_list)}
141

142

143
    features = []
144
    for (ex_index, example) in enumerate(examples):
145

146
        #Add new special tokens
147
        '''
148
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
149
model = GPT2Model.from_pretrained('gpt2')
150
        special_tokens_dict = {'cls_token': '<CLS>'}
151
        num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
152
        print('We have added', num_added_toks, 'tokens')
153
        model.resize_token_embeddings(len(tokenizer))
154
        '''
155

156
        '''
157
        print(tokenizer.all_special_tokens)
158
        print(tokenizer.encode(tokenizer.all_special_tokens))
159
        #['[PAD]', '[SEP]', '[CLS]', '[MASK]', '[UNK]']
160
        #[ 0, 102, 101, 103, 100]
161
        '''
162

163

164
        # The convention in BERT is:
165
        # (a) For sequence pairs:
166
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
167
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
168
        # (b) For single sequences:
169
        #  tokens:   [CLS] the dog is hairy . [SEP]
170
        #  type_ids: 0   0   0   0  0     0 0
171
        #
172
        # Where "type_ids" are used to indicate whether this is the first
173
        # sequence or the second sequence. The embedding vectors for `type=0` and
174
        # `type=1` were learned during pre-training and are added to the wordpiece
175
        # embedding vector (and position vector). This is not *strictly* necessary
176
        # since the [SEP] token unambigiously separates the sequences, but it makes
177
        # it easier for the model to learn the concept of sequences.
178
        #
179
        # For classification tasks, the first vector (corresponding to [CLS]) is
180
        # used as as the "sentence vector". Note that this only makes sense because
181
        # the entire model is fine-tuned.
182

183
        ###
184
        #Already add [CLS] and [SEP]
185
        #101, 102
186
        input_ids = tokenizer.encode(example.sentence,add_special_tokens=True)
187
        ###
188
        next_input = tokenizer.encode(example.aspect,add_special_tokens=False)
189
        #next_input = tokenizer.encode("restaurant",add_special_tokens=False)
190
        next_input = [3] + next_input + [2]
191
        input_ids+=next_input
192
        ###
193
        segment_ids = [0] * len(input_ids)
194

195
        '''
196
        if task_n==2:
197
            #"[SEP]"
198
            input_ids += input_ids + [102]
199
            #sentiment: word (Next sentence)
200
            #segment_ids += [1] * (len(tokens_b) + 1)
201
        '''
202

203
        # The “Attention Mask” is simply an array of 1s and 0s indicating which tokens are padding and which aren’t (including special tokens)
204

205
        # The mask has 1 for real tokens and 0 for padding tokens. Only real
206
        # tokens are attended to.
207
        attention_mask = [1] * len(input_ids)
208

209
        # Zero-pad up to the sequence length.
210
        padding = [0] * (max_seq_length - len(input_ids))
211
        input_ids += padding
212
        attention_mask += padding
213
        segment_ids += padding
214

215
        if len(input_ids) != max_seq_length:
216
            print("input_ids more than max_seq_length")
217
            continue
218
        assert len(input_ids) == max_seq_length
219
        if len(attention_mask) != max_seq_length:
220
            print("attention_mask more than max_seq_length")
221
            continue
222
        assert len(attention_mask) == max_seq_length
223
        if len(segment_ids) != max_seq_length:
224
            print("segment_ids more than max_seq_length")
225
            continue
226
        assert len(segment_ids) == max_seq_length
227

228

229
        if task_n == 1:
230
            label_id = label_map[example.aspect]
231
        elif task_n == 2:
232
            label_id = label_map[example.sentiment]
233
        else:
234
            print("Wrong task")
235

236

237
        if task_n == 1:
238
            features.append(
239
                    InputFeatures(input_ids=input_ids,
240
                                  attention_mask=attention_mask,
241
                                  segment_ids=None,
242
                                  label_id=label_id))
243
        elif task_n == 2:
244
            features.append(
245
                    InputFeatures(input_ids=input_ids,
246
                                  attention_mask=attention_mask,
247
                                  segment_ids=segment_ids,
248
                                  label_id=label_id))
249
        else:
250
            print("Wrong in convert_examples")
251

252

253
    return features
254

255

256
def main():
257
    parser = argparse.ArgumentParser()
258
    ## Required parameters
259
    ###############
260
    parser.add_argument("--data_dir",
261
                        default=None,
262
                        type=str,
263
                        required=True,
264
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
265
    parser.add_argument("--output_dir",
266
                        default=None,
267
                        type=str,
268
                        required=True,
269
                        help="The output directory where the model predictions and checkpoints will be written.")
270
    parser.add_argument("--pretrain_model",
271
                        default='bert-case-uncased',
272
                        type=str,
273
                        required=True,
274
                        help="Pre-trained model")
275
    parser.add_argument("--num_labels_task",
276
                        default=None, type=int,
277
                        required=True,
278
                        help="num_labels_task")
279
    parser.add_argument("--max_seq_length",
280
                        default=128,
281
                        type=int,
282
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
283
                             "Sequences longer than this will be truncated, and sequences shorter \n"
284
                             "than this will be padded.")
285
    parser.add_argument("--do_train",
286
                        default=False,
287
                        action='store_true',
288
                        help="Whether to run training.")
289
    parser.add_argument("--do_eval",
290
                        default=False,
291
                        action='store_true',
292
                        help="Whether to run eval on the dev set.")
293
    parser.add_argument("--do_lower_case",
294
                        default=False,
295
                        action='store_true',
296
                        help="Set this flag if you are using an uncased model.")
297
    parser.add_argument("--eval_batch_size",
298
                        default=32,
299
                        type=int,
300
                        help="Total batch size for training.")
301
    parser.add_argument("--learning_rate",
302
                        default=5e-5,
303
                        type=float,
304
                        help="The initial learning rate for Adam.")
305
    parser.add_argument("--num_train_epochs",
306
                        default=3.0,
307
                        type=float,
308
                        help="Total number of training epochs to perform.")
309
    parser.add_argument("--warmup_proportion",
310
                        default=0.1,
311
                        type=float,
312
                        help="Proportion of training to perform linear learning rate warmup for. "
313
                             "E.g., 0.1 = 10%% of training.")
314
    parser.add_argument("--no_cuda",
315
                        default=False,
316
                        action='store_true',
317
                        help="Whether not to use CUDA when available")
318
    parser.add_argument("--local_rank",
319
                        type=int,
320
                        default=-1,
321
                        help="local_rank for distributed training on gpus")
322
    parser.add_argument('--seed',
323
                        type=int,
324
                        default=42,
325
                        help="random seed for initialization")
326
    parser.add_argument('--gradient_accumulation_steps',
327
                        type=int,
328
                        default=1,
329
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
330
    parser.add_argument('--fp16',
331
                        default=False,
332
                        action='store_true',
333
                        help="Whether to use 16-bit float precision instead of 32-bit")
334
    parser.add_argument('--loss_scale',
335
                        type=float, default=0,
336
                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
337
                             "0 (default value): dynamic loss scaling.\n"
338
                             "Positive power of 2: static loss scaling value.\n")
339
    parser.add_argument("--weight_decay",
340
                        default=0.0,
341
                        type=float,
342
                        help="Weight decay if we apply some.")
343
    parser.add_argument("--adam_epsilon",
344
                        default=1e-8,
345
                        type=float,
346
                        help="Epsilon for Adam optimizer.")
347
    parser.add_argument("--max_grad_norm",
348
                        default=1.0,
349
                        type=float,
350
                        help="Max gradient norm.")
351
    parser.add_argument('--fp16_opt_level',
352
                        type=str,
353
                        default='O1',
354
                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
355
                             "See details at https://nvidia.github.io/apex/amp.html")
356
    parser.add_argument("--task",
357
                        default=None,
358
                        type=int,
359
                        required=True,
360
                        help="Choose Task")
361
    ###############
362

363
    args = parser.parse_args()
364
    #print(args.do_train, args.do_eval)
365
    #exit()
366

367
    processors = Processor_1
368

369
    num_labels = args.num_labels_task
370

371
    if args.local_rank == -1 or args.no_cuda:
372
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
373
        n_gpu = torch.cuda.device_count()
374
    else:
375
        torch.cuda.set_device(args.local_rank)
376
        device = torch.device("cuda", args.local_rank)
377
        n_gpu = 1
378
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
379
        torch.distributed.init_process_group(backend='nccl')
380
    logger.info("device: {}, n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
381
        device, n_gpu, bool(args.local_rank != -1), args.fp16))
382

383

384

385
    if args.gradient_accumulation_steps < 1:
386
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
387
                            args.gradient_accumulation_steps))
388

389
    #args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)
390

391
    random.seed(args.seed)
392
    np.random.seed(args.seed)
393
    torch.manual_seed(args.seed)
394
    if n_gpu > 0:
395
        torch.cuda.manual_seed_all(args.seed)
396

397
    if not args.do_eval:
398
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
399

400
    '''
401
    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
402
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
403
    '''
404
    os.makedirs(args.output_dir, exist_ok=True)
405

406

407

408
    tokenizer = RobertaTokenizer.from_pretrained(args.pretrain_model)
409

410

411
    train_examples = None
412
    num_train_steps = None
413
    aspect_list = None
414
    sentiment_list = None
415
    processor = processors()
416
    num_labels = num_labels
417
    #train_examples, aspect_list, sentiment_list = processor.get_train_examples(args.data_dir)
418

419
    filenames = os.listdir(args.output_dir)
420
    filenames = [x for x in filenames if "pytorch_model.bin_" in x]
421

422
    file_mark = []
423
    for x in filenames:
424
        file_mark.append([x, True])
425
        file_mark.append([x, False])
426

427
    ####
428
    ####
429
    test_examples, aspect_list, sentiment_list = processor.get_test_examples(args.data_dir)
430
    if args.task == 1:
431
        num_labels = len(aspect_list)
432
    elif args.task == 2:
433
        num_labels = len(sentiment_list)
434
    else:
435
        print("What's task?")
436
        exit()
437
    test = convert_examples_to_features(
438
        test_examples, aspect_list, sentiment_list, args.max_seq_length, tokenizer, args.task)
439
    eval_examples = test_examples
440
    ###
441

442

443
    for x, mark in file_mark:
444
        print(x, mark)
445
        output_model_file = os.path.join(args.output_dir, x)
446
        print(output_model_file)
447

448
        model = RobertaForSequenceClassification.from_pretrained(args.pretrain_model, num_labels=num_labels, output_hidden_states=False, output_attentions=False, return_dict=True)
449
        model.load_state_dict(torch.load(output_model_file), strict=False)
450
        #strict False: ignore non-matching keys
451

452

453
        #param_optimizer = [para[0] for para in model.named_parameters()]
454
        #param_optimizer = [para for para in model.named_parameters()][-2]
455
        #print(param_optimizer)
456

457
        model.to(device)
458
        if mark:
459
            eval_features = dev
460
        else:
461
            eval_features = test
462

463
        logger.info("***** Running evaluation *****")
464
        logger.info("  Num examples = %d", len(eval_examples))
465
        logger.info("  Batch size = %d", args.eval_batch_size)
466

467

468
        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
469
        all_attention_mask = torch.tensor([f.attention_mask for f in eval_features], dtype=torch.long)
470
        if args.task == 1:
471
            print("Excuting the task 1")
472
        elif args.task == 2:
473
            all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
474
        else:
475
            print("Wrong here2")
476

477
        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
478

479
        if args.task == 1:
480
            eval_data = TensorDataset(all_input_ids, all_attention_mask, all_label_ids)
481
        elif args.task == 2:
482
            eval_data = TensorDataset(all_input_ids, all_attention_mask, all_segment_ids, all_label_ids)
483
        else:
484
            print("Wrong here1")
485

486
        if args.local_rank == -1:
487
            eval_sampler = RandomSampler(eval_data)
488
        else:
489
            eval_sampler = DistributedSampler(eval_data)
490
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
491

492
        if mark:
493
            output_eval_file = os.path.join(args.output_dir, "eval_results_{}.txt".format(x.split("_")[-1]))
494
            output_file_pred = os.path.join(args.output_dir, "eval_pred_{}.txt".format(x.split("_")[-1]))
495
            output_file_glod = os.path.join(args.output_dir, "eval_gold_{}.txt".format(x.split("_")[-1]))
496
        else:
497
            output_eval_file = os.path.join(args.output_dir, "test_results_{}.txt".format(x.split("_")[-1]))
498
            output_file_pred = os.path.join(args.output_dir, "test_pred_{}.txt".format(x.split("_")[-1]))
499
            output_file_glod = os.path.join(args.output_dir, "test_gold_{}.txt".format(x.split("_")[-1]))
500

501
        fpred = open(output_file_pred, "w")
502
        fgold = open(output_file_glod, "w")
503

504
        model.eval()
505
        eval_loss, eval_accuracy = 0, 0
506
        nb_eval_steps, nb_eval_examples = 0, 0
507

508

509
        for step, batch in enumerate(tqdm(eval_dataloader, desc="Iteration")):
510
            #batch = tuple(t.to(device) if i != 3 else t for i, t in enumerate(batch))
511
            batch = tuple(t.to(device) for i, t in enumerate(batch))
512

513
            if args.task == 1:
514
                input_ids, attention_mask, label_ids = batch
515
            elif args.task == 2:
516
                input_ids, attention_mask, segment_ids, label_ids = batch
517
            else:
518
                print("Wrong here3")
519

520

521
            if args.task == 1:
522
                #loss, logits, hidden_states, attentions
523
                output = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels=label_ids)
524
                logits = output.logits
525
                tmp_eval_loss = output.loss
526
            elif args.task == 2:
527
                #loss, logits, hidden_states, attentions
528
                output = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=attention_mask, labels=label_ids)
529
                logits = output.logits
530
                tmp_eval_loss = output.loss
531
            else:
532
                print("Wrong!!")
533

534

535
            logits = logits.detach().cpu().numpy()
536
            label_ids = label_ids.to('cpu').numpy()
537
            tmp_eval_accuracy, pred = accuracy(logits, label_ids)
538
            for a, b in zip(pred, label_ids):
539
                fgold.write("{}\n".format(b))
540
                fpred.write("{}\n".format(a))
541

542
            eval_loss += tmp_eval_loss.mean().item()
543
            eval_accuracy += tmp_eval_accuracy
544

545
            nb_eval_examples += input_ids.size(0)
546
            nb_eval_steps += 1
547

548
        eval_loss = eval_loss / nb_eval_steps
549
        eval_accuracy = eval_accuracy / nb_eval_examples
550

551
        result = {'eval_loss': eval_loss,
552
                  'eval_accuracy': eval_accuracy
553
                  }
554

555
        with open(output_eval_file, "w") as writer:
556
            logger.info("***** Eval results *****")
557
            for key in sorted(result.keys()):
558
                logger.info("  %s = %s", key, str(result[key]))
559
                writer.write("%s = %s\n" % (key, str(result[key])))
560

561

562

563
if __name__ == "__main__":
564
    main()
565

566

567
CSS-LM

Использование cookies