CSS-LM

finetune_roberta.py
568 строк · 22.8 Кб
Перенос по словам
1
import argparse
2
import logging
3
import random
4
import numpy as np
5
import os
6
import json
7
import sys
8

9
import torch
10
from transformers import RobertaTokenizer, RobertaForMaskedLM, RobertaForSequenceClassification
11
from tqdm import tqdm, trange
12
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
13
from torch.utils.data.distributed import DistributedSampler
14
from transformers.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
15
from transformers.optimization import AdamW, get_linear_schedule_with_warmup
16

17
#outdomain = torch.load("/data1/private/suyusheng/task_selecte/data/opendomain_CLS.pt")
18
#print((sys.getsizeof(outdomain)/1024*1024), "MB") #72MB
19
#torch.topk(input, k, dim=None, largest=True, sorted=True, out=None)
20
#NlogN
21
#torch.max
22

23
#exit()
24

25

26
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
27
                    datefmt = '%m/%d/%Y %H:%M:%S',
28
                    level = logging.INFO)
29
logger = logging.getLogger(__name__)
30

31

32
class InputFeatures(object):
33
    """A single set of features of data."""
34

35
    def __init__(self, input_ids=None, attention_mask=None, segment_ids=None, label_id=None):
36
        self.input_ids = input_ids
37
        self.attention_mask = attention_mask
38
        self.segment_ids = segment_ids
39
        self.label_id = label_id
40

41
class InputExample(object):
42
    """A single training/test example for simple sequence classification."""
43
    def __init__(self, guid, sentence, aspect, sentiment=None):
44
        """Constructs a InputExample.
45

46
        Args:
47
            guid: Unique id for the example.
48
            text_a: string. The untokenized text of the first sequence. For single
49
            sequence tasks, only this sequence must be specified.
50
            text_b: (Optional) string. The untokenized text of the second sequence.
51
            Only must be specified for sequence pair tasks.
52
            label: (Optional) string. The label of the example. This should be
53
            specified for train and dev examples, but not for test examples.
54
        """
55
        self.guid = guid
56
        self.sentence = sentence
57
        self.aspect = aspect
58
        self.sentiment = sentiment
59

60

61
class DataProcessor(object):
62
    """Base class for data converters for sequence classification data sets."""
63

64
    def get_train_examples(self, data_dir):
65
        """Gets a collection of `InputExample`s for the train set."""
66
        raise NotImplementedError()
67

68
    def get_dev_examples(self, data_dir):
69
        """Gets a collection of `InputExample`s for the dev set."""
70
        raise NotImplementedError()
71

72
    def get_labels(self):
73
        """Gets the list of labels for this data set."""
74
        raise NotImplementedError()
75

76
    @classmethod
77
    def _read_json(cls, input_file):
78
        with open(input_file, "r", encoding='utf-8') as f:
79
            return json.loads(f.read())
80

81

82
class Processor_1(DataProcessor):
83
    """Processor for the CoLA data set (GLUE version)."""
84

85
    def get_train_examples(self, data_dir):
86
        """See base class."""
87
        examples = self._create_examples(
88
            self._read_json(os.path.join(data_dir, "train.json")), "train")
89
        aspect = set([x.aspect for x in examples])
90
        sentiment = set([x.sentiment for x in examples])
91
        return examples, list(aspect), list(sentiment)
92

93
    def get_dev_examples(self, data_dir):
94
        """See base class."""
95
        examples = self._create_examples(
96
            self._read_json(os.path.join(data_dir, "dev.json")), "dev")
97
        aspect = set([x.aspect for x in examples])
98
        sentiment = set([x.sentiment for x in examples])
99
        return examples, list(aspect), list(sentiment)
100

101
    def get_dev_examples(self, data_dir):
102
        """See base class."""
103
        examples = self._create_examples(
104
            self._read_json(os.path.join(data_dir, "test.json")), "test")
105
        aspect = set([x.aspect for x in examples])
106
        sentiment = set([x.sentiment for x in examples])
107
        return examples, list(aspect), list(sentiment)
108

109
    def get_labels(self):
110
        """Useless"""
111
        return ["0", "1"]
112

113
    def _create_examples(self, lines, set_type):
114
        """Creates examples for the training and dev sets."""
115
        examples = []
116
        for (i, line) in enumerate(lines):
117
            guid = "%s-%s" % (set_type, i)
118

119
            sentence = line["sentence"]
120
            aspect = line["aspect"]
121
            sentiment = line["sentiment"]
122

123
            examples.append(
124
                InputExample(guid=guid, sentence=sentence, aspect=aspect, sentiment=sentiment))
125
        return examples
126

127
def convert_examples_to_features(examples, aspect_list, sentiment_list, max_seq_length, tokenizer, task_n):
128

129
    """Loads a data file into a list of `InputBatch`s."""
130

131
    #Task_1: sentence --> aspect
132
    #Task_2: aspect+sentence --> sentiment
133
    if task_n == 1:
134
        label_list = sorted(aspect_list)
135
    elif task_n == 2:
136
        label_list = sorted(sentiment_list)
137
    else:
138
        print("Wrong task")
139
    '''
140
    for w in label_list:
141
        print(w,tokenizer.encode(w))
142
    exit()
143
    '''
144
    label_map = {label : i for i, label in enumerate(label_list)}
145

146

147
    features = []
148
    for (ex_index, example) in enumerate(examples):
149

150
        #Add new special tokens
151
        '''
152
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
153
model = GPT2Model.from_pretrained('gpt2')
154
        special_tokens_dict = {'cls_token': '<CLS>'}
155
        num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
156
        print('We have added', num_added_toks, 'tokens')
157
        model.resize_token_embeddings(len(tokenizer))
158
        '''
159

160
        '''
161
        print(tokenizer.all_special_tokens)
162
        print(tokenizer.encode(tokenizer.all_special_tokens))
163
        #['[PAD]', '[SEP]', '[CLS]', '[MASK]', '[UNK]']
164
        #[ 0, 102, 101, 103, 100]
165
        '''
166

167

168
        # The convention in BERT is:
169
        # (a) For sequence pairs:
170
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
171
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
172
        # (b) For single sequences:
173
        #  tokens:   [CLS] the dog is hairy . [SEP]
174
        #  type_ids: 0   0   0   0  0     0 0
175
        #
176
        # Where "type_ids" are used to indicate whether this is the first
177
        # sequence or the second sequence. The embedding vectors for `type=0` and
178
        # `type=1` were learned during pre-training and are added to the wordpiece
179
        # embedding vector (and position vector). This is not *strictly* necessary
180
        # since the [SEP] token unambigiously separates the sequences, but it makes
181
        # it easier for the model to learn the concept of sequences.
182
        #
183
        # For classification tasks, the first vector (corresponding to [CLS]) is
184
        # used as as the "sentence vector". Note that this only makes sense because
185
        # the entire model is fine-tuned.
186

187
        ###
188
        #Already add [CLS] and [SEP]
189
        #101, 102
190
        input_ids = tokenizer.encode(example.sentence,add_special_tokens=True)
191
        segment_ids = [0] * len(input_ids)
192

193

194
        if task_n==2:
195
            #"[SEP]"
196
            input_ids += input_ids + [102]
197
            #sentiment: word (Next sentence)
198
            #segment_ids += [1] * (len(tokens_b) + 1)
199

200
        # The “Attention Mask” is simply an array of 1s and 0s indicating which tokens are padding and which aren’t (including special tokens)
201

202
        # The mask has 1 for real tokens and 0 for padding tokens. Only real
203
        # tokens are attended to.
204
        attention_mask = [1] * len(input_ids)
205

206
        # Zero-pad up to the sequence length.
207
        padding = [0] * (max_seq_length - len(input_ids))
208
        input_ids += padding
209
        attention_mask += padding
210
        segment_ids += padding
211

212
        assert len(input_ids) == max_seq_length
213
        assert len(attention_mask) == max_seq_length
214
        assert len(segment_ids) == max_seq_length
215

216
        if task_n == 1:
217
            label_id = label_map[example.aspect]
218
        elif task_n == 2:
219
            label_id = label_map[example.sentiment]
220
        else:
221
            print("Wrong task")
222

223

224
        if task_n == 1:
225
            features.append(
226
                    InputFeatures(input_ids=input_ids,
227
                                  attention_mask=attention_mask,
228
                                  segment_ids=None,
229
                                  label_id=label_id))
230
        elif task_n == 2:
231
            features.append(
232
                    InputFeatures(input_ids=input_ids,
233
                                  attention_mask=attention_mask,
234
                                  segment_ids=segment_ids,
235
                                  label_id=label_id))
236
        else:
237
            print("Wrong in convert_examples")
238

239

240
    return features
241

242

243
def main():
244
    parser = argparse.ArgumentParser()
245
    ## Required parameters
246
    ###############
247
    parser.add_argument("--data_dir",
248
                        default=None,
249
                        type=str,
250
                        required=True,
251
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
252
    parser.add_argument("--output_dir",
253
                        default=None,
254
                        type=str,
255
                        required=True,
256
                        help="The output directory where the model predictions and checkpoints will be written.")
257
    parser.add_argument("--pretrain_model",
258
                        default='bert-case-uncased',
259
                        type=str,
260
                        required=True,
261
                        help="Pre-trained model")
262
    parser.add_argument("--num_labels_task",
263
                        default=None, type=int,
264
                        required=True,
265
                        help="num_labels_task")
266
    parser.add_argument("--max_seq_length",
267
                        default=128,
268
                        type=int,
269
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
270
                             "Sequences longer than this will be truncated, and sequences shorter \n"
271
                             "than this will be padded.")
272
    parser.add_argument("--do_train",
273
                        default=False,
274
                        action='store_true',
275
                        help="Whether to run training.")
276
    parser.add_argument("--do_eval",
277
                        default=False,
278
                        action='store_true',
279
                        help="Whether to run eval on the dev set.")
280
    parser.add_argument("--do_lower_case",
281
                        default=False,
282
                        action='store_true',
283
                        help="Set this flag if you are using an uncased model.")
284
    parser.add_argument("--train_batch_size",
285
                        default=32,
286
                        type=int,
287
                        help="Total batch size for training.")
288
    parser.add_argument("--learning_rate",
289
                        default=5e-5,
290
                        type=float,
291
                        help="The initial learning rate for Adam.")
292
    parser.add_argument("--num_train_epochs",
293
                        default=3.0,
294
                        type=float,
295
                        help="Total number of training epochs to perform.")
296
    parser.add_argument("--warmup_proportion",
297
                        default=0.1,
298
                        type=float,
299
                        help="Proportion of training to perform linear learning rate warmup for. "
300
                             "E.g., 0.1 = 10%% of training.")
301
    parser.add_argument("--no_cuda",
302
                        default=False,
303
                        action='store_true',
304
                        help="Whether not to use CUDA when available")
305
    parser.add_argument("--local_rank",
306
                        type=int,
307
                        default=-1,
308
                        help="local_rank for distributed training on gpus")
309
    parser.add_argument('--seed',
310
                        type=int,
311
                        default=42,
312
                        help="random seed for initialization")
313
    parser.add_argument('--gradient_accumulation_steps',
314
                        type=int,
315
                        default=1,
316
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
317
    parser.add_argument('--fp16',
318
                        default=False,
319
                        action='store_true',
320
                        help="Whether to use 16-bit float precision instead of 32-bit")
321
    parser.add_argument('--loss_scale',
322
                        type=float, default=0,
323
                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
324
                             "0 (default value): dynamic loss scaling.\n"
325
                             "Positive power of 2: static loss scaling value.\n")
326
    parser.add_argument("--weight_decay",
327
                        default=0.0,
328
                        type=float,
329
                        help="Weight decay if we apply some.")
330
    parser.add_argument("--adam_epsilon",
331
                        default=1e-8,
332
                        type=float,
333
                        help="Epsilon for Adam optimizer.")
334
    parser.add_argument("--max_grad_norm",
335
                        default=1.0,
336
                        type=float,
337
                        help="Max gradient norm.")
338
    parser.add_argument('--fp16_opt_level',
339
                        type=str,
340
                        default='O1',
341
                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
342
                             "See details at https://nvidia.github.io/apex/amp.html")
343
    parser.add_argument("--task",
344
                        default=None,
345
                        type=int,
346
                        required=True,
347
                        help="Choose Task")
348
    ###############
349

350
    args = parser.parse_args()
351

352
    processors = Processor_1
353

354
    num_labels = args.num_labels_task
355

356
    if args.local_rank == -1 or args.no_cuda:
357
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
358
        n_gpu = torch.cuda.device_count()
359
    else:
360
        torch.cuda.set_device(args.local_rank)
361
        device = torch.device("cuda", args.local_rank)
362
        n_gpu = 1
363
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
364
        torch.distributed.init_process_group(backend='nccl')
365
    logger.info("device: {}, n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
366
        device, n_gpu, bool(args.local_rank != -1), args.fp16))
367

368

369

370
    if args.gradient_accumulation_steps < 1:
371
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
372
                            args.gradient_accumulation_steps))
373

374
    args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)
375

376
    random.seed(args.seed)
377
    np.random.seed(args.seed)
378
    torch.manual_seed(args.seed)
379
    if n_gpu > 0:
380
        torch.cuda.manual_seed_all(args.seed)
381

382
    if not args.do_train:
383
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
384

385
    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
386
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
387
    os.makedirs(args.output_dir, exist_ok=True)
388

389

390
    tokenizer = RobertaTokenizer.from_pretrained(args.pretrain_model)
391

392

393
    train_examples = None
394
    num_train_steps = None
395
    aspect_list = None
396
    sentiment_list = None
397
    processor = processors()
398
    num_labels = num_labels
399
    train_examples, aspect_list, sentiment_list = processor.get_train_examples(args.data_dir)
400

401
    if args.task == 1:
402
        num_labels = len(aspect_list)
403
    elif args.task == 2:
404
        num_labels = len(sentiment_list)
405
    else:
406
        print("What's task?")
407
        exit()
408

409
    num_train_steps = int(
410
        len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)
411

412

413

414
    # Prepare model
415
    model = RobertaForSequenceClassification.from_pretrained(args.pretrain_model, num_labels=num_labels, output_hidden_states=False, output_attentions=False, return_dict=True)
416

417

418
    # Prepare optimizer
419
    t_total = num_train_steps
420
    if args.local_rank != -1:
421
        t_total = t_total // torch.distributed.get_world_size()
422

423
    model.to(device)
424

425
    param_optimizer = list(model.named_parameters())
426
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
427
    #no_decay = ['bias', 'LayerNorm.weight']
428
    no_grad = ['bert.encoder.layer.11.output.dense_ent', 'bert.encoder.layer.11.output.LayerNorm_ent']
429
    param_optimizer = [(n, p) for n, p in param_optimizer if not any(nd in n for nd in no_grad)]
430
    optimizer_grouped_parameters = [
431
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
432
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
433
        ]
434
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
435
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(t_total*0.1), num_training_steps=t_total)
436
    if args.fp16:
437
        try:
438
            from apex import amp
439
        except ImportError:
440
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
441
            exit()
442

443
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
444

445

446
    # multi-gpu training (should be after apex fp16 initialization)
447
    if n_gpu > 1:
448
        model = torch.nn.DataParallel(model)
449

450
    # Distributed training (should be after apex fp16 initialization)
451
    if args.local_rank != -1:
452
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
453
                                                          output_device=args.local_rank,
454
                                                          find_unused_parameters=True)
455

456

457
    global_step = 0
458
    if args.do_train:
459
        train_features = convert_examples_to_features(
460
            train_examples, aspect_list, sentiment_list, args.max_seq_length, tokenizer, args.task)
461
        logger.info("***** Running training *****")
462
        logger.info("  Num examples = %d", len(train_examples))
463
        logger.info("  Batch size = %d", args.train_batch_size)
464
        logger.info("  Num steps = %d", num_train_steps)
465

466

467
        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
468
        all_attention_mask = torch.tensor([f.attention_mask for f in train_features], dtype=torch.long)
469
        if args.task == 1:
470
            print("Excuting the task 1")
471
        elif args.task == 2:
472
            all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
473
        else:
474
            print("Wrong here2")
475

476
        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
477

478
        if args.task == 1:
479
            train_data = TensorDataset(all_input_ids, all_attention_mask, all_label_ids)
480
        elif args.task == 2:
481
            train_data = TensorDataset(all_input_ids, all_attention_mask, all_segment_ids, all_label_ids)
482
        else:
483
            print("Wrong here1")
484

485
        if args.local_rank == -1:
486
            train_sampler = RandomSampler(train_data)
487
        else:
488
            train_sampler = DistributedSampler(train_data)
489
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
490

491
        output_loss_file = os.path.join(args.output_dir, "loss")
492
        loss_fout = open(output_loss_file, 'w')
493
        model.train()
494

495

496
        ##########Pre-Pprocess#########
497
        ###############################
498

499

500
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
501
            tr_loss = 0
502
            nb_tr_examples, nb_tr_steps = 0, 0
503
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
504
                batch = tuple(t.to(device) if i != 3 else t for i, t in enumerate(batch))
505

506
                if args.task == 1:
507
                    input_ids, attention_mask, label_ids = batch
508
                elif args.task == 2:
509
                    input_ids, attention_mask, segment_ids, label_ids = batch
510
                else:
511
                    print("Wrong here3")
512

513

514
                if args.task == 1:
515
                    #loss, logits, hidden_states, attentions
516
                    output = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels=label_ids)
517
                    loss = output.loss
518
                elif args.task == 2:
519
                    #loss, logits, hidden_states, attentions
520
                    #output = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=attention_mask, labels=label_ids)
521
                    #output = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=attention_mask, labels=label_ids)
522
                    output = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels=label_ids)
523
                    loss = output.loss
524
                else:
525
                    print("Wrong!!")
526

527
                if n_gpu > 1:
528
                    loss = loss.mean() # mean() to average on multi-gpu.
529
                if args.gradient_accumulation_steps > 1:
530
                    loss = loss / args.gradient_accumulation_steps
531

532
                if args.fp16:
533
                    ###
534
                    #optimizer.backward(loss)
535
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
536
                        scaled_loss.backward()
537
                    ###
538
                else:
539
                    loss.backward()
540

541
                loss_fout.write("{}\n".format(loss.item()))
542
                tr_loss += loss.item()
543
                nb_tr_examples += input_ids.size(0)
544
                nb_tr_steps += 1
545
                if (step + 1) % args.gradient_accumulation_steps == 0:
546
                    # modify learning rate with special warm up BERT uses
547
                    ###
548
                    if args.fp16:
549
                        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
550
                    else:
551
                        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
552
                    optimizer.step()
553
                    scheduler.step()
554
                    model.zero_grad()
555
                    global_step += 1
556
                    ###
557
            model_to_save = model.module if hasattr(model, 'module') else model
558
            output_model_file = os.path.join(args.output_dir, "pytorch_model.bin_{}".format(global_step))
559
            torch.save(model_to_save.state_dict(), output_model_file)
560

561
        # Save a trained model
562
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
563
        output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
564
        torch.save(model_to_save.state_dict(), output_model_file)
565

566

567
if __name__ == "__main__":
568
    main()
569

570

571

572
CSS-LM

Использование cookies