CSS-LM
625 строк · 24.6 Кб
1import argparse
2import logging
3import random
4import numpy as np
5import os
6import json
7import math
8
9import torch
10from transformers import RobertaTokenizer, RobertaForMaskedLM, RobertaForSequenceClassification
11#from transformers.modeling_roberta import RobertaForMaskedLMDomainTask
12from transformers.modeling_roberta_updateRep import RobertaForMaskedLMDomainTask
13from tqdm import tqdm, trange
14from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
15from torch.utils.data.distributed import DistributedSampler
16from transformers.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
17from transformers.optimization import AdamW, get_linear_schedule_with_warmup
18
19
20
21logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
22datefmt = '%m/%d/%Y %H:%M:%S',
23level = logging.INFO)
24logger = logging.getLogger(__name__)
25
26def accuracy(out, labels):
27outputs = np.argmax(out, axis=1)
28return np.sum(outputs == labels), outputs
29
30class InputFeatures(object):
31"""A single set of features of data."""
32
33def __init__(self, input_ids=None, attention_mask=None, segment_ids=None, label_id=None):
34self.input_ids = input_ids
35self.attention_mask = attention_mask
36self.segment_ids = segment_ids
37self.label_id = label_id
38
39class InputExample(object):
40"""A single training/test example for simple sequence classification."""
41def __init__(self, guid, sentence, aspect, sentiment=None):
42"""Constructs a InputExample.
43
44Args:
45guid: Unique id for the example.
46text_a: string. The untokenized text of the first sequence. For single
47sequence tasks, only this sequence must be specified.
48text_b: (Optional) string. The untokenized text of the second sequence.
49Only must be specified for sequence pair tasks.
50label: (Optional) string. The label of the example. This should be
51specified for train and dev examples, but not for test examples.
52"""
53self.guid = guid
54self.sentence = sentence
55self.aspect = aspect
56self.sentiment = sentiment
57
58
59class DataProcessor(object):
60"""Base class for data converters for sequence classification data sets."""
61
62def get_train_examples(self, data_dir):
63"""Gets a collection of `InputExample`s for the train set."""
64raise NotImplementedError()
65
66def get_dev_examples(self, data_dir):
67"""Gets a collection of `InputExample`s for the dev set."""
68raise NotImplementedError()
69
70def get_labels(self):
71"""Gets the list of labels for this data set."""
72raise NotImplementedError()
73
74@classmethod
75def _read_json(cls, input_file):
76with open(input_file, "r", encoding='utf-8') as f:
77return json.loads(f.read())
78
79
80class Processor_1(DataProcessor):
81"""Processor for the CoLA data set (GLUE version)."""
82
83def get_train_examples(self, data_dir):
84"""See base class."""
85examples = self._create_examples(
86self._read_json(os.path.join(data_dir, "train.json")), "train")
87aspect = set([x.aspect for x in examples])
88sentiment = set([x.sentiment for x in examples])
89return examples, list(aspect), list(sentiment)
90
91def get_dev_examples(self, data_dir):
92"""See base class."""
93examples = self._create_examples(
94self._read_json(os.path.join(data_dir, "dev.json")), "dev")
95aspect = set([x.aspect for x in examples])
96sentiment = set([x.sentiment for x in examples])
97return examples, list(aspect), list(sentiment)
98
99def get_test_examples(self, data_dir):
100"""See base class."""
101examples = self._create_examples(
102self._read_json(os.path.join(data_dir, "test.json")), "test")
103aspect = set([x.aspect for x in examples])
104sentiment = set([x.sentiment for x in examples])
105return examples, list(aspect), list(sentiment)
106
107def get_labels(self):
108"""Useless"""
109return ["0", "1"]
110
111def _create_examples(self, lines, set_type):
112"""Creates examples for the training and dev sets."""
113examples = []
114for (i, line) in enumerate(lines):
115guid = "%s-%s" % (set_type, i)
116
117sentence = line["sentence"]
118aspect = line["aspect"]
119sentiment = line["sentiment"]
120
121examples.append(
122InputExample(guid=guid, sentence=sentence, aspect=aspect, sentiment=sentiment))
123return examples
124
125def convert_examples_to_features(examples, aspect_list, sentiment_list, max_seq_length, tokenizer, task_n):
126
127"""Loads a data file into a list of `InputBatch`s."""
128
129#Task_1: sentence --> aspect
130#Task_2: aspect+sentence --> sentiment
131if task_n == 1:
132label_list = sorted(aspect_list)
133elif task_n == 2:
134label_list = sorted(sentiment_list)
135else:
136print("Wrong task")
137'''
138for w in label_list:
139print(w,tokenizer.encode(w))
140exit()
141'''
142label_map = {label : i for i, label in enumerate(label_list)}
143print("=======")
144print(label_map)
145print("=======")
146
147
148features = []
149for (ex_index, example) in enumerate(examples):
150
151#Add new special tokens
152'''
153tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
154model = GPT2Model.from_pretrained('gpt2')
155special_tokens_dict = {'cls_token': '<CLS>'}
156num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
157print('We have added', num_added_toks, 'tokens')
158model.resize_token_embeddings(len(tokenizer))
159'''
160
161'''
162print(tokenizer.all_special_tokens)
163print(tokenizer.encode(tokenizer.all_special_tokens))
164#['[PAD]', '[SEP]', '[CLS]', '[MASK]', '[UNK]']
165#[ 0, 102, 101, 103, 100]
166'''
167
168
169# The convention in BERT is:
170# (a) For sequence pairs:
171# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
172# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
173# (b) For single sequences:
174# tokens: [CLS] the dog is hairy . [SEP]
175# type_ids: 0 0 0 0 0 0 0
176#
177# Where "type_ids" are used to indicate whether this is the first
178# sequence or the second sequence. The embedding vectors for `type=0` and
179# `type=1` were learned during pre-training and are added to the wordpiece
180# embedding vector (and position vector). This is not *strictly* necessary
181# since the [SEP] token unambigiously separates the sequences, but it makes
182# it easier for the model to learn the concept of sequences.
183#
184# For classification tasks, the first vector (corresponding to [CLS]) is
185# used as as the "sentence vector". Note that this only makes sense because
186# the entire model is fine-tuned.
187
188###
189#Already add [CLS] and [SEP]
190#101, 102
191input_ids = tokenizer.encode(example.sentence,add_special_tokens=True)
192if len(input_ids) > max_seq_length:
193input_ids = input_ids[:max_seq_length-1]+[2]
194segment_ids = [0] * len(input_ids)
195
196
197'''
198if task_n==2:
199#"[SEP]"
200input_ids += input_ids + [102]
201#sentiment: word (Next sentence)
202#segment_ids += [1] * (len(tokens_b) + 1)
203'''
204
205# The “Attention Mask” is simply an array of 1s and 0s indicating which tokens are padding and which aren’t (including special tokens)
206
207# The mask has 1 for real tokens and 0 for padding tokens. Only real
208# tokens are attended to.
209attention_mask = [1] * len(input_ids)
210
211# Zero-pad up to the sequence length.
212padding = [0] * (max_seq_length - len(input_ids))
213#<pad>:1
214padding_id = [1] * (max_seq_length - len(input_ids))
215input_ids += padding_id
216attention_mask += padding
217segment_ids += padding
218
219try:
220assert len(input_ids) == max_seq_length
221assert len(attention_mask) == max_seq_length
222assert len(segment_ids) == max_seq_length
223except:
224continue
225
226if task_n == 1:
227label_id = label_map[example.aspect]
228elif task_n == 2:
229label_id = label_map[example.sentiment]
230else:
231print("Wrong task")
232
233
234if task_n == 1:
235features.append(
236InputFeatures(input_ids=input_ids,
237attention_mask=attention_mask,
238segment_ids=None,
239label_id=label_id))
240elif task_n == 2:
241features.append(
242InputFeatures(input_ids=input_ids,
243attention_mask=attention_mask,
244segment_ids=segment_ids,
245label_id=label_id))
246else:
247print("Wrong in convert_examples")
248
249
250return features
251
252
253def main():
254parser = argparse.ArgumentParser()
255## Required parameters
256###############
257parser.add_argument("--data_dir",
258default=None,
259type=str,
260required=True,
261help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
262parser.add_argument("--output_dir",
263default=None,
264type=str,
265required=True,
266help="The output directory where the model predictions and checkpoints will be written.")
267parser.add_argument("--pretrain_model",
268default='bert-case-uncased',
269type=str,
270required=True,
271help="Pre-trained model")
272parser.add_argument("--num_labels_task",
273default=None, type=int,
274required=True,
275help="num_labels_task")
276parser.add_argument("--max_seq_length",
277default=128,
278type=int,
279help="The maximum total input sequence length after WordPiece tokenization. \n"
280"Sequences longer than this will be truncated, and sequences shorter \n"
281"than this will be padded.")
282parser.add_argument("--do_train",
283default=False,
284action='store_true',
285help="Whether to run training.")
286parser.add_argument("--do_eval",
287default=False,
288action='store_true',
289help="Whether to run eval on the dev set.")
290parser.add_argument("--do_lower_case",
291default=False,
292action='store_true',
293help="Set this flag if you are using an uncased model.")
294parser.add_argument("--eval_batch_size",
295default=32,
296type=int,
297help="Total batch size for training.")
298parser.add_argument("--learning_rate",
299default=5e-5,
300type=float,
301help="The initial learning rate for Adam.")
302parser.add_argument("--num_train_epochs",
303default=3.0,
304type=float,
305help="Total number of training epochs to perform.")
306parser.add_argument("--warmup_proportion",
307default=0.1,
308type=float,
309help="Proportion of training to perform linear learning rate warmup for. "
310"E.g., 0.1 = 10%% of training.")
311parser.add_argument("--no_cuda",
312default=False,
313action='store_true',
314help="Whether not to use CUDA when available")
315parser.add_argument("--local_rank",
316type=int,
317default=-1,
318help="local_rank for distributed training on gpus")
319parser.add_argument('--seed',
320type=int,
321default=42,
322help="random seed for initialization")
323parser.add_argument('--gradient_accumulation_steps',
324type=int,
325default=1,
326help="Number of updates steps to accumulate before performing a backward/update pass.")
327parser.add_argument('--fp16',
328default=False,
329action='store_true',
330help="Whether to use 16-bit float precision instead of 32-bit")
331parser.add_argument('--loss_scale',
332type=float, default=0,
333help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
334"0 (default value): dynamic loss scaling.\n"
335"Positive power of 2: static loss scaling value.\n")
336parser.add_argument("--weight_decay",
337default=0.0,
338type=float,
339help="Weight decay if we apply some.")
340parser.add_argument("--adam_epsilon",
341default=1e-8,
342type=float,
343help="Epsilon for Adam optimizer.")
344parser.add_argument("--max_grad_norm",
345default=1.0,
346type=float,
347help="Max gradient norm.")
348parser.add_argument('--fp16_opt_level',
349type=str,
350default='O1',
351help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
352"See details at https://nvidia.github.io/apex/amp.html")
353parser.add_argument("--task",
354default=None,
355type=int,
356required=True,
357help="Choose Task")
358###############
359
360args = parser.parse_args()
361#print(args.do_train, args.do_eval)
362#exit()
363
364processors = Processor_1
365
366num_labels = args.num_labels_task
367
368if args.local_rank == -1 or args.no_cuda:
369device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
370n_gpu = torch.cuda.device_count()
371else:
372torch.cuda.set_device(args.local_rank)
373device = torch.device("cuda", args.local_rank)
374n_gpu = 1
375# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
376torch.distributed.init_process_group(backend='nccl')
377logger.info("device: {}, n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
378device, n_gpu, bool(args.local_rank != -1), args.fp16))
379
380
381
382if args.gradient_accumulation_steps < 1:
383raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
384args.gradient_accumulation_steps))
385
386#args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)
387
388random.seed(args.seed)
389np.random.seed(args.seed)
390torch.manual_seed(args.seed)
391if n_gpu > 0:
392torch.cuda.manual_seed_all(args.seed)
393
394if not args.do_eval:
395raise ValueError("At least one of `do_train` or `do_eval` must be True.")
396
397'''
398if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
399raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
400'''
401os.makedirs(args.output_dir, exist_ok=True)
402
403
404
405tokenizer = RobertaTokenizer.from_pretrained(args.pretrain_model)
406
407
408train_examples = None
409num_train_steps = None
410aspect_list = None
411sentiment_list = None
412processor = processors()
413num_labels = num_labels
414#train_examples, aspect_list, sentiment_list = processor.get_train_examples(args.data_dir)
415
416filenames = os.listdir(args.output_dir)
417filenames = [x for x in filenames if "pytorch_model.bin_" in x]
418print(filenames)
419
420file_mark = []
421model_performace_dev = dict()
422model_performace_test = dict()
423for x in filenames:
424file_mark.append([x, True])
425file_mark.append([x, False])
426
427####
428####
429train_examples, aspect_list, sentiment_list = processor.get_test_examples(args.data_dir)
430test_examples, _, _ = processor.get_test_examples(args.data_dir)
431eval_examples, _, _ = processor.get_dev_examples(args.data_dir)
432if args.task == 1:
433num_labels = len(aspect_list)
434elif args.task == 2:
435num_labels = len(sentiment_list)
436else:
437print("What's task?")
438exit()
439test = convert_examples_to_features(
440test_examples, aspect_list, sentiment_list, args.max_seq_length, tokenizer, args.task)
441
442dev = convert_examples_to_features(
443eval_examples, aspect_list, sentiment_list, args.max_seq_length, tokenizer, args.task)
444###
445
446
447for x, mark in file_mark:
448print(x, mark)
449output_model_file = os.path.join(args.output_dir, x)
450
451#model = RobertaForSequenceClassification.from_pretrained(args.pretrain_model, num_labels=num_labels, output_hidden_states=False, output_attentions=False, return_dict=True)
452model = RobertaForMaskedLMDomainTask.from_pretrained(args.pretrain_model, output_hidden_states=False, output_attentions=False, return_dict=True, num_labels=args.num_labels_task)
453model.load_state_dict(torch.load(output_model_file), strict=False)
454#strict False: ignore non-matching keys
455
456
457#param_optimizer = [para[0] for para in model.named_parameters()]
458#param_optimizer = [para for para in model.named_parameters()][-2]
459#print(param_optimizer)
460
461model.to(device)
462if mark:
463eval_features = dev
464else:
465eval_features = test
466
467logger.info("***** Running evaluation *****")
468logger.info(" Num examples = %d", len(eval_examples))
469logger.info(" Batch size = %d", args.eval_batch_size)
470
471
472all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
473all_attention_mask = torch.tensor([f.attention_mask for f in eval_features], dtype=torch.long)
474if args.task == 1:
475print("Excuting the task 1")
476elif args.task == 2:
477all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
478else:
479print("Wrong here2")
480
481all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
482
483if args.task == 1:
484eval_data = TensorDataset(all_input_ids, all_attention_mask, all_label_ids)
485elif args.task == 2:
486eval_data = TensorDataset(all_input_ids, all_attention_mask, all_segment_ids, all_label_ids)
487else:
488print("Wrong here1")
489
490if args.local_rank == -1:
491eval_sampler = RandomSampler(eval_data)
492else:
493eval_sampler = DistributedSampler(eval_data)
494eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
495
496if mark:
497output_eval_file = os.path.join(args.output_dir, "eval_results_{}.txt".format(x.split("_")[-1]))
498output_file_pred = os.path.join(args.output_dir, "eval_pred_{}.txt".format(x.split("_")[-1]))
499output_file_glod = os.path.join(args.output_dir, "eval_gold_{}.txt".format(x.split("_")[-1]))
500else:
501output_eval_file = os.path.join(args.output_dir, "test_results_{}.txt".format(x.split("_")[-1]))
502output_file_pred = os.path.join(args.output_dir, "test_pred_{}.txt".format(x.split("_")[-1]))
503output_file_glod = os.path.join(args.output_dir, "test_gold_{}.txt".format(x.split("_")[-1]))
504
505fpred = open(output_file_pred, "w")
506fgold = open(output_file_glod, "w")
507
508model.eval()
509eval_loss, eval_accuracy = 0, 0
510nb_eval_steps, nb_eval_examples = 0, 0
511
512
513for step, batch in enumerate(tqdm(eval_dataloader, desc="Iteration")):
514#batch = tuple(t.to(device) if i != 3 else t for i, t in enumerate(batch))
515batch = tuple(t.to(device) for i, t in enumerate(batch))
516
517if args.task == 1:
518input_ids, attention_mask, label_ids = batch
519elif args.task == 2:
520input_ids, attention_mask, segment_ids, label_ids = batch
521else:
522print("Wrong here3")
523
524
525if args.task == 1:
526#loss, logits, hidden_states, attentions
527'''
528output = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels=label_ids)
529logits = output.logits
530tmp_eval_loss = output.loss
531'''
532#
533tmp_eval_loss, logits = model(input_ids_org=input_ids, sentence_label=label_ids, attention_mask=attention_mask, func="task_class")
534#logits = output.logits
535#tmp_eval_loss = output.loss
536elif args.task == 2:
537#loss, logits, hidden_states, attentions
538'''
539output = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels=label_ids)
540logits = output.logits
541tmp_eval_loss = output.loss
542'''
543#
544tmp_eval_loss, logits = model(input_ids_org=input_ids, sentence_label=label_ids, attention_mask=attention_mask, func="task_class")
545#exit()
546#logits = output.logits
547#tmp_eval_loss = output.loss
548else:
549print("Wrong!!")
550
551
552logits = logits.detach().cpu().numpy()
553label_ids = label_ids.to('cpu').numpy()
554tmp_eval_accuracy, pred = accuracy(logits, label_ids)
555for a, b in zip(pred, label_ids):
556fgold.write("{}\n".format(b))
557fpred.write("{}\n".format(a))
558
559eval_loss += tmp_eval_loss.mean().item()
560eval_accuracy += tmp_eval_accuracy
561
562nb_eval_examples += input_ids.size(0)
563nb_eval_steps += 1
564
565eval_loss = eval_loss / nb_eval_steps
566eval_accuracy = eval_accuracy / nb_eval_examples
567
568result = {'eval_loss': eval_loss,
569'eval_accuracy': eval_accuracy
570}
571
572with open(output_eval_file, "w") as writer:
573logger.info("***** Eval results *****")
574for key in sorted(result.keys()):
575logger.info(" %s = %s", key, str(result[key]))
576writer.write("%s = %s\n" % (key, str(result[key])))
577
578#if mark and step > int(math.ceil(len(eval_examples)/args.eval_batch_size)):
579if mark:
580model_performace_dev[x] = eval_accuracy
581else:
582model_performace_test[x] = eval_accuracy
583
584#################
585#################
586#####dev#########
587model_name_best=0
588score_best=0
589for model_name, score in model_performace_dev.items():
590if score > score_best:
591score_best = score
592model_name_best = model_name
593
594
595model = RobertaForMaskedLMDomainTask.from_pretrained(args.pretrain_model, output_hidden_states=False, output_attentions=False, return_dict=True, num_labels=args.num_labels_task)
596model_name_best = os.path.join(args.output_dir, model_name_best)
597model.load_state_dict(torch.load(model_name_best), strict=False)
598# Save a trained model
599logger.info("** ** * Saving fine - tuned model ** ** * ")
600model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
601output_model_file = os.path.join(args.output_dir, "pytorch_model.bin_dev_best")
602torch.save(model_to_save.state_dict(), output_model_file)
603
604#################
605#################
606#####test########
607model_name_best=0
608score_best=0
609for model_name, score in model_performace_test.items():
610if score > score_best:
611score_best = score
612model_name_best = model_name
613
614
615model = RobertaForMaskedLMDomainTask.from_pretrained(args.pretrain_model, output_hidden_states=False, output_attentions=False, return_dict=True, num_labels=args.num_labels_task)
616model_name_best = os.path.join(args.output_dir, model_name_best)
617model.load_state_dict(torch.load(model_name_best), strict=False)
618# Save a trained model
619logger.info("** ** * Saving fine - tuned model ** ** * ")
620model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
621output_model_file = os.path.join(args.output_dir, "pytorch_model.bin_test_best")
622torch.save(model_to_save.state_dict(), output_model_file)
623
624if __name__ == "__main__":
625main()
626
627
628