CSS-LM
587 строк · 23.0 Кб
1import argparse2import logging3import random4import numpy as np5import os6import json7
8import torch9from transformers import RobertaTokenizer, RobertaForMaskedLM, RobertaForSequenceClassification10#from transformers.modeling_roberta import RobertaForMaskedLMDomainTask
11from transformers.modeling_roberta_updateRep import RobertaForMaskedLMDomainTask12from tqdm import tqdm, trange13from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler14from torch.utils.data.distributed import DistributedSampler15from transformers.file_utils import PYTORCH_PRETRAINED_BERT_CACHE16from transformers.optimization import AdamW, get_linear_schedule_with_warmup17
18
19
20logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',21datefmt = '%m/%d/%Y %H:%M:%S',22level = logging.INFO)23logger = logging.getLogger(__name__)24
25def accuracy(out, labels):26outputs = np.argmax(out, axis=1)27return np.sum(outputs == labels), outputs28
29class InputFeatures(object):30"""A single set of features of data."""31
32def __init__(self, input_ids=None, attention_mask=None, segment_ids=None, label_id=None):33self.input_ids = input_ids34self.attention_mask = attention_mask35self.segment_ids = segment_ids36self.label_id = label_id37
38class InputExample(object):39"""A single training/test example for simple sequence classification."""40def __init__(self, guid, sentence, aspect, sentiment=None):41"""Constructs a InputExample.42
43Args:
44guid: Unique id for the example.
45text_a: string. The untokenized text of the first sequence. For single
46sequence tasks, only this sequence must be specified.
47text_b: (Optional) string. The untokenized text of the second sequence.
48Only must be specified for sequence pair tasks.
49label: (Optional) string. The label of the example. This should be
50specified for train and dev examples, but not for test examples.
51"""
52self.guid = guid53self.sentence = sentence54self.aspect = aspect55self.sentiment = sentiment56
57
58class DataProcessor(object):59"""Base class for data converters for sequence classification data sets."""60
61def get_train_examples(self, data_dir):62"""Gets a collection of `InputExample`s for the train set."""63raise NotImplementedError()64
65def get_dev_examples(self, data_dir):66"""Gets a collection of `InputExample`s for the dev set."""67raise NotImplementedError()68
69def get_labels(self):70"""Gets the list of labels for this data set."""71raise NotImplementedError()72
73@classmethod74def _read_json(cls, input_file):75with open(input_file, "r", encoding='utf-8') as f:76return json.loads(f.read())77
78
79class Processor_1(DataProcessor):80"""Processor for the CoLA data set (GLUE version)."""81
82def get_train_examples(self, data_dir):83"""See base class."""84examples = self._create_examples(85self._read_json(os.path.join(data_dir, "train.json")), "train")86aspect = set([x.aspect for x in examples])87sentiment = set([x.sentiment for x in examples])88return examples, list(aspect), list(sentiment)89
90def get_dev_examples(self, data_dir):91"""See base class."""92examples = self._create_examples(93self._read_json(os.path.join(data_dir, "dev.json")), "dev")94aspect = set([x.aspect for x in examples])95sentiment = set([x.sentiment for x in examples])96return examples, list(aspect), list(sentiment)97
98def get_test_examples(self, data_dir):99"""See base class."""100examples = self._create_examples(101self._read_json(os.path.join(data_dir, "test.json")), "test")102aspect = set([x.aspect for x in examples])103sentiment = set([x.sentiment for x in examples])104return examples, list(aspect), list(sentiment)105
106def get_labels(self):107"""Useless"""108return ["0", "1"]109
110def _create_examples(self, lines, set_type):111"""Creates examples for the training and dev sets."""112examples = []113for (i, line) in enumerate(lines):114guid = "%s-%s" % (set_type, i)115
116sentence = line["sentence"]117aspect = line["aspect"]118sentiment = line["sentiment"]119
120examples.append(121InputExample(guid=guid, sentence=sentence, aspect=aspect, sentiment=sentiment))122return examples123
124def convert_examples_to_features(examples, aspect_list, sentiment_list, max_seq_length, tokenizer, task_n):125
126"""Loads a data file into a list of `InputBatch`s."""127
128#Task_1: sentence --> aspect129#Task_2: aspect+sentence --> sentiment130if task_n == 1:131label_list = sorted(aspect_list)132elif task_n == 2:133label_list = sorted(sentiment_list)134else:135print("Wrong task")136'''137for w in label_list:
138print(w,tokenizer.encode(w))
139exit()
140'''
141label_map = {label : i for i, label in enumerate(label_list)}142print("=======")143print(label_map)144print("=======")145
146
147features = []148for (ex_index, example) in enumerate(examples):149
150#Add new special tokens151'''152tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
153model = GPT2Model.from_pretrained('gpt2')
154special_tokens_dict = {'cls_token': '<CLS>'}
155num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
156print('We have added', num_added_toks, 'tokens')
157model.resize_token_embeddings(len(tokenizer))
158'''
159
160'''161print(tokenizer.all_special_tokens)
162print(tokenizer.encode(tokenizer.all_special_tokens))
163#['[PAD]', '[SEP]', '[CLS]', '[MASK]', '[UNK]']
164#[ 0, 102, 101, 103, 100]
165'''
166
167
168# The convention in BERT is:169# (a) For sequence pairs:170# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]171# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1172# (b) For single sequences:173# tokens: [CLS] the dog is hairy . [SEP]174# type_ids: 0 0 0 0 0 0 0175#176# Where "type_ids" are used to indicate whether this is the first177# sequence or the second sequence. The embedding vectors for `type=0` and178# `type=1` were learned during pre-training and are added to the wordpiece179# embedding vector (and position vector). This is not *strictly* necessary180# since the [SEP] token unambigiously separates the sequences, but it makes181# it easier for the model to learn the concept of sequences.182#183# For classification tasks, the first vector (corresponding to [CLS]) is184# used as as the "sentence vector". Note that this only makes sense because185# the entire model is fine-tuned.186
187###188#Already add [CLS] and [SEP]189#101, 102190input_ids = tokenizer.encode(example.sentence,add_special_tokens=True)191segment_ids = [0] * len(input_ids)192
193
194'''195if task_n==2:
196#"[SEP]"
197input_ids += input_ids + [102]
198#sentiment: word (Next sentence)
199#segment_ids += [1] * (len(tokens_b) + 1)
200'''
201
202# The “Attention Mask” is simply an array of 1s and 0s indicating which tokens are padding and which aren’t (including special tokens)203
204# The mask has 1 for real tokens and 0 for padding tokens. Only real205# tokens are attended to.206attention_mask = [1] * len(input_ids)207
208# Zero-pad up to the sequence length.209padding = [0] * (max_seq_length - len(input_ids))210input_ids += padding211attention_mask += padding212segment_ids += padding213
214assert len(input_ids) == max_seq_length215assert len(attention_mask) == max_seq_length216assert len(segment_ids) == max_seq_length217
218if task_n == 1:219label_id = label_map[example.aspect]220elif task_n == 2:221label_id = label_map[example.sentiment]222else:223print("Wrong task")224
225
226if task_n == 1:227features.append(228InputFeatures(input_ids=input_ids,229attention_mask=attention_mask,230segment_ids=None,231label_id=label_id))232elif task_n == 2:233features.append(234InputFeatures(input_ids=input_ids,235attention_mask=attention_mask,236segment_ids=segment_ids,237label_id=label_id))238else:239print("Wrong in convert_examples")240
241
242return features243
244
245def main():246parser = argparse.ArgumentParser()247## Required parameters248###############249parser.add_argument("--data_dir",250default=None,251type=str,252required=True,253help="The input data dir. Should contain the .tsv files (or other data files) for the task.")254parser.add_argument("--output_dir",255default=None,256type=str,257required=True,258help="The output directory where the model predictions and checkpoints will be written.")259parser.add_argument("--pretrain_model",260default='bert-case-uncased',261type=str,262required=True,263help="Pre-trained model")264parser.add_argument("--num_labels_task",265default=None, type=int,266required=True,267help="num_labels_task")268parser.add_argument("--max_seq_length",269default=128,270type=int,271help="The maximum total input sequence length after WordPiece tokenization. \n"272"Sequences longer than this will be truncated, and sequences shorter \n"273"than this will be padded.")274parser.add_argument("--do_train",275default=False,276action='store_true',277help="Whether to run training.")278parser.add_argument("--do_eval",279default=False,280action='store_true',281help="Whether to run eval on the dev set.")282parser.add_argument("--do_lower_case",283default=False,284action='store_true',285help="Set this flag if you are using an uncased model.")286parser.add_argument("--eval_batch_size",287default=32,288type=int,289help="Total batch size for training.")290parser.add_argument("--learning_rate",291default=5e-5,292type=float,293help="The initial learning rate for Adam.")294parser.add_argument("--num_train_epochs",295default=3.0,296type=float,297help="Total number of training epochs to perform.")298parser.add_argument("--warmup_proportion",299default=0.1,300type=float,301help="Proportion of training to perform linear learning rate warmup for. "302"E.g., 0.1 = 10%% of training.")303parser.add_argument("--no_cuda",304default=False,305action='store_true',306help="Whether not to use CUDA when available")307parser.add_argument("--local_rank",308type=int,309default=-1,310help="local_rank for distributed training on gpus")311parser.add_argument('--seed',312type=int,313default=42,314help="random seed for initialization")315parser.add_argument('--gradient_accumulation_steps',316type=int,317default=1,318help="Number of updates steps to accumulate before performing a backward/update pass.")319parser.add_argument('--fp16',320default=False,321action='store_true',322help="Whether to use 16-bit float precision instead of 32-bit")323parser.add_argument('--loss_scale',324type=float, default=0,325help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"326"0 (default value): dynamic loss scaling.\n"327"Positive power of 2: static loss scaling value.\n")328parser.add_argument("--weight_decay",329default=0.0,330type=float,331help="Weight decay if we apply some.")332parser.add_argument("--adam_epsilon",333default=1e-8,334type=float,335help="Epsilon for Adam optimizer.")336parser.add_argument("--max_grad_norm",337default=1.0,338type=float,339help="Max gradient norm.")340parser.add_argument('--fp16_opt_level',341type=str,342default='O1',343help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."344"See details at https://nvidia.github.io/apex/amp.html")345parser.add_argument("--task",346default=None,347type=int,348required=True,349help="Choose Task")350###############351
352args = parser.parse_args()353#print(args.do_train, args.do_eval)354#exit()355
356processors = Processor_1357
358num_labels = args.num_labels_task359
360if args.local_rank == -1 or args.no_cuda:361device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")362n_gpu = torch.cuda.device_count()363else:364torch.cuda.set_device(args.local_rank)365device = torch.device("cuda", args.local_rank)366n_gpu = 1367# Initializes the distributed backend which will take care of sychronizing nodes/GPUs368torch.distributed.init_process_group(backend='nccl')369logger.info("device: {}, n_gpu: {}, distributed training: {}, 16-bits training: {}".format(370device, n_gpu, bool(args.local_rank != -1), args.fp16))371
372
373
374if args.gradient_accumulation_steps < 1:375raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(376args.gradient_accumulation_steps))377
378#args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)379
380random.seed(args.seed)381np.random.seed(args.seed)382torch.manual_seed(args.seed)383if n_gpu > 0:384torch.cuda.manual_seed_all(args.seed)385
386if not args.do_eval:387raise ValueError("At least one of `do_train` or `do_eval` must be True.")388
389'''390if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
391raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
392'''
393os.makedirs(args.output_dir, exist_ok=True)394
395
396
397tokenizer = RobertaTokenizer.from_pretrained(args.pretrain_model)398
399
400train_examples = None401num_train_steps = None402aspect_list = None403sentiment_list = None404processor = processors()405num_labels = num_labels406#train_examples, aspect_list, sentiment_list = processor.get_train_examples(args.data_dir)407
408filenames = os.listdir(args.output_dir)409filenames = [x for x in filenames if "pytorch_model.bin_" in x]410print(filenames)411
412file_mark = []413model_performace = dict()414for x in filenames:415file_mark.append([x, True])416#file_mark.append([x, False])417
418####419####420test_examples, aspect_list, sentiment_list = processor.get_test_examples(args.data_dir)421if args.task == 1:422num_labels = len(aspect_list)423elif args.task == 2:424num_labels = len(sentiment_list)425else:426print("What's task?")427exit()428dev = convert_examples_to_features(429test_examples, aspect_list, sentiment_list, args.max_seq_length, tokenizer, args.task)430eval_examples = test_examples431###432
433
434for x, mark in file_mark:435print(x, mark)436output_model_file = os.path.join(args.output_dir, x)437
438#model = RobertaForSequenceClassification.from_pretrained(args.pretrain_model, num_labels=num_labels, output_hidden_states=False, output_attentions=False, return_dict=True)439model = RobertaForMaskedLMDomainTask.from_pretrained(args.pretrain_model, output_hidden_states=False, output_attentions=False, return_dict=True, num_labels=args.num_labels_task)440model.load_state_dict(torch.load(output_model_file), strict=False)441#strict False: ignore non-matching keys442
443
444#param_optimizer = [para[0] for para in model.named_parameters()]445#param_optimizer = [para for para in model.named_parameters()][-2]446#print(param_optimizer)447
448model.to(device)449if mark:450eval_features = dev451else:452eval_features = test453
454logger.info("***** Running evaluation *****")455logger.info(" Num examples = %d", len(eval_examples))456logger.info(" Batch size = %d", args.eval_batch_size)457
458
459all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)460all_attention_mask = torch.tensor([f.attention_mask for f in eval_features], dtype=torch.long)461if args.task == 1:462print("Excuting the task 1")463elif args.task == 2:464all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)465else:466print("Wrong here2")467
468all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)469
470if args.task == 1:471eval_data = TensorDataset(all_input_ids, all_attention_mask, all_label_ids)472elif args.task == 2:473eval_data = TensorDataset(all_input_ids, all_attention_mask, all_segment_ids, all_label_ids)474else:475print("Wrong here1")476
477if args.local_rank == -1:478eval_sampler = RandomSampler(eval_data)479else:480eval_sampler = DistributedSampler(eval_data)481eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)482
483if mark:484output_eval_file = os.path.join(args.output_dir, "eval_results_{}.txt".format(x.split("_")[-1]))485output_file_pred = os.path.join(args.output_dir, "eval_pred_{}.txt".format(x.split("_")[-1]))486output_file_glod = os.path.join(args.output_dir, "eval_gold_{}.txt".format(x.split("_")[-1]))487else:488output_eval_file = os.path.join(args.output_dir, "test_results_{}.txt".format(x.split("_")[-1]))489output_file_pred = os.path.join(args.output_dir, "test_pred_{}.txt".format(x.split("_")[-1]))490output_file_glod = os.path.join(args.output_dir, "test_gold_{}.txt".format(x.split("_")[-1]))491
492fpred = open(output_file_pred, "w")493fgold = open(output_file_glod, "w")494
495model.eval()496eval_loss, eval_accuracy = 0, 0497nb_eval_steps, nb_eval_examples = 0, 0498
499
500for step, batch in enumerate(tqdm(eval_dataloader, desc="Iteration")):501#batch = tuple(t.to(device) if i != 3 else t for i, t in enumerate(batch))502batch = tuple(t.to(device) for i, t in enumerate(batch))503
504if args.task == 1:505input_ids, attention_mask, label_ids = batch506elif args.task == 2:507input_ids, attention_mask, segment_ids, label_ids = batch508else:509print("Wrong here3")510
511
512if args.task == 1:513#loss, logits, hidden_states, attentions514'''515output = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels=label_ids)
516logits = output.logits
517tmp_eval_loss = output.loss
518'''
519#520tmp_eval_loss, logits = model(input_ids_org=input_ids, sentence_label=label_ids, attention_mask=attention_mask, func="task_class")521#logits = output.logits522#tmp_eval_loss = output.loss523elif args.task == 2:524#loss, logits, hidden_states, attentions525'''526output = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels=label_ids)
527logits = output.logits
528tmp_eval_loss = output.loss
529'''
530#531tmp_eval_loss, logits = model(input_ids_org=input_ids, sentence_label=label_ids, attention_mask=attention_mask, func="task_class")532#exit()533#logits = output.logits534#tmp_eval_loss = output.loss535else:536print("Wrong!!")537
538
539logits = logits.detach().cpu().numpy()540label_ids = label_ids.to('cpu').numpy()541tmp_eval_accuracy, pred = accuracy(logits, label_ids)542for a, b in zip(pred, label_ids):543fgold.write("{}\n".format(b))544fpred.write("{}\n".format(a))545
546eval_loss += tmp_eval_loss.mean().item()547eval_accuracy += tmp_eval_accuracy548
549nb_eval_examples += input_ids.size(0)550nb_eval_steps += 1551
552eval_loss = eval_loss / nb_eval_steps553eval_accuracy = eval_accuracy / nb_eval_examples554
555result = {'eval_loss': eval_loss,556'eval_accuracy': eval_accuracy557}558
559with open(output_eval_file, "w") as writer:560logger.info("***** Eval results *****")561for key in sorted(result.keys()):562logger.info(" %s = %s", key, str(result[key]))563writer.write("%s = %s\n" % (key, str(result[key])))564
565model_performace[x] = eval_accuracy566
567#################568#################569model_name_best=0570score_best=0571for model_name, score in model_performace.items():572if score > score_best:573score_best = score574model_name_best = model_name575
576
577model = RobertaForMaskedLMDomainTask.from_pretrained(args.pretrain_model, output_hidden_states=False, output_attentions=False, return_dict=True, num_labels=args.num_labels_task)578model_name_best = os.path.join(args.output_dir, model_name_best)579model.load_state_dict(torch.load(model_name_best), strict=False)580# Save a trained model581logger.info("** ** * Saving fine - tuned model ** ** * ")582model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self583output_model_file = os.path.join(args.output_dir, "pytorch_model.bin_best")584torch.save(model_to_save.state_dict(), output_model_file)585
586if __name__ == "__main__":587main()588
589
590