CSS-LM
678 строк · 27.2 Кб
1import argparse2import logging3import random4import numpy as np5import os6import json7import math8
9import torch10from transformers import RobertaTokenizer, RobertaForMaskedLM, RobertaForSequenceClassification11#from transformers.modeling_roberta import RobertaForMaskedLMDomainTask
12from transformers.modeling_roberta_updateRep_self import RobertaForMaskedLMDomainTask13from tqdm import tqdm, trange14from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler15from torch.utils.data.distributed import DistributedSampler16from transformers.file_utils import PYTORCH_PRETRAINED_BERT_CACHE17from transformers.optimization import AdamW, get_linear_schedule_with_warmup18
19
20
21logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',22datefmt = '%m/%d/%Y %H:%M:%S',23level = logging.INFO)24logger = logging.getLogger(__name__)25
26def accuracy(out, labels):27outputs = np.argmax(out, axis=1)28return np.sum(outputs == labels), outputs29
30class InputFeatures(object):31"""A single set of features of data."""32
33def __init__(self, input_ids=None, attention_mask=None, segment_ids=None, label_id=None):34self.input_ids = input_ids35self.attention_mask = attention_mask36self.segment_ids = segment_ids37self.label_id = label_id38
39class InputExample(object):40"""A single training/test example for simple sequence classification."""41def __init__(self, guid, sentence, aspect, sentiment=None):42"""Constructs a InputExample.43
44Args:
45guid: Unique id for the example.
46text_a: string. The untokenized text of the first sequence. For single
47sequence tasks, only this sequence must be specified.
48text_b: (Optional) string. The untokenized text of the second sequence.
49Only must be specified for sequence pair tasks.
50label: (Optional) string. The label of the example. This should be
51specified for train and dev examples, but not for test examples.
52"""
53self.guid = guid54self.sentence = sentence55self.aspect = aspect56self.sentiment = sentiment57
58
59class DataProcessor(object):60"""Base class for data converters for sequence classification data sets."""61
62def get_train_examples(self, data_dir):63"""Gets a collection of `InputExample`s for the train set."""64raise NotImplementedError()65
66def get_dev_examples(self, data_dir):67"""Gets a collection of `InputExample`s for the dev set."""68raise NotImplementedError()69
70def get_labels(self):71"""Gets the list of labels for this data set."""72raise NotImplementedError()73
74@classmethod75def _read_json(cls, input_file):76with open(input_file, "r", encoding='utf-8') as f:77return json.loads(f.read())78
79
80class Processor_1(DataProcessor):81"""Processor for the CoLA data set (GLUE version)."""82
83def get_train_examples(self, data_dir):84"""See base class."""85examples = self._create_examples(86self._read_json(os.path.join(data_dir, "train.json")), "train")87aspect = set([x.aspect for x in examples])88sentiment = set([x.sentiment for x in examples])89return examples, list(aspect), list(sentiment)90
91def get_dev_examples(self, data_dir):92"""See base class."""93examples = self._create_examples(94self._read_json(os.path.join(data_dir, "dev.json")), "dev")95aspect = set([x.aspect for x in examples])96sentiment = set([x.sentiment for x in examples])97return examples, list(aspect), list(sentiment)98
99def get_test_examples(self, data_dir):100"""See base class."""101examples = self._create_examples(102self._read_json(os.path.join(data_dir, "test.json")), "test")103aspect = set([x.aspect for x in examples])104sentiment = set([x.sentiment for x in examples])105return examples, list(aspect), list(sentiment)106
107def get_labels(self):108"""Useless"""109return ["0", "1"]110
111def _create_examples(self, lines, set_type):112"""Creates examples for the training and dev sets."""113examples = []114for (i, line) in enumerate(lines):115guid = "%s-%s" % (set_type, i)116
117sentence = line["sentence"]118aspect = line["aspect"]119sentiment = line["sentiment"]120
121examples.append(122InputExample(guid=guid, sentence=sentence, aspect=aspect, sentiment=sentiment))123return examples124
125def convert_examples_to_features(examples, aspect_list, sentiment_list, max_seq_length, tokenizer, task_n):126
127"""Loads a data file into a list of `InputBatch`s."""128
129#Task_1: sentence --> aspect130#Task_2: aspect+sentence --> sentiment131if task_n == 1:132label_list = sorted(aspect_list)133elif task_n == 2:134label_list = sorted(sentiment_list)135else:136print("Wrong task")137'''138for w in label_list:
139print(w,tokenizer.encode(w))
140exit()
141'''
142label_map = {label : i for i, label in enumerate(label_list)}143print("=======")144print(label_map)145print("=======")146
147
148features = []149for (ex_index, example) in enumerate(examples):150
151#Add new special tokens152'''153tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
154model = GPT2Model.from_pretrained('gpt2')
155special_tokens_dict = {'cls_token': '<CLS>'}
156num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
157print('We have added', num_added_toks, 'tokens')
158model.resize_token_embeddings(len(tokenizer))
159'''
160
161'''162print(tokenizer.all_special_tokens)
163print(tokenizer.encode(tokenizer.all_special_tokens))
164#['[PAD]', '[SEP]', '[CLS]', '[MASK]', '[UNK]']
165#[ 0, 102, 101, 103, 100]
166'''
167
168
169# The convention in BERT is:170# (a) For sequence pairs:171# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]172# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1173# (b) For single sequences:174# tokens: [CLS] the dog is hairy . [SEP]175# type_ids: 0 0 0 0 0 0 0176#177# Where "type_ids" are used to indicate whether this is the first178# sequence or the second sequence. The embedding vectors for `type=0` and179# `type=1` were learned during pre-training and are added to the wordpiece180# embedding vector (and position vector). This is not *strictly* necessary181# since the [SEP] token unambigiously separates the sequences, but it makes182# it easier for the model to learn the concept of sequences.183#184# For classification tasks, the first vector (corresponding to [CLS]) is185# used as as the "sentence vector". Note that this only makes sense because186# the entire model is fine-tuned.187
188###189#Already add [CLS] and [SEP]190#101, 102191input_ids = tokenizer.encode(example.sentence,add_special_tokens=True)192if len(input_ids) > max_seq_length:193input_ids = input_ids[:max_seq_length-1]+[2]194segment_ids = [0] * len(input_ids)195
196
197'''198if task_n==2:
199#"[SEP]"
200input_ids += input_ids + [102]
201#sentiment: word (Next sentence)
202#segment_ids += [1] * (len(tokens_b) + 1)
203'''
204
205# The “Attention Mask” is simply an array of 1s and 0s indicating which tokens are padding and which aren’t (including special tokens)206
207# The mask has 1 for real tokens and 0 for padding tokens. Only real208# tokens are attended to.209attention_mask = [1] * len(input_ids)210
211# Zero-pad up to the sequence length.212padding = [0] * (max_seq_length - len(input_ids))213#<pad>:1214padding_id = [1] * (max_seq_length - len(input_ids))215input_ids += padding_id216attention_mask += padding217segment_ids += padding218
219try:220assert len(input_ids) == max_seq_length221assert len(attention_mask) == max_seq_length222assert len(segment_ids) == max_seq_length223except:224continue225
226if task_n == 1:227label_id = label_map[example.aspect]228elif task_n == 2:229label_id = label_map[example.sentiment]230else:231print("Wrong task")232
233
234if task_n == 1:235features.append(236InputFeatures(input_ids=input_ids,237attention_mask=attention_mask,238segment_ids=None,239label_id=label_id))240elif task_n == 2:241features.append(242InputFeatures(input_ids=input_ids,243attention_mask=attention_mask,244segment_ids=segment_ids,245label_id=label_id))246else:247print("Wrong in convert_examples")248
249
250return features251
252
253def main():254parser = argparse.ArgumentParser()255## Required parameters256###############257parser.add_argument("--data_dir",258default=None,259type=str,260required=True,261help="The input data dir. Should contain the .tsv files (or other data files) for the task.")262parser.add_argument("--output_dir",263default=None,264type=str,265required=True,266help="The output directory where the model predictions and checkpoints will be written.")267parser.add_argument("--pretrain_model",268default='bert-case-uncased',269type=str,270required=True,271help="Pre-trained model")272parser.add_argument("--num_labels_task",273default=None, type=int,274required=True,275help="num_labels_task")276parser.add_argument("--max_seq_length",277default=128,278type=int,279help="The maximum total input sequence length after WordPiece tokenization. \n"280"Sequences longer than this will be truncated, and sequences shorter \n"281"than this will be padded.")282parser.add_argument("--do_train",283default=False,284action='store_true',285help="Whether to run training.")286parser.add_argument("--do_eval",287default=False,288action='store_true',289help="Whether to run eval on the dev set.")290parser.add_argument("--do_lower_case",291default=False,292action='store_true',293help="Set this flag if you are using an uncased model.")294parser.add_argument("--eval_batch_size",295default=32,296type=int,297help="Total batch size for training.")298parser.add_argument("--learning_rate",299default=5e-5,300type=float,301help="The initial learning rate for Adam.")302parser.add_argument("--num_train_epochs",303default=3.0,304type=float,305help="Total number of training epochs to perform.")306parser.add_argument("--warmup_proportion",307default=0.1,308type=float,309help="Proportion of training to perform linear learning rate warmup for. "310"E.g., 0.1 = 10%% of training.")311parser.add_argument("--no_cuda",312default=False,313action='store_true',314help="Whether not to use CUDA when available")315parser.add_argument("--local_rank",316type=int,317default=-1,318help="local_rank for distributed training on gpus")319parser.add_argument('--seed',320type=int,321default=42,322help="random seed for initialization")323parser.add_argument('--gradient_accumulation_steps',324type=int,325default=1,326help="Number of updates steps to accumulate before performing a backward/update pass.")327parser.add_argument('--fp16',328default=False,329action='store_true',330help="Whether to use 16-bit float precision instead of 32-bit")331parser.add_argument('--loss_scale',332type=float, default=0,333help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"334"0 (default value): dynamic loss scaling.\n"335"Positive power of 2: static loss scaling value.\n")336parser.add_argument("--weight_decay",337default=0.0,338type=float,339help="Weight decay if we apply some.")340parser.add_argument("--adam_epsilon",341default=1e-8,342type=float,343help="Epsilon for Adam optimizer.")344parser.add_argument("--max_grad_norm",345default=1.0,346type=float,347help="Max gradient norm.")348parser.add_argument('--fp16_opt_level',349type=str,350default='O1',351help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."352"See details at https://nvidia.github.io/apex/amp.html")353parser.add_argument("--task",354default=2,355type=int,356required=True,357help="Choose Task")358parser.add_argument("--choose_eval_test_both",359default=2,360type=int,361help="choose test dev both")362###############363
364args = parser.parse_args()365#print(args.do_train, args.do_eval)366#exit()367
368
369processors = Processor_1370
371num_labels = args.num_labels_task372
373if args.local_rank == -1 or args.no_cuda:374device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")375n_gpu = torch.cuda.device_count()376print(n_gpu)377print(device)378else:379torch.cuda.set_device(args.local_rank)380device = torch.device("cuda", args.local_rank)381n_gpu = 1382# Initializes the distributed backend which will take care of sychronizing nodes/GPUs383torch.distributed.init_process_group(backend='nccl')384logger.info("device: {}, n_gpu: {}, distributed training: {}, 16-bits training: {}".format(385device, n_gpu, bool(args.local_rank != -1), args.fp16))386
387
388
389if args.gradient_accumulation_steps < 1:390raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(391args.gradient_accumulation_steps))392
393#args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)394
395random.seed(args.seed)396np.random.seed(args.seed)397torch.manual_seed(args.seed)398if n_gpu > 0:399torch.cuda.manual_seed_all(args.seed)400
401if not args.do_eval:402raise ValueError("At least one of `do_train` or `do_eval` must be True.")403
404'''405if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
406raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
407'''
408os.makedirs(args.output_dir, exist_ok=True)409
410
411
412tokenizer = RobertaTokenizer.from_pretrained(args.pretrain_model)413
414
415train_examples = None416num_train_steps = None417aspect_list = None418sentiment_list = None419processor = processors()420num_labels = num_labels421#train_examples, aspect_list, sentiment_list = processor.get_train_examples(args.data_dir)422
423filenames = os.listdir(args.output_dir)424filenames = [x for x in filenames if "pytorch_model.bin_" in x]425print(filenames)426
427file_mark = []428model_performace_dev = dict()429model_performace_test = dict()430for x in filenames:431###432#test433if args.choose_eval_test_both==0:434file_mark.append([x, True])435#eval436elif args.choose_eval_test_both==1:437file_mark.append([x, False])438else:439file_mark.append([x, True])440file_mark.append([x, False])441###442#file_mark.append([x, True])443#file_mark.append([x, False])444
445####446####447train_examples, aspect_list, sentiment_list = processor.get_test_examples(args.data_dir)448test_examples, _, _ = processor.get_test_examples(args.data_dir)449eval_examples, _, _ = processor.get_dev_examples(args.data_dir)450if args.task == 1:451num_labels = len(aspect_list)452elif args.task == 2:453num_labels = len(sentiment_list)454else:455print("What's task?")456exit()457test = convert_examples_to_features(458test_examples, aspect_list, sentiment_list, args.max_seq_length, tokenizer, args.task)459
460dev = convert_examples_to_features(461eval_examples, aspect_list, sentiment_list, args.max_seq_length, tokenizer, args.task)462###463
464
465for x, mark in file_mark:466#mark: eval-True; test-False467#choose_eval_test_both: eval-0, test-1, both-2468print(x, mark)469output_model_file = os.path.join(args.output_dir, x)470
471#model = RobertaForSequenceClassification.from_pretrained(args.pretrain_model, num_labels=num_labels, output_hidden_states=False, output_attentions=False, return_dict=True)472model = RobertaForMaskedLMDomainTask.from_pretrained(args.pretrain_model, output_hidden_states=False, output_attentions=False, return_dict=True, num_labels=args.num_labels_task)473model.load_state_dict(torch.load(output_model_file), strict=False)474#strict False: ignore non-matching keys475model.to(device)476
477#######################################478param_optimizer = list(model.named_parameters())479no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']480#no_decay = ['bias', 'LayerNorm.weight']481no_grad = ['bert.encoder.layer.11.output.dense_ent', 'bert.encoder.layer.11.output.LayerNorm_ent']482param_optimizer = [(n, p) for n, p in param_optimizer if not any(nd in n for nd in no_grad)]483optimizer_grouped_parameters = [484{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},485{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}486]487optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)488#scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(t_total*0.1), num_training_steps=t_total)489if args.fp16:490try:491from apex import amp492except ImportError:493raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")494exit()495
496model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)497
498
499# multi-gpu training (should be after apex fp16 initialization)500if n_gpu > 1:501model = torch.nn.DataParallel(model)502
503# Distributed training (should be after apex fp16 initialization)504if args.local_rank != -1:505model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],506output_device=args.local_rank,507find_unused_parameters=True)508#######################################509
510
511#param_optimizer = [para[0] for para in model.named_parameters()]512#param_optimizer = [para for para in model.named_parameters()][-2]513#print(param_optimizer)514
515if mark:516eval_features = dev517else:518eval_features = test519
520logger.info("***** Running evaluation *****")521logger.info(" Num examples = %d", len(eval_examples))522logger.info(" Batch size = %d", args.eval_batch_size)523
524
525all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)526all_attention_mask = torch.tensor([f.attention_mask for f in eval_features], dtype=torch.long)527if args.task == 1:528print("Excuting the task 1")529elif args.task == 2:530all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)531else:532print("Wrong here2")533
534all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)535
536if args.task == 1:537eval_data = TensorDataset(all_input_ids, all_attention_mask, all_label_ids)538elif args.task == 2:539eval_data = TensorDataset(all_input_ids, all_attention_mask, all_segment_ids, all_label_ids)540else:541print("Wrong here1")542
543if args.local_rank == -1:544eval_sampler = RandomSampler(eval_data)545else:546eval_sampler = DistributedSampler(eval_data)547eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)548
549if mark:550output_eval_file = os.path.join(args.output_dir, "eval_results_{}.txt".format(x.split("_")[-1]))551output_file_pred = os.path.join(args.output_dir, "eval_pred_{}.txt".format(x.split("_")[-1]))552output_file_glod = os.path.join(args.output_dir, "eval_gold_{}.txt".format(x.split("_")[-1]))553else:554output_eval_file = os.path.join(args.output_dir, "test_results_{}.txt".format(x.split("_")[-1]))555output_file_pred = os.path.join(args.output_dir, "test_pred_{}.txt".format(x.split("_")[-1]))556output_file_glod = os.path.join(args.output_dir, "test_gold_{}.txt".format(x.split("_")[-1]))557
558fpred = open(output_file_pred, "w")559fgold = open(output_file_glod, "w")560
561model.eval()562eval_loss, eval_accuracy = 0, 0563nb_eval_steps, nb_eval_examples = 0, 0564
565
566for step, batch in enumerate(tqdm(eval_dataloader, desc="Iteration")):567#batch = tuple(t.to(device) if i != 3 else t for i, t in enumerate(batch))568batch = tuple(t.to(device) for i, t in enumerate(batch))569
570if args.task == 1:571input_ids, attention_mask, label_ids = batch572elif args.task == 2:573input_ids, attention_mask, segment_ids, label_ids = batch574else:575print("Wrong here3")576
577
578if args.task == 1:579#loss, logits, hidden_states, attentions580'''581output = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels=label_ids)
582logits = output.logits
583tmp_eval_loss = output.loss
584'''
585#586tmp_eval_loss, logits = model(input_ids_org=input_ids, sentence_label=label_ids, attention_mask=attention_mask, func="task_class")587#logits = output.logits588#tmp_eval_loss = output.loss589elif args.task == 2:590#loss, logits, hidden_states, attentions591'''592output = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels=label_ids)
593logits = output.logits
594tmp_eval_loss = output.loss
595'''
596#597tmp_eval_loss, logits = model(input_ids_org=input_ids, sentence_label=label_ids, attention_mask=attention_mask, func="task_class")598#exit()599#logits = output.logits600#tmp_eval_loss = output.loss601else:602print("Wrong!!")603
604
605logits = logits.detach().cpu().numpy()606label_ids = label_ids.to('cpu').numpy()607tmp_eval_accuracy, pred = accuracy(logits, label_ids)608for a, b in zip(pred, label_ids):609fgold.write("{}\n".format(b))610fpred.write("{}\n".format(a))611
612eval_loss += tmp_eval_loss.mean().item()613eval_accuracy += tmp_eval_accuracy614
615nb_eval_examples += input_ids.size(0)616nb_eval_steps += 1617
618eval_loss = eval_loss / nb_eval_steps619eval_accuracy = eval_accuracy / nb_eval_examples620
621result = {'eval_loss': eval_loss,622'eval_accuracy': eval_accuracy623}624
625with open(output_eval_file, "w") as writer:626logger.info("***** Eval results *****")627for key in sorted(result.keys()):628logger.info(" %s = %s", key, str(result[key]))629writer.write("%s = %s\n" % (key, str(result[key])))630
631#if mark and step > int(math.ceil(len(eval_examples)/args.eval_batch_size)):632if mark:633model_performace_dev[x] = eval_accuracy634else:635model_performace_test[x] = eval_accuracy636
637#################638#################639#####dev#########640if args.choose_eval_test_both != 1:641model_name_best=0642score_best=0643for model_name, score in model_performace_dev.items():644if score >= score_best:645score_best = score646model_name_best = model_name647
648
649model = RobertaForMaskedLMDomainTask.from_pretrained(args.pretrain_model, output_hidden_states=False, output_attentions=False, return_dict=True, num_labels=args.num_labels_task)650model_name_best = os.path.join(args.output_dir, model_name_best)651model.load_state_dict(torch.load(model_name_best), strict=False)652# Save a trained model653logger.info("** ** * Saving fine - tuned model ** ** * ")654model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self655output_model_file = os.path.join(args.output_dir, "pytorch_model.bin_dev_best")656torch.save(model_to_save.state_dict(), output_model_file)657
658
659if args.choose_eval_test_both != 0:660model_name_best=0661score_best=0662for model_name, score in model_performace_test.items():663if score >= score_best:664score_best = score665model_name_best = model_name666
667
668model = RobertaForMaskedLMDomainTask.from_pretrained(args.pretrain_model, output_hidden_states=False, output_attentions=False, return_dict=True, num_labels=args.num_labels_task)669model_name_best = os.path.join(args.output_dir, model_name_best)670model.load_state_dict(torch.load(model_name_best), strict=False)671# Save a trained model672logger.info("** ** * Saving fine - tuned model ** ** * ")673model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self674output_model_file = os.path.join(args.output_dir, "pytorch_model.bin_test_best")675torch.save(model_to_save.state_dict(), output_model_file)676
677if __name__ == "__main__":678main()679
680
681