2
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
5
# Licensed under the Apache License, Version 2.0 (the "License");
6
# you may not use this file except in compliance with the License.
7
# You may obtain a copy of the License at
9
# http://www.apache.org/licenses/LICENSE-2.0
11
# Unless required by applicable law or agreed to in writing, software
12
# distributed under the License is distributed on an "AS IS" BASIS,
13
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
# See the License for the specific language governing permissions and
15
# limitations under the License.
16
"""BERT finetuning runner."""
18
from __future__ import absolute_import, division, print_function, unicode_literals
28
from torch.utils.data import DataLoader, Dataset, RandomSampler
29
from torch.utils.data.distributed import DistributedSampler
30
from tqdm import tqdm, trange
32
from transformers import RobertaTokenizer, RobertaForMaskedLM, RobertaForSequenceClassification
33
from transformers.optimization import AdamW, get_linear_schedule_with_warmup
35
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
36
datefmt='%m/%d/%Y %H:%M:%S',
38
logger = logging.getLogger(__name__)
40
class Dataset_noNext(Dataset):
41
def __init__(self, corpus_path, tokenizer, seq_len, encoding="utf-8", corpus_lines=None, on_memory=True):
43
self.vocab_size = tokenizer.vocab_size
44
self.tokenizer = tokenizer
45
self.seq_len = seq_len
46
self.on_memory = on_memory
47
self.corpus_lines = corpus_lines # number of non-empty lines in input corpus
48
self.corpus_path = corpus_path
49
self.encoding = encoding
50
self.current_doc = 0 # to avoid random sentence from same doc
52
# for loading samples directly from file
53
self.sample_counter = 0 # used to keep track of full epochs on file
54
self.line_buffer = None # keep second sentence of a pair in memory and use as first sentence in next pair
56
# for loading samples in memory
57
self.current_random_doc = 0
59
self.sample_to_doc = [] # map sample index to doc and line
61
# load samples into memory
66
with open(corpus_path, "r", encoding=encoding) as f:
67
for line in tqdm(f, desc="Loading Dataset", total=corpus_lines):
70
self.all_docs.append(doc)
72
#remove last added sample because there won't be a subsequent line anymore in the doc
73
self.sample_to_doc.pop()
76
sample = {"doc_id": len(self.all_docs),
78
self.sample_to_doc.append(sample)
80
self.corpus_lines = self.corpus_lines + 1
82
# if last row in file is not empty
83
if self.all_docs[-1] != doc:
84
self.all_docs.append(doc)
85
self.sample_to_doc.pop()
87
self.num_docs = len(self.all_docs)
89
# load samples later lazily from disk
91
if self.corpus_lines is None:
92
with open(corpus_path, "r", encoding=encoding) as f:
94
for line in tqdm(f, desc="Loading Dataset", total=corpus_lines):
95
if line.strip() == "":
98
self.corpus_lines += 1
100
# if doc does not end with empty line
101
if line.strip() != "":
104
self.file = open(corpus_path, "r", encoding=encoding)
105
self.random_file = open(corpus_path, "r", encoding=encoding)
108
# last line of doc won't be used, because there's no "nextSentence". Additionally, we start counting at 0.
109
return self.corpus_lines - self.num_docs - 1
111
def __getitem__(self, item):
112
cur_id = self.sample_counter
113
self.sample_counter += 1
114
if not self.on_memory:
115
# after one epoch we start again from beginning of file
116
if cur_id != 0 and (cur_id % len(self) == 0):
118
self.file = open(self.corpus_path, "r", encoding=self.encoding)
120
#t1, t2, is_next_label = self.random_sent(item)
121
t1, is_next_label = self.random_sent(item)
122
if is_next_label == None:
125
tokens_a = self.tokenizer.tokenize(t1)
126
#tokens_b = self.tokenizer.tokenize(t2)
130
cur_example = InputExample(guid=cur_id, tokens_a=tokens_a, tokens_b=None, is_next=is_next_label)
132
# transform sample to features
133
cur_features = convert_example_to_features(cur_example, self.seq_len, self.tokenizer)
135
cur_tensors = (torch.tensor(cur_features.input_ids),
136
torch.tensor(cur_features.input_mask),
137
torch.tensor(cur_features.segment_ids),
138
torch.tensor(cur_features.lm_label_ids),
139
torch.tensor(cur_features.is_next))
143
def random_sent(self, index):
145
Get one sample from corpus consisting of two sentences. With prob. 50% these are two subsequent sentences
146
from one doc. With 50% the second sentence will be a random one from another doc.
147
:param index: int, index of sample.
148
:return: (str, str, int), sentence 1, sentence 2, isNextSentence Label
150
t1, t2 = self.get_corpus_line(index)
153
def get_corpus_line(self, item):
155
Get one sample from corpus consisting of a pair of two subsequent lines from the same doc.
156
:param item: int, index of sample.
157
:return: (str, str), two subsequent sentences from corpus
161
assert item < self.corpus_lines
163
sample = self.sample_to_doc[item]
164
t1 = self.all_docs[sample["doc_id"]][sample["line"]]
165
# used later to avoid random nextSentence from same doc
166
self.current_doc = sample["doc_id"]
170
if self.line_buffer is None:
171
# read first non-empty line of file
173
t1 = next(self.file).strip()
175
# use t2 from previous iteration as new t1
176
t1 = self.line_buffer
177
# skip empty rows that are used for separating documents and keep track of current doc id
179
t1 = next(self.file).strip()
180
self.current_doc = self.current_doc+1
181
self.line_buffer = next(self.file).strip()
187
def get_random_line(self):
189
Get random line from another document for nextSentence task.
190
:return: str, content of one line
192
# Similar to original tf repo: This outer loop should rarely go for more than one iteration for large
193
# corpora. However, just to be careful, we try to make sure that
194
# the random document is not the same as the document we're processing.
197
rand_doc_idx = random.randint(0, len(self.all_docs)-1)
198
rand_doc = self.all_docs[rand_doc_idx]
199
line = rand_doc[random.randrange(len(rand_doc))]
201
rand_index = random.randint(1, self.corpus_lines if self.corpus_lines < 1000 else 1000)
203
for _ in range(rand_index):
204
line = self.get_next_line()
205
#check if our picked random line is really from another doc like we want it to be
206
if self.current_random_doc != self.current_doc:
210
def get_next_line(self):
211
""" Gets next line of random_file and starts over when reaching end of file"""
213
line = next(self.random_file).strip()
214
#keep track of which document we are currently looking at to later avoid having the same doc as t1
216
self.current_random_doc = self.current_random_doc + 1
217
line = next(self.random_file).strip()
218
except StopIteration:
219
self.random_file.close()
220
self.random_file = open(self.corpus_path, "r", encoding=self.encoding)
221
line = next(self.random_file).strip()
225
class InputExample(object):
226
"""A single training/test example for the language model."""
228
def __init__(self, guid, tokens_a, tokens_b=None, is_next=None, lm_labels=None):
229
"""Constructs a InputExample.
231
guid: Unique id for the example.
232
tokens_a: string. The untokenized text of the first sequence. For single
233
sequence tasks, only this sequence must be specified.
234
tokens_b: (Optional) string. The untokenized text of the second sequence.
235
Only must be specified for sequence pair tasks.
236
label: (Optional) string. The label of the example. This should be
237
specified for train and dev examples, but not for test examples.
240
self.tokens_a = tokens_a
241
self.tokens_b = tokens_b
242
self.is_next = is_next # nextSentence
243
self.lm_labels = lm_labels # masked words for language model
246
class InputFeatures(object):
247
"""A single set of features of data."""
249
def __init__(self, input_ids, input_mask, segment_ids, is_next, lm_label_ids):
250
self.input_ids = input_ids
251
self.input_mask = input_mask
252
self.segment_ids = segment_ids
253
self.is_next = is_next
254
self.lm_label_ids = lm_label_ids
257
def random_word(tokens, tokenizer):
259
Masking some random tokens for Language Model task with probabilities as in the original BERT paper.
260
:param tokens: list of str, tokenized sentence.
261
:param tokenizer: Tokenizer, object used for tokenization (we need it's vocab here)
262
:return: (list of str, list of int), masked tokens and related labels for LM prediction
269
for i, token in enumerate(tokens):
277
print(tokenizer.decode(random.randint(0,tokenizer.vocab_size)))
278
print(random.randint(0,tokenizer.vocab_size))
279
print(type(tokenizer.decode(random.randint(0,tokenizer.vocab_size))))
285
prob = random.random()
286
# mask token with 15% probability
289
#print(tokenizer.vocab)
291
#print(tokenizer.convert_ids_to_tokens(candidate_id))
294
print(tokenizer.convert_ids_to_tokens(candidate_id))
297
print(tokenizer.encode(tokens))
298
a=tokenizer.encode(tokens)
299
print(tokenizer.decode(a))
300
print(tokenizer.convert_ids_to_tokens(candidate_id))
305
# 80% randomly change token to mask token
309
# 10% randomly change token to random token
311
#tokens[i] = random.choice(list(tokenizer.vocab.items()))[0]
312
#tokens[i] = tokenizer.convert_ids_to_tokens(candidate_id)
313
candidate_id = random.randint(0,tokenizer.vocab_size)
314
w = tokenizer.convert_ids_to_tokens(candidate_id)
316
if tokens[i] == None:
318
w = tokenizer.convert_ids_to_tokens(candidate_id)
323
# -> rest 10% randomly keep current token
325
# append current token to output (we will predict these later)
327
#output_label.append(tokenizer.vocab[token])
328
w = tokenizer.convert_tokens_to_ids(token)
330
output_label.append(w)
332
print("Have no this tokens in ids")
335
# For unknown words (should not occur with BPE vocab)
336
#output_label.append(tokenizer.vocab["<unk>"])
337
w = tokenizer.convert_tokens_to_ids("<unk>")
338
output_label.append(w)
339
logger.warning("Cannot find token '{}' in vocab. Using <unk> insetad".format(token))
341
# no masking token (will be ignored by loss function later)
342
output_label.append(-1)
348
return tokens, output_label
351
def convert_example_to_features(example, max_seq_length, tokenizer):
353
Convert a raw sample (pair of sentences as tokenized strings) into a proper training sample with
354
IDs, LM labels, input_mask, CLS and SEP tokens etc.
355
:param example: InputExample, containing sentence input as strings and is_next label
356
:param max_seq_length: int, maximum length of sequence.
357
:param tokenizer: Tokenizer
358
:return: InputFeatures, containing all inputs and labels of one sample as IDs (as used for model training)
360
tokens_a = example.tokens_a
361
tokens_b = example.tokens_b
362
# Modifies `tokens_a` and `tokens_b` in place so that the total
363
# length is less than the specified length.
364
# Account for [CLS], [SEP], [SEP] with "- 3"
365
#_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
366
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 2)
368
tokens_a, t1_label = random_word(tokens_a, tokenizer)
369
#tokens_b, t2_label = random_word(tokens_b, tokenizer)
370
# concatenate lm labels and account for CLS, SEP, SEP
371
#lm_label_ids = ([-1] + t1_label + [-1] + t2_label + [-1])
372
lm_label_ids = ([-1] + t1_label + [-1])
374
# The convention in BERT is:
375
# (a) For sequence pairs:
376
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
377
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
378
# (b) For single sequences:
379
# tokens: [CLS] the dog is hairy . [SEP]
380
# type_ids: 0 0 0 0 0 0 0
382
# Where "type_ids" are used to indicate whether this is the first
383
# sequence or the second sequence. The embedding vectors for `type=0` and
384
# `type=1` were learned during pre-training and are added to the wordpiece
385
# embedding vector (and position vector). This is not *strictly* necessary
386
# since the [SEP] token unambigiously separates the sequences, but it makes
387
# it easier for the model to learn the concept of sequences.
389
# For classification tasks, the first vector (corresponding to [CLS]) is
390
# used as as the "sentence vector". Note that this only makes sense because
391
# the entire model is fine-tuned.
395
segment_ids.append(0)
396
for token in tokens_a:
398
segment_ids.append(0)
399
tokens.append("</s>")
400
segment_ids.append(0)
403
assert len(tokens_b) > 0
404
for token in tokens_b:
406
segment_ids.append(1)
408
#tokens.append("[SEP]")
409
#segment_ids.append(1)
411
#input_ids = tokenizer.convert_tokens_to_ids(tokens)
412
input_ids = tokenizer.encode(tokens, add_special_tokens=False)
415
input_ids = [w if w!=None else 0 for w in input_ids]
419
# The mask has 1 for real tokens and 0 for padding tokens. Only real
420
# tokens are attended to.
421
input_mask = [1] * len(input_ids)
423
# Zero-pad up to the sequence length.
424
pad_id = tokenizer.convert_tokens_to_ids("<pad>")
426
while len(input_ids) < max_seq_length:
427
input_ids.append(pad_id)
429
segment_ids.append(0)
430
lm_label_ids.append(-1)
432
assert len(input_ids) == max_seq_length
433
assert len(input_mask) == max_seq_length
434
assert len(segment_ids) == max_seq_length
435
assert len(lm_label_ids) == max_seq_length
438
logger.info("*** Example ***")
439
logger.info("guid: %s" % (example.guid))
440
logger.info("tokens: %s" % " ".join(
441
[str(x) for x in tokens]))
442
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
443
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
445
"segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
446
logger.info("LM label: %s " % (lm_label_ids))
447
logger.info("Is next sentence label: %s " % (example.is_next))
449
features = InputFeatures(input_ids=input_ids,
450
input_mask=input_mask,
451
segment_ids=segment_ids,
452
lm_label_ids=lm_label_ids,
453
is_next=example.is_next)
458
parser = argparse.ArgumentParser()
460
## Required parameters
461
parser.add_argument("--data_dir",
465
help="The input train corpus.")
466
parser.add_argument("--pretrain_model", default=None, type=str, required=True,
467
help="Bert pre-trained model selected in the list: bert-base-uncased, "
468
"bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
469
parser.add_argument("--output_dir",
473
help="The output directory where the model checkpoints will be written.")
476
parser.add_argument("--max_seq_length",
479
help="The maximum total input sequence length after WordPiece tokenization. \n"
480
"Sequences longer than this will be truncated, and sequences shorter \n"
481
"than this will be padded.")
482
parser.add_argument("--do_train",
484
help="Whether to run training.")
485
parser.add_argument("--train_batch_size",
488
help="Total batch size for training.")
489
parser.add_argument("--learning_rate",
492
help="The initial learning rate for Adam.")
493
parser.add_argument("--num_train_epochs",
496
help="Total number of training epochs to perform.")
497
parser.add_argument("--warmup_proportion",
500
help="Proportion of training to perform linear learning rate warmup for. "
501
"E.g., 0.1 = 10%% of training.")
502
parser.add_argument("--no_cuda",
504
help="Whether not to use CUDA when available")
505
parser.add_argument("--on_memory",
507
help="Whether to load train samples into memory or use disk")
508
parser.add_argument("--do_lower_case",
510
help="Whether to lower case the input text. True for uncased models, False for cased models.")
511
parser.add_argument("--local_rank",
514
help="local_rank for distributed training on gpus")
515
parser.add_argument('--seed',
518
help="random seed for initialization")
519
parser.add_argument('--gradient_accumulation_steps',
522
help="Number of updates steps to accumualte before performing a backward/update pass.")
523
parser.add_argument('--fp16',
525
help="Whether to use 16-bit float precision instead of 32-bit")
526
parser.add_argument('--loss_scale',
527
type = float, default = 0,
528
help = "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
529
"0 (default value): dynamic loss scaling.\n"
530
"Positive power of 2: static loss scaling value.\n")
532
parser.add_argument("--num_labels_task",
533
default=None, type=int,
535
help="num_labels_task")
536
parser.add_argument("--weight_decay",
539
help="Weight decay if we apply some.")
540
parser.add_argument("--adam_epsilon",
543
help="Epsilon for Adam optimizer.")
544
parser.add_argument("--max_grad_norm",
547
help="Max gradient norm.")
548
parser.add_argument('--fp16_opt_level',
551
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
552
"See details at https://nvidia.github.io/apex/amp.html")
553
parser.add_argument("--task",
560
args = parser.parse_args()
562
if args.local_rank == -1 or args.no_cuda:
563
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
564
n_gpu = torch.cuda.device_count()
566
torch.cuda.set_device(args.local_rank)
567
device = torch.device("cuda", args.local_rank)
569
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
570
torch.distributed.init_process_group(backend='nccl')
571
logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
572
device, n_gpu, bool(args.local_rank != -1), args.fp16))
574
if args.gradient_accumulation_steps < 1:
575
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
576
args.gradient_accumulation_steps))
578
args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
580
random.seed(args.seed)
581
np.random.seed(args.seed)
582
torch.manual_seed(args.seed)
584
torch.cuda.manual_seed_all(args.seed)
586
if not args.do_train:
587
raise ValueError("Training is currently the only implemented execution option. Please set `do_train`.")
589
if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
590
raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
591
if not os.path.exists(args.output_dir):
592
os.makedirs(args.output_dir)
594
tokenizer = RobertaTokenizer.from_pretrained(args.pretrain_model, do_lower_case=args.do_lower_case)
597
#train_examples = None
598
num_train_optimization_steps = None
600
print("Loading Train Dataset", args.data_dir)
601
train_dataset = Dataset_noNext(args.data_dir, tokenizer, seq_len=args.max_seq_length,
602
corpus_lines=None, on_memory=args.on_memory)
603
num_train_optimization_steps = int(
604
len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
605
if args.local_rank != -1:
606
num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
609
model = RobertaForMaskedLM.from_pretrained(args.pretrain_model)
614
param_optimizer = list(model.named_parameters())
615
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
616
optimizer_grouped_parameters = [
617
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
618
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
620
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
621
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(num_train_optimization_steps*0.1), num_training_steps=num_train_optimization_steps)
627
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
630
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
634
model = torch.nn.DataParallel(model)
636
if args.local_rank != -1:
637
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True)
642
logger.info("***** Running training *****")
643
logger.info(" Num examples = %d", len(train_dataset))
644
logger.info(" Batch size = %d", args.train_batch_size)
645
logger.info(" Num steps = %d", num_train_optimization_steps)
647
if args.local_rank == -1:
648
train_sampler = RandomSampler(train_dataset)
650
#TODO: check if this works with current data generator from disk that relies on next(file)
651
# (it doesn't return item back by index)
652
train_sampler = DistributedSampler(train_dataset)
653
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
655
output_loss_file = os.path.join(args.output_dir, "loss")
656
loss_fout = open(output_loss_file, 'w')
658
for _ in trange(int(args.num_train_epochs), desc="Epoch"):
660
nb_tr_examples, nb_tr_steps = 0, 0
661
for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
662
batch = tuple(t.to(device) for t in batch)
663
input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
667
for i in range(len(input_ids)):
669
print(len(input_ids[i]))
670
print(tokenizer.decode(input_ids[i]))
675
output = model(input_ids=input_ids, masked_lm_labels=lm_label_ids, return_dict=True)
679
loss = loss.mean() # mean() to average on multi-gpu.
680
if args.gradient_accumulation_steps > 1:
681
loss = loss / args.gradient_accumulation_steps
683
#optimizer.backward(loss)
684
with amp.scale_loss(loss, optimizer) as scaled_loss:
685
scaled_loss.backward()
690
loss_fout.write("{}\n".format(loss.item()))
693
tr_loss += loss.item()
694
nb_tr_examples += input_ids.size(0)
696
if (step + 1) % args.gradient_accumulation_steps == 0:
698
# modify learning rate with special warm up BERT uses
699
# if args.fp16 is False, BertAdam is used that handles this automatically
700
#lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
701
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
704
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
711
#optimizer.zero_grad()
715
#if global_step % 100000 == 0:
716
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
717
output_model_file = os.path.join(args.output_dir, "pytorch_model.bin_{}".format(global_step))
718
torch.save(model_to_save.state_dict(), output_model_file)
722
# Save a trained model
723
logger.info("** ** * Saving fine - tuned model ** ** * ")
724
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
725
output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
727
torch.save(model_to_save.state_dict(), output_model_file)
730
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
731
"""Truncates a sequence pair in place to the maximum length."""
733
# This is a simple heuristic which will always truncate the longer sequence
734
# one token at a time. This makes more sense than truncating an equal percent
735
# of tokens from each, since if one sequence is very short then each token
736
# that's truncated likely contains more information than a longer sequence.
738
#total_length = len(tokens_a) + len(tokens_b)
739
total_length = len(tokens_a)
740
if total_length <= max_length:
746
def accuracy(out, labels):
747
outputs = np.argmax(out, axis=1)
748
return np.sum(outputs == labels)
751
if __name__ == "__main__":