12
from transformers import RobertaTokenizer, RobertaForMaskedLM, RobertaForSequenceClassification
13
from tqdm import tqdm, trange
14
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
15
from torch.utils.data.distributed import DistributedSampler
16
from transformers.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
17
from transformers.optimization import AdamW, get_linear_schedule_with_warmup
25
num_samples = int(sys.argv[4])
30
tail_hidd_list = list()
35
pretrained_weights = model
36
tokenizer = RobertaTokenizer.from_pretrained(pretrained_weights)
38
fine_tuned_weight = model
39
model = RobertaForMaskedLM.from_pretrained(pretrained_weights, output_hidden_states=True,return_dict=True)
49
old = torch.FloatTensor(768)
50
with open(file_in) as f:
52
for index, d in tqdm(enumerate(f)):
55
if index == int(num_samples):
60
tokens = tokenizer.tokenize(d)
61
if len(tokens)>=max_length-2:
62
tokens = tokens[:max_length-2]
63
tokens = ["<s>"] + tokens + ["</s>"]
64
ids_tail = len(tokens)-1
66
ids_tail = len(tokens)-1
67
tokens = ["<s>"]+tokens+["</s>"]
68
attention_mask = [1]*len(tokens)
69
padding = ["<pad>"]*(max_length-len(tokens))
71
attention_mask += [0]*len(padding)
74
ids = tokenizer.encode(tokens, add_special_tokens=False)
75
torch_ids = torch.tensor([ids]).to(device)
76
attention_mask = torch.tensor([attention_mask]).to(device)
77
output = model(input_ids=torch_ids, attention_mask=attention_mask)
82
tail_hidd = [x[0] for x in output.hidden_states[:]]
83
tail_hidd = torch.stack(tail_hidd)
84
tail_hidd = tail_hidd[:,0,:]
87
tail_hidd = tail_hidd.to("cpu")
89
all_data_dict[index] = {"sentence":d}
90
tail_hidd_list.append(tail_hidd)
93
if torch.equal(tail_hidd,old):
102
with open(file_out+'.json', 'w') as outfile:
103
json.dump(all_data_dict, outfile)
107
tail_hidd_tensor = torch.stack(tail_hidd_list)
108
print(tail_hidd_tensor.shape)
109
torch.save(tail_hidd_tensor, file_out+'_CLS.pt')