openprompt
250 строк · 11.1 Кб
1# %% [markdown]
2# ## Text Classification with LM-BFF.
3# In this tutorial, we do sentiment analysis with automatic template and verbalizer generation. We use SST-2 as an example.
4
5# %% [markdown]
6# ### 1. load dataset
7
8# %%
9# import argparse
10# parser = argparse.ArgumentParser("")
11# parser.add_argument("--lr", type=float, default=5e-5)
12# args = parser.parse_args()
13from openprompt.data_utils.text_classification_dataset import SST2Processor
14dataset = {}
15dataset['train'] = SST2Processor().get_train_examples("../datasets/TextClassification/SST-2/16-shot/16-13")
16dataset['validation'] = SST2Processor().get_dev_examples("../datasets/TextClassification/SST-2/16-shot/16-13")
17dataset['test'] = SST2Processor().get_test_examples("../datasets/TextClassification/SST-2/16-shot/16-13")
18
19# %% [markdown]
20# ### 2. build initial verbalizer and template
21# - note that if you wish to do automatic label word generation, the verbalizer is not the final verbalizer, and is only used for template generation.
22# - note that if you wish to do automatic template generation, the template text may desirably include `{"meta":"labelword"}` so that label word can be used and remember to use `LMBFFTemplateGenerationTemplate` class so that "labelword" can be handled properly. Else you can just use `ManualTemplate`
23# - below is a template that expects plain text generation at each "mask" token position
24
25# %%
26print('load model...')
27from openprompt.plms import load_plm
28# load mlm model for main tasks
29plm, tokenizer, model_config, WrapperClass = load_plm("roberta", "roberta-large")
30
31# load generation model for template generation
32template_generate_model, template_generate_tokenizer, template_generate_model_config, template_tokenizer_wrapper = load_plm('t5', 't5-large')
33
34from openprompt.prompts import ManualVerbalizer, ManualTemplate
35verbalizer = ManualVerbalizer(tokenizer=tokenizer, num_classes=2, label_words=[['terrible'],['great']])
36
37from openprompt.prompts.prompt_generator import LMBFFTemplateGenerationTemplate
38template = LMBFFTemplateGenerationTemplate(tokenizer=template_generate_tokenizer, verbalizer=verbalizer, text='{"placeholder":"text_a"} {"mask"} {"meta":"labelword"} {"mask"}.')
39# template = ManualTemplate(tokenizer=tokenizer, text='{"placeholder":"text_a"} It is {"mask"}.')
40
41# view wrapped example
42wrapped_example = template.wrap_one_example(dataset['train'][0])
43print(wrapped_example)
44
45# %%
46# parameter setting
47cuda = True
48auto_t = True # whether to perform automatic template generation
49auto_v = True # whether to perform automatic label word generation
50
51
52# %%
53# train util function
54from openprompt.plms import load_plm
55from openprompt.prompts.prompt_generator import T5TemplateGenerator
56from openprompt.pipeline_base import PromptDataLoader, PromptForClassification
57from openprompt.prompts import ManualTemplate
58from openprompt.trainer import ClassificationRunner
59import copy
60import torch
61from transformers import AdamW, get_linear_schedule_with_warmup
62import numpy as np
63
64def fit(model, train_dataloader, val_dataloader, loss_func, optimizer):
65best_score = 0.0
66for epoch in range(5):
67train_loss = train_epoch(model, train_dataloader, loss_func, optimizer)
68score = evaluate(model, val_dataloader)
69if score > best_score:
70best_score = score
71print(f"Epoch {epoch+1}: Train loss={train_loss}, Eval score={score}")
72return best_score
73
74
75def train_epoch(model, train_dataloader, loss_func, optimizer):
76model.train()
77loss_all = []
78for step, inputs in enumerate(train_dataloader):
79if cuda:
80inputs = inputs.cuda()
81logits = model(inputs)
82labels = inputs['label']
83loss = loss_func(logits, labels)
84loss.backward()
85loss_all.append(loss.item())
86optimizer.step()
87optimizer.zero_grad()
88return np.mean(loss_all)
89
90def evaluate(model, val_dataloader):
91model.eval()
92allpreds = []
93alllabels = []
94with torch.no_grad():
95for step, inputs in enumerate(val_dataloader):
96if cuda:
97inputs = inputs.cuda()
98logits = model(inputs)
99labels = inputs['label']
100alllabels.extend(labels.cpu().tolist())
101allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())
102acc = sum([int(i==j) for i,j in zip(allpreds, alllabels)])/len(allpreds)
103return acc
104
105
106# %% [markdown]
107# ### 3. automatic template and verbalizer generation
108
109# %%
110from tqdm import tqdm
111
112class ManualTemplateWithoutParse(ManualTemplate):
113"""The generated template from TemplateGenerator is a list of dict of parsed template_text. So no further parsing is needed."""
114def on_text_set(self):
115pass
116
117# template generation
118if auto_t:
119print('performing auto_t...')
120
121if cuda:
122template_generate_model = template_generate_model.cuda()
123template_generator = T5TemplateGenerator(template_generate_model, template_generate_tokenizer, template_tokenizer_wrapper, verbalizer, beam_width=5) # beam_width is set to 5 here for efficiency, to improve performance, try a larger number.
124
125
126dataloader = PromptDataLoader(dataset['train'], template, tokenizer=template_generate_tokenizer, tokenizer_wrapper_class=template_tokenizer_wrapper, batch_size=len(dataset['train']), decoder_max_length=128, max_seq_length=128, shuffle=False, teacher_forcing=False) # register all data at once
127for data in dataloader:
128if cuda:
129data = data.cuda()
130template_generator._register_buffer(data)
131
132template_generate_model.eval()
133print('generating...')
134template_texts = template_generator._get_templates()
135
136original_template = template.text
137template_texts = [template_generator.convert_template(template_text, original_template) for template_text in template_texts]
138# template_generator._show_template()
139template_generator.release_memory()
140# generate a number of candidate template text
141print(template_texts)
142# iterate over each candidate and select the best one
143best_metrics = 0.0
144best_template_text = None
145for template_text in tqdm(template_texts):
146template = ManualTemplateWithoutParse(tokenizer, template_text)
147print(f"current template: {template_text}, wrapped example: {template.wrap_one_example(dataset['train'][0])}")
148
149train_dataloader = PromptDataLoader(dataset['train'], template, tokenizer=tokenizer, tokenizer_wrapper_class=WrapperClass, shuffle=True)
150valid_dataloader = PromptDataLoader(dataset['validation'], template, tokenizer=tokenizer, tokenizer_wrapper_class=WrapperClass)
151
152model = PromptForClassification(copy.deepcopy(plm), template, verbalizer)
153
154loss_func = torch.nn.CrossEntropyLoss()
155no_decay = ['bias', 'LayerNorm.weight']
156# it's always good practice to set no decay to biase and LayerNorm parameters
157optimizer_grouped_parameters = [
158{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
159{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
160]
161
162optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)
163if cuda:
164model = model.cuda()
165score = fit(model, train_dataloader, valid_dataloader, loss_func, optimizer)
166
167if score > best_metrics:
168print('current best score:', score)
169best_metrics = score
170best_template_text = template_text
171# use the best template
172template = ManualTemplateWithoutParse(tokenizer, text=best_template_text)
173print("final best template:", best_template_text)
174print("wrapped example:", template.wrap_one_example(dataset["train"][0]))
175
176# %%
177# verbalizer generation
178from openprompt.prompts.prompt_generator import RobertaVerbalizerGenerator
179if auto_v:
180print('performing auto_v...')
181# load generation model for template generation
182if cuda:
183plm = plm.cuda()
184verbalizer_generator = RobertaVerbalizerGenerator(model=plm, tokenizer=tokenizer, candidate_num=20, label_word_num_per_class=20)
185# to improve performance , try larger numbers
186
187dataloader = PromptDataLoader(dataset['train'], template, tokenizer=tokenizer, tokenizer_wrapper_class=WrapperClass, batch_size=32)
188for data in dataloader:
189if cuda:
190data = data.cuda()
191verbalizer_generator.register_buffer(data)
192label_words_list = verbalizer_generator.generate()
193verbalizer_generator.release_memory()
194
195# iterate over each candidate and select the best one
196current_verbalizer = copy.deepcopy(verbalizer)
197best_metrics = 0.0
198best_label_words = None
199for label_words in tqdm(label_words_list):
200current_verbalizer.label_words = label_words
201train_dataloader = PromptDataLoader(dataset['train'], template, tokenizer=tokenizer, tokenizer_wrapper_class=WrapperClass, shuffle=True)
202valid_dataloader = PromptDataLoader(dataset['validation'], template, tokenizer=tokenizer, tokenizer_wrapper_class=WrapperClass)
203
204model = PromptForClassification(copy.deepcopy(plm), template, current_verbalizer)
205
206loss_func = torch.nn.CrossEntropyLoss()
207no_decay = ['bias', 'LayerNorm.weight']
208# it's always good practice to set no decay to biase and LayerNorm parameters
209optimizer_grouped_parameters = [
210{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
211{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
212]
213
214optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)
215if cuda:
216model = model.cuda()
217score = fit(model, train_dataloader, valid_dataloader, loss_func, optimizer)
218
219if score > best_metrics:
220best_metrics = score
221best_label_words = label_words
222# use the best verbalizer
223print("final best label words:", best_label_words)
224verbalizer = ManualVerbalizer(tokenizer, num_classes=2, label_words=best_label_words)
225
226# %% [markdown]
227# ### 4. main training loop
228
229# %%
230# main training loop
231train_dataloader = PromptDataLoader(dataset['train'], template, tokenizer=tokenizer, tokenizer_wrapper_class=WrapperClass, shuffle=True)
232valid_dataloader = PromptDataLoader(dataset['validation'], template, tokenizer=tokenizer, tokenizer_wrapper_class=WrapperClass)
233test_dataloader = PromptDataLoader(dataset['test'], template, tokenizer=tokenizer, tokenizer_wrapper_class=WrapperClass)
234
235
236model = PromptForClassification(copy.deepcopy(plm), template, verbalizer)
237loss_func = torch.nn.CrossEntropyLoss()
238no_decay = ['bias', 'LayerNorm.weight']
239# it's always good practice to set no decay to biase and LayerNorm parameters
240optimizer_grouped_parameters = [
241{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
242{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
243]
244
245optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)
246if cuda:
247model = model.cuda()
248score = fit(model, train_dataloader, valid_dataloader, loss_func, optimizer)
249test_score = evaluate(model, test_dataloader)
250print("Final test score:", test_score)
251