2
batch_selfinstruct_generate.py
5
python -m generate_instruction generate_instruction_following_data \
7
--num_instructions_to_generate 10 \
8
--model_name="text-davinci-003" \
16
from functools import partial
17
from multiprocessing import Pool
21
from rouge_score import rouge_scorer
25
from gensim.summarization import bm25
26
from transformers import AutoTokenizer
27
checkpoint = "bigscience/bloomz-7b1"
28
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
32
def encode_prompt(prompt_instructions):
33
"""Encode multiple prompt instructions into a single string."""
34
prompt = open("./prompt_cn.txt").read() + "\n"
36
for idx, task_dict in enumerate(prompt_instructions):
37
(instruction, input, output) = task_dict["instruction"], task_dict["input"], task_dict["output"]
38
instruction = re.sub(r"\s+", " ", instruction).strip().rstrip(":")
39
input = "<无输入>" if input.lower() == "" else input
41
prompt += f"{idx + 1}. 指令: {instruction}\n"
42
prompt += f"{idx + 1}. 输入:\n{input}\n"
43
prompt += f"{idx + 1}. 输出:\n{output}\n"
45
prompt += f"{idx + 2}. 指令:"
49
def post_process_gpt3_response(num_prompt_instructions, response):
52
try: #for gpt-3.5-turbo
53
raw_instructions = response["message"]["content"]
56
raw_instructions = response["text"] #for text-davinci-003
59
if '指令:' not in raw_instructions[0: 10] and '指令:' not in raw_instructions[0: 10]:
60
raw_instructions = f"{num_prompt_instructions+1}. 指令:" + raw_instructions
61
raw_instructions = re.split("###", raw_instructions)
63
blacklist = ["图像", "图片", "照片", "文件", "图表", "图层", "曲线图", "折线图", "直线图", "柱形图", "饼状图", "链接", "http",'OpenAI', 'chatgpt', 'gpt-3', 'gpt-3.5', 'gpt-4']
64
replace_empty_list = ['要求GPT模型能够', '要求GPT能够', '要求GPT模型', '让GPT模型', '使用GPT模型', '请向GPT模型', 'GPT模型应', 'GPT模型应该', '请求GPT模型', '需要GPT模型回答', '请GPT模型'
65
, '请让GPT模型', '训练GPT模型', 'GPT模型需要', '要求GPT', '让GPT', '使用GPT', '请向GPT', 'GPT应', 'GPT应该', '请求GPT', '需要GPT回答', '请GPT', '请让GPT'
66
, '训练GPT', 'GPT需要', '希望GPT模型能够', '希望GPT能够', '以便GPT模型能够', '以便GPT能够', '使得GPT模型能够', '使得GPT能够', '使GPT模型能够', '使GPT能够'
68
for idx, inst in enumerate(raw_instructions):
69
# if the decoding stops due to length, the last example is likely truncated so we discard it
70
if idx == len(raw_instructions) - 1 and response["finish_reason"] == "length":
72
# filter based on keywords that are not suitable for language models.
73
if any(find_word_in_string(word, inst) for word in blacklist):
75
intruction_pattern = re.compile(r"(?<=(?:" + '|'.join(['指令:', '指令:']) + "))[\s\S]*?(?=" + '|'.join(['输入:', '输入:']) + ")")
76
input_pattern = re.compile(r"(?<=(?:" + '|'.join(['输入:', '输入:']) + "))[\s\S]*?(?=" + '|'.join(['输出:', '输出:']) + ")")
77
output_pattern = re.compile(r"(?<=(?:" + '|'.join(['输出:', '输出:']) + "))[\s\S]*?(?=$)")
78
intruction_match = intruction_pattern.search(inst)
79
input_match = input_pattern.search(inst)
80
output_match = output_pattern.search(inst)
81
if intruction_match and input_match and output_match:
82
inst = re.sub(r'\d+\.$', '', intruction_match.group().strip()).strip('\n')
83
input = re.sub(r'\d+\.$', '', input_match.group().strip()).strip('\n')
84
input = "" if "无输入" in input else input
85
output = output_match.group().strip().strip('\n')
86
if '指令:' in output and '输入:' in output and '输出:' in output: # 返回若没有以###号区分,取第一条数据
87
output_pattern_new = re.compile(r"(?<=(?:" + "))[\s\S]*?(?=" + '|'.join(['指令:', '指令:']) + ")")
88
output_match_new = output_pattern_new.search(output)
90
output = re.sub(r'\d+\.$', '', output_match_new.group().strip()).strip('\n')
95
for item in replace_empty_list:
96
inst = inst.replace(item, "")
98
if "GPT" in inst or 'GPT' in input:
101
if len(input) == 0: # input无输入
102
instructions.append({"instruction": inst, "input": input, "output": output})
104
if '示例' in inst or '例子' in inst: # inst里给例子
106
instructions.append({"instruction": inst, "input": input, "output": output})
109
instructions.append({"instruction": inst, "input": input, "output": output})
113
def find_word_in_string(w, s):
117
def generate_instruction_following_data(
119
seed_tasks_path="./zh_seed_tasks.json",
120
num_instructions_to_generate=1,
122
model_name="text-davinci-003",
123
num_prompt_instructions=3,
124
request_batch_size=1,
129
seed_tasks = [json.loads(l) for l in open(seed_tasks_path, "r")]
130
seed_instruction_data = [
131
{"instruction": t["instruction"], "input": t["instances"][0]["input"], "output": t["instances"][0]["output"]}
134
print(f"Loaded {len(seed_instruction_data)} human-written seed instructions")
137
os.makedirs(output_dir, exist_ok=True)
139
# load the LM-generated instructions
140
machine_instruction_data = []
141
if os.path.exists(os.path.join(output_dir, "Belle.train.json")):
142
machine_instruction_data = utils.jload(os.path.join(output_dir, "Belle.train.json"))
143
print(f"Loaded {len(machine_instruction_data)} machine-generated instructions")
146
# now let's generate new instructions!
147
progress_bar = tqdm.tqdm(total=num_instructions_to_generate)
148
if machine_instruction_data:
149
progress_bar.update(len(machine_instruction_data))
151
# first we tokenize all the seed instructions and generated machine instructions
152
all_instructions = [d["instruction"] for d in seed_instruction_data] + [
153
d["instruction"] for d in machine_instruction_data
155
all_instruction_tokens = [tokenizer.tokenize(inst) for inst in all_instructions]
156
bm25Model = bm25.BM25(all_instruction_tokens)
159
while len(machine_instruction_data) < num_instructions_to_generate:
163
for _ in range(request_batch_size):
164
# only sampling from the seed tasks
165
prompt_instructions = random.sample(seed_instruction_data, num_prompt_instructions)# seed_instruction_data, num_prompt_instructions)
166
prompt = encode_prompt(prompt_instructions)
167
batch_inputs.append(prompt)
168
decoding_args = utils.OpenAIDecodingArguments(
169
temperature=temperature,
171
max_tokens=1024, # hard-code to maximize the length. the requests will be automatically adjusted
173
stop=["\n20", "20.", "20."],
175
request_start = time.time()
176
results = utils.openai_completion(
177
prompts=batch_inputs,
179
model_name=model_name,
180
batch_size=request_batch_size,
181
decoding_args=decoding_args,
182
logit_bias={"50256": -100}, # prevent the <|endoftext|> token from being generated
185
request_duration = time.time() - request_start
187
process_start = time.time()
188
instruction_data = []
189
for result in results:
190
new_instructions = post_process_gpt3_response(num_prompt_instructions, result)
191
instruction_data += new_instructions
193
total = len(instruction_data)
195
for instruction_data_entry in instruction_data:
196
# computing similarity with the pre-tokenzied instructions
197
new_instruction_tokens = tokenizer.tokenize(instruction_data_entry["instruction"])
198
rouge_scores = bm25Model.get_scores(new_instruction_tokens)
200
most_similar_instructions = {
201
all_instructions[i]: rouge_scores[i] for i in np.argsort(rouge_scores)[-10:][::-1]
203
if max(rouge_scores) >18:
207
instruction_data_entry["most_similar_instructions"] = most_similar_instructions
208
instruction_data_entry["avg_similarity_score"] = float(np.mean(rouge_scores))
209
machine_instruction_data.append(instruction_data_entry)
210
all_instructions.append(instruction_data_entry["instruction"])
211
all_instruction_tokens.append(new_instruction_tokens)
212
progress_bar.update(1)
213
process_duration = time.time() - process_start
214
print(f"Request {request_idx} took {request_duration:.2f}s, processing took {process_duration:.2f}s")
215
print(f"Generated {total} instructions, kept {keep} instructions")
216
utils.jdump(machine_instruction_data, os.path.join(output_dir, "regen.json"))
219
def main(task, **kwargs):
220
globals()[task](**kwargs)
223
if __name__ == "__main__":