stanford_alpaca
/
generate_instruction.py
217 строк · 8.2 Кб
1"""
2batch_selfinstruct_generate.py
3
4run:
5python -m generate_instruction generate_instruction_following_data \
6--output_dir ./ \
7--num_instructions_to_generate 10 \
8--model_name="text-davinci-003" \
9"""
10import time11import json12import os13import random14import re15import string16from functools import partial17from multiprocessing import Pool18
19import numpy as np20import tqdm21from rouge_score import rouge_scorer22import utils23
24import fire25
26
27def encode_prompt(prompt_instructions):28"""Encode multiple prompt instructions into a single string."""29prompt = open("./prompt.txt").read() + "\n"30
31for idx, task_dict in enumerate(prompt_instructions):32(instruction, input, output) = task_dict["instruction"], task_dict["input"], task_dict["output"]33instruction = re.sub(r"\s+", " ", instruction).strip().rstrip(":")34input = "<noinput>" if input.lower() == "" else input35prompt += f"###\n"36prompt += f"{idx + 1}. Instruction: {instruction}\n"37prompt += f"{idx + 1}. Input:\n{input}\n"38prompt += f"{idx + 1}. Output:\n{output}\n"39prompt += f"###\n"40prompt += f"{idx + 2}. Instruction:"41return prompt42
43
44def post_process_gpt3_response(num_prompt_instructions, response):45if response is None:46return []47raw_instructions = f"{num_prompt_instructions+1}. Instruction:" + response["text"]48raw_instructions = re.split("###", raw_instructions)49instructions = []50for idx, inst in enumerate(raw_instructions):51# if the decoding stops due to length, the last example is likely truncated so we discard it52if idx == len(raw_instructions) - 1 and response["finish_reason"] == "length":53continue54idx += num_prompt_instructions + 155splitted_data = re.split(f"{idx}\.\s+(Instruction|Input|Output):", inst)56if len(splitted_data) != 7:57continue58else:59inst = splitted_data[2].strip()60input = splitted_data[4].strip()61input = "" if input.lower() == "<noinput>" else input62output = splitted_data[6].strip()63# filter out too short or too long instructions64if len(inst.split()) <= 3 or len(inst.split()) > 150:65continue66# filter based on keywords that are not suitable for language models.67blacklist = [68"image",69"images",70"graph",71"graphs",72"picture",73"pictures",74"file",75"files",76"map",77"maps",78"draw",79"plot",80"go to",81"video",82"audio",83"music",84"flowchart",85"diagram",86]87blacklist += []88if any(find_word_in_string(word, inst) for word in blacklist):89continue90# We found that the model tends to add "write a program" to some existing instructions, which lead to a lot of such instructions.91# And it's a bit comfusing whether the model need to write a program or directly output the result.92# Here we filter them out.93# Note this is not a comprehensive filtering for all programming instructions.94if inst.startswith("Write a program"):95continue96# filter those starting with punctuation97if inst[0] in string.punctuation:98continue99# filter those starting with non-english character100if not inst[0].isascii():101continue102instructions.append({"instruction": inst, "input": input, "output": output})103return instructions104
105
106def find_word_in_string(w, s):107return re.compile(r"\b({0})\b".format(w), flags=re.IGNORECASE).search(s)108
109
110def generate_instruction_following_data(111output_dir="./",112seed_tasks_path="./seed_tasks.jsonl",113num_instructions_to_generate=100,114model_name="text-davinci-003",115num_prompt_instructions=3,116request_batch_size=5,117temperature=1.0,118top_p=1.0,119num_cpus=16,120):121seed_tasks = [json.loads(l) for l in open(seed_tasks_path, "r")]122seed_instruction_data = [123{"instruction": t["instruction"], "input": t["instances"][0]["input"], "output": t["instances"][0]["output"]}124for t in seed_tasks125]126print(f"Loaded {len(seed_instruction_data)} human-written seed instructions")127
128os.makedirs(output_dir, exist_ok=True)129request_idx = 0130# load the LM-generated instructions131machine_instruction_data = []132if os.path.exists(os.path.join(output_dir, "regen.json")):133machine_instruction_data = utils.jload(os.path.join(output_dir, "regen.json"))134print(f"Loaded {len(machine_instruction_data)} machine-generated instructions")135
136# similarities = {}137scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=False)138
139# now let's generate new instructions!140progress_bar = tqdm.tqdm(total=num_instructions_to_generate)141if machine_instruction_data:142progress_bar.update(len(machine_instruction_data))143
144# first we tokenize all the seed instructions and generated machine instructions145all_instructions = [d["instruction"] for d in seed_instruction_data] + [146d["instruction"] for d in machine_instruction_data147]148all_instruction_tokens = [scorer._tokenizer.tokenize(inst) for inst in all_instructions]149
150while len(machine_instruction_data) < num_instructions_to_generate:151request_idx += 1152
153batch_inputs = []154for _ in range(request_batch_size):155# only sampling from the seed tasks156prompt_instructions = random.sample(seed_instruction_data, num_prompt_instructions)157prompt = encode_prompt(prompt_instructions)158batch_inputs.append(prompt)159decoding_args = utils.OpenAIDecodingArguments(160temperature=temperature,161n=1,162max_tokens=3072, # hard-code to maximize the length. the requests will be automatically adjusted163top_p=top_p,164stop=["\n20", "20.", "20."],165)166request_start = time.time()167results = utils.openai_completion(168prompts=batch_inputs,169model_name=model_name,170batch_size=request_batch_size,171decoding_args=decoding_args,172logit_bias={"50256": -100}, # prevent the <|endoftext|> token from being generated173)174request_duration = time.time() - request_start175
176process_start = time.time()177instruction_data = []178for result in results:179new_instructions = post_process_gpt3_response(num_prompt_instructions, result)180instruction_data += new_instructions181
182total = len(instruction_data)183keep = 0184for instruction_data_entry in instruction_data:185# computing similarity with the pre-tokenzied instructions186new_instruction_tokens = scorer._tokenizer.tokenize(instruction_data_entry["instruction"])187with Pool(num_cpus) as p:188rouge_scores = p.map(189partial(rouge_scorer._score_lcs, new_instruction_tokens),190all_instruction_tokens,191)192rouge_scores = [score.fmeasure for score in rouge_scores]193most_similar_instructions = {194all_instructions[i]: rouge_scores[i] for i in np.argsort(rouge_scores)[-10:][::-1]195}196if max(rouge_scores) > 0.7:197continue198else:199keep += 1200instruction_data_entry["most_similar_instructions"] = most_similar_instructions201instruction_data_entry["avg_similarity_score"] = float(np.mean(rouge_scores))202machine_instruction_data.append(instruction_data_entry)203all_instructions.append(instruction_data_entry["instruction"])204all_instruction_tokens.append(new_instruction_tokens)205progress_bar.update(1)206process_duration = time.time() - process_start207print(f"Request {request_idx} took {request_duration:.2f}s, processing took {process_duration:.2f}s")208print(f"Generated {total} instructions, kept {keep} instructions")209utils.jdump(machine_instruction_data, os.path.join(output_dir, "regen.json"))210
211
212def main(task, **kwargs):213globals()[task](**kwargs)214
215
216if __name__ == "__main__":217fire.Fire(main)218