LLM-FineTuning-Large-Language-Models
184 строки · 6.8 Кб
1import os2from dataclasses import dataclass, field3from typing import Optional4import torch5from datasets import load_dataset6from peft import LoraConfig7from transformers import (8AutoModelForCausalLM,9AutoTokenizer,10HfArgumentParser,11AutoTokenizer,12TrainingArguments,13)
14from peft import prepare_model_for_kbit_training, get_peft_model15from transformers import GPTQConfig16
17from trl import SFTTrainer18
19# Fine-tunes Llama 2 model on Guanaco dataset using GPTQ and peft.
20
21@dataclass
22class TrainingArguments(transformers.Seq2SeqTrainingArguments):23local_rank: Optional[int] = field(default=-1, metadata={"help": "Used for multi-gpu"})24per_device_train_batch_size: Optional[int] = field(default=1, metadata={"help": 'The training batch size per GPU. Increase for better speed.'})25per_device_eval_batch_size: Optional[int] = field(default=1)26gradient_accumulation_steps: Optional[int] = field(default=16, metadata={"help": 'How many gradients to accumulate before to perform an optimizer step'})27learning_rate: Optional[float] = field(default=0.0002, metadata={"help": 'The learning rate'})28max_grad_norm: Optional[float] = field(default=0.3, metadata={"help": 'Gradient clipping max norm. This is tuned and works well for all models tested.'})29weight_decay: Optional[int] = field(default=0.0, metadata={"help": 'The L2 weight decay rate of AdamW'}) # use lora dropout instead for regularization if needed30lora_alpha: Optional[int] = field(default=16)31lora_dropout: Optional[float] = field(default=0.1)32lora_r: Optional[int] = field(default=64, metadata={"help": "Lora R dimension."})33max_seq_length: Optional[int] = field(default=512)34
35model_name: Optional[str] = field(36default="TheBloke/Llama-2-7B-GPTQ",37metadata={38"help": "The model that you want to train from the Hugging Face hub. E.g. gpt2, gpt2-xl, bert, etc."39}40)41
42dataset_name: Optional[str] = field(43default="timdettmers/openassistant-guanaco",44metadata={"help": "The preference dataset to use."},45)46num_train_epochs: Optional[int] = field(47default=1,48metadata={"help": "The number of training epochs for the reward model."},49)50fp16: Optional[bool] = field(51default=False,52metadata={"help": "Enables fp16 training."},53)54bf16: Optional[bool] = field(55default=False,56metadata={"help": "Enables bf16 training."},57)58packing: Optional[bool] = field(59default=False,60metadata={"help": "Use packing dataset creating."},61)62gradient_checkpointing: Optional[bool] = field(63default=True,64metadata={"help": "Enables gradient checkpointing."},65)66optim: Optional[str] = field(67default="adamw_hf",68metadata={"help": "The optimizer to use."},69)70lr_scheduler_type: str = field(71default="constant",72metadata={"help": "Learning rate schedule. Constant a bit better than cosine, and has advantage for analysis"},73)74max_steps: int = field(default=10000, metadata={"help": "How many optimizer update steps to take"})75warmup_ratio: float = field(default=0.03, metadata={"help": "Fraction of steps to do a warmup for"})76group_by_length: bool = field(77default=True,78metadata={79"help": "Group sequences into batches with same length. Saves memory and speeds up training considerably."80},81)82save_steps: int = field(default=10, metadata={"help": "Save checkpoint every X updates steps."})83logging_steps: int = field(default=10, metadata={"help": "Log every X updates steps."})84merge_and_push: Optional[bool] = field(85default=False,86metadata={"help": "Merge and push weights after training"},87)88output_dir: str = field(89default="./results",90metadata={"help": "The output directory where the model predictions and checkpoints will be written."},91)92
93###########################################################
94
95parser = HfArgumentParser(TrainingArguments)96script_args = parser.parse_args_into_dataclasses()[0]97
98
99def prepare_lora_model(args):100major, _ = torch.cuda.get_device_capability()101if major >= 8:102print("=" * 80)103print("Your GPU supports bfloat16, you can accelerate training with the argument --bf16")104print("=" * 80)105
106# Load the entire model on the GPU 0107device_map = {"":0}108# switch to `device_map = "auto"` for multi-GPU109# device_map = "auto"110
111
112# need to disable exllama kernel113# exllama kernel are not very stable for training114model = AutoModelForCausalLM.from_pretrained(115args.model_name,116device_map=device_map,117quantization_config= GPTQConfig(bits=4, use_exllama=False)118)119
120# check: https://github.com/huggingface/transformers/pull/24906121#For fine-tuning llama2 models that have config.pretraining_tp>1 consider calling122# model.config.pretraining_tp = 1123model.config.pretraining_tp = 1124
125lora_config = LoraConfig(126lora_alpha=script_args.lora_alpha,127lora_dropout=script_args.lora_dropout,128r=script_args.lora_r,129bias="none",130task_type="CAUSAL_LM",131)132
133tokenizer = AutoTokenizer.from_pretrained(script_args.model_name, trust_remote_code=True)134tokenizer.pad_token = tokenizer.eos_token135
136return model, lora_config, tokenizer137
138
139training_arguments = TrainingArguments(140output_dir=script_args.output_dir,141per_device_train_batch_size=script_args.per_device_train_batch_size,142gradient_accumulation_steps=script_args.gradient_accumulation_steps,143optim=script_args.optim,144save_steps=script_args.save_steps,145logging_steps=script_args.logging_steps,146learning_rate=script_args.learning_rate,147fp16=script_args.fp16,148bf16=script_args.bf16,149max_grad_norm=script_args.max_grad_norm,150max_steps=script_args.max_steps,151warmup_ratio=script_args.warmup_ratio,152group_by_length=script_args.group_by_length,153lr_scheduler_type=script_args.lr_scheduler_type,154)
155
156####################################################
157model, lora_config, tokenizer = prepare_lora_model(script_args)158model = prepare_model_for_kbit_training(model)159model = get_peft_model(model, lora_config)160
161model.config.use_cache = False162dataset = load_dataset(script_args.dataset_name, split="train")163
164# Fix weird overflow issue with fp16 training
165tokenizer.padding_side = "right"166trainer = SFTTrainer(167model=model,168train_dataset=dataset,169dataset_text_field="text",170max_seq_length=script_args.max_seq_length,171tokenizer=tokenizer,172args=training_arguments,173packing=script_args.packing,174)
175
176trainer.train()177
178if script_args.merge_and_push:179output_dir = os.path.join(script_args.output_dir, "final_checkpoints")180trainer.model.save_pretrained(output_dir)181
182# Free memory for merging weights183del model184torch.cuda.empty_cache()