skypilot
1base_model: mistralai/Mistral-7B-v0.1
2model_type: MistralForCausalLM
3tokenizer_type: LlamaTokenizer
4is_mistral_derived_model: true
5
6load_in_8bit: false
7load_in_4bit: true
8strict: false
9
10datasets:
11- path: mhenrichsen/alpaca_2k_test
12type: alpaca
13dataset_prepared_path: last_run_prepared
14val_set_size: 0.05
15output_dir: ./qlora-out
16
17# hub_model_id: manishiitg/mhenrichsen-alpaca_2k_test # TODO: Replace with hub model id
18# hf_use_auth_token: false # TODO: push as private or public model
19
20adapter: qlora
21lora_model_dir:
22
23sequence_len: 8192
24sample_packing: true
25pad_to_sequence_len: true
26
27lora_r: 32
28lora_alpha: 16
29lora_dropout: 0.05
30lora_target_linear: true
31lora_fan_in_fan_out:
32lora_target_modules:
33- gate_proj
34- down_proj
35- up_proj
36- q_proj
37- v_proj
38- k_proj
39- o_proj
40
41wandb_project:
42wandb_entity:
43wandb_watch:
44wandb_run_id:
45wandb_log_model:
46
47gradient_accumulation_steps: 4
48micro_batch_size: 2
49num_epochs: 1
50optimizer: adamw_bnb_8bit
51lr_scheduler: cosine
52learning_rate: 0.0002
53
54train_on_inputs: false
55group_by_length: false
56bf16: true
57fp16: false
58tf32: false
59
60gradient_checkpointing: true
61early_stopping_patience:
62resume_from_checkpoint:
63local_rank:
64logging_steps: 1
65xformers_attention:
66flash_attention: true
67
68warmup_steps: 10
69eval_steps: 0.05
70eval_table_size:
71eval_table_max_new_tokens: 128
72save_steps:
73debug:
74deepspeed:
75weight_decay: 0.0
76fsdp:
77fsdp_config:
78special_tokens:
79bos_token: "<s>"
80eos_token: "</s>"
81unk_token: "<unk>"