rulm
58 строк · 1.7 Кб
1{
2"trainer": {
3"evaluation_strategy": "steps",
4"per_device_train_batch_size": 1,
5"per_device_eval_batch_size": 1,
6"gradient_accumulation_steps": 16,
7"eval_steps": 150,
8"save_steps": 150,
9"logging_steps": 5,
10"learning_rate": 0.00003,
11"num_train_epochs": 3,
12"lr_scheduler_type": "cosine",
13"warmup_steps": 100,
14"fp16": false,
15"bf16": true,
16"gradient_checkpointing": true,
17"torch_compile": false,
18"optim": "adamw_torch",
19"half_precision_backend": "auto",
20"fp16_opt_level": "O2"
21},
22"deepspeed": {
23"bf16": {
24"enabled": true
25},
26"optimizer": {
27"type": "AdamW",
28"params": {
29"lr": "auto",
30"betas": "auto",
31"eps": "auto",
32"weight_decay": "auto"
33}
34},
35"zero_optimization": {
36"stage": 3,
37"offload_param": {
38"device": "cpu",
39"pin_memory": true
40},
41"offload_optimizer": {
42"device": "cpu",
43"pin_memory": true
44},
45"overlap_comm": true,
46"round_robin_gradients": true,
47"gather_16bit_weights_on_model_save": true,
48"allgather_bucket_size": 2e8,
49"sub_group_size": 1e9
50},
51"train_batch_size": "auto",
52"gradient_accumulation_steps": "auto"
53},
54"model_name": "bigscience/mt0-xxl-mt",
55"model_type": "seq2seq",
56"max_source_tokens_count": 256,
57"max_target_tokens_count": 256
58}
59
60