rulm

mt0_xxl_deepspeed.json
58 строк · 1.7 Кб
Перенос по словам
1
{
2
    "trainer": {
3
        "evaluation_strategy": "steps",
4
        "per_device_train_batch_size": 1,
5
        "per_device_eval_batch_size": 1,
6
        "gradient_accumulation_steps": 16,
7
        "eval_steps": 150,
8
        "save_steps": 150,
9
        "logging_steps": 5,
10
        "learning_rate": 0.00003,
11
        "num_train_epochs": 3,
12
        "lr_scheduler_type": "cosine",
13
        "warmup_steps": 100,
14
        "fp16": false,
15
        "bf16": true,
16
        "gradient_checkpointing": true,
17
        "torch_compile": false,
18
        "optim": "adamw_torch",
19
        "half_precision_backend": "auto",
20
        "fp16_opt_level": "O2"
21
    },
22
    "deepspeed": {
23
        "bf16": {
24
            "enabled": true
25
        },
26
        "optimizer": {
27
                "type": "AdamW",
28
                "params": {
29
                    "lr": "auto",
30
                    "betas": "auto",
31
                    "eps": "auto",
32
                    "weight_decay": "auto"
33
                }
34
        },
35
        "zero_optimization": {
36
            "stage": 3,
37
            "offload_param": {
38
                "device": "cpu",
39
                "pin_memory": true
40
            },
41
            "offload_optimizer": {
42
                "device": "cpu",
43
                "pin_memory": true
44
            },
45
            "overlap_comm": true,
46
            "round_robin_gradients": true,
47
            "gather_16bit_weights_on_model_save": true,
48
            "allgather_bucket_size": 2e8,
49
            "sub_group_size": 1e9
50
        },
51
        "train_batch_size": "auto",
52
        "gradient_accumulation_steps": "auto"
53
    },
54
    "model_name": "bigscience/mt0-xxl-mt",
55
    "model_type": "seq2seq",
56
    "max_source_tokens_count": 256,
57
    "max_target_tokens_count": 256
58
}
59

60
rulm

Использование cookies