gpt-neox

bnb_125M.yml
87 строк · 2.2 Кб
Перенос по словам
1
# GPT-2 pretraining setup
2
{
3
   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4
   # across the node boundaries )
5
   "pipe_parallel_size": 1,
6
   "model_parallel_size": 1,
7

8
   # model settings
9
   "num_layers": 12,
10
   "hidden_size": 768,
11
   "num_attention_heads": 12,
12
   "seq_length": 2048,
13
   "max_position_embeddings": 2048,
14
   "norm": "layernorm",
15
   "pos_emb": "rotary",
16
   "no_weight_tying": true,
17
   "use_bnb_optimizer": true,
18

19
   # these should provide some speedup but takes a while to build, set to true if desired
20
   "scaled_upper_triang_masked_softmax_fusion": false,
21
   "bias_gelu_fusion": false,
22
   "rope_fusion": false,
23
   "layernorm_fusion": false,
24

25

26
   # optimizer settings
27
   "optimizer": {
28
     "type": "Adam",
29
     "params": {
30
       "lr": 0.0006,
31
       "betas": [0.9, 0.999],
32
       "eps": 1.0e-8,
33
     }
34
   },
35
   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
36
   "zero_optimization": {
37
    "stage": 0,
38
    "allgather_partitions": True,
39
    "allgather_bucket_size": 500000000,
40
    "overlap_comm": True,
41
    "reduce_scatter": True,
42
    "reduce_bucket_size": 500000000,
43
    "contiguous_gradients": True,
44
  },
45

46
   # batch / data settings
47
   "train_micro_batch_size_per_gpu": 4,
48
   "data_impl": "mmap",
49
   "split": "949,50,1",
50

51
   # activation checkpointing
52
   "checkpoint_activations": true,
53
   "checkpoint_num_layers": 1,
54
   "partition_activations": true,
55
   "synchronize_each_layer": true,
56

57
   # regularization
58
   "gradient_clipping": 1.0,
59
   "weight_decay": 0.0,
60
   "hidden_dropout": 0.0,
61
   "attention_dropout": 0.0,
62

63
   # precision settings
64
   "fp16": {
65
     "enabled": true,
66
     "loss_scale": 0,
67
     "loss_scale_window": 1000,
68
     "hysteresis": 2,
69
     "min_loss_scale": 1
70
   },
71

72
   # misc. training settings
73
   "train_iters": 320000,
74
   "lr_decay_iters": 320000,
75
   "distributed_backend": "nccl",
76
   "lr_decay_style": "cosine",
77
   "warmup": 0.01,
78
   "checkpoint_factor": 10000,
79
   "eval_interval": 1000,
80
   "eval_iters": 10,
81

82
   # logging
83
   "log_interval": 100,
84
   "steps_per_print": 10,
85
   "keep_last_n_checkpoints": 4,
86
   "wall_clock_breakdown": true,
87
}
88
gpt-neox

Использование cookies