gpt-neox

175B.yml
92 строки · 2.3 Кб
Перенос по словам
1
# GPT-2 pretraining setup
2
{
3
   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4
   # across the node boundaries )
5
   "pipe_parallel_size": 1,
6
   "model_parallel_size": 1,
7

8
   # model settings
9
   "num_layers": 96,
10
   "hidden_size": 12288,
11
   "num_attention_heads": 96,
12
   "seq_length": 2048,
13
   "max_position_embeddings": 2048,
14
   "norm": "layernorm",
15
   "pos_emb": "rotary",
16
   "no_weight_tying": true,
17
   "gpt_j_residual": false,
18
   "output_layer_parallelism": "column",
19

20
   # these should provide some speedup but takes a while to build, set to true if desired
21
   "scaled_upper_triang_masked_softmax_fusion": false,
22
   "bias_gelu_fusion": false,
23
   "rope_fusion": false,
24
   "layernorm_fusion": false,
25

26
   # init methods
27
   "init_method": "small_init",
28
   "output_layer_init_method": "wang_init",
29

30
   # optimizer settings
31
   "optimizer": {
32
     "type": "Adam",
33
     "params": {
34
       "lr": 0.00006,
35
       "betas": [0.9, 0.95],
36
       "eps": 1.0e-8,
37
     }
38
   },
39
   "min_lr": 0.000006,
40
   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
41
   "zero_optimization": {
42
    "stage": 1,
43
    "allgather_partitions": True,
44
    "allgather_bucket_size": 500000000,
45
    "overlap_comm": True,
46
    "reduce_scatter": True,
47
    "reduce_bucket_size": 500000000,
48
    "contiguous_gradients": True,
49
  },
50

51
   # batch / data settings
52
   "train_micro_batch_size_per_gpu": 4,
53
   "data_impl": "mmap",
54

55
   # activation checkpointing
56
   "checkpoint_activations": true,
57
   "checkpoint_num_layers": 1,
58
   "partition_activations": true,
59
   "synchronize_each_layer": true,
60

61
   # regularization
62
   "gradient_clipping": 1.0,
63
   "weight_decay": 0.1,
64
   "hidden_dropout": 0,
65
   "attention_dropout": 0,
66

67
   # precision settings
68
   "fp16": {
69
     "fp16": true,
70
     "enabled": true,
71
     "loss_scale": 0,
72
     "loss_scale_window": 1000,
73
     "hysteresis": 2,
74
     "min_loss_scale": 1
75
   },
76

77
   # misc. training settings
78
   "train_iters": 320000,
79
   "lr_decay_iters": 320000,
80
   "distributed_backend": "nccl",
81
   "lr_decay_style": "cosine",
82
   "warmup": 0.01,
83
   "checkpoint_factor": 10000,
84
   "eval_interval": 1000,
85
   "eval_iters": 10,
86

87
   # logging
88
   "log_interval": 100,
89
   "steps_per_print": 10,
90
   "keep_last_n_checkpoints": 4,
91
   "wall_clock_breakdown": true,
92
}
93
gpt-neox

Использование cookies