gpt-neox

760M.yml
93 строки · 2.3 Кб
Перенос по словам
1
# GPT-2 pretraining setup
2
{
3
   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4
   # across the node boundaries )
5
   "pipe_parallel_size": 1,
6
   "model_parallel_size": 1,
7

8
   # model settings
9
   "num_layers": 24,
10
   "hidden_size": 1536,
11
   "num_attention_heads": 16,
12
   "seq_length": 2048,
13
   "max_position_embeddings": 2048,
14
   "norm": "layernorm",
15
   "pos_emb": "rotary",
16
   "no_weight_tying": true,
17
   "gpt_j_residual": false,
18
   "output_layer_parallelism": "column",
19

20
   # these should provide some speedup but takes a while to build, set to true if desired
21
   "scaled_upper_triang_masked_softmax_fusion": false,
22
   "bias_gelu_fusion": false,
23
   "rope_fusion": false,
24
   "layernorm_fusion": false,
25

26
   # init methods
27
   "init_method": "small_init",
28
   "output_layer_init_method": "wang_init",
29

30
   # optimizer settings
31
   "optimizer": {
32
     "type": "Adam",
33
     "params": {
34
       "lr": 0.00025,
35
       "betas": [0.9, 0.999],
36
       "eps": 1.0e-8,
37
     }
38
   },
39
   "min_lr": 0.000025,
40

41
   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
42
   "zero_optimization": {
43
    "stage": 1,
44
    "allgather_partitions": True,
45
    "allgather_bucket_size": 500000000,
46
    "overlap_comm": True,
47
    "reduce_scatter": True,
48
    "reduce_bucket_size": 500000000,
49
    "contiguous_gradients": True,
50
  },
51

52
   # batch / data settings
53
   "train_micro_batch_size_per_gpu": 4,
54
   "data_impl": "mmap",
55

56
   # activation checkpointing
57
   "checkpoint_activations": true,
58
   "checkpoint_num_layers": 1,
59
   "partition_activations": true,
60
   "synchronize_each_layer": true,
61

62
   # regularization
63
   "gradient_clipping": 1.0,
64
   "weight_decay": 0.1,
65
   "hidden_dropout": 0,
66
   "attention_dropout": 0,
67

68
   # precision settings
69
   "fp16": {
70
     "fp16": true,
71
     "enabled": true,
72
     "loss_scale": 0,
73
     "loss_scale_window": 1000,
74
     "hysteresis": 2,
75
     "min_loss_scale": 1
76
   },
77

78
   # misc. training settings
79
   "train_iters": 320000,
80
   "lr_decay_iters": 320000,
81
   "distributed_backend": "nccl",
82
   "lr_decay_style": "cosine",
83
   "warmup": 0.01,
84
   "checkpoint_factor": 10000,
85
   "eval_interval": 1000,
86
   "eval_iters": 10,
87

88
   # logging
89
   "log_interval": 100,
90
   "steps_per_print": 10,
91
   "keep_last_n_checkpoints": 4,
92
   "wall_clock_breakdown": true,
93
}
94
gpt-neox

Использование cookies