gpt-neox
1# GPT-2 pretraining setup
2{
3# parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages4# across the node boundaries )5"pipe_parallel_size": 1,6"model_parallel_size": 1,7
8# model settings9"num_layers": 12,10"hidden_size": 768,11"num_attention_heads": 12,12"seq_length": 2048,13"max_position_embeddings": 2048,14"norm": "layernorm",15"pos_emb": "rotary",16"no_weight_tying": true,17"use_bnb_optimizer": true,18
19# these should provide some speedup but takes a while to build, set to true if desired20"scaled_upper_triang_masked_softmax_fusion": false,21"bias_gelu_fusion": false,22"rope_fusion": false,23"layernorm_fusion": false,24
25
26# optimizer settings27"optimizer": {28"type": "Adam",29"params": {30"lr": 0.0006,31"betas": [0.9, 0.999],32"eps": 1.0e-8,33}34},35# for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training36"zero_optimization": {37"stage": 0,38"allgather_partitions": True,39"allgather_bucket_size": 500000000,40"overlap_comm": True,41"reduce_scatter": True,42"reduce_bucket_size": 500000000,43"contiguous_gradients": True,44},45
46# batch / data settings47"train_micro_batch_size_per_gpu": 4,48"data_impl": "mmap",49"split": "949,50,1",50
51# activation checkpointing52"checkpoint_activations": true,53"checkpoint_num_layers": 1,54"partition_activations": true,55"synchronize_each_layer": true,56
57# regularization58"gradient_clipping": 1.0,59"weight_decay": 0.0,60"hidden_dropout": 0.0,61"attention_dropout": 0.0,62
63# precision settings64"fp16": {65"enabled": true,66"loss_scale": 0,67"loss_scale_window": 1000,68"hysteresis": 2,69"min_loss_scale": 170},71
72# misc. training settings73"train_iters": 320000,74"lr_decay_iters": 320000,75"distributed_backend": "nccl",76"lr_decay_style": "cosine",77"warmup": 0.01,78"checkpoint_factor": 10000,79"eval_interval": 1000,80"eval_iters": 10,81
82# logging83"log_interval": 100,84"steps_per_print": 10,85"keep_last_n_checkpoints": 4,86"wall_clock_breakdown": true,87}
88