gpt-neox

800M.yml
86 строк · 1.9 Кб
Перенос по словам
1
{
2
  "pipe_parallel_size": 1,
3
  "model_parallel_size": 1,
4

5
  # model settings
6
  "num_layers": 16,
7
  "hidden_size": 2048,
8
  "num_attention_heads": 8,
9
  "seq_length": 2048,
10
  "max_position_embeddings": 2048,
11
  "pos_emb": "rotary",
12
  "no_weight_tying": true,
13
  "gpt_j_residual": false,
14
  "output_layer_parallelism": "column",
15

16
  "scaled_upper_triang_masked_softmax_fusion": false,
17
  "bias_gelu_fusion": false,
18
  "rope_fusion": false,
19
  "layernorm_fusion": false,
20

21
  # init methods
22
  "init_method": "small_init",
23
  "output_layer_init_method": "wang_init",
24

25
  "optimizer": {
26
    "type": "Adam",
27
    "params": {
28
      "lr": 0.00025,
29
      "betas": [0.9, 0.95],
30
      "eps": 1.0e-8,
31
    }
32
  },
33
  "min_lr": 0.000025,
34

35
  # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
36
  "zero_optimization": {
37
    "stage": 1,
38
    "allgather_partitions": True,
39
    "allgather_bucket_size": 500000000,
40
    "overlap_comm": True,
41
    "reduce_scatter": True,
42
    "reduce_bucket_size": 500000000,
43
    "contiguous_gradients": True,
44
  },
45

46
  "train_micro_batch_size_per_gpu": 16,
47
  "gradient_accumulation_steps": 1,
48
  "data_impl": "mmap",
49
  "num_workers": 1,
50

51
  # activation checkpointing
52
  "checkpoint_activations": true,
53
  "checkpoint_num_layers": 1,
54
  "partition_activations": true,
55
  "synchronize_each_layer": true,
56

57
  # regularization
58
  "gradient_clipping": 1.0,
59
  "weight_decay": 0.1,
60
  "hidden_dropout": 0,
61
  "attention_dropout": 0,
62

63
  # precision settings
64
  "fp16": {
65
    "fp16": true,
66
    "enabled": true,
67
    "loss_scale": 0,
68
    "loss_scale_window": 1000,
69
    "initial_scale_power": 12,
70
    "hysteresis": 2,
71
    "min_loss_scale": 1,
72
  },
73

74
  "train_iters": 143000,
75
  "lr_decay_iters": 143000,
76
  "distributed_backend": "nccl",
77
  "lr_decay_style": "cosine",
78
  "warmup": 0.01,
79
  "checkpoint_factor": 1000,
80
  "eval_interval": 40000,
81
  "eval_iters": 10,
82

83
  "log_interval": 10,
84
  "steps_per_print": 10,
85
  "wall_clock_breakdown": true,
86
}
87
gpt-neox

Использование cookies