gpt-neox

20B.yml
113 строк · 3.0 Кб
Перенос по словам
1
# DISCLAIMER: This is the configuration file for the GPT-NeoX-20B model as it was trained on 96x 40GB A100
2
# GPUs. Depending on your system configuration, you may need to change some parameters in order to fit
3
# the model in memory.
4

5
{
6
  # Tokenizer /  checkpoint settings - you will need to change these to the location you have them saved in
7
  "vocab_file": "./20B_checkpoints/20B_tokenizer.json",
8
  "save": "./20B_checkpoints",
9
  "load": "./20B_checkpoints",
10

11
  # If finetuning, edit the following to the location of your finetuning dataset:
12
  "data_path": "./data/pile_20B_tokenizer/pile_20B_tokenizer_text_document",
13

14
  # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
15
  # across the node boundaries )
16
  "pipe_parallel_size": 4,
17
  "model_parallel_size": 2,
18

19
  # model settings
20
  "num_layers": 44,
21
  "hidden_size": 6144,
22
  "num_attention_heads": 64,
23
  "seq_length": 2048,
24
  "max_position_embeddings": 2048,
25
  "norm": "layernorm",
26
  "pos_emb": "rotary",
27
  "rotary_pct": 0.25,
28
  "no_weight_tying": true,
29
  "gpt_j_residual": true,
30
  "output_layer_parallelism": "column",
31
  "scaled_upper_triang_masked_softmax_fusion": true,
32
  "bias_gelu_fusion": true,
33
  "rope_fusion": false,
34
  "layernorm_fusion": false,
35

36
  # init methods
37
  "init_method": "small_init",
38
  "output_layer_init_method": "wang_init",
39

40
  # optimizer settings
41
  "optimizer": {
42
    "type": "Adam",
43
    "params": {
44
      "lr": 0.97e-4,
45
      "betas": [0.9, 0.95],
46
      "eps": 1.0e-8,
47
      }
48
      },
49

50
  "min_lr": 0.97e-5,
51

52
  # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
53
  "zero_optimization": {
54
  "stage": 1,
55
  "allgather_partitions": True,
56
  "allgather_bucket_size": 1260000000,
57
  "overlap_comm": True,
58
  "reduce_scatter": True,
59
  "reduce_bucket_size": 1260000000,
60
  "contiguous_gradients": True,
61
  },
62

63
  # batch / data settings (assuming 96 GPUs)
64
  "train_micro_batch_size_per_gpu": 4,
65
  "gradient_accumulation_steps": 32,
66
  "data_impl": "mmap",
67
  "split": "995,4,1",
68

69
  # activation checkpointing
70
  "checkpoint_activations": true,
71
  "checkpoint_num_layers": 1,
72
  "partition_activations": false,
73
  "synchronize_each_layer": true,
74

75
  # regularization
76
  "gradient_clipping": 1.0,
77
  "weight_decay": 0.01,
78
  "hidden_dropout": 0,
79
  "attention_dropout": 0,
80

81
  # precision settings
82
  "fp16": {
83
    "fp16": true,
84
    "enabled": true,
85
    "loss_scale": 0,
86
    "loss_scale_window": 1000,
87
    "initial_scale_power": 12,
88
    "hysteresis": 2,
89
    "min_loss_scale": 1
90
    },
91

92
  # misc. training settings
93
  "train_iters": 150000,
94
  "lr_decay_iters": 150000,
95

96
  "distributed_backend": "nccl",
97
  "lr_decay_style": "cosine",
98
  "warmup": 0.01,
99
  "checkpoint_factor": 500, # this variable previously called `save-interval`
100
  "eval_interval": 1000,
101
  "eval_iters": 10,
102

103
  # logging
104
  "log_interval": 2,
105
  "steps_per_print": 2,
106
  "wall_clock_breakdown": false,
107

108
  ### NEW DATA: ####
109
  "tokenizer_type": "HFTokenizer",
110
  "tensorboard-dir": "./tensorboard",
111
  "log_dir": "./logs",
112

113
}
114
gpt-neox

Использование cookies