paddlenlp
1Global:
2device: gpu
3seed: 1024
4
5global_batch_size:
6local_batch_size: 1
7micro_batch_size: 1
8
9
10Engine:
11max_steps: 500000
12num_train_epochs: 1
13accumulate_steps:
14logging_freq: 1
15eval_freq: 500
16eval_iters: 10
17test_iters:
18mix_precision:
19enable: True
20dtype: "float16"
21level: "O2"
22scale_loss: 32768.0
23custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"]
24custom_white_list: ["lookup_table", "lookup_table_v2"]
25save_load:
26save_steps: 1000
27save_epoch: 1
28output_dir: ./output
29ckpt_dir:
30
31
32Model:
33module: "GPTModule"
34name: "GPT"
35vocab_size_divisible_unit: 128
36fused_linear: False
37fuse_attn_qkv: True
38scale_qk_by_layer_num: True
39sequence_parallel: False
40use_flash_attn: False
41fused_softmax_with_triangular: True
42
43
44Data:
45Train:
46dataset:
47name: GPTDataset
48input_dir: ./data/
49split: [969, 30, 1]
50max_seq_len: 1024
51sampler:
52name: GPTBatchSampler
53shuffle: False
54drop_last: True
55loader:
56num_workers: 1
57return_list: False
58collate_fn: gpt_collate_fn
59
60Eval:
61dataset:
62name: GPTDataset
63input_dir: ./data/
64split: [969, 30, 1]
65max_seq_len: 1024
66sampler:
67name: GPTBatchSampler
68shuffle: False
69drop_last: True
70loader:
71num_workers: 1
72return_list: False
73collate_fn: gpt_collate_fn
74
75
76Optimizer:
77name: FusedAdamW
78weight_decay: 0.01
79beta1: 0.9
80beta2: 0.999
81epsilon: 1.0e-8
82lr:
83name: CosineAnnealingWithWarmupDecay
84decay_steps: 360000
85warmup_rate: 0.01
86max_lr: 5.0e-5
87min_lr: 1.0e-5
88use_increments: True
89grad_clip:
90name: "ClipGradByGlobalNorm"
91clip_norm: 1.0
92tensor_fusion: False
93
94
95Profiler:
96enable: False
97scheduler: [1, 5]
98profiler_log: profiler_log
99detailed: False
100
101
102Distributed:
103fuse_sequence_parallel_allreduce: False
104