2
"pipe_parallel_size": 1,
3
"model_parallel_size": 1,
4
"make_vocab_size_divisible_by": 1,
9
"intermediate_size": 28672,
10
"num_attention_heads": 64,
13
"max_position_embeddings": 4096,
16
"rotary_emb_base": 1000000,
17
"no_weight_tying": true,
18
"gpt_j_residual": false,
19
"output_layer_parallelism": "column",
21
"rms_norm_epsilon": 1.0e-5,
23
"attention_config": [[["flash"], 48]],
25
"scaled_upper_triang_masked_softmax_fusion": true,
26
"bias_gelu_fusion": false,
27
"use_bias_in_norms": false,
28
"use_bias_in_attn_linear": false,