5
"attention_probs_dropout_prob": 0.1,
6
"gradient_checkpointing": false,
8
"hidden_dropout_prob": 0.1,
10
"initializer_range": 0.02,
11
"intermediate_size": 3072,
12
"layer_norm_eps": 1e-12,
13
"max_position_embeddings": 4096,
15
"num_attention_heads": 12,
16
"num_hidden_layers": 12,
18
"pre_layer_norm": true,
19
"last_layer_norm": false,
20
"position_embedding_type": "rotary",
23
"transformers_version": "4.6.0.dev0",
27
"sparse_config_cls": "deepspeed.ops.sparse_attention:BigBirdSparsityConfig",
31
"different_layout_per_head": true,
32
"num_sliding_window_blocks": 3,
33
"num_global_blocks": 2,
34
"num_random_blocks": 3