3
"BigBirdForPreTraining"
5
"attention_probs_dropout_prob": 0.1,
6
"attention_type": "block_sparse",
8
"num_random_blocks": 3,
9
"gradient_checkpointing": false,
10
"hidden_act": "gelu_new",
11
"hidden_dropout_prob": 0.1,
13
"initializer_range": 0.02,
14
"intermediate_size": 3072,
15
"layer_norm_eps": 1e-12,
16
"max_position_embeddings": 4096,
17
"model_type": "big_bird",
18
"num_attention_heads": 12,
19
"num_hidden_layers": 12,
21
"position_embedding_type": "absolute",
22
"rescale_embeddings": false,
23
"transformers_version": "4.4.0.dev0",