CSS-LM
206 строк · 13.1 Кб
1# coding=utf-8
2# Copyright 2020 The Trax Authors and The HuggingFace Inc. team.
3# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16""" Reformer model configuration """
17
18
19import logging
20
21from .configuration_utils import PretrainedConfig
22
23
24logger = logging.getLogger(__name__)
25
26REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
27"google/reformer-crime-and-punishment": "https://cdn.huggingface.co/google/reformer-crime-and-punishment/config.json",
28"google/reformer-enwik8": "https://cdn.huggingface.co/google/reformer-enwik8/config.json",
29}
30
31
32class ReformerConfig(PretrainedConfig):
33r"""
34This is the configuration class to store the configuration of a :class:`~transformers.ReformerModel`.
35It is used to instantiate an Reformer model according to the specified arguments, defining the model
36architecture.
37
38Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
39to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
40for more information.
41
42Args:
43attention_head_size (:obj:`int`, optional, defaults to 64):
44Dimensionality of the projected key, query and value vectors
45attn_layers (:obj:`list(str)`, optional, defaults to ["local", "lsh", "local", "lsh", "local", "lsh"]):
46List of attention layer types in ascending order. It can be chosen between a
47LSHSelfAttention layer ("lsh") and a LocalSelfAttention layer ("local").
48For more information on LSHSelfAttention layer, see `LSH Self Attention <reformer.html#lsh-self-attention>`__ .
49For more information on LocalSelfAttention layer, see `Local Self Attention <reformer.html#local-sensitive-hashing-self-attention>`__ .
50axial_pos_embds (:obj:`bool`, optional, defaults to True):
51If `True` use axial position embeddings. For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__
52axial_norm_std (:obj:`float`, optional, defaluts to 1.0):
53The standard deviation of the normal_initializer for initializing the weight matrices of the axial positional encodings.
54axial_pos_shape (:obj:`list(int)`, optional, defaults to `[64, 64]`):
55The position dims of the axial position encodings.
56During training the product of the position dims has to equal the sequence length.
57For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__.
58axial_pos_embds_dim (:obj:`list(int)`, optional, defaults to `[64, 192]`):
59The embedding dims of the axial position encodings.
60The sum of the embedding dims has to equal the hidden size.
61For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__.
62chunk_size_lm_head (:obj:`int`, optional, defaults to 0):
63The chunk size of the final language model feed forward head layer.
64A chunk size of 0 means that the feed forward layer is not chunked.
65A chunk size of n means that the feed forward layer processes n < sequence_length embeddings at a time.
66For more information on feed forward chunking, see `How does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ .
67chunk_size_feed_forward (:obj:`int`, optional, defaults to 0):
68The chunk size of all feed forward layers in the residual attention blocks.
69A chunk size of 0 means that the feed forward layer is not chunked.
70A chunk size of n means that the feed forward layer processes n < sequence_length embeddings at a time.
71For more information on feed forward chunking, see `How does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ .
72eos_token_id (:obj:`int`, optional, defaults to 2):
73The token id for the <EOS> token.
74feed_forward_size (:obj:`int`, optional, defaults to 512):
75Dimensionality of the "feed_forward" (i.e., feed-forward) layer in the residual attention block.
76hash_seed (:obj:`int`, optional, defaults to `None`):
77Seed that can be used to make local sensitive hashing in LSHSelfAttention deterministic. This should only be set for testing purposed. For evaluation and training purposes `hash_seed` should be set to `None` to ensure fully random rotations in local sensitive hashing scheme.
78hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "relu"):
79The non-linear activation function (function or string) in the feed forward layer in the residual attention block.
80If string, "gelu", "relu", "swish", "gelu_new" and "gelu_fast" are supported.
81hidden_dropout_prob (:obj:`float`, optional, defaults to 0.05):
82The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
83hidden_size (:obj:`int`, optional, defaults to 256):
84Dimensionality of the output hidden states of the residual attention blocks.
85initializer_range (:obj:`float`, optional, defaults to 0.02):
86The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
87is_decoder (:obj:`bool`, optional, defaults to False):
88If `is_decoder` is True, a causal mask is used in addition to `attention_mask`.
89When using the Reformer for causal language modeling, `is_decoder` is set to `True`.
90layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
91The epsilon used by the layer normalization layers.
92local_chunk_length (:obj:`int`, optional, defaults to 64):
93Length of chunk which attends to itself in LocalSelfAttention. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention).
94local_num_chunks_before (:obj:`int`, optional, defaults to 1):
95Number of previous neighbouring chunks to attend to in LocalSelfAttention layer to itself.
96local_num_chunks_after (:obj:`int`, optional, defaults to 0):
97Number of following neighbouring chunks to attend to in LocalSelfAttention layer in addition to itself.
98local_attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
99The dropout ratio for the attention probabilities in LocalSelfAttention.
100lsh_attn_chunk_length (:obj:`int`, optional, defaults to 64):
101Length of chunk which attends to itself in LSHSelfAttention. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention).
102lsh_num_chunks_before (:obj:`int`, optional, defaults to 1):
103Number of previous neighbouring chunks to attend to in LSHSelfAttention layer to itself.
104lsh_num_chunks_after (:obj:`int`, optional, defaults to 0):
105Number of following neighbouring chunks to attend to in LSHSelfAttention layer to itself.
106lsh_attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
107The dropout ratio for the attention probabilities in LSHSelfAttention.
108max_position_embeddings (:obj:`int`, optional, defaults to 4096):
109The maximum sequence length that this model might ever be used with.
110Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
111num_attention_heads (:obj:`int`, optional, defaults to 12):
112Number of attention heads for each attention layer in the Transformer encoder.
113num_buckets (:obj:`int` or :obj:`list(int)`, optional, defaults to `None`):
114Number of buckets, the key query vectors can be "hashed into" using the locality sensitive hashing scheme. Each query key vector is hashed into a hash in `1, ..., num_buckets`.
115The number of buckets can also be factorized into a list for improved memory complexity. In this case, each query key vector is hashed into a hash in `1-1, 1-2, ..., num_buckets[0]-1, ..., num_buckets[0]-num_buckets[1]` if `num_buckets` is factorized into two factors.
116The number of buckets (or the product the factors) should approximately equal sequence length / lsh_chunk_length. If `num_buckets` is set to `None`, a good value for `num_buckets` is calculated on the fly.
117num_hashes (:obj:`int`, optional, defaults to 1):
118Number of hashing rounds (e.g. number of random rotations) in Local Sensitive Hashing scheme.
119The higher `num_hashes`, the more accurate the `LSHSelfAttention` becomes, but also the more memory and time intensive the hashing becomes.
120pad_token_id (:obj:`int`, optional, defaults to 0):
121The token id for the <PAD> token.
122vocab_size (:obj:`int`, optional, defaults to 320):
123Vocabulary size of the Reformer model. Defines the different tokens that
124can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ReformerModel`.
125
126Example::
127
128>>> from transformers import ReformerModel, ReformerConfig
129
130>>> # Initializing a Reformer configuration
131>>> configuration = ReformerConfig()
132
133>>> # Initializing a Reformer model
134>>> model = ReformerModel(configuration)
135
136>>> # Accessing the model configuration
137>>> configuration = model.config
138"""
139model_type = "reformer"
140
141def __init__(
142self,
143attention_head_size=64,
144attn_layers=["local", "lsh", "local", "lsh", "local", "lsh"],
145axial_norm_std=1.0,
146axial_pos_embds=True,
147axial_pos_shape=[64, 64],
148axial_pos_embds_dim=[64, 192],
149chunk_size_lm_head=0,
150chunk_size_feed_forward=0,
151eos_token_id=2,
152feed_forward_size=512,
153hash_seed=None,
154hidden_act="relu",
155hidden_dropout_prob=0.05,
156hidden_size=256,
157initializer_range=0.02,
158is_decoder=False,
159layer_norm_eps=1e-12,
160local_num_chunks_before=1,
161local_num_chunks_after=0,
162local_attention_probs_dropout_prob=0.05,
163local_attn_chunk_length=64,
164lsh_attn_chunk_length=64,
165lsh_attention_probs_dropout_prob=0.0,
166lsh_num_chunks_before=1,
167lsh_num_chunks_after=0,
168max_position_embeddings=4096,
169num_attention_heads=2,
170num_buckets=None,
171num_hashes=1,
172pad_token_id=0,
173vocab_size=320,
174**kwargs
175):
176super().__init__(pad_token_id=pad_token_id, eos_token_id=eos_token_id, is_decoder=is_decoder, **kwargs)
177
178self.hash_seed = hash_seed
179self.vocab_size = vocab_size
180self.attention_head_size = attention_head_size
181self.hidden_size = hidden_size
182self.num_attention_heads = num_attention_heads
183self.num_hashes = num_hashes
184self.num_hidden_layers = len(attn_layers)
185self.num_buckets = tuple(num_buckets) if isinstance(num_buckets, list) else num_buckets
186self.lsh_attn_chunk_length = lsh_attn_chunk_length
187self.local_attn_chunk_length = local_attn_chunk_length
188self.lsh_num_chunks_after = lsh_num_chunks_after
189self.lsh_num_chunks_before = lsh_num_chunks_before
190self.local_num_chunks_after = local_num_chunks_after
191self.local_num_chunks_before = local_num_chunks_before
192self.hidden_act = hidden_act
193self.feed_forward_size = feed_forward_size
194self.hidden_dropout_prob = hidden_dropout_prob
195self.lsh_attention_probs_dropout_prob = lsh_attention_probs_dropout_prob
196self.local_attention_probs_dropout_prob = local_attention_probs_dropout_prob
197self.max_position_embeddings = max_position_embeddings
198self.initializer_range = initializer_range
199self.layer_norm_eps = layer_norm_eps
200self.axial_pos_embds = axial_pos_embds
201self.axial_pos_shape = tuple(axial_pos_shape)
202self.axial_pos_embds_dim = tuple(axial_pos_embds_dim)
203self.axial_norm_std = axial_norm_std
204self.chunk_size_lm_head = chunk_size_lm_head
205self.chunk_size_feed_forward = chunk_size_feed_forward
206self.attn_layers = attn_layers
207