CSS-LM

configuration_xlnet.py
237 строк · 11.3 Кб
Перенос по словам
1
# coding=utf-8
2
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
3
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
4
#
5
# Licensed under the Apache License, Version 2.0 (the "License");
6
# you may not use this file except in compliance with the License.
7
# You may obtain a copy of the License at
8
#
9
#     http://www.apache.org/licenses/LICENSE-2.0
10
#
11
# Unless required by applicable law or agreed to in writing, software
12
# distributed under the License is distributed on an "AS IS" BASIS,
13
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
# See the License for the specific language governing permissions and
15
# limitations under the License.
16
""" XLNet configuration """
17

18
import logging
19
import warnings
20

21
from .configuration_utils import PretrainedConfig
22

23

24
logger = logging.getLogger(__name__)
25

26
XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
27
    "xlnet-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json",
28
    "xlnet-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json",
29
}
30

31

32
class XLNetConfig(PretrainedConfig):
33
    """
34
        This is the configuration class to store the configuration of a :class:`~transformers.XLNetModel`.
35
        It is used to instantiate an XLNet model according to the specified arguments, defining the model
36
        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
37
        the `xlnet-large-cased <https://huggingface.co/xlnet-large-cased>`__ architecture.
38

39
        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
40
        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
41
        for more information.
42

43
        Args:
44
            vocab_size (:obj:`int`, optional, defaults to 32000):
45
                Vocabulary size of the XLNet model. Defines the different tokens that
46
                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLNetModel`.
47
            d_model (:obj:`int`, optional, defaults to 1024):
48
                Dimensionality of the encoder layers and the pooler layer.
49
            n_layer (:obj:`int`, optional, defaults to 24):
50
                Number of hidden layers in the Transformer encoder.
51
            n_head (:obj:`int`, optional, defaults to 16):
52
                Number of attention heads for each attention layer in the Transformer encoder.
53
            d_inner (:obj:`int`, optional, defaults to 4096):
54
                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
55
            ff_activation (:obj:`string`, optional, defaults to "gelu"):
56
                The non-linear activation function (function or string) in the
57
                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
58
            untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
59
                Untie relative position biases
60
            attn_type (:obj:`string`, optional, defaults to "bi"):
61
                The attention type used by the model. Set 'bi' for XLNet, 'uni' for Transformer-XL.
62
            initializer_range (:obj:`float`, optional, defaults to 0.02):
63
                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
64
            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
65
                The epsilon used by the layer normalization layers.
66
            dropout (:obj:`float`, optional, defaults to 0.1):
67
                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
68
            mem_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
69
                The number of tokens to cache. The key/value pairs that have already been pre-computed
70
                in a previous forward pass won't be re-computed. See the
71
                `quickstart <https://huggingface.co/transformers/quickstart.html#using-the-past>`__
72
                for more information.
73
            reuse_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
74
                The number of tokens in the current batch to be cached and reused in the future.
75
            bi_data (:obj:`boolean`, optional, defaults to :obj:`False`):
76
                Whether to use bidirectional input pipeline. Usually set to `True` during
77
                pretraining and `False` during finetuning.
78
            clamp_len (:obj:`int`, optional, defaults to -1):
79
                Clamp all relative distances larger than clamp_len.
80
                Setting this attribute to -1 means no clamping.
81
            same_length (:obj:`boolean`, optional, defaults to :obj:`False`):
82
                Whether to use the same attention length for each token.
83
            summary_type (:obj:`string`, optional, defaults to "last"):
84
                Argument used when doing sequence summary. Used in for the multiple choice head in
85
                :class:transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
86
                Is one of the following options:
87

88
                - 'last' => take the last token hidden state (like XLNet)
89
                - 'first' => take the first token hidden state (like Bert)
90
                - 'mean' => take the mean of all tokens hidden states
91
                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
92
                - 'attn' => Not implemented now, use multi-head attention
93
            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
94
                Argument used when doing sequence summary. Used in for the multiple choice head in
95
                :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
96
                Add a projection after the vector extraction
97
            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
98
                Argument used when doing sequence summary. Used in for the multiple choice head in
99
                :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
100
                'tanh' => add a tanh activation to the output, Other => no activation.
101
            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
102
                Argument used when doing sequence summary. Used in for the multiple choice head in
103
                :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
104
                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
105
            summary_last_dropout (:obj:`float`, optional, defaults to 0.1):
106
                Argument used when doing sequence summary. Used in for the multiple choice head in
107
                :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
108
                Add a dropout after the projection and activation
109
            start_n_top (:obj:`int`, optional, defaults to 5):
110
                Used in the SQuAD evaluation script for XLM and XLNet.
111
            end_n_top (:obj:`int`, optional, defaults to 5):
112
                Used in the SQuAD evaluation script for XLM and XLNet.
113
            use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
114
                Whether or not the model should return the last pre-computed hidden states.
115

116
                .. note::
117
                    This flag behaves differently from with other models: it just controls the inference behavior, during
118
                    training the model always uses ``use_cache=True``.
119

120
        Example::
121

122
            >>> from transformers import XLNetConfig, XLNetModel
123

124
            >>> # Initializing a XLNet configuration
125
            >>> configuration = XLNetConfig()
126

127
            >>> # Initializing a model from the configuration
128
            >>> model = XLNetModel(configuration)
129

130
            >>> # Accessing the model configuration
131
            >>> configuration = model.config
132
    """
133

134
    model_type = "xlnet"
135

136
    def __init__(
137
        self,
138
        vocab_size=32000,
139
        d_model=1024,
140
        n_layer=24,
141
        n_head=16,
142
        d_inner=4096,
143
        ff_activation="gelu",
144
        untie_r=True,
145
        attn_type="bi",
146
        initializer_range=0.02,
147
        layer_norm_eps=1e-12,
148
        dropout=0.1,
149
        mem_len=None,
150
        reuse_len=None,
151
        bi_data=False,
152
        clamp_len=-1,
153
        same_length=False,
154
        summary_type="last",
155
        summary_use_proj=True,
156
        summary_activation="tanh",
157
        summary_last_dropout=0.1,
158
        start_n_top=5,
159
        end_n_top=5,
160
        pad_token_id=5,
161
        bos_token_id=1,
162
        eos_token_id=2,
163
        **kwargs
164
    ):
165
        """Constructs XLNetConfig.
166
        """
167
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
168
        self.vocab_size = vocab_size
169
        self.d_model = d_model
170
        self.n_layer = n_layer
171
        self.n_head = n_head
172
        assert d_model % n_head == 0
173
        if "d_head" in kwargs:
174
            assert (
175
                kwargs["d_head"] == d_model // n_head
176
            ), f"`d_head` ({kwargs['d_head']}) should be equal to `d_model // n_head` ({d_model // n_head})"
177
        self.d_head = d_model // n_head
178
        self.ff_activation = ff_activation
179
        self.d_inner = d_inner
180
        self.untie_r = untie_r
181
        self.attn_type = attn_type
182

183
        self.initializer_range = initializer_range
184
        self.layer_norm_eps = layer_norm_eps
185

186
        self.dropout = dropout
187
        self.mem_len = mem_len
188
        self.reuse_len = reuse_len
189
        self.bi_data = bi_data
190
        self.clamp_len = clamp_len
191
        self.same_length = same_length
192

193
        self.summary_type = summary_type
194
        self.summary_use_proj = summary_use_proj
195
        self.summary_activation = summary_activation
196
        self.summary_last_dropout = summary_last_dropout
197
        self.start_n_top = start_n_top
198
        self.end_n_top = end_n_top
199

200
        self.bos_token_id = bos_token_id
201
        self.pad_token_id = pad_token_id
202
        self.eos_token_id = eos_token_id
203

204
        if mem_len is None or mem_len == 0:
205
            warnings.warn(
206
                "This config doesn't use attention memories, a core feature of XLNet."
207
                " Consider setting `men_len` to a non-zero value, for example "
208
                "`xlnet = XLNetLMHeadModel.from_pretrained('xlnet-base-cased'', mem_len=1024)`,"
209
                " for accurate training performance as well as an order of magnitude faster inference."
210
                " Starting from version 3.5.0, the default parameter will be 1024, following"
211
                " the implementation in https://arxiv.org/abs/1906.08237",
212
                FutureWarning,
213
            )
214

215
    @property
216
    def max_position_embeddings(self):
217
        return -1
218

219
    @property
220
    def n_token(self):  # Backward compatibility
221
        return self.vocab_size
222

223
    @n_token.setter
224
    def n_token(self, value):  # Backward compatibility
225
        self.vocab_size = value
226

227
    @property
228
    def hidden_size(self):
229
        return self.d_model
230

231
    @property
232
    def num_attention_heads(self):
233
        return self.n_head
234

235
    @property
236
    def num_hidden_layers(self):
237
        return self.n_layer
238
CSS-LM

Использование cookies