CSS-LM

tokenization_camembert.py
287 строк · 11.8 Кб
Перенос по словам
1
# coding=utf-8
2
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License
15
""" Tokenization classes for Camembert model."""
16

17

18
import logging
19
import os
20
from shutil import copyfile
21
from typing import List, Optional
22

23
import sentencepiece as spm
24

25
from .tokenization_utils import PreTrainedTokenizer
26
from .tokenization_xlnet import SPIECE_UNDERLINE
27

28

29
logger = logging.getLogger(__name__)
30

31
VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
32

33
PRETRAINED_VOCAB_FILES_MAP = {
34
    "vocab_file": {
35
        "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-sentencepiece.bpe.model",
36
    }
37
}
38

39
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
40
    "camembert-base": None,
41
}
42

43
SHARED_MODEL_IDENTIFIERS = [
44
    # Load with
45
    # `tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")`
46
    "Musixmatch/umberto-commoncrawl-cased-v1",
47
    "Musixmatch/umberto-wikipedia-uncased-v1",
48
]
49

50

51
class CamembertTokenizer(PreTrainedTokenizer):
52
    """
53
        Adapted from RobertaTokenizer and XLNetTokenizer
54
        SentencePiece based tokenizer. Peculiarities:
55

56
        - requires `SentencePiece <https://github.com/google/sentencepiece>`_
57

58
    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
59
    should refer to the superclass for more information regarding methods.
60

61
    Args:
62
        vocab_file (:obj:`str`):
63
            Path to the vocabulary file.
64
        bos_token (:obj:`string`, `optional`, defaults to "<s>"):
65
            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
66

67
            .. note::
68

69
                When building a sequence using special tokens, this is not the token that is used for the beginning
70
                of sequence. The token used is the :obj:`cls_token`.
71
        eos_token (:obj:`string`, `optional`, defaults to "</s>"):
72
            The end of sequence token.
73

74
            .. note::
75

76
                When building a sequence using special tokens, this is not the token that is used for the end
77
                of sequence. The token used is the :obj:`sep_token`.
78
        sep_token (:obj:`string`, `optional`, defaults to "</s>"):
79
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
80
            for sequence classification or for a text and a question for question answering.
81
            It is also used as the last token of a sequence built with special tokens.
82
        cls_token (:obj:`string`, `optional`, defaults to "<s>"):
83
            The classifier token which is used when doing sequence classification (classification of the whole
84
            sequence instead of per-token classification). It is the first token of the sequence when built with
85
            special tokens.
86
        unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
87
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
88
            token instead.
89
        pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
90
            The token used for padding, for example when batching sequences of different lengths.
91
        mask_token (:obj:`string`, `optional`, defaults to "<mask>"):
92
            The token used for masking values. This is the token used when training this model with masked language
93
            modeling. This is the token which the model will try to predict.
94
        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
95
            Additional special tokens used by the tokenizer.
96

97
    Attributes:
98
        sp_model (:obj:`SentencePieceProcessor`):
99
            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
100
    """
101

102
    vocab_files_names = VOCAB_FILES_NAMES
103
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
104
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
105
    model_input_names = ["attention_mask"]
106

107
    def __init__(
108
        self,
109
        vocab_file,
110
        bos_token="<s>",
111
        eos_token="</s>",
112
        sep_token="</s>",
113
        cls_token="<s>",
114
        unk_token="<unk>",
115
        pad_token="<pad>",
116
        mask_token="<mask>",
117
        additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED"],
118
        **kwargs
119
    ):
120
        super().__init__(
121
            max_len=512,
122
            bos_token=bos_token,
123
            eos_token=eos_token,
124
            unk_token=unk_token,
125
            sep_token=sep_token,
126
            cls_token=cls_token,
127
            pad_token=pad_token,
128
            mask_token=mask_token,
129
            additional_special_tokens=additional_special_tokens,
130
            **kwargs,
131
        )
132
        self.sp_model = spm.SentencePieceProcessor()
133
        self.sp_model.Load(str(vocab_file))
134
        self.vocab_file = vocab_file
135
        # HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated in the actual
136
        # sentencepiece vocabulary (this is the case for <s> and </s>
137
        self.fairseq_tokens_to_ids = {"<s>NOTUSED": 0, "<pad>": 1, "</s>NOTUSED": 2, "<unk>": 3}
138
        self.fairseq_offset = len(self.fairseq_tokens_to_ids)
139
        self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.fairseq_tokens_to_ids)
140
        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
141

142
    def build_inputs_with_special_tokens(
143
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
144
    ) -> List[int]:
145
        """
146
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
147
        by concatenating and adding special tokens.
148
        A CamemBERT sequence has the following format:
149

150
        - single sequence: ``<s> X </s>``
151
        - pair of sequences: ``<s> A </s></s> B </s>``
152

153
        Args:
154
            token_ids_0 (:obj:`List[int]`):
155
                List of IDs to which the special tokens will be added
156
            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
157
                Optional second list of IDs for sequence pairs.
158

159
        Returns:
160
            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
161
        """
162

163
        if token_ids_1 is None:
164
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
165
        cls = [self.cls_token_id]
166
        sep = [self.sep_token_id]
167
        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
168

169
    def get_special_tokens_mask(
170
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
171
    ) -> List[int]:
172
        """
173
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
174
        special tokens using the tokenizer ``prepare_for_model`` method.
175

176
        Args:
177
            token_ids_0 (:obj:`List[int]`):
178
                List of ids.
179
            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
180
                Optional second list of IDs for sequence pairs.
181
            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
182
                Set to True if the token list is already formatted with special tokens for the model
183

184
        Returns:
185
            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
186
        """
187
        if already_has_special_tokens:
188
            if token_ids_1 is not None:
189
                raise ValueError(
190
                    "You should not supply a second sequence if the provided sequence of "
191
                    "ids is already formated with special tokens for the model."
192
                )
193
            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
194

195
        if token_ids_1 is None:
196
            return [1] + ([0] * len(token_ids_0)) + [1]
197
        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
198

199
    def create_token_type_ids_from_sequences(
200
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
201
    ) -> List[int]:
202
        """
203
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
204
        CamemBERT, like RoBERTa, does not make use of token type ids, therefore a list of zeros is returned.
205

206
        Args:
207
            token_ids_0 (:obj:`List[int]`):
208
                List of ids.
209
            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
210
                Optional second list of IDs for sequence pairs.
211

212
        Returns:
213
            :obj:`List[int]`: List of zeros.
214

215
        """
216
        sep = [self.sep_token_id]
217
        cls = [self.cls_token_id]
218

219
        if token_ids_1 is None:
220
            return len(cls + token_ids_0 + sep) * [0]
221
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
222

223
    @property
224
    def vocab_size(self):
225
        return len(self.fairseq_tokens_to_ids) + len(self.sp_model)
226

227
    def _tokenize(self, text):
228
        return self.sp_model.EncodeAsPieces(text)
229

230
    def _convert_token_to_id(self, token):
231
        """ Converts a token (str) in an id using the vocab. """
232
        if token in self.fairseq_tokens_to_ids:
233
            return self.fairseq_tokens_to_ids[token]
234
        elif self.sp_model.PieceToId(token) == 0:
235
            # Convert sentence piece unk token to fairseq unk token index
236
            return self.unk_token_id
237
        return self.fairseq_offset + self.sp_model.PieceToId(token)
238

239
    def _convert_id_to_token(self, index):
240
        """Converts an index (integer) in a token (str) using the vocab."""
241
        if index in self.fairseq_ids_to_tokens:
242
            return self.fairseq_ids_to_tokens[index]
243
        return self.sp_model.IdToPiece(index - self.fairseq_offset)
244

245
    def __getstate__(self):
246
        state = self.__dict__.copy()
247
        state["sp_model"] = None
248
        return state
249

250
    def __setstate__(self, d):
251
        self.__dict__ = d
252
        try:
253
            import sentencepiece as spm
254
        except ImportError:
255
            logger.warning(
256
                "You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece"
257
                "pip install sentencepiece"
258
            )
259
            raise
260
        self.sp_model = spm.SentencePieceProcessor()
261
        self.sp_model.Load(self.vocab_file)
262

263
    def convert_tokens_to_string(self, tokens):
264
        """Converts a sequence of tokens (strings for sub-words) in a single string."""
265
        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
266
        return out_string
267

268
    def save_vocabulary(self, save_directory):
269
        """
270
        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
271

272
        Args:
273
            save_directory (:obj:`str`):
274
                The directory in which to save the vocabulary.
275

276
        Returns:
277
            :obj:`Tuple(str)`: Paths to the files saved.
278
        """
279
        if not os.path.isdir(save_directory):
280
            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
281
            return
282
        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
283

284
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
285
            copyfile(self.vocab_file, out_vocab_file)
286

287
        return (out_vocab_file,)
288
CSS-LM

Использование cookies