CSS-LM
287 строк · 11.8 Кб
1# coding=utf-8
2# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License
15""" Tokenization classes for Camembert model."""
16
17
18import logging
19import os
20from shutil import copyfile
21from typing import List, Optional
22
23import sentencepiece as spm
24
25from .tokenization_utils import PreTrainedTokenizer
26from .tokenization_xlnet import SPIECE_UNDERLINE
27
28
29logger = logging.getLogger(__name__)
30
31VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
32
33PRETRAINED_VOCAB_FILES_MAP = {
34"vocab_file": {
35"camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-sentencepiece.bpe.model",
36}
37}
38
39PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
40"camembert-base": None,
41}
42
43SHARED_MODEL_IDENTIFIERS = [
44# Load with
45# `tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")`
46"Musixmatch/umberto-commoncrawl-cased-v1",
47"Musixmatch/umberto-wikipedia-uncased-v1",
48]
49
50
51class CamembertTokenizer(PreTrainedTokenizer):
52"""
53Adapted from RobertaTokenizer and XLNetTokenizer
54SentencePiece based tokenizer. Peculiarities:
55
56- requires `SentencePiece <https://github.com/google/sentencepiece>`_
57
58This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
59should refer to the superclass for more information regarding methods.
60
61Args:
62vocab_file (:obj:`str`):
63Path to the vocabulary file.
64bos_token (:obj:`string`, `optional`, defaults to "<s>"):
65The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
66
67.. note::
68
69When building a sequence using special tokens, this is not the token that is used for the beginning
70of sequence. The token used is the :obj:`cls_token`.
71eos_token (:obj:`string`, `optional`, defaults to "</s>"):
72The end of sequence token.
73
74.. note::
75
76When building a sequence using special tokens, this is not the token that is used for the end
77of sequence. The token used is the :obj:`sep_token`.
78sep_token (:obj:`string`, `optional`, defaults to "</s>"):
79The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
80for sequence classification or for a text and a question for question answering.
81It is also used as the last token of a sequence built with special tokens.
82cls_token (:obj:`string`, `optional`, defaults to "<s>"):
83The classifier token which is used when doing sequence classification (classification of the whole
84sequence instead of per-token classification). It is the first token of the sequence when built with
85special tokens.
86unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
87The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
88token instead.
89pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
90The token used for padding, for example when batching sequences of different lengths.
91mask_token (:obj:`string`, `optional`, defaults to "<mask>"):
92The token used for masking values. This is the token used when training this model with masked language
93modeling. This is the token which the model will try to predict.
94additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
95Additional special tokens used by the tokenizer.
96
97Attributes:
98sp_model (:obj:`SentencePieceProcessor`):
99The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
100"""
101
102vocab_files_names = VOCAB_FILES_NAMES
103pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
104max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
105model_input_names = ["attention_mask"]
106
107def __init__(
108self,
109vocab_file,
110bos_token="<s>",
111eos_token="</s>",
112sep_token="</s>",
113cls_token="<s>",
114unk_token="<unk>",
115pad_token="<pad>",
116mask_token="<mask>",
117additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED"],
118**kwargs
119):
120super().__init__(
121max_len=512,
122bos_token=bos_token,
123eos_token=eos_token,
124unk_token=unk_token,
125sep_token=sep_token,
126cls_token=cls_token,
127pad_token=pad_token,
128mask_token=mask_token,
129additional_special_tokens=additional_special_tokens,
130**kwargs,
131)
132self.sp_model = spm.SentencePieceProcessor()
133self.sp_model.Load(str(vocab_file))
134self.vocab_file = vocab_file
135# HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated in the actual
136# sentencepiece vocabulary (this is the case for <s> and </s>
137self.fairseq_tokens_to_ids = {"<s>NOTUSED": 0, "<pad>": 1, "</s>NOTUSED": 2, "<unk>": 3}
138self.fairseq_offset = len(self.fairseq_tokens_to_ids)
139self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.fairseq_tokens_to_ids)
140self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
141
142def build_inputs_with_special_tokens(
143self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
144) -> List[int]:
145"""
146Build model inputs from a sequence or a pair of sequence for sequence classification tasks
147by concatenating and adding special tokens.
148A CamemBERT sequence has the following format:
149
150- single sequence: ``<s> X </s>``
151- pair of sequences: ``<s> A </s></s> B </s>``
152
153Args:
154token_ids_0 (:obj:`List[int]`):
155List of IDs to which the special tokens will be added
156token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
157Optional second list of IDs for sequence pairs.
158
159Returns:
160:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
161"""
162
163if token_ids_1 is None:
164return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
165cls = [self.cls_token_id]
166sep = [self.sep_token_id]
167return cls + token_ids_0 + sep + sep + token_ids_1 + sep
168
169def get_special_tokens_mask(
170self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
171) -> List[int]:
172"""
173Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
174special tokens using the tokenizer ``prepare_for_model`` method.
175
176Args:
177token_ids_0 (:obj:`List[int]`):
178List of ids.
179token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
180Optional second list of IDs for sequence pairs.
181already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
182Set to True if the token list is already formatted with special tokens for the model
183
184Returns:
185:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
186"""
187if already_has_special_tokens:
188if token_ids_1 is not None:
189raise ValueError(
190"You should not supply a second sequence if the provided sequence of "
191"ids is already formated with special tokens for the model."
192)
193return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
194
195if token_ids_1 is None:
196return [1] + ([0] * len(token_ids_0)) + [1]
197return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
198
199def create_token_type_ids_from_sequences(
200self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
201) -> List[int]:
202"""
203Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
204CamemBERT, like RoBERTa, does not make use of token type ids, therefore a list of zeros is returned.
205
206Args:
207token_ids_0 (:obj:`List[int]`):
208List of ids.
209token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
210Optional second list of IDs for sequence pairs.
211
212Returns:
213:obj:`List[int]`: List of zeros.
214
215"""
216sep = [self.sep_token_id]
217cls = [self.cls_token_id]
218
219if token_ids_1 is None:
220return len(cls + token_ids_0 + sep) * [0]
221return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
222
223@property
224def vocab_size(self):
225return len(self.fairseq_tokens_to_ids) + len(self.sp_model)
226
227def _tokenize(self, text):
228return self.sp_model.EncodeAsPieces(text)
229
230def _convert_token_to_id(self, token):
231""" Converts a token (str) in an id using the vocab. """
232if token in self.fairseq_tokens_to_ids:
233return self.fairseq_tokens_to_ids[token]
234elif self.sp_model.PieceToId(token) == 0:
235# Convert sentence piece unk token to fairseq unk token index
236return self.unk_token_id
237return self.fairseq_offset + self.sp_model.PieceToId(token)
238
239def _convert_id_to_token(self, index):
240"""Converts an index (integer) in a token (str) using the vocab."""
241if index in self.fairseq_ids_to_tokens:
242return self.fairseq_ids_to_tokens[index]
243return self.sp_model.IdToPiece(index - self.fairseq_offset)
244
245def __getstate__(self):
246state = self.__dict__.copy()
247state["sp_model"] = None
248return state
249
250def __setstate__(self, d):
251self.__dict__ = d
252try:
253import sentencepiece as spm
254except ImportError:
255logger.warning(
256"You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece"
257"pip install sentencepiece"
258)
259raise
260self.sp_model = spm.SentencePieceProcessor()
261self.sp_model.Load(self.vocab_file)
262
263def convert_tokens_to_string(self, tokens):
264"""Converts a sequence of tokens (strings for sub-words) in a single string."""
265out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
266return out_string
267
268def save_vocabulary(self, save_directory):
269"""
270Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
271
272Args:
273save_directory (:obj:`str`):
274The directory in which to save the vocabulary.
275
276Returns:
277:obj:`Tuple(str)`: Paths to the files saved.
278"""
279if not os.path.isdir(save_directory):
280logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
281return
282out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
283
284if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
285copyfile(self.vocab_file, out_vocab_file)
286
287return (out_vocab_file,)
288