CSS-LM
145 строк · 5.5 Кб
1# coding=utf-8
2# Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15"""Tokenization classes for Flaubert, based on XLM."""
16
17
18import logging
19import unicodedata
20
21import six
22
23from .tokenization_xlm import XLMTokenizer
24
25
26logger = logging.getLogger(__name__)
27
28VOCAB_FILES_NAMES = {
29"vocab_file": "vocab.json",
30"merges_file": "merges.txt",
31}
32
33PRETRAINED_VOCAB_FILES_MAP = {
34"vocab_file": {
35"flaubert/flaubert_small_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_small_cased/vocab.json",
36"flaubert/flaubert_base_uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_uncased/vocab.json",
37"flaubert/flaubert_base_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_cased/vocab.json",
38"flaubert/flaubert_large_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_large_cased/vocab.json",
39},
40"merges_file": {
41"flaubert/flaubert_small_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_small_cased/merges.txt",
42"flaubert/flaubert_base_uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_uncased/merges.txt",
43"flaubert/flaubert_base_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_cased/merges.txt",
44"flaubert/flaubert_large_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_large_cased/merges.txt",
45},
46}
47
48PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
49"flaubert/flaubert_small_cased": 512,
50"flaubert/flaubert_base_uncased": 512,
51"flaubert/flaubert_base_cased": 512,
52"flaubert/flaubert_large_cased": 512,
53}
54
55PRETRAINED_INIT_CONFIGURATION = {
56"flaubert/flaubert_small_cased": {"do_lowercase": False},
57"flaubert/flaubert_base_uncased": {"do_lowercase": True},
58"flaubert/flaubert_base_cased": {"do_lowercase": False},
59"flaubert/flaubert_large_cased": {"do_lowercase": False},
60}
61
62
63def convert_to_unicode(text):
64"""
65Converts `text` to Unicode (if it's not already), assuming UTF-8 input.
66"""
67# six_ensure_text is copied from https://github.com/benjaminp/six
68def six_ensure_text(s, encoding="utf-8", errors="strict"):
69if isinstance(s, six.binary_type):
70return s.decode(encoding, errors)
71elif isinstance(s, six.text_type):
72return s
73else:
74raise TypeError("not expecting type '%s'" % type(s))
75
76return six_ensure_text(text, encoding="utf-8", errors="ignore")
77
78
79class FlaubertTokenizer(XLMTokenizer):
80"""
81BPE tokenizer for Flaubert
82
83- Moses preprocessing & tokenization
84- Normalize all inputs text
85- argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
86(ex: "__classify__") to a vocabulary
87- `do_lowercase` controle lower casing (automatically set for pretrained vocabularies)
88
89This tokenizer inherits from :class:`~transformers.XLMTokenizer`. Please check the superclass for usage examples
90and documentation regarding arguments.
91"""
92
93vocab_files_names = VOCAB_FILES_NAMES
94pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
95pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
96max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
97
98def __init__(self, do_lowercase=False, **kwargs):
99super().__init__(**kwargs)
100self.do_lowercase = do_lowercase
101self.do_lowercase_and_remove_accent = False
102
103def preprocess_text(self, text):
104text = text.replace("``", '"').replace("''", '"')
105text = convert_to_unicode(text)
106text = unicodedata.normalize("NFC", text)
107
108if self.do_lowercase:
109text = text.lower()
110
111return text
112
113def _tokenize(self, text, bypass_tokenizer=False):
114"""
115Tokenize a string given language code using Moses.
116
117Details of tokenization:
118- [sacremoses](https://github.com/alvations/sacremoses): port of Moses
119- Install with `pip install sacremoses`
120
121Args:
122- bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False) (bool). If True, we only apply BPE.
123
124Returns:
125List of tokens.
126"""
127lang = "fr"
128if lang and self.lang2id and lang not in self.lang2id:
129logger.error(
130"Supplied language code not found in lang2id mapping. Please check that your language is supported by the loaded pretrained model."
131)
132
133if bypass_tokenizer:
134text = text.split()
135else:
136text = self.preprocess_text(text)
137text = self.moses_pipeline(text, lang=lang)
138text = self.moses_tokenize(text, lang=lang)
139
140split_tokens = []
141for token in text:
142if token:
143split_tokens.extend([t for t in self.bpe(token).split(" ")])
144
145return split_tokens
146