CSS-LM

tokenization_flaubert.py
145 строк · 5.5 Кб
Перенос по словам
1
# coding=utf-8
2
# Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
"""Tokenization classes for Flaubert, based on XLM."""
16

17

18
import logging
19
import unicodedata
20

21
import six
22

23
from .tokenization_xlm import XLMTokenizer
24

25

26
logger = logging.getLogger(__name__)
27

28
VOCAB_FILES_NAMES = {
29
    "vocab_file": "vocab.json",
30
    "merges_file": "merges.txt",
31
}
32

33
PRETRAINED_VOCAB_FILES_MAP = {
34
    "vocab_file": {
35
        "flaubert/flaubert_small_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_small_cased/vocab.json",
36
        "flaubert/flaubert_base_uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_uncased/vocab.json",
37
        "flaubert/flaubert_base_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_cased/vocab.json",
38
        "flaubert/flaubert_large_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_large_cased/vocab.json",
39
    },
40
    "merges_file": {
41
        "flaubert/flaubert_small_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_small_cased/merges.txt",
42
        "flaubert/flaubert_base_uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_uncased/merges.txt",
43
        "flaubert/flaubert_base_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_cased/merges.txt",
44
        "flaubert/flaubert_large_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_large_cased/merges.txt",
45
    },
46
}
47

48
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
49
    "flaubert/flaubert_small_cased": 512,
50
    "flaubert/flaubert_base_uncased": 512,
51
    "flaubert/flaubert_base_cased": 512,
52
    "flaubert/flaubert_large_cased": 512,
53
}
54

55
PRETRAINED_INIT_CONFIGURATION = {
56
    "flaubert/flaubert_small_cased": {"do_lowercase": False},
57
    "flaubert/flaubert_base_uncased": {"do_lowercase": True},
58
    "flaubert/flaubert_base_cased": {"do_lowercase": False},
59
    "flaubert/flaubert_large_cased": {"do_lowercase": False},
60
}
61

62

63
def convert_to_unicode(text):
64
    """
65
    Converts `text` to Unicode (if it's not already), assuming UTF-8 input.
66
    """
67
    # six_ensure_text is copied from https://github.com/benjaminp/six
68
    def six_ensure_text(s, encoding="utf-8", errors="strict"):
69
        if isinstance(s, six.binary_type):
70
            return s.decode(encoding, errors)
71
        elif isinstance(s, six.text_type):
72
            return s
73
        else:
74
            raise TypeError("not expecting type '%s'" % type(s))
75

76
    return six_ensure_text(text, encoding="utf-8", errors="ignore")
77

78

79
class FlaubertTokenizer(XLMTokenizer):
80
    """
81
    BPE tokenizer for Flaubert
82

83
    - Moses preprocessing & tokenization
84
    - Normalize all inputs text
85
    - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
86
      (ex: "__classify__") to a vocabulary
87
    - `do_lowercase` controle lower casing (automatically set for pretrained vocabularies)
88

89
    This tokenizer inherits from :class:`~transformers.XLMTokenizer`. Please check the superclass for usage examples
90
    and documentation regarding arguments.
91
    """
92

93
    vocab_files_names = VOCAB_FILES_NAMES
94
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
95
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
96
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
97

98
    def __init__(self, do_lowercase=False, **kwargs):
99
        super().__init__(**kwargs)
100
        self.do_lowercase = do_lowercase
101
        self.do_lowercase_and_remove_accent = False
102

103
    def preprocess_text(self, text):
104
        text = text.replace("``", '"').replace("''", '"')
105
        text = convert_to_unicode(text)
106
        text = unicodedata.normalize("NFC", text)
107

108
        if self.do_lowercase:
109
            text = text.lower()
110

111
        return text
112

113
    def _tokenize(self, text, bypass_tokenizer=False):
114
        """
115
        Tokenize a string given language code using Moses.
116

117
        Details of tokenization:
118
        - [sacremoses](https://github.com/alvations/sacremoses): port of Moses
119
            - Install with `pip install sacremoses`
120

121
        Args:
122
            - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)  (bool). If True, we only apply BPE.
123

124
        Returns:
125
            List of tokens.
126
        """
127
        lang = "fr"
128
        if lang and self.lang2id and lang not in self.lang2id:
129
            logger.error(
130
                "Supplied language code not found in lang2id mapping. Please check that your language is supported by the loaded pretrained model."
131
            )
132

133
        if bypass_tokenizer:
134
            text = text.split()
135
        else:
136
            text = self.preprocess_text(text)
137
            text = self.moses_pipeline(text, lang=lang)
138
            text = self.moses_tokenize(text, lang=lang)
139

140
        split_tokens = []
141
        for token in text:
142
            if token:
143
                split_tokens.extend([t for t in self.bpe(token).split(" ")])
144

145
        return split_tokens
146
CSS-LM

Использование cookies