CSS-LM
257 строк · 9.7 Кб
1# coding=utf-8
2# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15"""Tokenization classes."""
16
17
18import collections
19import logging
20import os
21import unicodedata
22from typing import Optional
23
24from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer, load_vocab
25
26
27logger = logging.getLogger(__name__)
28
29VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
30
31PRETRAINED_VOCAB_FILES_MAP = {
32"vocab_file": {
33"cl-tohoku/bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese/vocab.txt",
34"cl-tohoku/bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking/vocab.txt",
35"cl-tohoku/bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char/vocab.txt",
36"cl-tohoku/bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking/vocab.txt",
37}
38}
39
40PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
41"cl-tohoku/bert-base-japanese": 512,
42"cl-tohoku/bert-base-japanese-whole-word-masking": 512,
43"cl-tohoku/bert-base-japanese-char": 512,
44"cl-tohoku/bert-base-japanese-char-whole-word-masking": 512,
45}
46
47PRETRAINED_INIT_CONFIGURATION = {
48"cl-tohoku/bert-base-japanese": {
49"do_lower_case": False,
50"word_tokenizer_type": "mecab",
51"subword_tokenizer_type": "wordpiece",
52},
53"cl-tohoku/bert-base-japanese-whole-word-masking": {
54"do_lower_case": False,
55"word_tokenizer_type": "mecab",
56"subword_tokenizer_type": "wordpiece",
57},
58"cl-tohoku/bert-base-japanese-char": {
59"do_lower_case": False,
60"word_tokenizer_type": "mecab",
61"subword_tokenizer_type": "character",
62},
63"cl-tohoku/bert-base-japanese-char-whole-word-masking": {
64"do_lower_case": False,
65"word_tokenizer_type": "mecab",
66"subword_tokenizer_type": "character",
67},
68}
69
70
71class BertJapaneseTokenizer(BertTokenizer):
72"""BERT tokenizer for Japanese text"""
73
74vocab_files_names = VOCAB_FILES_NAMES
75pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
76pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
77max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
78
79def __init__(
80self,
81vocab_file,
82do_lower_case=False,
83do_word_tokenize=True,
84do_subword_tokenize=True,
85word_tokenizer_type="basic",
86subword_tokenizer_type="wordpiece",
87never_split=None,
88unk_token="[UNK]",
89sep_token="[SEP]",
90pad_token="[PAD]",
91cls_token="[CLS]",
92mask_token="[MASK]",
93mecab_kwargs=None,
94**kwargs
95):
96"""Constructs a MecabBertTokenizer.
97
98Args:
99**vocab_file**: Path to a one-wordpiece-per-line vocabulary file.
100**do_lower_case**: (`optional`) boolean (default True)
101Whether to lower case the input.
102Only has an effect when do_basic_tokenize=True.
103**do_word_tokenize**: (`optional`) boolean (default True)
104Whether to do word tokenization.
105**do_subword_tokenize**: (`optional`) boolean (default True)
106Whether to do subword tokenization.
107**word_tokenizer_type**: (`optional`) string (default "basic")
108Type of word tokenizer.
109**subword_tokenizer_type**: (`optional`) string (default "wordpiece")
110Type of subword tokenizer.
111**mecab_kwargs**: (`optional`) dict passed to `MecabTokenizer` constructor (default None)
112"""
113super(BertTokenizer, self).__init__(
114unk_token=unk_token,
115sep_token=sep_token,
116pad_token=pad_token,
117cls_token=cls_token,
118mask_token=mask_token,
119**kwargs,
120)
121# ^^ We call the grandparent's init, not the parent's.
122
123if not os.path.isfile(vocab_file):
124raise ValueError(
125"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
126"model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
127)
128self.vocab = load_vocab(vocab_file)
129self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
130
131self.do_word_tokenize = do_word_tokenize
132if do_word_tokenize:
133if word_tokenizer_type == "basic":
134self.word_tokenizer = BasicTokenizer(
135do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=False
136)
137elif word_tokenizer_type == "mecab":
138self.word_tokenizer = MecabTokenizer(
139do_lower_case=do_lower_case, never_split=never_split, **(mecab_kwargs or {})
140)
141else:
142raise ValueError("Invalid word_tokenizer_type '{}' is specified.".format(word_tokenizer_type))
143
144self.do_subword_tokenize = do_subword_tokenize
145if do_subword_tokenize:
146if subword_tokenizer_type == "wordpiece":
147self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
148elif subword_tokenizer_type == "character":
149self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=self.unk_token)
150else:
151raise ValueError("Invalid subword_tokenizer_type '{}' is specified.".format(subword_tokenizer_type))
152
153def _tokenize(self, text):
154if self.do_word_tokenize:
155tokens = self.word_tokenizer.tokenize(text, never_split=self.all_special_tokens)
156else:
157tokens = [text]
158
159if self.do_subword_tokenize:
160split_tokens = [sub_token for token in tokens for sub_token in self.subword_tokenizer.tokenize(token)]
161else:
162split_tokens = tokens
163
164return split_tokens
165
166
167class MecabTokenizer:
168"""Runs basic tokenization with MeCab morphological parser."""
169
170def __init__(self, do_lower_case=False, never_split=None, normalize_text=True, mecab_option: Optional[str] = None):
171"""Constructs a MecabTokenizer.
172
173Args:
174**do_lower_case**: (`optional`) boolean (default True)
175Whether to lower case the input.
176**never_split**: (`optional`) list of str
177Kept for backward compatibility purposes.
178Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
179List of token not to split.
180**normalize_text**: (`optional`) boolean (default True)
181Whether to apply unicode normalization to text before tokenization.
182**mecab_option**: (`optional`) string passed to `MeCab.Tagger` constructor (default "")
183"""
184self.do_lower_case = do_lower_case
185self.never_split = never_split if never_split is not None else []
186self.normalize_text = normalize_text
187
188import fugashi
189import ipadic
190
191# Use ipadic by default (later options can override it)
192mecab_option = mecab_option or ""
193mecab_option = ipadic.MECAB_ARGS + " " + mecab_option
194
195self.mecab = fugashi.GenericTagger(mecab_option)
196
197def tokenize(self, text, never_split=None, **kwargs):
198"""Tokenizes a piece of text."""
199if self.normalize_text:
200text = unicodedata.normalize("NFKC", text)
201
202never_split = self.never_split + (never_split if never_split is not None else [])
203tokens = []
204
205for word in self.mecab(text):
206token = word.surface
207
208if self.do_lower_case and token not in never_split:
209token = token.lower()
210
211tokens.append(token)
212
213return tokens
214
215
216class CharacterTokenizer(object):
217"""Runs Character tokenziation."""
218
219def __init__(self, vocab, unk_token, normalize_text=True):
220"""Constructs a CharacterTokenizer.
221
222Args:
223**vocab**:
224Vocabulary object.
225**unk_token**: str
226A special symbol for out-of-vocabulary token.
227**normalize_text**: (`optional`) boolean (default True)
228Whether to apply unicode normalization to text before tokenization.
229"""
230self.vocab = vocab
231self.unk_token = unk_token
232self.normalize_text = normalize_text
233
234def tokenize(self, text):
235"""Tokenizes a piece of text into characters.
236
237For example:
238input = "apple"
239output = ["a", "p", "p", "l", "e"]
240Args:
241text: A single token or whitespace separated tokens.
242This should have already been passed through `BasicTokenizer`.
243Returns:
244A list of characters.
245"""
246if self.normalize_text:
247text = unicodedata.normalize("NFKC", text)
248
249output_tokens = []
250for i, char in enumerate(text):
251if char not in self.vocab:
252output_tokens.append(self.unk_token)
253continue
254
255output_tokens.append(char)
256
257return output_tokens
258