transformers
499 строк · 21.9 Кб
1# coding=utf-8
2# Copyright 2020 The HuggingFace Team. All rights reserved.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16
17import os18import pickle19import unittest20
21from transformers import AutoTokenizer22from transformers.models.bert.tokenization_bert import BertTokenizer23from transformers.models.bert_japanese.tokenization_bert_japanese import (24VOCAB_FILES_NAMES,25BertJapaneseTokenizer,26CharacterTokenizer,27JumanppTokenizer,28MecabTokenizer,29SudachiTokenizer,30WordpieceTokenizer,31)
32from transformers.testing_utils import custom_tokenizers, require_jumanpp, require_sudachi_projection33
34from ...test_tokenization_common import TokenizerTesterMixin35
36
37@custom_tokenizers
38class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):39tokenizer_class = BertJapaneseTokenizer40test_rust_tokenizer = False41space_between_special_tokens = True42
43def setUp(self):44super().setUp()45
46vocab_tokens = [47"[UNK]",48"[CLS]",49"[SEP]",50"こんにちは",51"こん",52"にちは",53"ばんは",54"##こん",55"##にちは",56"##ばんは",57"世界",58"##世界",59"、",60"##、",61"。",62"##。",63"アップルストア",64"外国",65"##人",66"参政",67"##権",68"此れ",69"は",70"猫",71"です",72]73
74self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])75with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:76vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))77
78def get_input_output_texts(self, tokenizer):79input_text = "こんにちは、世界。 \nこんばんは、世界。"80output_text = "こんにちは 、 世界 。 こんばんは 、 世界 。"81return input_text, output_text82
83def get_clean_sequence(self, tokenizer):84input_text, output_text = self.get_input_output_texts(tokenizer)85ids = tokenizer.encode(output_text, add_special_tokens=False)86text = tokenizer.decode(ids, clean_up_tokenization_spaces=False)87return text, ids88
89def test_pretokenized_inputs(self):90pass # TODO add if relevant91
92def test_maximum_encoding_length_pair_input(self):93pass # TODO add if relevant94
95def test_maximum_encoding_length_single_input(self):96pass # TODO add if relevant97
98def test_full_tokenizer(self):99tokenizer = self.tokenizer_class(self.vocab_file)100
101tokens = tokenizer.tokenize("こんにちは、世界。\nこんばんは、世界。")102self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"])103self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])104
105def test_pickle_mecab_tokenizer(self):106tokenizer = self.tokenizer_class(self.vocab_file, word_tokenizer_type="mecab")107self.assertIsNotNone(tokenizer)108
109text = "こんにちは、世界。\nこんばんは、世界。"110tokens = tokenizer.tokenize(text)111self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"])112self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])113
114filename = os.path.join(self.tmpdirname, "tokenizer.bin")115with open(filename, "wb") as handle:116pickle.dump(tokenizer, handle)117
118with open(filename, "rb") as handle:119tokenizer_new = pickle.load(handle)120
121tokens_loaded = tokenizer_new.tokenize(text)122
123self.assertListEqual(tokens, tokens_loaded)124
125def test_mecab_full_tokenizer_with_mecab_kwargs(self):126tokenizer = self.tokenizer_class(127self.vocab_file, word_tokenizer_type="mecab", mecab_kwargs={"mecab_dic": "ipadic"}128)129
130text = "アップルストア"131tokens = tokenizer.tokenize(text)132self.assertListEqual(tokens, ["アップルストア"])133
134def test_mecab_tokenizer_ipadic(self):135tokenizer = MecabTokenizer(mecab_dic="ipadic")136
137self.assertListEqual(138tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),139["アップルストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"],140)141
142def test_mecab_tokenizer_unidic_lite(self):143try:144tokenizer = MecabTokenizer(mecab_dic="unidic_lite")145except ModuleNotFoundError:146return147
148self.assertListEqual(149tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),150["アップル", "ストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"],151)152
153def test_mecab_tokenizer_unidic(self):154try:155import unidic156
157self.assertTrue(158os.path.isdir(unidic.DICDIR),159"The content of unidic was not downloaded. Run `python -m unidic download` before running this test case. Note that this requires 2.1GB on disk.",160)161tokenizer = MecabTokenizer(mecab_dic="unidic")162except ModuleNotFoundError:163return164
165self.assertListEqual(166tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),167["アップル", "ストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"],168)169
170def test_mecab_tokenizer_lower(self):171tokenizer = MecabTokenizer(do_lower_case=True, mecab_dic="ipadic")172
173self.assertListEqual(174tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),175["アップルストア", "で", "iphone", "8", "が", "発売", "さ", "れ", "た", "。"],176)177
178def test_mecab_tokenizer_with_option(self):179try:180tokenizer = MecabTokenizer(181do_lower_case=True, normalize_text=False, mecab_option="-d /usr/local/lib/mecab/dic/jumandic"182)183except RuntimeError:184# if dict doesn't exist in the system, previous code raises this error.185return186
187self.assertListEqual(188tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),189["アップルストア", "で", "iPhone", "8", "が", "発売", "さ", "れた", "\u3000", "。"],190)191
192def test_mecab_tokenizer_no_normalize(self):193tokenizer = MecabTokenizer(normalize_text=False, mecab_dic="ipadic")194
195self.assertListEqual(196tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),197["アップルストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", " ", "。"],198)199
200@require_sudachi_projection201def test_pickle_sudachi_tokenizer(self):202tokenizer = self.tokenizer_class(self.vocab_file, word_tokenizer_type="sudachi")203self.assertIsNotNone(tokenizer)204
205text = "こんにちは、世界。\nこんばんは、世界。"206tokens = tokenizer.tokenize(text)207self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"])208self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])209
210filename = os.path.join(self.tmpdirname, "tokenizer.bin")211with open(filename, "wb") as handle:212pickle.dump(tokenizer, handle)213
214with open(filename, "rb") as handle:215tokenizer_new = pickle.load(handle)216
217tokens_loaded = tokenizer_new.tokenize(text)218
219self.assertListEqual(tokens, tokens_loaded)220
221@require_sudachi_projection222def test_sudachi_tokenizer_core(self):223tokenizer = SudachiTokenizer(sudachi_dict_type="core")224
225# fmt: off226self.assertListEqual(227tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),228[" ", "\t", "アップル", "ストア", "で", "iPhone", "8", " ", "が", " ", " ", "\n ", "発売", "さ", "れ", "た", " ", "。", " ", " "],229)230# fmt: on231
232@require_sudachi_projection233def test_sudachi_tokenizer_split_mode_A(self):234tokenizer = SudachiTokenizer(sudachi_dict_type="core", sudachi_split_mode="A")235
236self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国", "人", "参政", "権"])237
238@require_sudachi_projection239def test_sudachi_tokenizer_split_mode_B(self):240tokenizer = SudachiTokenizer(sudachi_dict_type="core", sudachi_split_mode="B")241
242self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国人", "参政権"])243
244@require_sudachi_projection245def test_sudachi_tokenizer_split_mode_C(self):246tokenizer = SudachiTokenizer(sudachi_dict_type="core", sudachi_split_mode="C")247
248self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国人参政権"])249
250@require_sudachi_projection251def test_sudachi_full_tokenizer_with_sudachi_kwargs_split_mode_B(self):252tokenizer = self.tokenizer_class(253self.vocab_file, word_tokenizer_type="sudachi", sudachi_kwargs={"sudachi_split_mode": "B"}254)255
256self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国", "##人", "参政", "##権"])257
258@require_sudachi_projection259def test_sudachi_tokenizer_projection(self):260tokenizer = SudachiTokenizer(261sudachi_dict_type="core", sudachi_split_mode="A", sudachi_projection="normalized_nouns"262)263
264self.assertListEqual(tokenizer.tokenize("これはねこです。"), ["此れ", "は", "猫", "です", "。"])265
266@require_sudachi_projection267def test_sudachi_full_tokenizer_with_sudachi_kwargs_sudachi_projection(self):268tokenizer = self.tokenizer_class(269self.vocab_file, word_tokenizer_type="sudachi", sudachi_kwargs={"sudachi_projection": "normalized_nouns"}270)271
272self.assertListEqual(tokenizer.tokenize("これはねこです。"), ["此れ", "は", "猫", "です", "。"])273
274@require_sudachi_projection275def test_sudachi_tokenizer_lower(self):276tokenizer = SudachiTokenizer(do_lower_case=True, sudachi_dict_type="core")277
278self.assertListEqual(tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),[" ", "\t", "アップル", "ストア", "で", "iphone", "8", " ", "が", " ", " ", "\n ", "発売", "さ", "れ", "た", " ", "。", " ", " "]) # fmt: skip279
280@require_sudachi_projection281def test_sudachi_tokenizer_no_normalize(self):282tokenizer = SudachiTokenizer(normalize_text=False, sudachi_dict_type="core")283
284self.assertListEqual(tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),[" ", "\t", "アップル", "ストア", "で", "iPhone", "8", " ", "が", " ", " ", "\n ", "発売", "さ", "れ", "た", "\u3000", "。", " ", " "]) # fmt: skip285
286@require_sudachi_projection287def test_sudachi_tokenizer_trim_whitespace(self):288tokenizer = SudachiTokenizer(trim_whitespace=True, sudachi_dict_type="core")289
290self.assertListEqual(291tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),292["アップル", "ストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"],293)294
295@require_jumanpp296def test_pickle_jumanpp_tokenizer(self):297tokenizer = self.tokenizer_class(self.vocab_file, word_tokenizer_type="jumanpp")298self.assertIsNotNone(tokenizer)299
300text = "こんにちは、世界。\nこんばんは、世界。"301tokens = tokenizer.tokenize(text)302self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"])303self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])304
305filename = os.path.join(self.tmpdirname, "tokenizer.bin")306with open(filename, "wb") as handle:307pickle.dump(tokenizer, handle)308
309with open(filename, "rb") as handle:310tokenizer_new = pickle.load(handle)311
312tokens_loaded = tokenizer_new.tokenize(text)313
314self.assertListEqual(tokens, tokens_loaded)315
316@require_jumanpp317def test_jumanpp_tokenizer(self):318tokenizer = JumanppTokenizer()319
320self.assertListEqual(321tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),["アップル", "ストア", "で", "iPhone", "8", "\u3000", "が", "\u3000", "\u3000", "\u3000", "発売", "さ", "れた", "\u3000", "。"]) # fmt: skip322
323@require_jumanpp324def test_jumanpp_tokenizer_lower(self):325tokenizer = JumanppTokenizer(do_lower_case=True)326
327self.assertListEqual(tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),["アップル", "ストア", "で", "iphone", "8", "\u3000", "が", "\u3000", "\u3000", "\u3000", "発売", "さ", "れた", "\u3000", "。"],) # fmt: skip328
329@require_jumanpp330def test_jumanpp_tokenizer_no_normalize(self):331tokenizer = JumanppTokenizer(normalize_text=False)332
333self.assertListEqual(tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),["ア", "ッ", "フ", "゚", "ル", "ストア", "で", "iPhone", "8", "\u3000", "が", "\u3000", "\u3000", "\u3000", "発売", "さ", "れた", "\u3000", "。"],) # fmt: skip334
335@require_jumanpp336def test_jumanpp_tokenizer_trim_whitespace(self):337tokenizer = JumanppTokenizer(trim_whitespace=True)338
339self.assertListEqual(340tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),341["アップル", "ストア", "で", "iPhone", "8", "が", "発売", "さ", "れた", "。"],342)343
344@require_jumanpp345def test_jumanpp_full_tokenizer_with_jumanpp_kwargs_trim_whitespace(self):346tokenizer = self.tokenizer_class(347self.vocab_file, word_tokenizer_type="jumanpp", jumanpp_kwargs={"trim_whitespace": True}348)349
350text = "こんにちは、世界。\nこんばんは、世界。"351tokens = tokenizer.tokenize(text)352self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"])353self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])354
355@require_jumanpp356def test_jumanpp_tokenizer_ext(self):357tokenizer = JumanppTokenizer()358
359self.assertListEqual(360tokenizer.tokenize("ありがとうございますm(_ _)m見つけるのが大変です。"),361["ありがとう", "ございます", "m(_ _)m", "見つける", "の", "が", "大変です", "。"],362)363
364def test_wordpiece_tokenizer(self):365vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こんにちは", "こん", "にちは", "ばんは", "##こん", "##にちは", "##ばんは"] # fmt: skip366
367vocab = {}368for i, token in enumerate(vocab_tokens):369vocab[token] = i370tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")371
372self.assertListEqual(tokenizer.tokenize(""), [])373
374self.assertListEqual(tokenizer.tokenize("こんにちは"), ["こんにちは"])375
376self.assertListEqual(tokenizer.tokenize("こんばんは"), ["こん", "##ばんは"])377
378self.assertListEqual(tokenizer.tokenize("こんばんは こんばんにちは こんにちは"), ["こん", "##ばんは", "[UNK]", "こんにちは"]) # fmt: skip379
380def test_sentencepiece_tokenizer(self):381tokenizer = BertJapaneseTokenizer.from_pretrained("nlp-waseda/roberta-base-japanese-with-auto-jumanpp")382subword_tokenizer = tokenizer.subword_tokenizer383
384tokens = subword_tokenizer.tokenize("国境 の 長い トンネル を 抜ける と 雪国 であった 。")385self.assertListEqual(tokens, ["▁国境", "▁の", "▁長い", "▁トンネル", "▁を", "▁抜ける", "▁と", "▁雪", "国", "▁であった", "▁。"]) # fmt: skip386
387tokens = subword_tokenizer.tokenize("こんばんは こんばん にち は こんにちは")388self.assertListEqual(tokens, ["▁こん", "ばん", "は", "▁こん", "ばん", "▁に", "ち", "▁は", "▁こんにちは"])389
390def test_sequence_builders(self):391tokenizer = self.tokenizer_class.from_pretrained("cl-tohoku/bert-base-japanese")392
393text = tokenizer.encode("ありがとう。", add_special_tokens=False)394text_2 = tokenizer.encode("どういたしまして。", add_special_tokens=False)395
396encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)397encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)398
399# 2 is for "[CLS]", 3 is for "[SEP]"400assert encoded_sentence == [2] + text + [3]401assert encoded_pair == [2] + text + [3] + text_2 + [3]402
403
404@custom_tokenizers
405class BertJapaneseCharacterTokenizationTest(TokenizerTesterMixin, unittest.TestCase):406tokenizer_class = BertJapaneseTokenizer407test_rust_tokenizer = False408
409def setUp(self):410super().setUp()411
412vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こ", "ん", "に", "ち", "は", "ば", "世", "界", "、", "。"]413
414self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])415with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:416vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))417
418def get_tokenizer(self, **kwargs):419return BertJapaneseTokenizer.from_pretrained(self.tmpdirname, subword_tokenizer_type="character", **kwargs)420
421def get_input_output_texts(self, tokenizer):422input_text = "こんにちは、世界。 \nこんばんは、世界。"423output_text = "こ ん に ち は 、 世 界 。 こ ん ば ん は 、 世 界 。"424return input_text, output_text425
426def test_pretokenized_inputs(self):427pass # TODO add if relevant428
429def test_maximum_encoding_length_pair_input(self):430pass # TODO add if relevant431
432def test_maximum_encoding_length_single_input(self):433pass # TODO add if relevant434
435def test_full_tokenizer(self):436tokenizer = self.tokenizer_class(self.vocab_file, subword_tokenizer_type="character")437
438tokens = tokenizer.tokenize("こんにちは、世界。 \nこんばんは、世界。")439self.assertListEqual(tokens, ["こ", "ん", "に", "ち", "は", "、", "世", "界", "。", "こ", "ん", "ば", "ん", "は", "、", "世", "界", "。"]) # fmt: skip440self.assertListEqual(441tokenizer.convert_tokens_to_ids(tokens), [3, 4, 5, 6, 7, 11, 9, 10, 12, 3, 4, 8, 4, 7, 11, 9, 10, 12]442)443
444def test_character_tokenizer(self):445vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こ", "ん", "に", "ち", "は", "ば", "世", "界", "、", "。"]446
447vocab = {}448for i, token in enumerate(vocab_tokens):449vocab[token] = i450tokenizer = CharacterTokenizer(vocab=vocab, unk_token="[UNK]")451
452self.assertListEqual(tokenizer.tokenize(""), [])453
454self.assertListEqual(tokenizer.tokenize("こんにちは"), ["こ", "ん", "に", "ち", "は"])455
456self.assertListEqual(tokenizer.tokenize("こんにちほ"), ["こ", "ん", "に", "ち", "[UNK]"])457
458def test_sequence_builders(self):459tokenizer = self.tokenizer_class.from_pretrained("cl-tohoku/bert-base-japanese-char")460
461text = tokenizer.encode("ありがとう。", add_special_tokens=False)462text_2 = tokenizer.encode("どういたしまして。", add_special_tokens=False)463
464encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)465encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)466
467# 2 is for "[CLS]", 3 is for "[SEP]"468assert encoded_sentence == [2] + text + [3]469assert encoded_pair == [2] + text + [3] + text_2 + [3]470
471
472@custom_tokenizers
473class AutoTokenizerCustomTest(unittest.TestCase):474def test_tokenizer_bert_japanese(self):475EXAMPLE_BERT_JAPANESE_ID = "cl-tohoku/bert-base-japanese"476tokenizer = AutoTokenizer.from_pretrained(EXAMPLE_BERT_JAPANESE_ID)477self.assertIsInstance(tokenizer, BertJapaneseTokenizer)478
479
480class BertTokenizerMismatchTest(unittest.TestCase):481def test_tokenizer_mismatch_warning(self):482EXAMPLE_BERT_JAPANESE_ID = "cl-tohoku/bert-base-japanese"483with self.assertLogs("transformers", level="WARNING") as cm:484BertTokenizer.from_pretrained(EXAMPLE_BERT_JAPANESE_ID)485self.assertTrue(486cm.records[0].message.startswith(487"The tokenizer class you load from this checkpoint is not the same type as the class this function"488" is called from."489)490)491EXAMPLE_BERT_ID = "google-bert/bert-base-cased"492with self.assertLogs("transformers", level="WARNING") as cm:493BertJapaneseTokenizer.from_pretrained(EXAMPLE_BERT_ID)494self.assertTrue(495cm.records[0].message.startswith(496"The tokenizer class you load from this checkpoint is not the same type as the class this function"497" is called from."498)499)500