transformers
342 строки · 13.9 Кб
1# coding=utf-8
2# Copyright 2020 The HuggingFace Team. All rights reserved.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16
17import os18import unittest19
20from transformers import BertTokenizerFast21from transformers.models.bert.tokenization_bert import (22VOCAB_FILES_NAMES,23BasicTokenizer,24BertTokenizer,25WordpieceTokenizer,26_is_control,27_is_punctuation,28_is_whitespace,29)
30from transformers.testing_utils import require_tokenizers, slow31
32from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english33
34
35@require_tokenizers
36class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):37tokenizer_class = BertTokenizer38rust_tokenizer_class = BertTokenizerFast39test_rust_tokenizer = True40space_between_special_tokens = True41from_pretrained_filter = filter_non_english42
43def setUp(self):44super().setUp()45
46vocab_tokens = [47"[UNK]",48"[CLS]",49"[SEP]",50"[PAD]",51"[MASK]",52"want",53"##want",54"##ed",55"wa",56"un",57"runn",58"##ing",59",",60"low",61"lowest",62]63self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])64with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:65vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))66
67def get_input_output_texts(self, tokenizer):68input_text = "UNwant\u00E9d,running"69output_text = "unwanted, running"70return input_text, output_text71
72def test_full_tokenizer(self):73tokenizer = self.tokenizer_class(self.vocab_file)74
75tokens = tokenizer.tokenize("UNwant\u00E9d,running")76self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])77self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])78
79def test_rust_and_python_full_tokenizers(self):80if not self.test_rust_tokenizer:81return82
83tokenizer = self.get_tokenizer()84rust_tokenizer = self.get_rust_tokenizer()85
86sequence = "UNwant\u00E9d,running"87
88tokens = tokenizer.tokenize(sequence)89rust_tokens = rust_tokenizer.tokenize(sequence)90self.assertListEqual(tokens, rust_tokens)91
92ids = tokenizer.encode(sequence, add_special_tokens=False)93rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)94self.assertListEqual(ids, rust_ids)95
96rust_tokenizer = self.get_rust_tokenizer()97ids = tokenizer.encode(sequence)98rust_ids = rust_tokenizer.encode(sequence)99self.assertListEqual(ids, rust_ids)100
101# With lower casing102tokenizer = self.get_tokenizer(do_lower_case=True)103rust_tokenizer = self.get_rust_tokenizer(do_lower_case=True)104
105sequence = "UNwant\u00E9d,running"106
107tokens = tokenizer.tokenize(sequence)108rust_tokens = rust_tokenizer.tokenize(sequence)109self.assertListEqual(tokens, rust_tokens)110
111ids = tokenizer.encode(sequence, add_special_tokens=False)112rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)113self.assertListEqual(ids, rust_ids)114
115rust_tokenizer = self.get_rust_tokenizer()116ids = tokenizer.encode(sequence)117rust_ids = rust_tokenizer.encode(sequence)118self.assertListEqual(ids, rust_ids)119
120def test_chinese(self):121tokenizer = BasicTokenizer()122
123self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"])124
125def test_basic_tokenizer_lower(self):126tokenizer = BasicTokenizer(do_lower_case=True)127
128self.assertListEqual(129tokenizer.tokenize(" \tHeLLo!how \n Are yoU? "), ["hello", "!", "how", "are", "you", "?"]130)131self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])132
133def test_basic_tokenizer_lower_strip_accents_false(self):134tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False)135
136self.assertListEqual(137tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["hällo", "!", "how", "are", "you", "?"]138)139self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"])140
141def test_basic_tokenizer_lower_strip_accents_true(self):142tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True)143
144self.assertListEqual(145tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["hallo", "!", "how", "are", "you", "?"]146)147self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])148
149def test_basic_tokenizer_lower_strip_accents_default(self):150tokenizer = BasicTokenizer(do_lower_case=True)151
152self.assertListEqual(153tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["hallo", "!", "how", "are", "you", "?"]154)155self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])156
157def test_basic_tokenizer_no_lower(self):158tokenizer = BasicTokenizer(do_lower_case=False)159
160self.assertListEqual(161tokenizer.tokenize(" \tHeLLo!how \n Are yoU? "), ["HeLLo", "!", "how", "Are", "yoU", "?"]162)163
164def test_basic_tokenizer_no_lower_strip_accents_false(self):165tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=False)166
167self.assertListEqual(168tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["HäLLo", "!", "how", "Are", "yoU", "?"]169)170
171def test_basic_tokenizer_no_lower_strip_accents_true(self):172tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=True)173
174self.assertListEqual(175tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["HaLLo", "!", "how", "Are", "yoU", "?"]176)177
178def test_basic_tokenizer_respects_never_split_tokens(self):179tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"])180
181self.assertListEqual(182tokenizer.tokenize(" \tHeLLo!how \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]183)184
185def test_basic_tokenizer_splits_on_punctuation(self):186tokenizer = BasicTokenizer()187text = "a\n'll !!to?'d of, can't."188expected = ["a", "'", "ll", "!", "!", "to", "?", "'", "d", "of", ",", "can", "'", "t", "."]189self.assertListEqual(tokenizer.tokenize(text), expected)190
191def test_wordpiece_tokenizer(self):192vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]193
194vocab = {}195for i, token in enumerate(vocab_tokens):196vocab[token] = i197tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")198
199self.assertListEqual(tokenizer.tokenize(""), [])200
201self.assertListEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"])202
203self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])204
205def test_is_whitespace(self):206self.assertTrue(_is_whitespace(" "))207self.assertTrue(_is_whitespace("\t"))208self.assertTrue(_is_whitespace("\r"))209self.assertTrue(_is_whitespace("\n"))210self.assertTrue(_is_whitespace("\u00A0"))211
212self.assertFalse(_is_whitespace("A"))213self.assertFalse(_is_whitespace("-"))214
215def test_is_control(self):216self.assertTrue(_is_control("\u0005"))217
218self.assertFalse(_is_control("A"))219self.assertFalse(_is_control(" "))220self.assertFalse(_is_control("\t"))221self.assertFalse(_is_control("\r"))222
223def test_is_punctuation(self):224self.assertTrue(_is_punctuation("-"))225self.assertTrue(_is_punctuation("$"))226self.assertTrue(_is_punctuation("`"))227self.assertTrue(_is_punctuation("."))228
229self.assertFalse(_is_punctuation("A"))230self.assertFalse(_is_punctuation(" "))231
232def test_clean_text(self):233tokenizer = self.get_tokenizer()234rust_tokenizer = self.get_rust_tokenizer()235
236# Example taken from the issue https://github.com/huggingface/tokenizers/issues/340237self.assertListEqual([tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]])238
239self.assertListEqual(240[rust_tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]]241)242
243@slow244def test_sequence_builders(self):245tokenizer = self.tokenizer_class.from_pretrained("google-bert/bert-base-uncased")246
247text = tokenizer.encode("sequence builders", add_special_tokens=False)248text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)249
250encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)251encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)252
253assert encoded_sentence == [101] + text + [102]254assert encoded_pair == [101] + text + [102] + text_2 + [102]255
256def test_offsets_with_special_characters(self):257for tokenizer, pretrained_name, kwargs in self.tokenizers_list:258with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):259tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)260
261sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."262tokens = tokenizer_r.encode_plus(263sentence,264return_attention_mask=False,265return_token_type_ids=False,266return_offsets_mapping=True,267add_special_tokens=True,268)269
270do_lower_case = tokenizer_r.do_lower_case if hasattr(tokenizer_r, "do_lower_case") else False271expected_results = (272[273((0, 0), tokenizer_r.cls_token),274((0, 1), "A"),275((1, 2), ","),276((3, 5), "na"),277((5, 6), "##ï"),278((6, 8), "##ve"),279((9, 15), tokenizer_r.mask_token),280((16, 21), "Allen"),281((21, 23), "##NL"),282((23, 24), "##P"),283((25, 33), "sentence"),284((33, 34), "."),285((0, 0), tokenizer_r.sep_token),286]287if not do_lower_case288else [289((0, 0), tokenizer_r.cls_token),290((0, 1), "a"),291((1, 2), ","),292((3, 8), "naive"),293((9, 15), tokenizer_r.mask_token),294((16, 21), "allen"),295((21, 23), "##nl"),296((23, 24), "##p"),297((25, 33), "sentence"),298((33, 34), "."),299((0, 0), tokenizer_r.sep_token),300]301)302
303self.assertEqual(304[e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"])305)306self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])307
308def test_change_tokenize_chinese_chars(self):309list_of_commun_chinese_char = ["的", "人", "有"]310text_with_chinese_char = "".join(list_of_commun_chinese_char)311for tokenizer, pretrained_name, kwargs in self.tokenizers_list:312with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):313kwargs["tokenize_chinese_chars"] = True314tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)315tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)316
317ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)318ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)319
320tokens_without_spe_char_r = tokenizer_r.convert_ids_to_tokens(ids_without_spe_char_r)321tokens_without_spe_char_p = tokenizer_p.convert_ids_to_tokens(ids_without_spe_char_p)322
323# it is expected that each Chinese character is not preceded by "##"324self.assertListEqual(tokens_without_spe_char_p, list_of_commun_chinese_char)325self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char)326
327kwargs["tokenize_chinese_chars"] = False328tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)329tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)330
331ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)332ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)333
334tokens_without_spe_char_r = tokenizer_r.convert_ids_to_tokens(ids_without_spe_char_r)335tokens_without_spe_char_p = tokenizer_p.convert_ids_to_tokens(ids_without_spe_char_p)336
337# it is expected that only the first Chinese character is not preceded by "##".338expected_tokens = [339f"##{token}" if idx != 0 else token for idx, token in enumerate(list_of_commun_chinese_char)340]341self.assertListEqual(tokens_without_spe_char_p, expected_tokens)342self.assertListEqual(tokens_without_spe_char_r, expected_tokens)343