transformers
142 строки · 7.2 Кб
1# coding=utf-8
2# Copyright 2023 The HuggingFace Inc. and Baidu team. All rights reserved.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15""" Testing suite for the PyTorch ErnieM model. """
16
17import unittest
18
19from transformers import ErnieMTokenizer
20from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
21
22from ...test_tokenization_common import TokenizerTesterMixin
23
24
25SAMPLE_VOCAB = get_tests_dir("fixtures/spiece.model")
26
27
28@require_sentencepiece
29@require_tokenizers
30class ErnieMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
31tokenizer_class = ErnieMTokenizer
32test_seq2seq = False
33test_sentencepiece = True
34test_rust_tokenizer = False
35test_sentencepiece_ignore_case = False
36
37def setUp(self):
38super().setUp()
39
40# We have a SentencePiece fixture for testing
41tokenizer = ErnieMTokenizer(SAMPLE_VOCAB, unk_token="<unk>", pad_token="<pad>")
42tokenizer.save_pretrained(self.tmpdirname)
43
44def get_input_output_texts(self, tokenizer):
45input_text = "this is a test"
46output_text = "this is a test"
47return input_text, output_text
48
49def test_convert_token_and_id(self):
50"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
51token = "<pad>"
52token_id = 0
53
54self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
55self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
56
57def test_get_vocab(self):
58vocab_keys = list(self.get_tokenizer().get_vocab().keys())
59
60self.assertEqual(vocab_keys[0], "<pad>")
61self.assertEqual(vocab_keys[1], "<unk>")
62self.assertEqual(vocab_keys[-1], "▁eloquent")
63self.assertEqual(len(vocab_keys), 30_000)
64
65def test_vocab_size(self):
66self.assertEqual(self.get_tokenizer().vocab_size, 30_000)
67
68def test_rust_and_python_full_tokenizers(self):
69if not self.test_rust_tokenizer:
70return
71
72tokenizer = self.get_tokenizer()
73rust_tokenizer = self.get_rust_tokenizer()
74
75sequence = "I was born in 92000, and this is falsé."
76
77tokens = tokenizer.tokenize(sequence)
78rust_tokens = rust_tokenizer.tokenize(sequence)
79self.assertListEqual(tokens, rust_tokens)
80
81ids = tokenizer.encode(sequence, add_special_tokens=False)
82rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
83self.assertListEqual(ids, rust_ids)
84
85rust_tokenizer = self.get_rust_tokenizer()
86ids = tokenizer.encode(sequence)
87rust_ids = rust_tokenizer.encode(sequence)
88self.assertListEqual(ids, rust_ids)
89
90def test_full_tokenizer(self):
91tokenizer = ErnieMTokenizer(SAMPLE_VOCAB, do_lower_case=True, unk_token="<unk>", pad_token="<pad>")
92
93tokens = tokenizer.tokenize("This is a test")
94self.assertListEqual(tokens, ["▁this", "▁is", "▁a", "▁test"])
95
96self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [48, 25, 21, 1289])
97
98tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
99# ErnieMTokenizer(paddlenlp implementation) outputs '9' instead of '_9' so to mimic that '_9' is changed to '9'
100self.assertListEqual(
101tokens, ["▁i", "▁was", "▁born", "▁in", "9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "é", "."]
102)
103ids = tokenizer.convert_tokens_to_ids(tokens)
104self.assertListEqual(ids, [31, 23, 386, 19, 518, 3050, 15, 17, 48, 25, 8256, 18, 1, 9])
105
106back_tokens = tokenizer.convert_ids_to_tokens(ids)
107self.assertListEqual(
108back_tokens,
109["▁i", "▁was", "▁born", "▁in", "9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "."],
110)
111
112def test_sequence_builders(self):
113tokenizer = ErnieMTokenizer(SAMPLE_VOCAB, unk_token="<unk>", pad_token="<pad>")
114
115text = tokenizer.encode("sequence builders")
116text_2 = tokenizer.encode("multi-sequence build")
117
118encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
119encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
120
121assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
122assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + [
123tokenizer.sep_token_id
124] + text_2 + [tokenizer.sep_token_id]
125
126@slow
127def test_tokenizer_integration(self):
128expected_encoding = {'input_ids': [[0, 11062, 82772, 7, 15, 82772, 538, 51529, 237, 17198, 1290, 206, 9, 215175, 1314, 136, 17198, 1290, 206, 9, 56359, 42, 122009, 9, 16466, 16, 87344, 4537, 9, 4717, 78381, 6, 159958, 7, 15, 24480, 618, 4, 527, 22693, 9, 304, 4, 2777, 24480, 9874, 4, 43523, 594, 4, 803, 18392, 33189, 18, 4, 43523, 24447, 5, 5, 5, 16, 100, 24955, 83658, 9626, 144057, 15, 839, 22335, 16, 136, 24955, 83658, 83479, 15, 39102, 724, 16, 678, 645, 6460, 1328, 4589, 42, 122009, 115774, 23, 3559, 1328, 46876, 7, 136, 53894, 1940, 42227, 41159, 17721, 823, 425, 4, 27512, 98722, 206, 136, 5531, 4970, 919, 17336, 5, 2], [0, 20080, 618, 83, 82775, 47, 479, 9, 1517, 73, 53894, 333, 80581, 110117, 18811, 5256, 1295, 51, 152526, 297, 7986, 390, 124416, 538, 35431, 214, 98, 15044, 25737, 136, 7108, 43701, 23, 756, 135355, 7, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 581, 63773, 119455, 6, 147797, 88203, 7, 645, 70, 21, 3285, 10269, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # fmt: skip
129
130self.tokenizer_integration_test_util(
131expected_encoding=expected_encoding,
132model_name="susnato/ernie-m-base_pytorch",
133sequences=[
134"Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides "
135"general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural "
136"Language Understanding (NLU) and Natural Language Generation (NLG) with over32+ pretrained "
137"models in100+ languages and deep interoperability between Jax, PyTorch and TensorFlow.",
138"BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly "
139"conditioning on both left and right context in all layers.",
140"The quick brown fox jumps over the lazy dog.",
141],
142)
143