transformers

test_tokenization_ernie_m.py
142 строки · 7.2 Кб
Перенос по словам
1
# coding=utf-8
2
# Copyright 2023 The HuggingFace Inc. and Baidu team. All rights reserved.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
""" Testing suite for the PyTorch ErnieM model. """
16

17
import unittest
18

19
from transformers import ErnieMTokenizer
20
from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
21

22
from ...test_tokenization_common import TokenizerTesterMixin
23

24

25
SAMPLE_VOCAB = get_tests_dir("fixtures/spiece.model")
26

27

28
@require_sentencepiece
29
@require_tokenizers
30
class ErnieMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
31
    tokenizer_class = ErnieMTokenizer
32
    test_seq2seq = False
33
    test_sentencepiece = True
34
    test_rust_tokenizer = False
35
    test_sentencepiece_ignore_case = False
36

37
    def setUp(self):
38
        super().setUp()
39

40
        # We have a SentencePiece fixture for testing
41
        tokenizer = ErnieMTokenizer(SAMPLE_VOCAB, unk_token="<unk>", pad_token="<pad>")
42
        tokenizer.save_pretrained(self.tmpdirname)
43

44
    def get_input_output_texts(self, tokenizer):
45
        input_text = "this is a test"
46
        output_text = "this is a test"
47
        return input_text, output_text
48

49
    def test_convert_token_and_id(self):
50
        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
51
        token = "<pad>"
52
        token_id = 0
53

54
        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
55
        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
56

57
    def test_get_vocab(self):
58
        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
59

60
        self.assertEqual(vocab_keys[0], "<pad>")
61
        self.assertEqual(vocab_keys[1], "<unk>")
62
        self.assertEqual(vocab_keys[-1], "▁eloquent")
63
        self.assertEqual(len(vocab_keys), 30_000)
64

65
    def test_vocab_size(self):
66
        self.assertEqual(self.get_tokenizer().vocab_size, 30_000)
67

68
    def test_rust_and_python_full_tokenizers(self):
69
        if not self.test_rust_tokenizer:
70
            return
71

72
        tokenizer = self.get_tokenizer()
73
        rust_tokenizer = self.get_rust_tokenizer()
74

75
        sequence = "I was born in 92000, and this is falsé."
76

77
        tokens = tokenizer.tokenize(sequence)
78
        rust_tokens = rust_tokenizer.tokenize(sequence)
79
        self.assertListEqual(tokens, rust_tokens)
80

81
        ids = tokenizer.encode(sequence, add_special_tokens=False)
82
        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
83
        self.assertListEqual(ids, rust_ids)
84

85
        rust_tokenizer = self.get_rust_tokenizer()
86
        ids = tokenizer.encode(sequence)
87
        rust_ids = rust_tokenizer.encode(sequence)
88
        self.assertListEqual(ids, rust_ids)
89

90
    def test_full_tokenizer(self):
91
        tokenizer = ErnieMTokenizer(SAMPLE_VOCAB, do_lower_case=True, unk_token="<unk>", pad_token="<pad>")
92

93
        tokens = tokenizer.tokenize("This is a test")
94
        self.assertListEqual(tokens, ["▁this", "▁is", "▁a", "▁test"])
95

96
        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [48, 25, 21, 1289])
97

98
        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
99
        # ErnieMTokenizer(paddlenlp implementation) outputs '9' instead of '_9' so to mimic that '_9' is changed to '9'
100
        self.assertListEqual(
101
            tokens, ["▁i", "▁was", "▁born", "▁in", "9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "é", "."]
102
        )
103
        ids = tokenizer.convert_tokens_to_ids(tokens)
104
        self.assertListEqual(ids, [31, 23, 386, 19, 518, 3050, 15, 17, 48, 25, 8256, 18, 1, 9])
105

106
        back_tokens = tokenizer.convert_ids_to_tokens(ids)
107
        self.assertListEqual(
108
            back_tokens,
109
            ["▁i", "▁was", "▁born", "▁in", "9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "."],
110
        )
111

112
    def test_sequence_builders(self):
113
        tokenizer = ErnieMTokenizer(SAMPLE_VOCAB, unk_token="<unk>", pad_token="<pad>")
114

115
        text = tokenizer.encode("sequence builders")
116
        text_2 = tokenizer.encode("multi-sequence build")
117

118
        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
119
        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
120

121
        assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
122
        assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + [
123
            tokenizer.sep_token_id
124
        ] + text_2 + [tokenizer.sep_token_id]
125

126
    @slow
127
    def test_tokenizer_integration(self):
128
        expected_encoding = {'input_ids': [[0, 11062, 82772, 7, 15, 82772, 538, 51529, 237, 17198, 1290, 206, 9, 215175, 1314, 136, 17198, 1290, 206, 9, 56359, 42, 122009, 9, 16466, 16, 87344, 4537, 9, 4717, 78381, 6, 159958, 7, 15, 24480, 618, 4, 527, 22693, 9, 304, 4, 2777, 24480, 9874, 4, 43523, 594, 4, 803, 18392, 33189, 18, 4, 43523, 24447, 5, 5, 5, 16, 100, 24955, 83658, 9626, 144057, 15, 839, 22335, 16, 136, 24955, 83658, 83479, 15, 39102, 724, 16, 678, 645, 6460, 1328, 4589, 42, 122009, 115774, 23, 3559, 1328, 46876, 7, 136, 53894, 1940, 42227, 41159, 17721, 823, 425, 4, 27512, 98722, 206, 136, 5531, 4970, 919, 17336, 5, 2], [0, 20080, 618, 83, 82775, 47, 479, 9, 1517, 73, 53894, 333, 80581, 110117, 18811, 5256, 1295, 51, 152526, 297, 7986, 390, 124416, 538, 35431, 214, 98, 15044, 25737, 136, 7108, 43701, 23, 756, 135355, 7, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 581, 63773, 119455, 6, 147797, 88203, 7, 645, 70, 21, 3285, 10269, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}  # fmt: skip
129

130
        self.tokenizer_integration_test_util(
131
            expected_encoding=expected_encoding,
132
            model_name="susnato/ernie-m-base_pytorch",
133
            sequences=[
134
                "Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides "
135
                "general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural "
136
                "Language Understanding (NLU) and Natural Language Generation (NLG) with over32+ pretrained "
137
                "models in100+ languages and deep interoperability between Jax, PyTorch and TensorFlow.",
138
                "BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly "
139
                "conditioning on both left and right context in all layers.",
140
                "The quick brown fox jumps over the lazy dog.",
141
            ],
142
        )
143
transformers

Использование cookies