paddlenlp

test_tokenizer.py
46 строк · 1.6 Кб
Перенос по словам
1
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2
#
3
# Licensed under the Apache License, Version 2.0 (the "License");
4
# you may not use this file except in compliance with the License.
5
# You may obtain a copy of the License at
6
#
7
#     http://www.apache.org/licenses/LICENSE-2.0
8
#
9
# Unless required by applicable law or agreed to in writing, software
10
# distributed under the License is distributed on an "AS IS" BASIS,
11
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
# See the License for the specific language governing permissions and
13
# limitations under the License.
14

15
import unittest
16

17
from paddlenlp.data import JiebaTokenizer, Vocab
18
from tests.common_test import CpuCommonTest
19
from tests.testing_utils import create_test_data
20

21

22
class TestJiebaTokenizer(CpuCommonTest):
23
    def setUp(self):
24
        test_data_file = create_test_data(__file__)
25
        self.vocab = Vocab.load_vocabulary(test_data_file, unk_token="[UNK]")
26
        self.tokenizer = JiebaTokenizer(self.vocab)
27

28
    def test_jieba(self):
29
        text = "一万一"
30
        token_arr = self.tokenizer.cut(text)
31
        idx_arr = self.tokenizer.encode(text)
32
        for i, token in enumerate(token_arr):
33
            self.check_output_equal(self.vocab(token), idx_arr[i])
34

35
        jieba_tokenizer = self.tokenizer.get_tokenizer()
36
        jieba_token_arr = jieba_tokenizer.lcut(text, False, True)
37
        self.check_output_equal(token_arr, jieba_token_arr)
38

39
    def test_unk(self):
40
        text = "中国"
41
        idx_arr = self.tokenizer.encode(text)
42
        self.check_output_equal(self.vocab[self.vocab.unk_token] in idx_arr, True)
43

44

45
if __name__ == "__main__":
46
    unittest.main()
47
paddlenlp

Использование cookies