paddlenlp
1# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15import unittest16
17from paddlenlp.data import JiebaTokenizer, Vocab18from tests.common_test import CpuCommonTest19from tests.testing_utils import create_test_data20
21
22class TestJiebaTokenizer(CpuCommonTest):23def setUp(self):24test_data_file = create_test_data(__file__)25self.vocab = Vocab.load_vocabulary(test_data_file, unk_token="[UNK]")26self.tokenizer = JiebaTokenizer(self.vocab)27
28def test_jieba(self):29text = "一万一"30token_arr = self.tokenizer.cut(text)31idx_arr = self.tokenizer.encode(text)32for i, token in enumerate(token_arr):33self.check_output_equal(self.vocab(token), idx_arr[i])34
35jieba_tokenizer = self.tokenizer.get_tokenizer()36jieba_token_arr = jieba_tokenizer.lcut(text, False, True)37self.check_output_equal(token_arr, jieba_token_arr)38
39def test_unk(self):40text = "中国"41idx_arr = self.tokenizer.encode(text)42self.check_output_equal(self.vocab[self.vocab.unk_token] in idx_arr, True)43
44
45if __name__ == "__main__":46unittest.main()47