transformers
303 строки · 14.2 Кб
1# coding=utf-8
2# Copyright 2020 The HuggingFace Team. All rights reserved.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16
17import itertools
18import json
19import os
20import unittest
21
22from transformers import AddedToken, RobertaTokenizer, RobertaTokenizerFast
23from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES
24from transformers.testing_utils import require_tokenizers, slow
25
26from ...test_tokenization_common import TokenizerTesterMixin
27
28
29@require_tokenizers
30class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
31tokenizer_class = RobertaTokenizer
32rust_tokenizer_class = RobertaTokenizerFast
33test_rust_tokenizer = True
34from_pretrained_kwargs = {"cls_token": "<s>"}
35
36def setUp(self):
37super().setUp()
38
39# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
40vocab = [
41"l",
42"o",
43"w",
44"e",
45"r",
46"s",
47"t",
48"i",
49"d",
50"n",
51"\u0120",
52"\u0120l",
53"\u0120n",
54"\u0120lo",
55"\u0120low",
56"er",
57"\u0120lowest",
58"\u0120newer",
59"\u0120wider",
60"<unk>",
61]
62vocab_tokens = dict(zip(vocab, range(len(vocab))))
63merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
64self.special_tokens_map = {"unk_token": "<unk>"}
65
66self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
67self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
68with open(self.vocab_file, "w", encoding="utf-8") as fp:
69fp.write(json.dumps(vocab_tokens) + "\n")
70with open(self.merges_file, "w", encoding="utf-8") as fp:
71fp.write("\n".join(merges))
72
73def get_tokenizer(self, **kwargs):
74kwargs.update(self.special_tokens_map)
75return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
76
77def get_rust_tokenizer(self, **kwargs):
78kwargs.update(self.special_tokens_map)
79return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
80
81def get_input_output_texts(self, tokenizer):
82input_text = "lower newer"
83output_text = "lower newer"
84return input_text, output_text
85
86def test_full_tokenizer(self):
87tokenizer = self.tokenizer_class(self.vocab_file, self.merges_file, **self.special_tokens_map)
88text = "lower newer"
89bpe_tokens = ["l", "o", "w", "er", "\u0120", "n", "e", "w", "er"]
90tokens = tokenizer.tokenize(text) # , add_prefix_space=True)
91self.assertListEqual(tokens, bpe_tokens)
92
93input_tokens = tokens + [tokenizer.unk_token]
94input_bpe_tokens = [0, 1, 2, 15, 10, 9, 3, 2, 15, 19]
95self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
96
97def roberta_dict_integration_testing(self):
98tokenizer = self.get_tokenizer()
99
100self.assertListEqual(tokenizer.encode("Hello world!", add_special_tokens=False), [0, 31414, 232, 328, 2])
101self.assertListEqual(
102tokenizer.encode("Hello world! cécé herlolip 418", add_special_tokens=False),
103[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2],
104)
105
106@slow
107def test_sequence_builders(self):
108tokenizer = self.tokenizer_class.from_pretrained("FacebookAI/roberta-base")
109
110text = tokenizer.encode("sequence builders", add_special_tokens=False)
111text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
112
113encoded_text_from_decode = tokenizer.encode(
114"sequence builders", add_special_tokens=True, add_prefix_space=False
115)
116encoded_pair_from_decode = tokenizer.encode(
117"sequence builders", "multi-sequence build", add_special_tokens=True, add_prefix_space=False
118)
119
120encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
121encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
122
123assert encoded_sentence == encoded_text_from_decode
124assert encoded_pair == encoded_pair_from_decode
125
126def test_space_encoding(self):
127tokenizer = self.get_tokenizer()
128
129sequence = "Encode this sequence."
130space_encoding = tokenizer.byte_encoder[" ".encode("utf-8")[0]]
131
132# Testing encoder arguments
133encoded = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=False)
134first_char = tokenizer.convert_ids_to_tokens(encoded[0])[0]
135self.assertNotEqual(first_char, space_encoding)
136
137encoded = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=True)
138first_char = tokenizer.convert_ids_to_tokens(encoded[0])[0]
139self.assertEqual(first_char, space_encoding)
140
141tokenizer.add_special_tokens({"bos_token": "<s>"})
142encoded = tokenizer.encode(sequence, add_special_tokens=True)
143first_char = tokenizer.convert_ids_to_tokens(encoded[1])[0]
144self.assertNotEqual(first_char, space_encoding)
145
146# Testing spaces after special tokens
147mask = "<mask>"
148tokenizer.add_special_tokens(
149{"mask_token": AddedToken(mask, lstrip=True, rstrip=False)}
150) # mask token has a left space
151mask_ind = tokenizer.convert_tokens_to_ids(mask)
152
153sequence = "Encode <mask> sequence"
154sequence_nospace = "Encode <mask>sequence"
155
156encoded = tokenizer.encode(sequence)
157mask_loc = encoded.index(mask_ind)
158first_char = tokenizer.convert_ids_to_tokens(encoded[mask_loc + 1])[0]
159self.assertEqual(first_char, space_encoding)
160
161encoded = tokenizer.encode(sequence_nospace)
162mask_loc = encoded.index(mask_ind)
163first_char = tokenizer.convert_ids_to_tokens(encoded[mask_loc + 1])[0]
164self.assertNotEqual(first_char, space_encoding)
165
166def test_pretokenized_inputs(self):
167pass
168
169def test_embeded_special_tokens(self):
170for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
171with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
172tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
173tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
174sentence = "A, <mask> AllenNLP sentence."
175tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
176tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
177
178# token_type_ids should put 0 everywhere
179self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
180
181# attention_mask should put 1 everywhere, so sum over length should be 1
182self.assertEqual(
183sum(tokens_r["attention_mask"]) / len(tokens_r["attention_mask"]),
184sum(tokens_p["attention_mask"]) / len(tokens_p["attention_mask"]),
185)
186
187tokens_r_str = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
188tokens_p_str = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
189
190# Rust correctly handles the space before the mask while python doesnt
191self.assertSequenceEqual(tokens_p["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
192self.assertSequenceEqual(tokens_r["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
193
194self.assertSequenceEqual(
195tokens_p_str, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"]
196)
197self.assertSequenceEqual(
198tokens_r_str, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"]
199)
200
201def test_change_add_prefix_space_and_trim_offsets_args(self):
202for trim_offsets, add_prefix_space in itertools.product([True, False], repeat=2):
203tokenizer_r = self.rust_tokenizer_class.from_pretrained(
204self.tmpdirname, use_fast=True, add_prefix_space=add_prefix_space, trim_offsets=trim_offsets
205)
206
207pre_tokenizer_state = json.loads(tokenizer_r.backend_tokenizer.pre_tokenizer.__getstate__())
208post_processor_state = json.loads(tokenizer_r.backend_tokenizer.post_processor.__getstate__())
209
210self.assertEqual(pre_tokenizer_state["add_prefix_space"], add_prefix_space)
211
212self.assertEqual(post_processor_state["add_prefix_space"], add_prefix_space)
213self.assertEqual(post_processor_state["trim_offsets"], trim_offsets)
214
215def test_offsets_mapping_with_different_add_prefix_space_and_trim_space_arguments(self):
216# Test which aims to verify that the offsets are well adapted to the argument `add_prefix_space` and
217# `trim_offsets`
218for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
219with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
220text_of_1_token = "hello" # `hello` is a token in the vocabulary of `pretrained_name`
221text = f"{text_of_1_token} {text_of_1_token}"
222
223tokenizer_r = self.rust_tokenizer_class.from_pretrained(
224pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=True
225)
226encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
227self.assertEqual(encoding.offset_mapping[0], (0, len(text_of_1_token)))
228self.assertEqual(
229encoding.offset_mapping[1],
230(len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
231)
232
233tokenizer_r = self.rust_tokenizer_class.from_pretrained(
234pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True
235)
236encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
237self.assertEqual(encoding.offset_mapping[0], (0, len(text_of_1_token)))
238self.assertEqual(
239encoding.offset_mapping[1],
240(len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
241)
242
243tokenizer_r = self.rust_tokenizer_class.from_pretrained(
244pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False
245)
246encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
247self.assertEqual(encoding.offset_mapping[0], (0, len(text_of_1_token)))
248self.assertEqual(
249encoding.offset_mapping[1],
250(len(text_of_1_token), len(text_of_1_token) + 1 + len(text_of_1_token)),
251)
252
253tokenizer_r = self.rust_tokenizer_class.from_pretrained(
254pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False
255)
256encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
257self.assertEqual(encoding.offset_mapping[0], (0, len(text_of_1_token)))
258self.assertEqual(
259encoding.offset_mapping[1],
260(len(text_of_1_token), len(text_of_1_token) + 1 + len(text_of_1_token)),
261)
262
263text = f" {text}"
264
265# tokenizer_r = self.rust_tokenizer_class.from_pretrained(
266# pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=True
267# )
268# encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
269# self.assertEqual(encoding.offset_mapping[0], (1, 1 + len(text_of_1_token)))
270# self.assertEqual(
271# encoding.offset_mapping[1],
272# (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
273# )
274
275tokenizer_r = self.rust_tokenizer_class.from_pretrained(
276pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True
277)
278encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
279self.assertEqual(encoding.offset_mapping[0], (1, 1 + len(text_of_1_token)))
280self.assertEqual(
281encoding.offset_mapping[1],
282(1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
283)
284
285tokenizer_r = self.rust_tokenizer_class.from_pretrained(
286pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False
287)
288encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
289self.assertEqual(encoding.offset_mapping[0], (0, 1 + len(text_of_1_token)))
290self.assertEqual(
291encoding.offset_mapping[1],
292(1 + len(text_of_1_token), 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
293)
294
295tokenizer_r = self.rust_tokenizer_class.from_pretrained(
296pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False
297)
298encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
299self.assertEqual(encoding.offset_mapping[0], (0, 1 + len(text_of_1_token)))
300self.assertEqual(
301encoding.offset_mapping[1],
302(1 + len(text_of_1_token), 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
303)
304