transformers
501 строка · 23.4 Кб
1# Copyright 2022 The HuggingFace Team. All rights reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15import unittest
16
17from transformers.models.whisper import WhisperTokenizer, WhisperTokenizerFast
18from transformers.models.whisper.tokenization_whisper import _combine_tokens_into_words, _find_longest_common_sequence
19from transformers.testing_utils import slow
20
21from ...test_tokenization_common import TokenizerTesterMixin
22
23
24ES_CODE = 50262
25EN_CODE = 50259
26END_OF_TRANSCRIPT = 50257
27START_OF_TRANSCRIPT = 50258
28TRANSLATE = 50358
29TRANSCRIBE = 50359
30NOTIMESTAMPS = 50363
31
32
33class WhisperTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
34tokenizer_class = WhisperTokenizer
35rust_tokenizer_class = WhisperTokenizerFast
36test_rust_tokenizer = True
37test_sentencepiece = False
38test_seq2seq = False
39
40def setUp(self):
41super().setUp()
42tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny")
43tokenizer.pad_token_id = 50256
44tokenizer.pad_token = "<|endoftext|>"
45tokenizer.save_pretrained(self.tmpdirname)
46
47def test_convert_token_and_id(self):
48"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
49token = "Where"
50token_id = 14436
51
52self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
53self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
54
55def test_get_vocab(self):
56vocab_keys = list(self.get_tokenizer().get_vocab().keys())
57
58self.assertEqual(vocab_keys[0], "!")
59self.assertEqual(vocab_keys[1], '"')
60self.assertEqual(vocab_keys[-1], "<|30.00|>")
61self.assertEqual(len(vocab_keys), 51865)
62
63def test_vocab_size(self):
64self.assertEqual(self.get_tokenizer().vocab_size, 50258)
65
66def test_full_tokenizer(self):
67tokenizer = WhisperTokenizer.from_pretrained(self.tmpdirname)
68
69tokens = tokenizer.tokenize("This is a test")
70self.assertListEqual(tokens, ["This", "Ġis", "Ġa", "Ġtest"])
71
72self.assertListEqual(
73tokenizer.convert_tokens_to_ids(tokens),
74[5723, 307, 257, 1500],
75)
76
77tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
78self.assertListEqual(
79tokens,
80["I", "Ġwas", "Ġborn", "Ġin", "Ġ9", "2000", ",", "Ġand", "Ġthis", "Ġis", "Ġfals", "é", "."], # fmt: skip
81)
82ids = tokenizer.convert_tokens_to_ids(tokens)
83self.assertListEqual(ids, [40, 390, 4232, 294, 1722, 25743, 11, 293, 341, 307, 16720, 526, 13])
84
85back_tokens = tokenizer.convert_ids_to_tokens(ids)
86self.assertListEqual(
87back_tokens,
88["I", "Ġwas", "Ġborn", "Ġin", "Ġ9", "2000", ",", "Ġand", "Ġthis", "Ġis", "Ġfals", "é", "."], # fmt: skip
89)
90
91def test_tokenizer_slow_store_full_signature(self):
92pass
93
94def test_tokenizer_fast_store_full_signature(self):
95pass
96
97def test_special_tokens_initialization(self):
98# Whisper relies on specific additional special tokens, so we skip this
99# general test. In particular, this test loads fast tokenizer from slow
100# tokenizer, and the conversion uses prefix_tokens, where we reference
101# additional special tokens by specific indices, hence overriding the
102# list with less tokens leads to out of index error
103pass
104
105@slow
106def test_tokenizer_integration(self):
107expected_encoding = {'input_ids': [[50257, 50362, 41762, 364, 357, 36234, 1900, 355, 12972, 13165, 354, 12, 35636, 364, 290, 12972, 13165, 354, 12, 5310, 13363, 12, 4835, 8, 3769, 2276, 12, 29983, 45619, 357, 13246, 51, 11, 402, 11571, 12, 17, 11, 5564, 13246, 38586, 11, 16276, 44, 11, 4307, 346, 33, 861, 11, 16276, 7934, 23029, 329, 12068, 15417, 28491, 357, 32572, 52, 8, 290, 12068, 15417, 16588, 357, 32572, 38, 8, 351, 625, 3933, 10, 2181, 13363, 4981, 287, 1802, 10, 8950, 290, 2769, 48817, 1799, 1022, 449, 897, 11, 9485, 15884, 354, 290, 309, 22854, 37535, 13, 50256], [50257, 50362, 13246, 51, 318, 3562, 284, 662, 12, 27432, 2769, 8406, 4154, 282, 24612, 422, 9642, 9608, 276, 2420, 416, 26913, 21143, 319, 1111, 1364, 290, 826, 4732, 287, 477, 11685, 13, 50256], [50257, 50362, 464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13, 50256]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # fmt: skip
108
109self.tokenizer_integration_test_util(
110expected_encoding=expected_encoding, model_name="openai/whisper-tiny.en", padding=False
111)
112
113def test_output_offsets(self):
114tokenizer = self.get_tokenizer()
115previous_sequence = [51492, 406, 3163, 1953, 466, 13, 51612, 51612]
116self.assertEqual(
117tokenizer.decode(previous_sequence, output_offsets=True),
118{
119"text": " not worth thinking about.",
120"offsets": [{"text": " not worth thinking about.", "timestamp": (22.56, 24.96)}],
121},
122)
123
124# Merge when the previous sequence is a suffix of the next sequence
125next_sequences_1 = [50364, 295, 6177, 3391, 11, 19817, 3337, 507, 307, 406, 3163, 1953, 466, 13, 50614, 50614, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 50834, 50257] # fmt: skip
126self.assertEqual(
127tokenizer.decode(next_sequences_1, output_offsets=True),
128{
129"text": (
130" of spectators, retrievality is not worth thinking about. His instant panic was followed by a"
131" small, sharp blow high on his chest.<|endoftext|>"
132),
133"offsets": [
134{"text": " of spectators, retrievality is not worth thinking about.", "timestamp": (0.0, 5.0)},
135{
136"text": " His instant panic was followed by a small, sharp blow high on his chest.",
137"timestamp": (5.0, 9.4),
138},
139],
140},
141)
142
143def test_find_longest_common_subsequence(self):
144previous_sequence = [1, 2, 3]
145next_sequence = [2, 3, 4, 5]
146merge = _find_longest_common_sequence([previous_sequence, next_sequence])
147self.assertEqual(merge, [1, 2, 3, 4, 5])
148
149# Now previous is larger than next.
150# We merge what we can and remove the extra right side of the left sequence
151previous_sequence = [1, 2, 3, 4, 5, 6, 7]
152next_sequence = [2, 3, 4, 5]
153merge = _find_longest_common_sequence([previous_sequence, next_sequence])
154self.assertEqual(merge, [1, 2, 3, 4, 5])
155
156# Nothing in common
157previous_sequence = [1, 2, 3]
158next_sequence = [4, 5, 6]
159merge = _find_longest_common_sequence([previous_sequence, next_sequence])
160self.assertEqual(merge, [1, 2, 3, 4, 5, 6])
161
162# Some errors in the overlap.
163# We take from previous on the left, from the next on the right of the overlap
164previous_sequence = [1, 2, 3, 4, 99]
165next_sequence = [2, 98, 4, 5, 6]
166merge = _find_longest_common_sequence([previous_sequence, next_sequence])
167self.assertEqual(merge, [1, 2, 3, 4, 5, 6])
168
169# We take from previous on the left, from the next on the right of the overlap
170previous_sequence = [1, 2, 99, 4, 5]
171next_sequence = [2, 3, 4, 98, 6]
172merge = _find_longest_common_sequence([previous_sequence, next_sequence])
173self.assertEqual(merge, [1, 2, 99, 4, 98, 6])
174
175# This works on 3 sequences
176seq1 = [1, 2, 3]
177seq2 = [2, 3, 4]
178seq3 = [3, 4, 5]
179merge = _find_longest_common_sequence([seq1, seq2, seq3])
180self.assertEqual(merge, [1, 2, 3, 4, 5])
181
182# This works on 3 sequences with errors
183seq1 = [1, 2, 3, 98, 5]
184seq2 = [2, 99, 4, 5, 6, 7]
185seq3 = [4, 97, 6, 7, 8]
186merge = _find_longest_common_sequence([seq1, seq2, seq3])
187self.assertEqual(merge, [1, 2, 3, 4, 5, 6, 7, 8])
188
189def test_skip_special_tokens_skips_prompt_ids(self):
190tokenizer = self.get_tokenizer()
191rust_tokenizer = self.get_rust_tokenizer()
192# fmt: off
193encoded_input = [
19450361, 2221, 13, 2326, 388, 391, 50258, 50259, 50359,
19550363, 1282, 264, 2674, 9156, 295, 1523, 11, 2221, 13,
1962326, 388, 391, 13657, 365, 2681, 21296, 17711, 13, 50257,
197]
198# fmt: on
199expected_with_special_tokens = "<|startofprev|> Mr. Quilter<|startoftranscript|><|en|><|transcribe|><|notimestamps|> On the general principles of art, Mr. Quilter writes with equal lucidity.<|endoftext|>"
200expected_without_special_tokens = " On the general principles of art, Mr. Quilter writes with equal lucidity."
201self.assertEqual(tokenizer.decode(encoded_input, skip_special_tokens=False), expected_with_special_tokens)
202self.assertEqual(tokenizer.decode(encoded_input, skip_special_tokens=True), expected_without_special_tokens)
203self.assertEqual(rust_tokenizer.decode(encoded_input, skip_special_tokens=False), expected_with_special_tokens)
204self.assertEqual(
205rust_tokenizer.decode(encoded_input, skip_special_tokens=True), expected_without_special_tokens
206)
207
208def test_skip_special_tokens_with_timestamps(self):
209tokenizer = self.get_tokenizer()
210rust_tokenizer = self.get_rust_tokenizer()
211
212# fmt: off
213encoded_input = [
21450258, 50363, 50364, 634, 575, 12525, 22618, 1968, 6144,
21535617, 20084, 1756, 311, 589, 307, 534, 10281, 934,
216439, 293, 50676, 50676, 393, 4411, 294, 309, 457,
217707, 295, 33301, 286, 392, 6628, 13, 50836, 50257,
218]
219# fmt: on
220
221expected_with_special_tokens = "<|startoftranscript|><|notimestamps|><|0.00|> He has grave doubts whether Sir Frederick Layton's work is really Greek after all and<|6.24|><|6.24|> can discover in it but little of rocky Ithaca.<|9.44|><|endoftext|>"
222expected_without_special_tokens = "<|0.00|> He has grave doubts whether Sir Frederick Layton's work is really Greek after all and<|6.24|><|6.24|> can discover in it but little of rocky Ithaca.<|9.44|>"
223self.assertEqual(
224tokenizer.decode(encoded_input, decode_with_timestamps=True, skip_special_tokens=False),
225expected_with_special_tokens,
226)
227self.assertEqual(
228tokenizer.decode(encoded_input, decode_with_timestamps=True, skip_special_tokens=True),
229expected_without_special_tokens,
230)
231self.assertEqual(
232rust_tokenizer.decode(encoded_input, decode_with_timestamps=True, skip_special_tokens=False),
233expected_with_special_tokens,
234)
235self.assertEqual(
236rust_tokenizer.decode(encoded_input, decode_with_timestamps=True, skip_special_tokens=True),
237expected_without_special_tokens,
238)
239
240def test_fast_tokenizer_get_prompt_ids(self):
241tokenizer = self.get_tokenizer()
242rust_tokenizer = self.get_rust_tokenizer()
243
244prompt = "This is test prompt text."
245tokenizer_prompt_ids = tokenizer.get_prompt_ids(prompt)
246fast_tokenizer_prompt_ids = rust_tokenizer.get_prompt_ids(prompt)
247
248self.assertListEqual(tokenizer_prompt_ids.tolist(), fast_tokenizer_prompt_ids.tolist())
249
250def test_combine_tokens_into_words(self):
251tokenizer = self.get_tokenizer()
252rust_tokenizer = self.get_rust_tokenizer()
253
254# 'whatever "whatever" said someone, clever!?'
255encoded_input = [1363, 7969, 503, 1363, 7969, 1, 848, 1580, 11, 13494, 7323]
256expected_words = ["whatever", ' "whatever"', " said", " someone,", " clever!?"]
257expected_tokens = [[1363, 7969], [503, 1363, 7969, 1], [848], [1580, 11], [13494, 7323]]
258expected_indices = [[0, 1], [2, 3, 4, 5], [6], [7, 8], [9, 10]]
259output = _combine_tokens_into_words(tokenizer, encoded_input)
260self.assertEqual(expected_words, output[0])
261self.assertEqual(expected_tokens, output[1])
262self.assertEqual(expected_indices, output[2])
263output_rust = _combine_tokens_into_words(rust_tokenizer, encoded_input)
264self.assertEqual(expected_words, output_rust[0])
265self.assertEqual(expected_tokens, output_rust[1])
266self.assertEqual(expected_indices, output_rust[2])
267
268def test_basic_normalizer(self):
269tokenizer = self.get_tokenizer()
270rust_tokenizer = self.get_rust_tokenizer()
271
272input_str = "Hola güey!"
273expected_output_normalize = "hola güey "
274expected_output_diacritics = "hola guey "
275
276# tokenizer tests
277encoded_input = tokenizer(input_str).input_ids
278decoded_output = tokenizer.decode(encoded_input, skip_special_tokens=True, basic_normalize=False)
279self.assertEqual(decoded_output, input_str)
280
281decoded_output_normalize = tokenizer.decode(encoded_input, skip_special_tokens=True, basic_normalize=True)
282self.assertEqual(decoded_output_normalize, expected_output_normalize)
283
284decoded_output_diacritics = tokenizer.decode(
285encoded_input, skip_special_tokens=True, basic_normalize=True, remove_diacritics=True
286)
287self.assertEqual(decoded_output_diacritics, expected_output_diacritics)
288
289# fast tokenizer tests
290encoded_input = rust_tokenizer(input_str).input_ids
291decoded_output = rust_tokenizer.decode(encoded_input, skip_special_tokens=True, basic_normalize=False)
292self.assertEqual(decoded_output, input_str)
293
294decoded_output_normalize = rust_tokenizer.decode(encoded_input, skip_special_tokens=True, basic_normalize=True)
295self.assertEqual(decoded_output_normalize, expected_output_normalize)
296
297decoded_output_diacritics = rust_tokenizer.decode(
298encoded_input, skip_special_tokens=True, basic_normalize=True, remove_diacritics=True
299)
300self.assertEqual(decoded_output_diacritics, expected_output_diacritics)
301
302
303class SpeechToTextTokenizerMultilinguialTest(unittest.TestCase):
304checkpoint_name = "openai/whisper-small.en"
305
306@classmethod
307def setUpClass(cls):
308cls.tokenizer: WhisperTokenizer = WhisperTokenizer.from_pretrained(cls.checkpoint_name)
309return cls
310
311def test_tokenizer_equivalence(self):
312text = "다람쥐 헌 쳇바퀴에 타고파"
313multilingual_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="korean")
314monolingual_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny.en")
315
316monolingual_tokens = monolingual_tokenizer.encode(text, add_special_tokens=False)
317multilingual_tokens = multilingual_tokenizer.encode(text, add_special_tokens=False)
318
319assert monolingual_tokenizer.decode(monolingual_tokens) == text
320assert multilingual_tokenizer.decode(multilingual_tokens) == text
321assert len(monolingual_tokens) > len(multilingual_tokens)
322
323# fmt: off
324EXPECTED_ENG = [
32546695, 97, 167, 252, 234, 168, 98, 238, 220, 169,
326245, 234, 23821, 111, 229, 167, 108, 242, 169, 222,
327112, 168, 245, 238, 220, 169, 225, 222, 166, 111,
328254, 169, 234, 234
329]
330EXPECTED_MULTI = [
3319835, 22855, 168, 98, 238, 13431, 234, 43517, 229, 47053,
332169, 222, 19086, 19840, 1313, 17974
333]
334# fmt: on
335
336self.assertListEqual(monolingual_tokens, EXPECTED_ENG)
337self.assertListEqual(multilingual_tokens, EXPECTED_MULTI)
338
339def test_tokenizer_special(self):
340multilingual_tokenizer = WhisperTokenizer.from_pretrained(
341"openai/whisper-tiny", language="english", task="transcribe"
342)
343text = "Hey! How are you feeling? J'ai l'impression que 郷さん est prêt"
344
345multilingual_tokens = multilingual_tokenizer.encode(text)
346
347# fmt: off
348# format: <|startoftranscript|> <|lang-id|> <|task|> <|notimestamps|> ... transcription ids ... <|endoftext|>
349EXPECTED_MULTI = [
350START_OF_TRANSCRIPT, EN_CODE, TRANSCRIBE, NOTIMESTAMPS, 7057, 0, 1012, 366, 291,
3512633, 30, 508, 6, 1301, 287, 6, 36107, 631, 220, 11178,
352115, 15567, 871, 44393, END_OF_TRANSCRIPT
353]
354EXPECTED_SPECIAL_TEXT = (
355"<|startoftranscript|><|en|><|transcribe|><|notimestamps|>Hey! How are you feeling? "
356"J'ai l'impression que 郷さん est prêt<|endoftext|>"
357)
358# fmt: on
359
360self.assertListEqual(multilingual_tokens, EXPECTED_MULTI)
361
362special_transcript = multilingual_tokenizer.decode(multilingual_tokens, skip_special_tokens=False)
363self.assertEqual(special_transcript, EXPECTED_SPECIAL_TEXT)
364
365transcript = multilingual_tokenizer.decode(multilingual_tokens, skip_special_tokens=True)
366self.assertEqual(transcript, text)
367
368def test_vocab_size(self):
369self.assertEqual(self.tokenizer.vocab_size, 50257)
370
371# Copied from tests.models.speech_to_text.test_tokenization_speech_to_text.SpeechToTextTokenizerMultilinguialTest.test_tokenizer_decode_ignores_language_codes
372def test_tokenizer_decode_ignores_language_codes(self):
373self.assertIn(ES_CODE, self.tokenizer.all_special_ids)
374generated_ids = [ES_CODE, 4, 1601, 47, 7647, 2]
375result = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
376expected_spanish = self.tokenizer.decode(generated_ids[1:], skip_special_tokens=True)
377self.assertEqual(result, expected_spanish)
378self.assertNotIn(self.tokenizer.eos_token, result)
379
380def test_batch_encoding(self):
381multilingual_tokenizer = WhisperTokenizer.from_pretrained(
382"openai/whisper-tiny", language="spanish", task="translate"
383)
384batch = ["El gato ", "El gato se sentó"]
385batch_output = multilingual_tokenizer.batch_encode_plus(batch, padding=True).input_ids
386
387# fmt: off
388EXPECTED_MULTI = [
389[START_OF_TRANSCRIPT, ES_CODE, TRANSLATE, NOTIMESTAMPS, 17356, 290, 2513, 220,
390END_OF_TRANSCRIPT, END_OF_TRANSCRIPT, END_OF_TRANSCRIPT],
391[START_OF_TRANSCRIPT, ES_CODE, TRANSLATE, NOTIMESTAMPS, 17356, 290, 2513, 369,
3922279, 812, END_OF_TRANSCRIPT]
393]
394# fmt: on
395
396self.assertListEqual(batch_output, EXPECTED_MULTI)
397
398def test_set_prefix_tokens(self):
399multilingual_tokenizer = WhisperTokenizer.from_pretrained(
400"openai/whisper-tiny", language="spanish", task="translate"
401)
402
403# change the language prefix token from Spanish to English
404multilingual_tokenizer.set_prefix_tokens(language="english")
405
406batch = ["the cat", "the cat sat"]
407batch_output = multilingual_tokenizer.batch_encode_plus(batch, padding=True).input_ids
408
409# fmt: off
410EXPECTED_MULTI = [
411[START_OF_TRANSCRIPT, EN_CODE, TRANSLATE, NOTIMESTAMPS, 3322, 3857,
412END_OF_TRANSCRIPT, END_OF_TRANSCRIPT],
413[START_OF_TRANSCRIPT, EN_CODE, TRANSLATE, NOTIMESTAMPS, 3322, 3857,
4143227, END_OF_TRANSCRIPT]
415]
416# fmt: on
417
418self.assertListEqual(batch_output, EXPECTED_MULTI)
419
420def test_batch_encoding_decoding(self):
421multilingual_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="spanish")
422batch = ["hola güey", "que onda"]
423batch_encoding = multilingual_tokenizer.batch_encode_plus(batch, padding=True).input_ids
424transcription = multilingual_tokenizer.batch_decode(batch_encoding, skip_special_tokens=True)
425self.assertListEqual(batch, transcription)
426
427def test_offset_decoding(self):
428multilingual_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny")
429# fmt: off
430INPUT_TOKENS = [
43150258, 50259, 50359, 50364, 441, 1857, 4174, 11, 5242, 366,
432257, 1333, 295, 493, 2794, 2287, 293, 12018, 14880, 11,
433293, 25730, 311, 454, 34152, 4496, 904, 50724, 50724, 366,
434382, 4048, 382, 257, 361, 18459, 13065, 13, 2221, 13,
4357145, 74, 325, 38756, 311, 29822, 7563, 412, 472, 709,
436294, 264, 51122, 51122, 912, 636, 300, 2221, 13, 2741,
4375767, 1143, 281, 7319, 702, 7798, 13, 400, 2221, 13,
4382619, 4004, 811, 2709, 702, 51449, 51449, 50257
439]
440# fmt: on
441output = multilingual_tokenizer.decode(INPUT_TOKENS, output_offsets=True)["offsets"]
442
443self.assertEqual(
444output,
445[
446{
447"text": (
448" Lennils, pictures are a sort of upguards and atom paintings, and Mason's exquisite idles"
449),
450"timestamp": (0.0, 7.2),
451},
452{
453"text": (
454" are as national as a jingo poem. Mr. Birkut Foster's landscapes smile at one much in the"
455),
456"timestamp": (7.2, 15.16),
457},
458{
459"text": " same way that Mr. Carker used to flash his teeth. And Mr. John Colier gives his",
460"timestamp": (15.16, 21.7),
461},
462],
463)
464# test `decode_with_offsets`
465output = multilingual_tokenizer.decode(INPUT_TOKENS, decode_with_timestamps=True)
466self.assertEqual(
467output,
468"<|startoftranscript|><|en|><|transcribe|><|0.00|> Lennils, pictures are a sort of upguards and atom"
469" paintings, and Mason's exquisite idles<|7.20|><|7.20|> are as national as a jingo poem. Mr. Birkut"
470" Foster's landscapes smile at one much in the<|15.16|><|15.16|> same way that Mr. Carker used to flash"
471" his teeth. And Mr. John Colier gives his<|21.70|><|21.70|><|endoftext|>",
472)
473# test a single sequence with timestamps
474# fmt: off
475INPUT_TOKENS = [
47650364, 441, 1857, 4174, 11, 5242, 366,
477257, 1333, 295, 493, 2794, 2287, 293, 12018, 14880, 11,
478293, 25730, 311, 454, 34152, 4496, 904, 50724
479]
480# fmt: on
481
482output = multilingual_tokenizer.decode(INPUT_TOKENS, output_offsets=True)["offsets"]
483self.assertEqual(
484output[0],
485{
486"text": " Lennils, pictures are a sort of upguards and atom paintings, and Mason's exquisite idles",
487"timestamp": (0.0, 7.2),
488},
489)
490
491# test a sequence without a single timestamps
492# fmt: off
493INPUT_TOKENS = [
494441, 1857, 4174, 11, 5242, 366,
495257, 1333, 295, 493, 2794, 2287, 293, 12018, 14880, 11,
496293, 25730, 311, 454, 34152, 4496, 904, 50724
497]
498# fmt: on
499
500output = multilingual_tokenizer.decode(INPUT_TOKENS, output_offsets=True)["offsets"]
501self.assertEqual(output, [])
502