transformers

test_tokenization_whisper.py
501 строка · 23.4 Кб
Перенос по словам
1
# Copyright 2022 The HuggingFace Team. All rights reserved.
2
#
3
# Licensed under the Apache License, Version 2.0 (the "License");
4
# you may not use this file except in compliance with the License.
5
# You may obtain a copy of the License at
6
#
7
#     http://www.apache.org/licenses/LICENSE-2.0
8
#
9
# Unless required by applicable law or agreed to in writing, software
10
# distributed under the License is distributed on an "AS IS" BASIS,
11
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
# See the License for the specific language governing permissions and
13
# limitations under the License.
14

15
import unittest
16

17
from transformers.models.whisper import WhisperTokenizer, WhisperTokenizerFast
18
from transformers.models.whisper.tokenization_whisper import _combine_tokens_into_words, _find_longest_common_sequence
19
from transformers.testing_utils import slow
20

21
from ...test_tokenization_common import TokenizerTesterMixin
22

23

24
ES_CODE = 50262
25
EN_CODE = 50259
26
END_OF_TRANSCRIPT = 50257
27
START_OF_TRANSCRIPT = 50258
28
TRANSLATE = 50358
29
TRANSCRIBE = 50359
30
NOTIMESTAMPS = 50363
31

32

33
class WhisperTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
34
    tokenizer_class = WhisperTokenizer
35
    rust_tokenizer_class = WhisperTokenizerFast
36
    test_rust_tokenizer = True
37
    test_sentencepiece = False
38
    test_seq2seq = False
39

40
    def setUp(self):
41
        super().setUp()
42
        tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny")
43
        tokenizer.pad_token_id = 50256
44
        tokenizer.pad_token = "<|endoftext|>"
45
        tokenizer.save_pretrained(self.tmpdirname)
46

47
    def test_convert_token_and_id(self):
48
        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
49
        token = "Where"
50
        token_id = 14436
51

52
        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
53
        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
54

55
    def test_get_vocab(self):
56
        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
57

58
        self.assertEqual(vocab_keys[0], "!")
59
        self.assertEqual(vocab_keys[1], '"')
60
        self.assertEqual(vocab_keys[-1], "<|30.00|>")
61
        self.assertEqual(len(vocab_keys), 51865)
62

63
    def test_vocab_size(self):
64
        self.assertEqual(self.get_tokenizer().vocab_size, 50258)
65

66
    def test_full_tokenizer(self):
67
        tokenizer = WhisperTokenizer.from_pretrained(self.tmpdirname)
68

69
        tokens = tokenizer.tokenize("This is a test")
70
        self.assertListEqual(tokens, ["This", "Ġis", "Ġa", "Ġtest"])
71

72
        self.assertListEqual(
73
            tokenizer.convert_tokens_to_ids(tokens),
74
            [5723, 307, 257, 1500],
75
        )
76

77
        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
78
        self.assertListEqual(
79
            tokens,
80
            ["I", "Ġwas", "Ġborn", "Ġin", "Ġ9", "2000", ",", "Ġand", "Ġthis", "Ġis", "Ġfals", "Ã©", "."],  # fmt: skip
81
        )
82
        ids = tokenizer.convert_tokens_to_ids(tokens)
83
        self.assertListEqual(ids, [40, 390, 4232, 294, 1722, 25743, 11, 293, 341, 307, 16720, 526, 13])
84

85
        back_tokens = tokenizer.convert_ids_to_tokens(ids)
86
        self.assertListEqual(
87
            back_tokens,
88
            ["I", "Ġwas", "Ġborn", "Ġin", "Ġ9", "2000", ",", "Ġand", "Ġthis", "Ġis", "Ġfals", "Ã©", "."],  # fmt: skip
89
        )
90

91
    def test_tokenizer_slow_store_full_signature(self):
92
        pass
93

94
    def test_tokenizer_fast_store_full_signature(self):
95
        pass
96

97
    def test_special_tokens_initialization(self):
98
        # Whisper relies on specific additional special tokens, so we skip this
99
        # general test. In particular, this test loads fast tokenizer from slow
100
        # tokenizer, and the conversion uses prefix_tokens, where we reference
101
        # additional special tokens by specific indices, hence overriding the
102
        # list with less tokens leads to out of index error
103
        pass
104

105
    @slow
106
    def test_tokenizer_integration(self):
107
        expected_encoding = {'input_ids': [[50257, 50362, 41762, 364, 357, 36234, 1900, 355, 12972, 13165, 354, 12, 35636, 364, 290, 12972, 13165, 354, 12, 5310, 13363, 12, 4835, 8, 3769, 2276, 12, 29983, 45619, 357, 13246, 51, 11, 402, 11571, 12, 17, 11, 5564, 13246, 38586, 11, 16276, 44, 11, 4307, 346, 33, 861, 11, 16276, 7934, 23029, 329, 12068, 15417, 28491, 357, 32572, 52, 8, 290, 12068, 15417, 16588, 357, 32572, 38, 8, 351, 625, 3933, 10, 2181, 13363, 4981, 287, 1802, 10, 8950, 290, 2769, 48817, 1799, 1022, 449, 897, 11, 9485, 15884, 354, 290, 309, 22854, 37535, 13, 50256], [50257, 50362, 13246, 51, 318, 3562, 284, 662, 12, 27432, 2769, 8406, 4154, 282, 24612, 422, 9642, 9608, 276, 2420, 416, 26913, 21143, 319, 1111, 1364, 290, 826, 4732, 287, 477, 11685, 13, 50256], [50257, 50362, 464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13, 50256]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}  # fmt: skip
108

109
        self.tokenizer_integration_test_util(
110
            expected_encoding=expected_encoding, model_name="openai/whisper-tiny.en", padding=False
111
        )
112

113
    def test_output_offsets(self):
114
        tokenizer = self.get_tokenizer()
115
        previous_sequence = [51492, 406, 3163, 1953, 466, 13, 51612, 51612]
116
        self.assertEqual(
117
            tokenizer.decode(previous_sequence, output_offsets=True),
118
            {
119
                "text": " not worth thinking about.",
120
                "offsets": [{"text": " not worth thinking about.", "timestamp": (22.56, 24.96)}],
121
            },
122
        )
123

124
        # Merge when the previous sequence is a suffix of the next sequence
125
        next_sequences_1 = [50364, 295, 6177, 3391, 11, 19817, 3337, 507, 307, 406, 3163, 1953, 466, 13, 50614, 50614, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 50834, 50257]  # fmt: skip
126
        self.assertEqual(
127
            tokenizer.decode(next_sequences_1, output_offsets=True),
128
            {
129
                "text": (
130
                    " of spectators, retrievality is not worth thinking about. His instant panic was followed by a"
131
                    " small, sharp blow high on his chest.<|endoftext|>"
132
                ),
133
                "offsets": [
134
                    {"text": " of spectators, retrievality is not worth thinking about.", "timestamp": (0.0, 5.0)},
135
                    {
136
                        "text": " His instant panic was followed by a small, sharp blow high on his chest.",
137
                        "timestamp": (5.0, 9.4),
138
                    },
139
                ],
140
            },
141
        )
142

143
    def test_find_longest_common_subsequence(self):
144
        previous_sequence = [1, 2, 3]
145
        next_sequence = [2, 3, 4, 5]
146
        merge = _find_longest_common_sequence([previous_sequence, next_sequence])
147
        self.assertEqual(merge, [1, 2, 3, 4, 5])
148

149
        # Now previous is larger than next.
150
        # We merge what we can and remove the extra right side of the left sequence
151
        previous_sequence = [1, 2, 3, 4, 5, 6, 7]
152
        next_sequence = [2, 3, 4, 5]
153
        merge = _find_longest_common_sequence([previous_sequence, next_sequence])
154
        self.assertEqual(merge, [1, 2, 3, 4, 5])
155

156
        # Nothing in common
157
        previous_sequence = [1, 2, 3]
158
        next_sequence = [4, 5, 6]
159
        merge = _find_longest_common_sequence([previous_sequence, next_sequence])
160
        self.assertEqual(merge, [1, 2, 3, 4, 5, 6])
161

162
        # Some errors in the overlap.
163
        # We take from previous on the left, from the next on the right of the overlap
164
        previous_sequence = [1, 2, 3, 4, 99]
165
        next_sequence = [2, 98, 4, 5, 6]
166
        merge = _find_longest_common_sequence([previous_sequence, next_sequence])
167
        self.assertEqual(merge, [1, 2, 3, 4, 5, 6])
168

169
        # We take from previous on the left, from the next on the right of the overlap
170
        previous_sequence = [1, 2, 99, 4, 5]
171
        next_sequence = [2, 3, 4, 98, 6]
172
        merge = _find_longest_common_sequence([previous_sequence, next_sequence])
173
        self.assertEqual(merge, [1, 2, 99, 4, 98, 6])
174

175
        # This works on 3 sequences
176
        seq1 = [1, 2, 3]
177
        seq2 = [2, 3, 4]
178
        seq3 = [3, 4, 5]
179
        merge = _find_longest_common_sequence([seq1, seq2, seq3])
180
        self.assertEqual(merge, [1, 2, 3, 4, 5])
181

182
        # This works on 3 sequences with errors
183
        seq1 = [1, 2, 3, 98, 5]
184
        seq2 = [2, 99, 4, 5, 6, 7]
185
        seq3 = [4, 97, 6, 7, 8]
186
        merge = _find_longest_common_sequence([seq1, seq2, seq3])
187
        self.assertEqual(merge, [1, 2, 3, 4, 5, 6, 7, 8])
188

189
    def test_skip_special_tokens_skips_prompt_ids(self):
190
        tokenizer = self.get_tokenizer()
191
        rust_tokenizer = self.get_rust_tokenizer()
192
        # fmt: off
193
        encoded_input = [
194
            50361, 2221, 13, 2326, 388, 391, 50258, 50259, 50359,
195
            50363, 1282, 264, 2674, 9156, 295, 1523, 11, 2221, 13,
196
            2326, 388, 391, 13657, 365, 2681, 21296, 17711, 13, 50257,
197
        ]
198
        # fmt: on
199
        expected_with_special_tokens = "<|startofprev|> Mr. Quilter<|startoftranscript|><|en|><|transcribe|><|notimestamps|> On the general principles of art, Mr. Quilter writes with equal lucidity.<|endoftext|>"
200
        expected_without_special_tokens = " On the general principles of art, Mr. Quilter writes with equal lucidity."
201
        self.assertEqual(tokenizer.decode(encoded_input, skip_special_tokens=False), expected_with_special_tokens)
202
        self.assertEqual(tokenizer.decode(encoded_input, skip_special_tokens=True), expected_without_special_tokens)
203
        self.assertEqual(rust_tokenizer.decode(encoded_input, skip_special_tokens=False), expected_with_special_tokens)
204
        self.assertEqual(
205
            rust_tokenizer.decode(encoded_input, skip_special_tokens=True), expected_without_special_tokens
206
        )
207

208
    def test_skip_special_tokens_with_timestamps(self):
209
        tokenizer = self.get_tokenizer()
210
        rust_tokenizer = self.get_rust_tokenizer()
211

212
        # fmt: off
213
        encoded_input = [
214
            50258, 50363, 50364, 634, 575, 12525, 22618, 1968, 6144,
215
            35617, 20084, 1756, 311, 589, 307, 534, 10281, 934,
216
            439, 293, 50676, 50676, 393, 4411, 294, 309, 457,
217
            707, 295, 33301, 286, 392, 6628, 13, 50836, 50257,
218
        ]
219
        # fmt: on
220

221
        expected_with_special_tokens = "<|startoftranscript|><|notimestamps|><|0.00|> He has grave doubts whether Sir Frederick Layton's work is really Greek after all and<|6.24|><|6.24|> can discover in it but little of rocky Ithaca.<|9.44|><|endoftext|>"
222
        expected_without_special_tokens = "<|0.00|> He has grave doubts whether Sir Frederick Layton's work is really Greek after all and<|6.24|><|6.24|> can discover in it but little of rocky Ithaca.<|9.44|>"
223
        self.assertEqual(
224
            tokenizer.decode(encoded_input, decode_with_timestamps=True, skip_special_tokens=False),
225
            expected_with_special_tokens,
226
        )
227
        self.assertEqual(
228
            tokenizer.decode(encoded_input, decode_with_timestamps=True, skip_special_tokens=True),
229
            expected_without_special_tokens,
230
        )
231
        self.assertEqual(
232
            rust_tokenizer.decode(encoded_input, decode_with_timestamps=True, skip_special_tokens=False),
233
            expected_with_special_tokens,
234
        )
235
        self.assertEqual(
236
            rust_tokenizer.decode(encoded_input, decode_with_timestamps=True, skip_special_tokens=True),
237
            expected_without_special_tokens,
238
        )
239

240
    def test_fast_tokenizer_get_prompt_ids(self):
241
        tokenizer = self.get_tokenizer()
242
        rust_tokenizer = self.get_rust_tokenizer()
243

244
        prompt = "This is test prompt text."
245
        tokenizer_prompt_ids = tokenizer.get_prompt_ids(prompt)
246
        fast_tokenizer_prompt_ids = rust_tokenizer.get_prompt_ids(prompt)
247

248
        self.assertListEqual(tokenizer_prompt_ids.tolist(), fast_tokenizer_prompt_ids.tolist())
249

250
    def test_combine_tokens_into_words(self):
251
        tokenizer = self.get_tokenizer()
252
        rust_tokenizer = self.get_rust_tokenizer()
253

254
        # 'whatever "whatever" said someone, clever!?'
255
        encoded_input = [1363, 7969, 503, 1363, 7969, 1, 848, 1580, 11, 13494, 7323]
256
        expected_words = ["whatever", ' "whatever"', " said", " someone,", " clever!?"]
257
        expected_tokens = [[1363, 7969], [503, 1363, 7969, 1], [848], [1580, 11], [13494, 7323]]
258
        expected_indices = [[0, 1], [2, 3, 4, 5], [6], [7, 8], [9, 10]]
259
        output = _combine_tokens_into_words(tokenizer, encoded_input)
260
        self.assertEqual(expected_words, output[0])
261
        self.assertEqual(expected_tokens, output[1])
262
        self.assertEqual(expected_indices, output[2])
263
        output_rust = _combine_tokens_into_words(rust_tokenizer, encoded_input)
264
        self.assertEqual(expected_words, output_rust[0])
265
        self.assertEqual(expected_tokens, output_rust[1])
266
        self.assertEqual(expected_indices, output_rust[2])
267

268
    def test_basic_normalizer(self):
269
        tokenizer = self.get_tokenizer()
270
        rust_tokenizer = self.get_rust_tokenizer()
271

272
        input_str = "Hola güey!"
273
        expected_output_normalize = "hola güey "
274
        expected_output_diacritics = "hola guey "
275

276
        # tokenizer tests
277
        encoded_input = tokenizer(input_str).input_ids
278
        decoded_output = tokenizer.decode(encoded_input, skip_special_tokens=True, basic_normalize=False)
279
        self.assertEqual(decoded_output, input_str)
280

281
        decoded_output_normalize = tokenizer.decode(encoded_input, skip_special_tokens=True, basic_normalize=True)
282
        self.assertEqual(decoded_output_normalize, expected_output_normalize)
283

284
        decoded_output_diacritics = tokenizer.decode(
285
            encoded_input, skip_special_tokens=True, basic_normalize=True, remove_diacritics=True
286
        )
287
        self.assertEqual(decoded_output_diacritics, expected_output_diacritics)
288

289
        # fast tokenizer tests
290
        encoded_input = rust_tokenizer(input_str).input_ids
291
        decoded_output = rust_tokenizer.decode(encoded_input, skip_special_tokens=True, basic_normalize=False)
292
        self.assertEqual(decoded_output, input_str)
293

294
        decoded_output_normalize = rust_tokenizer.decode(encoded_input, skip_special_tokens=True, basic_normalize=True)
295
        self.assertEqual(decoded_output_normalize, expected_output_normalize)
296

297
        decoded_output_diacritics = rust_tokenizer.decode(
298
            encoded_input, skip_special_tokens=True, basic_normalize=True, remove_diacritics=True
299
        )
300
        self.assertEqual(decoded_output_diacritics, expected_output_diacritics)
301

302

303
class SpeechToTextTokenizerMultilinguialTest(unittest.TestCase):
304
    checkpoint_name = "openai/whisper-small.en"
305

306
    @classmethod
307
    def setUpClass(cls):
308
        cls.tokenizer: WhisperTokenizer = WhisperTokenizer.from_pretrained(cls.checkpoint_name)
309
        return cls
310

311
    def test_tokenizer_equivalence(self):
312
        text = "다람쥐 헌 쳇바퀴에 타고파"
313
        multilingual_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="korean")
314
        monolingual_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny.en")
315

316
        monolingual_tokens = monolingual_tokenizer.encode(text, add_special_tokens=False)
317
        multilingual_tokens = multilingual_tokenizer.encode(text, add_special_tokens=False)
318

319
        assert monolingual_tokenizer.decode(monolingual_tokens) == text
320
        assert multilingual_tokenizer.decode(multilingual_tokens) == text
321
        assert len(monolingual_tokens) > len(multilingual_tokens)
322

323
        # fmt: off
324
        EXPECTED_ENG = [
325
            46695, 97, 167, 252, 234, 168, 98, 238, 220, 169,
326
            245, 234, 23821, 111, 229, 167, 108, 242, 169, 222,
327
            112, 168, 245, 238, 220, 169, 225, 222, 166, 111,
328
            254, 169, 234, 234
329
        ]
330
        EXPECTED_MULTI = [
331
            9835, 22855, 168, 98, 238, 13431, 234, 43517, 229, 47053,
332
            169, 222, 19086, 19840, 1313, 17974
333
        ]
334
        # fmt: on
335

336
        self.assertListEqual(monolingual_tokens, EXPECTED_ENG)
337
        self.assertListEqual(multilingual_tokens, EXPECTED_MULTI)
338

339
    def test_tokenizer_special(self):
340
        multilingual_tokenizer = WhisperTokenizer.from_pretrained(
341
            "openai/whisper-tiny", language="english", task="transcribe"
342
        )
343
        text = "Hey! How are you feeling? J'ai l'impression que 郷さん est prêt"
344

345
        multilingual_tokens = multilingual_tokenizer.encode(text)
346

347
        # fmt: off
348
        # format: <|startoftranscript|> <|lang-id|> <|task|> <|notimestamps|> ... transcription ids ... <|endoftext|>
349
        EXPECTED_MULTI = [
350
            START_OF_TRANSCRIPT, EN_CODE, TRANSCRIBE, NOTIMESTAMPS, 7057, 0, 1012, 366, 291,
351
            2633, 30, 508, 6, 1301, 287, 6, 36107, 631, 220, 11178,
352
            115, 15567, 871, 44393, END_OF_TRANSCRIPT
353
        ]
354
        EXPECTED_SPECIAL_TEXT = (
355
            "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>Hey! How are you feeling? "
356
            "J'ai l'impression que 郷さん est prêt<|endoftext|>"
357
        )
358
        # fmt: on
359

360
        self.assertListEqual(multilingual_tokens, EXPECTED_MULTI)
361

362
        special_transcript = multilingual_tokenizer.decode(multilingual_tokens, skip_special_tokens=False)
363
        self.assertEqual(special_transcript, EXPECTED_SPECIAL_TEXT)
364

365
        transcript = multilingual_tokenizer.decode(multilingual_tokens, skip_special_tokens=True)
366
        self.assertEqual(transcript, text)
367

368
    def test_vocab_size(self):
369
        self.assertEqual(self.tokenizer.vocab_size, 50257)
370

371
    # Copied from tests.models.speech_to_text.test_tokenization_speech_to_text.SpeechToTextTokenizerMultilinguialTest.test_tokenizer_decode_ignores_language_codes
372
    def test_tokenizer_decode_ignores_language_codes(self):
373
        self.assertIn(ES_CODE, self.tokenizer.all_special_ids)
374
        generated_ids = [ES_CODE, 4, 1601, 47, 7647, 2]
375
        result = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
376
        expected_spanish = self.tokenizer.decode(generated_ids[1:], skip_special_tokens=True)
377
        self.assertEqual(result, expected_spanish)
378
        self.assertNotIn(self.tokenizer.eos_token, result)
379

380
    def test_batch_encoding(self):
381
        multilingual_tokenizer = WhisperTokenizer.from_pretrained(
382
            "openai/whisper-tiny", language="spanish", task="translate"
383
        )
384
        batch = ["El gato ", "El gato se sentó"]
385
        batch_output = multilingual_tokenizer.batch_encode_plus(batch, padding=True).input_ids
386

387
        # fmt: off
388
        EXPECTED_MULTI = [
389
            [START_OF_TRANSCRIPT, ES_CODE, TRANSLATE, NOTIMESTAMPS, 17356, 290, 2513, 220,
390
             END_OF_TRANSCRIPT, END_OF_TRANSCRIPT, END_OF_TRANSCRIPT],
391
            [START_OF_TRANSCRIPT, ES_CODE, TRANSLATE, NOTIMESTAMPS, 17356, 290, 2513, 369,
392
             2279, 812, END_OF_TRANSCRIPT]
393
        ]
394
        # fmt: on
395

396
        self.assertListEqual(batch_output, EXPECTED_MULTI)
397

398
    def test_set_prefix_tokens(self):
399
        multilingual_tokenizer = WhisperTokenizer.from_pretrained(
400
            "openai/whisper-tiny", language="spanish", task="translate"
401
        )
402

403
        # change the language prefix token from Spanish to English
404
        multilingual_tokenizer.set_prefix_tokens(language="english")
405

406
        batch = ["the cat", "the cat sat"]
407
        batch_output = multilingual_tokenizer.batch_encode_plus(batch, padding=True).input_ids
408

409
        # fmt: off
410
        EXPECTED_MULTI = [
411
            [START_OF_TRANSCRIPT, EN_CODE, TRANSLATE, NOTIMESTAMPS, 3322, 3857,
412
             END_OF_TRANSCRIPT, END_OF_TRANSCRIPT],
413
            [START_OF_TRANSCRIPT, EN_CODE, TRANSLATE, NOTIMESTAMPS, 3322, 3857,
414
             3227, END_OF_TRANSCRIPT]
415
        ]
416
        # fmt: on
417

418
        self.assertListEqual(batch_output, EXPECTED_MULTI)
419

420
    def test_batch_encoding_decoding(self):
421
        multilingual_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="spanish")
422
        batch = ["hola güey", "que onda"]
423
        batch_encoding = multilingual_tokenizer.batch_encode_plus(batch, padding=True).input_ids
424
        transcription = multilingual_tokenizer.batch_decode(batch_encoding, skip_special_tokens=True)
425
        self.assertListEqual(batch, transcription)
426

427
    def test_offset_decoding(self):
428
        multilingual_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny")
429
        # fmt: off
430
        INPUT_TOKENS = [
431
            50258, 50259, 50359, 50364, 441, 1857, 4174, 11, 5242, 366,
432
            257, 1333, 295, 493, 2794, 2287, 293, 12018, 14880, 11,
433
            293, 25730, 311, 454, 34152, 4496, 904, 50724, 50724, 366,
434
            382, 4048, 382, 257, 361, 18459, 13065, 13, 2221, 13,
435
            7145, 74, 325, 38756, 311, 29822, 7563, 412, 472, 709,
436
            294, 264, 51122, 51122, 912, 636, 300, 2221, 13, 2741,
437
            5767, 1143, 281, 7319, 702, 7798, 13, 400, 2221, 13,
438
            2619, 4004, 811, 2709, 702, 51449, 51449, 50257
439
        ]
440
        # fmt: on
441
        output = multilingual_tokenizer.decode(INPUT_TOKENS, output_offsets=True)["offsets"]
442

443
        self.assertEqual(
444
            output,
445
            [
446
                {
447
                    "text": (
448
                        " Lennils, pictures are a sort of upguards and atom paintings, and Mason's exquisite idles"
449
                    ),
450
                    "timestamp": (0.0, 7.2),
451
                },
452
                {
453
                    "text": (
454
                        " are as national as a jingo poem. Mr. Birkut Foster's landscapes smile at one much in the"
455
                    ),
456
                    "timestamp": (7.2, 15.16),
457
                },
458
                {
459
                    "text": " same way that Mr. Carker used to flash his teeth. And Mr. John Colier gives his",
460
                    "timestamp": (15.16, 21.7),
461
                },
462
            ],
463
        )
464
        # test `decode_with_offsets`
465
        output = multilingual_tokenizer.decode(INPUT_TOKENS, decode_with_timestamps=True)
466
        self.assertEqual(
467
            output,
468
            "<|startoftranscript|><|en|><|transcribe|><|0.00|> Lennils, pictures are a sort of upguards and atom"
469
            " paintings, and Mason's exquisite idles<|7.20|><|7.20|> are as national as a jingo poem. Mr. Birkut"
470
            " Foster's landscapes smile at one much in the<|15.16|><|15.16|> same way that Mr. Carker used to flash"
471
            " his teeth. And Mr. John Colier gives his<|21.70|><|21.70|><|endoftext|>",
472
        )
473
        # test a single sequence with timestamps
474
        # fmt: off
475
        INPUT_TOKENS = [
476
            50364, 441, 1857, 4174, 11, 5242, 366,
477
            257, 1333, 295, 493, 2794, 2287, 293, 12018, 14880, 11,
478
            293, 25730, 311, 454, 34152, 4496, 904, 50724
479
        ]
480
        # fmt: on
481

482
        output = multilingual_tokenizer.decode(INPUT_TOKENS, output_offsets=True)["offsets"]
483
        self.assertEqual(
484
            output[0],
485
            {
486
                "text": " Lennils, pictures are a sort of upguards and atom paintings, and Mason's exquisite idles",
487
                "timestamp": (0.0, 7.2),
488
            },
489
        )
490

491
        # test a sequence without a single timestamps
492
        # fmt: off
493
        INPUT_TOKENS = [
494
            441, 1857, 4174, 11, 5242, 366,
495
            257, 1333, 295, 493, 2794, 2287, 293, 12018, 14880, 11,
496
            293, 25730, 311, 454, 34152, 4496, 904, 50724
497
        ]
498
        # fmt: on
499

500
        output = multilingual_tokenizer.decode(INPUT_TOKENS, output_offsets=True)["offsets"]
501
        self.assertEqual(output, [])
502
transformers

Использование cookies