transformers
3236 строк · 204.7 Кб
1# coding=utf-8
2# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15""" Testing suite for the PyTorch Whisper model. """
16
17import copy
18import inspect
19import os
20import random
21import re
22import tempfile
23import time
24import unittest
25
26import numpy as np
27import pytest
28from huggingface_hub import hf_hub_download
29
30import transformers
31from transformers import WhisperConfig
32from transformers.testing_utils import (
33is_pt_flax_cross_test,
34require_flash_attn,
35require_torch,
36require_torch_fp16,
37require_torch_gpu,
38require_torchaudio,
39slow,
40torch_device,
41)
42from transformers.utils import cached_property, is_flax_available, is_torch_available, is_torchaudio_available
43from transformers.utils.import_utils import is_datasets_available
44
45from ...generation.test_utils import GenerationTesterMixin
46from ...test_configuration_common import ConfigTester
47from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
48from ...test_pipeline_mixin import PipelineTesterMixin
49
50
51if is_datasets_available():
52import datasets
53from datasets import Audio, load_dataset
54
55if is_torch_available():
56import torch
57
58from transformers import (
59WhisperFeatureExtractor,
60WhisperForAudioClassification,
61WhisperForCausalLM,
62WhisperForConditionalGeneration,
63WhisperModel,
64WhisperProcessor,
65set_seed,
66)
67from transformers.generation.logits_process import LogitsProcessor
68from transformers.models.whisper.modeling_whisper import WhisperDecoder, WhisperEncoder, sinusoids
69
70class DummyTimestampLogitProcessor(LogitsProcessor):
71"""This processor fakes the correct timestamps tokens pattern [TOK_1] [TOK_2] ... [TOK_N] [TIME_STAMP_TOK_1] [TIME_STAMP_TOK_2] [TOK_N+1] ..."""
72
73def __init__(
74self, timestamp_begin, vocab_size, batch_size, max_length, min_space=3, seed=0, is_length_ascending=True
75):
76self.timestamp_begin = timestamp_begin
77self.vocab_size = vocab_size
78
79self.min_space_between_timestamps = min_space
80self.timestamp_tokens = torch.arange(self.timestamp_begin, self.vocab_size)
81self.timestamp_tokens.to(torch_device)
82self.is_length_ascending = is_length_ascending
83
84self.no_time_stamp_counter = batch_size * [0]
85self.prev_highest_timestamp = batch_size * [0]
86self.batch_size = batch_size
87self.max_length = max_length
88self.count = 0
89self.begin_index = 0
90
91self.let_pass = [[] for _ in range(batch_size)]
92for k in range(batch_size):
93random.seed(seed + k)
94for _ in range(10000):
95self.let_pass[k].append(random.randint(1, 10) <= 3)
96
97def set_begin_index(self, begin_index: int):
98self.begin_index = begin_index
99
100def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
101# we don't want to randomely sample timestamp tokens
102if input_ids.shape[-1] != self.begin_index:
103scores[:, self.timestamp_begin :] = -float("inf")
104
105self.no_time_stamp_counter = [x + 1 for x in self.no_time_stamp_counter]
106for k in range(input_ids.shape[0]):
107# make sure to use correct index if a batch was removed
108if self.is_length_ascending and input_ids.shape[0] < self.batch_size:
109prev_k = k + self.batch_size - input_ids.shape[0]
110else:
111prev_k = k
112
113if input_ids[k, -1] == self.timestamp_begin:
114self.no_time_stamp_counter[prev_k] = 0
115
116can_produce = self.no_time_stamp_counter[prev_k] > self.min_space_between_timestamps
117must_produce = (
118input_ids[k][2:].le(self.timestamp_begin).all() and input_ids.shape[-1] == self.max_length - 1
119)
120# produce timestamp with 30%
121if (can_produce and self.let_pass[prev_k][self.count]) or must_produce:
122self.no_time_stamp_counter[prev_k] = 0
123self.prev_highest_timestamp[prev_k] = max(input_ids[k].max() + 1, self.timestamp_tokens[0].item())
124
125# force a timestamp
126scores[k, :] = -float("inf")
127scores[k, self.prev_highest_timestamp[prev_k]] = 10.0
128
129if (
130input_ids.shape[-1] > 3
131and input_ids[k, -1].item() in self.timestamp_tokens
132and input_ids[k, -2].item() not in self.timestamp_tokens
133):
134# force the same as before
135scores[k, :] = -float("inf")
136scores[k, input_ids[k, -1].item()] = 10.0
137
138self.count += 1
139
140if torch.isinf(scores).all():
141raise ValueError("Dummy logit processor is incorrectly set up. Scores should not be all inf.")
142
143return scores
144
145
146if is_torchaudio_available():
147import torchaudio
148
149
150if is_flax_available():
151import jax.numpy as jnp
152
153from transformers.modeling_flax_pytorch_utils import (
154convert_pytorch_state_dict_to_flax,
155load_flax_weights_in_pytorch_model,
156)
157
158
159def prepare_whisper_inputs_dict(
160config,
161input_features,
162decoder_input_ids,
163attention_mask=None,
164decoder_attention_mask=None,
165head_mask=None,
166decoder_head_mask=None,
167cross_attn_head_mask=None,
168):
169if decoder_attention_mask is None:
170decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
171if head_mask is None:
172head_mask = torch.ones(config.encoder_layers, config.encoder_attention_heads, device=torch_device)
173if decoder_head_mask is None:
174decoder_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
175if cross_attn_head_mask is None:
176cross_attn_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
177return {
178# "input_ids": input_features,
179"input_features": input_features,
180"decoder_input_ids": decoder_input_ids,
181"decoder_attention_mask": decoder_attention_mask,
182"head_mask": head_mask,
183"decoder_head_mask": decoder_head_mask,
184"cross_attn_head_mask": cross_attn_head_mask,
185}
186
187
188@require_torch
189class WhisperModelTester:
190def __init__(
191self,
192parent,
193batch_size=2,
194seq_length=60,
195is_training=True,
196use_labels=False,
197vocab_size=200,
198hidden_size=16,
199num_hidden_layers=2,
200num_attention_heads=4,
201input_channels=1,
202hidden_act="gelu",
203hidden_dropout_prob=0.1,
204attention_probs_dropout_prob=0.1,
205max_position_embeddings=20,
206max_source_positions=30,
207max_target_positions=40,
208bos_token_id=98,
209eos_token_id=98,
210pad_token_id=0,
211num_mel_bins=80,
212decoder_start_token_id=85,
213num_conv_layers=1,
214suppress_tokens=None,
215begin_suppress_tokens=None,
216):
217self.parent = parent
218self.batch_size = batch_size
219self.seq_length = seq_length
220self.is_training = is_training
221self.use_labels = use_labels
222self.vocab_size = vocab_size
223self.hidden_size = hidden_size
224self.num_hidden_layers = num_hidden_layers
225self.num_attention_heads = num_attention_heads
226self.input_channels = input_channels
227self.hidden_act = hidden_act
228self.hidden_dropout_prob = hidden_dropout_prob
229self.attention_probs_dropout_prob = attention_probs_dropout_prob
230self.num_mel_bins = num_mel_bins
231self.max_position_embeddings = max_position_embeddings
232self.max_source_positions = max_source_positions
233self.max_target_positions = max_target_positions
234self.eos_token_id = eos_token_id
235self.pad_token_id = pad_token_id
236self.bos_token_id = bos_token_id
237self.decoder_start_token_id = decoder_start_token_id
238self.num_conv_layers = num_conv_layers
239self.suppress_tokens = suppress_tokens
240self.begin_suppress_tokens = begin_suppress_tokens
241
242def prepare_config_and_inputs(self):
243input_features = floats_tensor([self.batch_size, self.num_mel_bins, self.seq_length], self.vocab_size)
244
245decoder_input_ids = torch.tensor(self.batch_size * [[self.decoder_start_token_id]], device=torch_device)
246
247config = self.get_config()
248inputs_dict = prepare_whisper_inputs_dict(
249config,
250attention_mask=None,
251input_features=input_features,
252decoder_input_ids=decoder_input_ids,
253)
254return config, inputs_dict
255
256def get_config(self):
257return WhisperConfig(
258vocab_size=self.vocab_size,
259d_model=self.hidden_size,
260encoder_layers=self.num_hidden_layers,
261decoder_layers=self.num_hidden_layers,
262encoder_attention_heads=self.num_attention_heads,
263decoder_attention_heads=self.num_attention_heads,
264input_channels=self.input_channels,
265dropout=self.hidden_dropout_prob,
266attention_dropout=self.attention_probs_dropout_prob,
267max_position_embeddings=self.max_position_embeddings,
268max_source_positions=self.max_source_positions,
269max_target_positions=self.max_target_positions,
270eos_token_id=self.eos_token_id,
271bos_token_id=self.bos_token_id,
272pad_token_id=self.pad_token_id,
273decoder_ffn_dim=self.hidden_size,
274encoder_ffn_dim=self.hidden_size,
275decoder_start_token_id=self.decoder_start_token_id,
276suppress_tokens=self.suppress_tokens,
277begin_suppress_tokens=self.begin_suppress_tokens,
278)
279
280def prepare_config_and_inputs_for_common(self):
281config, inputs_dict = self.prepare_config_and_inputs()
282return config, inputs_dict
283
284def get_subsampled_output_lengths(self, input_lengths):
285"""
286Computes the output length of the convolutional layers
287"""
288
289for i in range(self.num_conv_layers):
290input_lengths = (input_lengths - 1) // 2 + 1
291
292return input_lengths
293
294def create_and_check_model_forward(self, config, inputs_dict, freeze_encoder=False):
295model = WhisperModel(config=config).to(torch_device).eval()
296
297if freeze_encoder:
298model.freeze_encoder()
299
300input_features = inputs_dict["input_features"]
301decoder_input_ids = inputs_dict["decoder_input_ids"]
302
303# first forward pass
304last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
305
306self.parent.assertTrue(last_hidden_state.shape, (13, 7, 16))
307
308def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
309model = WhisperModel(config=config).get_decoder().to(torch_device).eval()
310input_ids = inputs_dict["decoder_input_ids"]
311attention_mask = inputs_dict["decoder_attention_mask"]
312
313# first forward pass
314outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
315
316output, past_key_values = outputs.to_tuple()
317
318# create hypothetical multiple next token and extent to next_input_ids
319next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size).clamp(2)
320next_attn_mask = ids_tensor((self.batch_size, 3), 2)
321
322# append to next input_ids and
323next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
324next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
325
326output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
327output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
328"last_hidden_state"
329]
330
331# select random slice
332random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
333output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
334output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
335
336self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
337
338# test that outputs are equal for slice
339self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2))
340
341def check_encoder_decoder_model_standalone(self, config, inputs_dict):
342model = WhisperModel(config=config).to(torch_device).eval()
343outputs = model(**inputs_dict)
344
345encoder_last_hidden_state = outputs.encoder_last_hidden_state
346last_hidden_state = outputs.last_hidden_state
347
348with tempfile.TemporaryDirectory() as tmpdirname:
349encoder = model.get_encoder()
350encoder.save_pretrained(tmpdirname)
351encoder = WhisperEncoder.from_pretrained(tmpdirname).to(torch_device)
352
353encoder_last_hidden_state_2 = encoder(inputs_dict["input_features"])[0]
354
355self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
356
357with tempfile.TemporaryDirectory() as tmpdirname:
358decoder = model.get_decoder()
359decoder.save_pretrained(tmpdirname)
360decoder = WhisperDecoder.from_pretrained(tmpdirname).to(torch_device)
361
362last_hidden_state_2 = decoder(
363input_ids=inputs_dict["decoder_input_ids"],
364attention_mask=inputs_dict["decoder_attention_mask"],
365encoder_hidden_states=encoder_last_hidden_state,
366)[0]
367
368self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
369
370
371@require_torch
372class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
373all_model_classes = (WhisperModel, WhisperForConditionalGeneration) if is_torch_available() else ()
374all_generative_model_classes = (WhisperForConditionalGeneration,) if is_torch_available() else ()
375pipeline_model_mapping = (
376{
377"audio-classification": WhisperForAudioClassification,
378"automatic-speech-recognition": WhisperForConditionalGeneration,
379"feature-extraction": WhisperModel,
380"text-generation": WhisperForCausalLM,
381}
382if is_torch_available()
383else {}
384)
385is_encoder_decoder = True
386fx_compatible = False
387test_pruning = False
388test_missing_keys = False
389# Needs higher percentages after model tester's vocab_size is changed to 200 (PR #21222)
390# `0.5` is for `test_disk_offload` (which also works for `test_model_parallelism`)
391model_split_percents = [0.5, 0.8, 0.9]
392
393input_name = "input_features"
394
395# TODO: Fix the failed tests
396def is_pipeline_test_to_skip(
397self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
398):
399if pipeline_test_casse_name in [
400"AutomaticSpeechRecognitionPipelineTests",
401"AudioClassificationPipelineTests",
402]:
403# RuntimeError: The size of tensor a (1500) must match the size of tensor b (30) at non-singleton
404# dimension 1
405return True
406
407return False
408
409def setUp(self):
410self.model_tester = WhisperModelTester(self)
411self.config_tester = ConfigTester(self, config_class=WhisperConfig)
412self.maxDiff = 3000
413
414def test_config(self):
415self.config_tester.run_common_tests()
416
417def test_save_load_strict(self):
418config, inputs_dict = self.model_tester.prepare_config_and_inputs()
419for model_class in self.all_model_classes:
420model = model_class(config)
421
422with tempfile.TemporaryDirectory() as tmpdirname:
423model.save_pretrained(tmpdirname)
424model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
425self.assertEqual(info["missing_keys"], [])
426
427def test_model_forward(self):
428config_and_inputs = self.model_tester.prepare_config_and_inputs()
429self.model_tester.create_and_check_model_forward(*config_and_inputs)
430
431def test_model_forward_with_frozen_encoder(self):
432config_and_inputs = self.model_tester.prepare_config_and_inputs()
433self.model_tester.create_and_check_model_forward(*config_and_inputs, freeze_encoder=True)
434
435def test_requires_grad_with_frozen_encoder(self):
436config = self.model_tester.get_config()
437for model_class in self.all_model_classes:
438model = model_class(config)
439model.freeze_encoder()
440
441try:
442encoder_grads = [param.requires_grad for param in model.encoder.parameters()]
443decoder_grads = [param.requires_grad for param in model.decoder.parameters()]
444except AttributeError:
445encoder_grads = [param.requires_grad for param in model.model.encoder.parameters()]
446decoder_grads = [param.requires_grad for param in model.model.decoder.parameters()]
447
448self.assertFalse(all(encoder_grads))
449self.assertTrue(all(decoder_grads))
450
451def test_requires_grad_encoder_embed_positions(self):
452config = self.model_tester.get_config()
453for model_class in self.all_model_classes:
454model = model_class(config)
455encoder = model.get_encoder()
456self.assertFalse(encoder.embed_positions.weight.requires_grad)
457
458def test_encoder_sinusoidal_embed_positions(self):
459config = self.model_tester.get_config()
460for model_class in self.all_model_classes:
461model = model_class(config)
462embeds = model.get_encoder().embed_positions.weight
463self.assertTrue(torch.allclose(embeds, sinusoids(*embeds.shape)))
464
465def test_decoder_model_past_with_large_inputs(self):
466config_and_inputs = self.model_tester.prepare_config_and_inputs()
467self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
468
469def test_encoder_decoder_model_standalone(self):
470config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
471self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
472
473def _get_input_ids_and_config(self, batch_size=3):
474config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
475input_ids = inputs_dict[self.input_name]
476
477# cut to half length & take max batch_size=batch_size
478input_ids = input_ids[:batch_size, :, :]
479
480# generate max 3 tokens
481max_length = 4
482if config.eos_token_id is not None and config.pad_token_id is None:
483# hack to allow generate for models such as GPT2 as is done in `generate()`
484config.pad_token_id = config.eos_token_id
485
486return config, input_ids, None, max_length
487
488def test_inputs_embeds(self):
489config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
490
491for model_class in self.all_model_classes:
492model = model_class(config)
493model.to(torch_device)
494model.eval()
495
496inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
497
498decoder_input_ids = inputs.pop("decoder_input_ids", None)
499inputs.pop("decoder_attention_mask", None)
500
501wte = model.get_input_embeddings()
502inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
503
504with torch.no_grad():
505model(**inputs)[0]
506
507# training is not supported yet
508def test_training(self):
509pass
510
511def test_training_gradient_checkpointing(self):
512pass
513
514@unittest.skip(
515reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
516)
517def test_training_gradient_checkpointing_use_reentrant(self):
518pass
519
520@unittest.skip(
521reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
522)
523def test_training_gradient_checkpointing_use_reentrant_false(self):
524pass
525
526def test_generate_with_head_masking(self):
527pass
528
529@require_torch_fp16
530def test_generate_fp16(self):
531config, input_dict = self.model_tester.prepare_config_and_inputs()
532config.max_target_positions = 400
533input_features = input_dict["input_features"]
534model = WhisperForConditionalGeneration(config).eval().to(torch_device)
535input_features = input_features.half()
536model.half()
537model.generate(input_features)
538model.generate(input_features, num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
539
540def test_generate_language(self):
541config, input_dict = self.model_tester.prepare_config_and_inputs()
542input_features = input_dict["input_features"]
543model = WhisperForConditionalGeneration(config).to(torch_device)
544# Hack to keep the test fast and not require downloading a model with a generation_config
545model.generation_config.__setattr__("lang_to_id", {"<|en|>": 1})
546model.generation_config.__setattr__("task_to_id", {"transcribe": 2})
547
548# test language code
549model.generate(input_features, language="en")
550# test tokenizer code
551model.generate(input_features, language="<|en|>")
552# test language name
553model.generate(input_features, language="English")
554
555def test_forward_signature(self):
556config, _ = self.model_tester.prepare_config_and_inputs_for_common()
557
558for model_class in self.all_model_classes:
559model = model_class(config)
560signature = inspect.signature(model.forward)
561# signature.parameters is an OrderedDict => so arg_names order is deterministic
562arg_names = [*signature.parameters.keys()]
563
564expected_arg_names = [
565"input_features",
566"attention_mask",
567"decoder_input_ids",
568"decoder_attention_mask",
569]
570expected_arg_names.extend(
571["head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs"]
572if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names
573else ["encoder_outputs"]
574)
575self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
576
577def test_hidden_states_output(self):
578def check_hidden_states_output(inputs_dict, config, model_class):
579model = model_class(config)
580model.to(torch_device)
581model.eval()
582
583with torch.no_grad():
584outputs = model(**self._prepare_for_class(inputs_dict, model_class))
585
586hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
587
588expected_num_layers = getattr(
589self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
590)
591self.assertEqual(len(hidden_states), expected_num_layers)
592
593if hasattr(self.model_tester, "encoder_seq_length"):
594seq_length = self.model_tester.encoder_seq_length
595else:
596seq_length = self.model_tester.seq_length
597
598subsampled_seq_length = model._get_feat_extract_output_lengths(seq_length)
599
600self.assertListEqual(
601list(hidden_states[0].shape[-2:]),
602[subsampled_seq_length, self.model_tester.hidden_size],
603)
604
605if config.is_encoder_decoder:
606hidden_states = outputs.decoder_hidden_states
607
608self.assertIsInstance(hidden_states, (list, tuple))
609self.assertEqual(len(hidden_states), expected_num_layers)
610
611decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", 1)
612
613self.assertListEqual(
614list(hidden_states[0].shape[-2:]),
615[decoder_seq_length, self.model_tester.hidden_size],
616)
617
618config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
619
620for model_class in self.all_model_classes:
621inputs_dict["output_hidden_states"] = True
622check_hidden_states_output(inputs_dict, config, model_class)
623
624# check that output_hidden_states also work using config
625del inputs_dict["output_hidden_states"]
626config.output_hidden_states = True
627
628check_hidden_states_output(inputs_dict, config, model_class)
629
630def test_attention_outputs(self):
631config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
632config.return_dict = True
633
634seq_len = getattr(self.model_tester, "seq_length", None)
635decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", 1)
636encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
637decoder_key_length = getattr(self.model_tester, "decoder_key_length", 1)
638encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
639
640for model_class in self.all_model_classes:
641inputs_dict["output_attentions"] = True
642inputs_dict["output_hidden_states"] = False
643config.return_dict = True
644model = model_class(config)
645model.to(torch_device)
646model.eval()
647
648subsampled_encoder_seq_length = model._get_feat_extract_output_lengths(encoder_seq_length)
649subsampled_encoder_key_length = model._get_feat_extract_output_lengths(encoder_key_length)
650
651with torch.no_grad():
652outputs = model(**self._prepare_for_class(inputs_dict, model_class))
653attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
654self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
655
656# check that output_attentions also work using config
657del inputs_dict["output_attentions"]
658config.output_attentions = True
659model = model_class(config)
660model.to(torch_device)
661model.eval()
662with torch.no_grad():
663outputs = model(**self._prepare_for_class(inputs_dict, model_class))
664attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
665self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
666
667self.assertListEqual(
668list(attentions[0].shape[-3:]),
669[self.model_tester.num_attention_heads, subsampled_encoder_seq_length, subsampled_encoder_key_length],
670)
671out_len = len(outputs)
672
673correct_outlen = 5
674
675# loss is at first position
676if "labels" in inputs_dict:
677correct_outlen += 1 # loss is added to beginning
678if "past_key_values" in outputs:
679correct_outlen += 1 # past_key_values have been returned
680
681self.assertEqual(out_len, correct_outlen)
682
683# decoder attentions
684decoder_attentions = outputs.decoder_attentions
685self.assertIsInstance(decoder_attentions, (list, tuple))
686self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
687self.assertListEqual(
688list(decoder_attentions[0].shape[-3:]),
689[self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
690)
691
692# cross attentions
693cross_attentions = outputs.cross_attentions
694self.assertIsInstance(cross_attentions, (list, tuple))
695self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
696self.assertListEqual(
697list(cross_attentions[0].shape[-3:]),
698[
699self.model_tester.num_attention_heads,
700decoder_seq_length,
701subsampled_encoder_key_length,
702],
703)
704
705# Check attention is always last and order is fine
706inputs_dict["output_attentions"] = True
707inputs_dict["output_hidden_states"] = True
708model = model_class(config)
709model.to(torch_device)
710model.eval()
711with torch.no_grad():
712outputs = model(**self._prepare_for_class(inputs_dict, model_class))
713
714added_hidden_states = 2
715self.assertEqual(out_len + added_hidden_states, len(outputs))
716
717self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
718
719self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
720self.assertListEqual(
721list(self_attentions[0].shape[-3:]),
722[self.model_tester.num_attention_heads, subsampled_encoder_seq_length, subsampled_encoder_key_length],
723)
724
725def test_resize_tokens_embeddings(self):
726(
727original_config,
728inputs_dict,
729) = self.model_tester.prepare_config_and_inputs_for_common()
730if not self.test_resize_embeddings:
731return
732
733for model_class in self.all_model_classes:
734config = copy.deepcopy(original_config)
735model = model_class(config)
736model.to(torch_device)
737
738if self.model_tester.is_training is False:
739model.eval()
740
741model_vocab_size = config.vocab_size
742# Retrieve the embeddings and clone theme
743model_embed = model.resize_token_embeddings(model_vocab_size)
744cloned_embeddings = model_embed.weight.clone()
745
746# Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
747model_embed = model.resize_token_embeddings(model_vocab_size + 10)
748self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
749# Check that it actually resizes the embeddings matrix
750self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
751# Check that the model can still do a forward pass successfully (every parameter should be resized)
752model(**self._prepare_for_class(inputs_dict, model_class))
753
754# Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
755model_embed = model.resize_token_embeddings(model_vocab_size - 15)
756self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
757# Check that it actually resizes the embeddings matrix
758self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
759
760# make sure that decoder_input_ids are resized
761if "decoder_input_ids" in inputs_dict:
762inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
763model(**self._prepare_for_class(inputs_dict, model_class))
764
765# Check that adding and removing tokens has not modified the first part of the embedding matrix.
766models_equal = True
767for p1, p2 in zip(cloned_embeddings, model_embed.weight):
768if p1.data.ne(p2.data).sum() > 0:
769models_equal = False
770
771self.assertTrue(models_equal)
772
773def test_resize_embeddings_untied(self):
774(
775original_config,
776inputs_dict,
777) = self.model_tester.prepare_config_and_inputs_for_common()
778if not self.test_resize_embeddings:
779return
780
781original_config.tie_word_embeddings = False
782
783# if model cannot untied embeddings -> leave test
784if original_config.tie_word_embeddings:
785return
786
787for model_class in self.all_model_classes:
788config = copy.deepcopy(original_config)
789model = model_class(config).to(torch_device)
790
791# if no output embeddings -> leave test
792if model.get_output_embeddings() is None:
793continue
794
795# Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
796model_vocab_size = config.vocab_size
797model.resize_token_embeddings(model_vocab_size + 10)
798self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
799output_embeds = model.get_output_embeddings()
800self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
801# Check bias if present
802if output_embeds.bias is not None:
803self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
804# Check that the model can still do a forward pass successfully (every parameter should be resized)
805model(**self._prepare_for_class(inputs_dict, model_class))
806
807# Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
808model.resize_token_embeddings(model_vocab_size - 15)
809self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
810# Check that it actually resizes the embeddings matrix
811output_embeds = model.get_output_embeddings()
812self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
813# Check bias if present
814if output_embeds.bias is not None:
815self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
816# Check that the model can still do a forward pass successfully (every parameter should be resized)
817if "decoder_input_ids" in inputs_dict:
818inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
819# Check that the model can still do a forward pass successfully (every parameter should be resized)
820model(**self._prepare_for_class(inputs_dict, model_class))
821
822def test_generate_without_input_ids(self):
823pass
824
825@staticmethod
826def _get_encoder_outputs(
827model, input_ids, attention_mask, output_attentions=None, output_hidden_states=None, num_interleave=1
828):
829encoder = model.get_encoder()
830encoder_outputs = encoder(
831input_ids,
832output_attentions=output_attentions,
833output_hidden_states=output_hidden_states,
834)
835encoder_outputs["last_hidden_state"] = encoder_outputs.last_hidden_state.repeat_interleave(
836num_interleave, dim=0
837)
838input_ids = input_ids[:, :, 0]
839input_ids = torch.zeros_like(input_ids[:, :1], dtype=torch.long) + torch.tensor(
840[model._get_decoder_start_token_id()], device=input_ids.device
841)
842attention_mask = None
843return encoder_outputs, input_ids, attention_mask
844
845def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_sequences=1):
846batch_size, mel, seq_length = input_ids.shape
847subsampled_seq_length = self.model_tester.get_subsampled_output_lengths(seq_length)
848num_sequences_in_output = batch_size * num_return_sequences
849gen_len = (
850output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length
851)
852
853# scores
854self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config)
855
856# Attentions
857# encoder
858self._check_encoder_attention_for_generate(
859output.encoder_attentions, batch_size, config, subsampled_seq_length
860)
861# decoder
862self._check_attentions_for_generate(
863num_sequences_in_output,
864output.decoder_attentions,
865min_length=1,
866max_length=output.sequences.shape[-1],
867config=config,
868use_cache=use_cache,
869)
870
871# Hidden States
872# encoder
873self._check_encoder_hidden_states_for_generate(
874output.encoder_hidden_states, batch_size, config, subsampled_seq_length
875)
876
877# decoder
878self._check_hidden_states_for_generate(
879num_sequences_in_output,
880output.decoder_hidden_states,
881min_length=1,
882max_length=output.sequences.shape[-1],
883config=config,
884use_cache=use_cache,
885)
886
887@require_flash_attn
888@require_torch_gpu
889@pytest.mark.flash_attn_test
890@slow
891def test_flash_attn_2_inference(self):
892import torch
893
894for model_class in self.all_model_classes:
895if not model_class._supports_flash_attn_2:
896return
897
898config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
899model = model_class(config)
900
901with tempfile.TemporaryDirectory() as tmpdirname:
902model.save_pretrained(tmpdirname)
903model_fa = model_class.from_pretrained(
904tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
905)
906model_fa.to(torch_device)
907
908model = model_class.from_pretrained(
909tmpdirname,
910torch_dtype=torch.bfloat16,
911)
912model.to(torch_device)
913
914dummy_input = inputs_dict[model.main_input_name][:1]
915if dummy_input.dtype in [torch.float32, torch.float16]:
916dummy_input = dummy_input.to(torch.bfloat16)
917
918decoder_input_ids = inputs_dict.get("decoder_input_ids", dummy_input)[:1]
919
920outputs = model(dummy_input, decoder_input_ids=decoder_input_ids, output_hidden_states=True)
921outputs_fa = model_fa(dummy_input, decoder_input_ids=decoder_input_ids, output_hidden_states=True)
922
923logits = outputs.decoder_hidden_states[-1]
924logits_fa = outputs_fa.decoder_hidden_states[-1]
925
926# whisper FA2 needs very high tolerance
927assert torch.allclose(logits_fa, logits, atol=4e-1)
928
929# check with inference + dropout
930model.train()
931_ = model_fa(dummy_input, decoder_input_ids=decoder_input_ids)
932
933@require_flash_attn
934@require_torch_gpu
935@pytest.mark.flash_attn_test
936@slow
937def test_flash_attn_2_inference_padding_right(self):
938import torch
939
940for model_class in self.all_model_classes:
941if not model_class._supports_flash_attn_2:
942return
943
944config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
945model = model_class(config)
946
947with tempfile.TemporaryDirectory() as tmpdirname:
948model.save_pretrained(tmpdirname)
949model_fa = model_class.from_pretrained(
950tmpdirname, torch_dtype=torch.float16, attn_implementation="flash_attention_2"
951)
952model_fa.to(torch_device)
953
954model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16)
955model.to(torch_device)
956
957dummy_input = inputs_dict[model.main_input_name][:1]
958dummy_input = dummy_input.to(torch.float16)
959
960decoder_input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]], device=dummy_input.device, dtype=torch.long)
961decoder_attention_mask = torch.tensor(
962[[0, 0, 0, 1, 1, 1]], device=dummy_input.device, dtype=torch.long
963)
964
965outputs = model(dummy_input, decoder_input_ids=decoder_input_ids, output_hidden_states=True)
966outputs_fa = model_fa(dummy_input, decoder_input_ids=decoder_input_ids, output_hidden_states=True)
967
968logits = outputs.decoder_hidden_states[-1]
969logits_fa = outputs_fa.decoder_hidden_states[-1]
970
971# whisper FA2 needs very high tolerance
972assert torch.allclose(logits_fa, logits, atol=4e-1)
973
974other_inputs = {
975"decoder_input_ids": decoder_input_ids,
976"decoder_attention_mask": decoder_attention_mask,
977"output_hidden_states": True,
978}
979
980outputs = model(dummy_input, **other_inputs)
981outputs_fa = model_fa(dummy_input, **other_inputs)
982
983logits = outputs.decoder_hidden_states[-1]
984logits_fa = outputs_fa.decoder_hidden_states[-1]
985
986# whisper FA2 needs very high tolerance
987assert torch.allclose(logits_fa[:, -2:], logits[:, -2:], atol=4e-1)
988
989def _create_and_check_torchscript(self, config, inputs_dict):
990if not self.test_torchscript:
991return
992
993configs_no_init = _config_zero_init(config) # To be sure we have no Nan
994configs_no_init.torchscript = True
995configs_no_init._attn_implementation = "eager"
996for model_class in self.all_model_classes:
997model = model_class(config=configs_no_init)
998model.to(torch_device)
999model.eval()
1000inputs = self._prepare_for_class(inputs_dict, model_class)
1001
1002try:
1003model.config.use_cache = False # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward
1004input_features = inputs["input_features"]
1005decoder_input_ids = inputs["decoder_input_ids"]
1006decoder_attention_mask = inputs["decoder_attention_mask"]
1007# prepare `attention_mask` with shape (batch_size, sequence_length)
1008attention_mask = torch.ones(
1009input_features.shape[0],
1010input_features.shape[-1],
1011device=input_features.device,
1012dtype=input_features.dtype,
1013)
1014traced_model = torch.jit.trace(
1015model, (input_features, attention_mask, decoder_input_ids, decoder_attention_mask)
1016)
1017
1018except RuntimeError:
1019self.fail("Couldn't trace module.")
1020
1021with tempfile.TemporaryDirectory() as tmp_dir_name:
1022pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
1023
1024try:
1025torch.jit.save(traced_model, pt_file_name)
1026except Exception:
1027self.fail("Couldn't save module.")
1028
1029try:
1030loaded_model = torch.jit.load(pt_file_name)
1031except Exception:
1032self.fail("Couldn't load module.")
1033
1034model.to(torch_device)
1035model.eval()
1036
1037loaded_model.to(torch_device)
1038loaded_model.eval()
1039
1040model_state_dict = model.state_dict()
1041loaded_model_state_dict = loaded_model.state_dict()
1042
1043non_persistent_buffers = {}
1044for key in loaded_model_state_dict.keys():
1045if key not in model_state_dict.keys():
1046non_persistent_buffers[key] = loaded_model_state_dict[key]
1047
1048loaded_model_state_dict = {
1049key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
1050}
1051
1052self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
1053
1054model_buffers = list(model.buffers())
1055for non_persistent_buffer in non_persistent_buffers.values():
1056found_buffer = False
1057for i, model_buffer in enumerate(model_buffers):
1058if torch.equal(non_persistent_buffer, model_buffer):
1059found_buffer = True
1060break
1061
1062self.assertTrue(found_buffer)
1063model_buffers.pop(i)
1064
1065models_equal = True
1066for layer_name, p1 in model_state_dict.items():
1067p2 = loaded_model_state_dict[layer_name]
1068if p1.data.ne(p2.data).sum() > 0:
1069models_equal = False
1070
1071self.assertTrue(models_equal)
1072
1073def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=5e-5, name="outputs", attributes=None):
1074# We override with a slightly higher tol value, as test recently became flaky
1075super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol, name, attributes)
1076
1077def check_pt_flax_outputs(self, fx_outputs, pt_outputs, model_class, tol=5e-5, name="outputs", attributes=None):
1078# We override with a slightly higher tol value, as test recently became flaky
1079super().check_pt_flax_outputs(fx_outputs, pt_outputs, model_class, tol, name, attributes)
1080
1081@is_pt_flax_cross_test
1082def test_equivalence_pt_to_flax(self):
1083config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
1084init_shape = (1,) + inputs_dict["input_features"].shape[1:]
1085
1086for model_class in self.all_model_classes:
1087with self.subTest(model_class.__name__):
1088fx_model_class_name = "Flax" + model_class.__name__
1089
1090if not hasattr(transformers, fx_model_class_name):
1091# no flax model exists for this class
1092return
1093
1094# Output all for aggressive testing
1095config.output_hidden_states = True
1096config.output_attentions = self.has_attentions
1097
1098fx_model_class = getattr(transformers, fx_model_class_name)
1099
1100# load PyTorch class
1101pt_model = model_class(config).eval()
1102# Flax models don't use the `use_cache` option and cache is not returned as a default.
1103# So we disable `use_cache` here for PyTorch model.
1104pt_model.config.use_cache = False
1105
1106# load Flax class
1107fx_model = fx_model_class(config, input_shape=init_shape, dtype=jnp.float32)
1108
1109# make sure only flax inputs are forward that actually exist in function args
1110fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
1111
1112# prepare inputs
1113pt_inputs = self._prepare_for_class(inputs_dict, model_class)
1114
1115# remove function args that don't exist in Flax
1116pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
1117
1118# send pytorch inputs to the correct device
1119pt_inputs = {
1120k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v for k, v in pt_inputs.items()
1121}
1122
1123# convert inputs to Flax
1124fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
1125
1126fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
1127fx_model.params = fx_state
1128
1129# send pytorch model to the correct device
1130pt_model.to(torch_device)
1131
1132with torch.no_grad():
1133pt_outputs = pt_model(**pt_inputs)
1134fx_outputs = fx_model(**fx_inputs)
1135
1136fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
1137pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
1138
1139self.assertEqual(fx_keys, pt_keys)
1140self.check_pt_flax_outputs(fx_outputs, pt_outputs, model_class)
1141
1142with tempfile.TemporaryDirectory() as tmpdirname:
1143pt_model.save_pretrained(tmpdirname)
1144fx_model_loaded = fx_model_class.from_pretrained(tmpdirname, input_shape=init_shape, from_pt=True)
1145
1146fx_outputs_loaded = fx_model_loaded(**fx_inputs)
1147
1148fx_keys = tuple([k for k, v in fx_outputs_loaded.items() if v is not None])
1149pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
1150
1151self.assertEqual(fx_keys, pt_keys)
1152self.check_pt_flax_outputs(fx_outputs_loaded, pt_outputs, model_class)
1153
1154@is_pt_flax_cross_test
1155def test_equivalence_flax_to_pt(self):
1156config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
1157init_shape = (1,) + inputs_dict["input_features"].shape[1:]
1158
1159for model_class in self.all_model_classes:
1160with self.subTest(model_class.__name__):
1161fx_model_class_name = "Flax" + model_class.__name__
1162
1163if not hasattr(transformers, fx_model_class_name):
1164# no flax model exists for this class
1165return
1166
1167# Output all for aggressive testing
1168config.output_hidden_states = True
1169config.output_attentions = self.has_attentions
1170
1171fx_model_class = getattr(transformers, fx_model_class_name)
1172
1173# load PyTorch class
1174pt_model = model_class(config).eval()
1175# Flax models don't use the `use_cache` option and cache is not returned as a default.
1176# So we disable `use_cache` here for PyTorch model.
1177pt_model.config.use_cache = False
1178
1179# load Flax class
1180fx_model = fx_model_class(config, input_shape=init_shape, dtype=jnp.float32)
1181
1182# make sure only flax inputs are forward that actually exist in function args
1183fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
1184
1185# prepare inputs
1186pt_inputs = self._prepare_for_class(inputs_dict, model_class)
1187
1188# remove function args that don't exist in Flax
1189pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
1190
1191# send pytorch inputs to the correct device
1192pt_inputs = {
1193k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v for k, v in pt_inputs.items()
1194}
1195
1196# convert inputs to Flax
1197fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
1198
1199pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
1200
1201# make sure weights are tied in PyTorch
1202pt_model.tie_weights()
1203
1204# send pytorch model to the correct device
1205pt_model.to(torch_device)
1206
1207with torch.no_grad():
1208pt_outputs = pt_model(**pt_inputs)
1209fx_outputs = fx_model(**fx_inputs)
1210
1211fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
1212pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
1213
1214self.assertEqual(fx_keys, pt_keys)
1215self.check_pt_flax_outputs(fx_outputs, pt_outputs, model_class)
1216
1217with tempfile.TemporaryDirectory() as tmpdirname:
1218fx_model.save_pretrained(tmpdirname)
1219pt_model_loaded = model_class.from_pretrained(tmpdirname, from_flax=True)
1220
1221# send pytorch model to the correct device
1222pt_model_loaded.to(torch_device)
1223pt_model_loaded.eval()
1224
1225with torch.no_grad():
1226pt_outputs_loaded = pt_model_loaded(**pt_inputs)
1227
1228fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
1229pt_keys = tuple([k for k, v in pt_outputs_loaded.items() if v is not None])
1230
1231self.assertEqual(fx_keys, pt_keys)
1232self.check_pt_flax_outputs(fx_outputs, pt_outputs_loaded, model_class)
1233
1234def test_mask_feature_prob(self):
1235config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
1236config.mask_feature_prob = 0.2
1237config.mask_feature_length = 2
1238
1239for model_class in self.all_model_classes:
1240model = model_class(config)
1241model.to(torch_device)
1242model.train()
1243
1244# forward pass
1245encoder_last_hidden_state = model(**input_dict).encoder_last_hidden_state
1246self.assertTrue(encoder_last_hidden_state.shape, (13, 30, 16))
1247
1248def test_mask_time_prob(self):
1249config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
1250config.mask_time_prob = 0.2
1251config.mask_time_length = 2
1252
1253for model_class in self.all_model_classes:
1254model = model_class(config)
1255model.to(torch_device)
1256model.train()
1257
1258# forward pass
1259encoder_last_hidden_state = model(**input_dict).encoder_last_hidden_state
1260self.assertTrue(encoder_last_hidden_state.shape, (13, 30, 16))
1261
1262def test_generate_with_prompt_ids_and_task_and_language(self):
1263config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
1264model = WhisperForConditionalGeneration(config).eval().to(torch_device)
1265input_features = input_dict["input_features"]
1266prompt_ids = torch.arange(5).to(torch_device)
1267language = "<|de|>"
1268task = "translate"
1269lang_id = 6
1270task_id = 7
1271model.generation_config.__setattr__("lang_to_id", {language: lang_id})
1272model.generation_config.__setattr__("task_to_id", {task: task_id})
1273
1274output = model.generate(input_features, max_new_tokens=5, task=task, language=language, prompt_ids=prompt_ids)
1275
1276expected_output_start = [
1277*prompt_ids.tolist(),
1278model.generation_config.decoder_start_token_id,
1279lang_id,
1280task_id,
1281]
1282for row in output.tolist():
1283self.assertListEqual(row[: len(expected_output_start)], expected_output_start)
1284
1285def test_generate_with_prompt_ids_and_forced_decoder_ids(self):
1286config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
1287model = WhisperForConditionalGeneration(config).eval().to(torch_device)
1288input_features = input_dict["input_features"]
1289prompt_ids = torch.arange(5).to(torch_device)
1290forced_decoder_ids = [(1, 6), (2, 7), (3, 8)]
1291
1292output = model.generate(
1293input_features, max_new_tokens=5, forced_decoder_ids=forced_decoder_ids, prompt_ids=prompt_ids
1294)
1295
1296expected_output_start = [
1297*prompt_ids.tolist(),
1298model.generation_config.decoder_start_token_id,
1299*[token for _rank, token in forced_decoder_ids],
1300]
1301for row in output.tolist():
1302self.assertListEqual(row[: len(expected_output_start)], expected_output_start)
1303
1304def test_generate_with_prompt_ids_max_length(self):
1305config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
1306config.max_target_positions = 7
1307
1308model = WhisperForConditionalGeneration(config).eval().to(torch_device)
1309input_features = input_dict["input_features"]
1310decoder_input_ids = torch.arange(5).to(torch_device)
1311prompt_ids = decoder_input_ids[:4]
1312max_new_tokens = 8
1313
1314with self.assertRaisesRegex(
1315ValueError,
1316f"The length of `decoder_input_ids` equal `prompt_ids` plus special start tokens is {decoder_input_ids.shape[-1]}, and the `max_new_tokens` "
1317f"is {max_new_tokens}. Thus, the combined length of "
1318f"`decoder_input_ids` and `max_new_tokens` is: {max_new_tokens + decoder_input_ids.shape[-1]}. This exceeds the "
1319f"`max_target_positions` of the Whisper model: {config.max_target_positions}. "
1320"You should either reduce the length of your prompt, or reduce the value of `max_new_tokens`, "
1321f"so that their combined length is less than {config.max_target_positions}.",
1322):
1323model.generate(input_features, max_new_tokens=max_new_tokens, prompt_ids=prompt_ids)
1324
1325model.generate(input_features, max_new_tokens=1, prompt_ids=prompt_ids)
1326
1327def test_generate_longform_with_prompt_ids(self):
1328config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
1329model = WhisperForConditionalGeneration(config).eval().to(torch_device)
1330
1331prompt_ids = torch.arange(5).to(torch_device)
1332model.generation_config.no_timestamps_token_id = 11
1333model.generation_config.pad_token_id = 10
1334
1335# make sure prompt token ids [0-9] can't be generated
1336model.generation_config.suppress_tokens = list(range(10))
1337
1338input_features = input_dict["input_features"]
1339
1340language = "<|de|>"
1341lang_id = 6
1342
1343input_features = input_features.repeat(1, 1, 50)
1344attention_mask = torch.ones_like(input_features, dtype=torch.long)[:, 0]
1345
1346for prompt_type in ["first-segment", "all-segments"]:
1347for task_id, task in enumerate(["translate", "transcribe"]):
1348task_id = 7 + task_id
1349
1350model.generation_config.__setattr__("lang_to_id", {language: lang_id})
1351model.generation_config.__setattr__("task_to_id", {task: task_id})
1352
1353output = model.generate(
1354input_features,
1355attention_mask=attention_mask,
1356prompt_condition_type=prompt_type,
1357max_new_tokens=5,
1358task=task,
1359language=language,
1360prompt_ids=prompt_ids,
1361condition_on_prev_tokens=True,
1362)
1363for row in output.tolist():
1364# make sure no token below 10 is in generated output => this means for long-form prompt ids should NOT be returned
1365assert not any(i in row for i in model.generation_config.suppress_tokens)
1366
1367def _check_longform_generate_single_batch(self, condition_on_prev_tokens):
1368config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
1369
1370model = WhisperForConditionalGeneration(config).eval().to(torch_device)
1371input_features = input_dict["input_features"]
1372
1373# len = 250 with num_input_frames = 60
1374long_input_features = torch.cat([input_features.repeat(1, 1, 4), input_features[:, :, :10]], dim=-1)
1375
1376# force bsz=1
1377long_input_features = long_input_features[:1]
1378vocab_size = model.config.vocab_size
1379
1380batch_size = 1
1381num_timestamp_tokens = 20
1382max_length = 16
1383logits_processor = [
1384DummyTimestampLogitProcessor(
1385vocab_size - num_timestamp_tokens,
1386vocab_size,
1387batch_size=batch_size,
1388max_length=max_length,
1389min_space=4,
1390)
1391]
1392
1393# each chunk should not be longer than 10
1394model.generation_config.max_length = max_length
1395
1396# if input features are long can't set return_timestamps to False
1397with self.assertRaises(ValueError):
1398_ = model.generate(long_input_features, logits_processor=logits_processor, return_timestamps=False)
1399
1400# if input features are long need to set generation config
1401with self.assertRaises(ValueError):
1402_ = model.generate(long_input_features, logits_processor=logits_processor)
1403
1404timestamp_begin = vocab_size - num_timestamp_tokens
1405model.generation_config.no_timestamps_token_id = timestamp_begin - 1
1406model.generation_config.eos_token_id = None
1407model.config.eos_token_id = None
1408model.generation_config._detect_timestamp_from_logprob = False
1409# make sure that we only have the same begin token
1410model.generation_config.max_initial_timestamp_index = 0
1411model.generation_config.prev_bos_token_id = timestamp_begin - 3
1412
1413gen_kwargs = {
1414"logits_processor": logits_processor,
1415"return_segments": True,
1416"condition_on_prev_tokens": condition_on_prev_tokens,
1417}
1418
1419if condition_on_prev_tokens:
1420gen_kwargs["no_speech_threshold"] = 0.6
1421gen_kwargs["temperature"] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0)
1422gen_kwargs["compression_ratio_threshold"] = 2.4
1423gen_kwargs["logprob_threshold"] = -1.0
1424
1425outputs = model.generate(long_input_features, **gen_kwargs)
1426
1427segments = outputs["segments"][0]
1428
1429for _, segment in enumerate(segments):
1430assert segment["start"] <= segment["end"], "start has to be smaller equal end"
1431assert any(
1432s > timestamp_begin for s in segment["tokens"][1:]
1433), f"At least one segment token should be a timestamp token, but not first., {segment['tokens']}"
1434assert (
1435segment["tokens"].shape[-1] <= max_length
1436), "make sure that no segment is larger than max generation length"
1437
1438def test_longform_generate_single_batch(self):
1439self._check_longform_generate_single_batch(condition_on_prev_tokens=False)
1440
1441def test_longform_generate_single_batch_cond_prev(self):
1442self._check_longform_generate_single_batch(condition_on_prev_tokens=True)
1443
1444def _check_longform_generate_multi_batch(self, condition_on_prev_tokens):
1445config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
1446
1447model = WhisperForConditionalGeneration(config).eval().to(torch_device)
1448input_features = input_dict["input_features"].to(torch_device)
1449
1450# len = 250 with num_input_frames = 60
1451long_input_features = torch.cat([input_features.repeat(1, 1, 4), input_features[:, :, :10]], dim=-1)
1452input_features_2 = long_input_features[1:]
1453attention_mask = torch.ones(
1454(2, long_input_features.shape[-1]), dtype=input_features.dtype, device=input_features.device
1455)
1456attention_mask[0, 200:] = 0
1457
1458# force bsz=1
1459vocab_size = model.config.vocab_size
1460
1461batch_size = 1
1462num_timestamp_tokens = 20
1463max_new_tokens = 16
1464timestamp_begin = vocab_size - num_timestamp_tokens
1465model.generation_config.no_timestamps_token_id = timestamp_begin - 1
1466model.generation_config.eos_token_id = None
1467model.config.eos_token_id = None
1468model.generation_config._detect_timestamp_from_logprob = False
1469# make sure that we only have the same begin token
1470model.generation_config.max_initial_timestamp_index = 0
1471model.generation_config.max_new_tokens = max_new_tokens
1472model.generation_config.prev_bos_token_id = timestamp_begin - 3
1473
1474logits_processor = [
1475DummyTimestampLogitProcessor(
1476vocab_size - num_timestamp_tokens,
1477vocab_size,
1478batch_size=batch_size,
1479max_length=max_new_tokens,
1480min_space=4,
1481seed=1,
1482)
1483]
1484outputs_2 = model.generate(
1485input_features_2,
1486max_new_tokens=max_new_tokens,
1487logits_processor=logits_processor,
1488condition_on_prev_tokens=condition_on_prev_tokens,
1489return_segments=True,
1490)
1491tokens_2 = outputs_2["sequences"][0]
1492segments_2 = outputs_2["segments"][0]
1493
1494batch_size = 2
1495logits_processor = [
1496DummyTimestampLogitProcessor(
1497vocab_size - num_timestamp_tokens,
1498vocab_size,
1499batch_size=batch_size,
1500max_length=max_new_tokens,
1501min_space=4,
1502seed=0,
1503)
1504]
1505gen_kwargs = {
1506"logits_processor": logits_processor,
1507"return_segments": True,
1508"condition_on_prev_tokens": condition_on_prev_tokens,
1509"attention_mask": attention_mask,
1510"max_new_tokens": max_new_tokens,
1511}
1512
1513outputs = model.generate(long_input_features, **gen_kwargs)
1514tokens = outputs["sequences"][1]
1515segments = outputs["segments"][1]
1516
1517# make sure batched and non-batched is the same
1518assert tokens_2.tolist() == tokens[: tokens_2.shape[-1]].tolist()
1519
1520for seg1, seg2 in zip(segments_2, segments):
1521assert seg1["start"] == seg2["start"]
1522assert seg1["end"] == seg2["end"]
1523assert seg1["tokens"].tolist() == seg2["tokens"].tolist()
1524
1525def test_longform_generate_multi_batch(self):
1526self._check_longform_generate_multi_batch(condition_on_prev_tokens=False)
1527
1528def test_longform_generate_multi_batch_cond_prev(self):
1529self._check_longform_generate_multi_batch(condition_on_prev_tokens=True)
1530
1531
1532@require_torch
1533@require_torchaudio
1534class WhisperModelIntegrationTests(unittest.TestCase):
1535@cached_property
1536def default_processor(self):
1537return WhisperProcessor.from_pretrained("openai/whisper-base")
1538
1539def _load_datasamples(self, num_samples):
1540ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
1541# automatic decoding with librispeech
1542speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
1543
1544return [x["array"] for x in speech_samples]
1545
1546@slow
1547def test_tiny_logits_librispeech(self):
1548torch_device = "cpu"
1549set_seed(0)
1550model = WhisperModel.from_pretrained("openai/whisper-tiny")
1551model.to(torch_device)
1552input_speech = self._load_datasamples(1)
1553feature_extractor = WhisperFeatureExtractor()
1554input_features = feature_extractor(input_speech, return_tensors="pt").input_features
1555
1556with torch.no_grad():
1557logits = model(
1558input_features,
1559decoder_input_ids=torch.tensor([[50258, 50259, 50359]]),
1560output_hidden_states=False,
1561output_attentions=False,
1562return_dict=False,
1563use_cache=False,
1564)
1565
1566# fmt: off
1567EXPECTED_LOGITS = torch.tensor(
1568[
15692.9892, -6.7607, 5.7348, 3.6096, 0.2152, -5.7321, 4.8855, -1.6407,
15700.2823, -1.5718, 10.4269, 3.4427, 0.0219, -8.0612, 3.4784, 8.4246,
15714.0575, -2.2864, 11.1084, 0.9963, 0.9884, -8.5154, -3.5469, -9.3713,
15720.9786, 3.5435, 7.4850, -5.2579, -1.4366, 10.4841
1573]
1574)
1575# fmt: on
1576self.assertTrue(torch.allclose(logits[0][0, 0, :30].cpu(), EXPECTED_LOGITS, atol=1e-4))
1577
1578# fmt: off
1579EXPECTED_GENERATION = torch.tensor(
1580[
1581-1.4651, -2.6944, 2.7821, 2.3793, 4.0738, 0.0188, -3.3203, 1.9836,
15820.0520, 0.7095, 1.1063, 0.2952, -3.6786, -0.5249, 0.3105, 4.7691,
15831.1562, 1.3046, 0.5810, -0.3624, 1.7006, 1.3424, 0.9817, 2.1958,
15841.8775, -5.7046, -0.7679, 4.0113, 2.6848, 2.8609
1585]
1586)
1587# fmt: on
1588
1589head_logits = logits[0] @ model.decoder.embed_tokens.weight.T
1590self.assertTrue(torch.allclose(head_logits[0, 0, :30].cpu(), EXPECTED_GENERATION, atol=1e-4))
1591
1592@slow
1593def test_small_en_logits_librispeech(self):
1594set_seed(0)
1595torch_device = "cpu"
1596model = WhisperModel.from_pretrained("openai/whisper-small.en")
1597model.to(torch_device)
1598
1599input_speech = self._load_datasamples(1)
1600
1601feaure_extractor = WhisperFeatureExtractor()
1602input_features = feaure_extractor(input_speech, return_tensors="pt").input_features.to(torch_device)
1603
1604logits = model(
1605input_features,
1606decoder_input_ids=torch.tensor([[model.config.decoder_start_token_id]]),
1607output_hidden_states=False,
1608output_attentions=False,
1609use_cache=False,
1610)
1611
1612logits = logits.last_hidden_state @ model.decoder.embed_tokens.weight.T
1613
1614# fmt: off
1615EXPECTED_LOGITS = torch.tensor(
1616[
1617-3.6784, -7.7211, -9.5070, -11.9286, -7.6489, -9.7026, -5.6188,
1618-8.0104, -4.6238, -5.1833, -9.0485, -3.4079, -5.4874, -2.6935,
1619-6.3479, -7.3398, -6.9558, -7.6867, -7.4748, -8.3463, -9.9781,
1620-10.8389, -10.3105, -11.7201, -9.7261, -7.1590, -5.9272, -12.4509,
1621-11.1146, -8.1918
1622]
1623)
1624# fmt: on
1625self.assertTrue(torch.allclose(logits[0, 0, :30].cpu(), EXPECTED_LOGITS, atol=1e-4))
1626
1627@slow
1628def test_large_logits_librispeech(self):
1629set_seed(0)
1630
1631torch_device = "cpu"
1632model = WhisperModel.from_pretrained("openai/whisper-large")
1633model.to(torch_device)
1634
1635input_speech = self._load_datasamples(1)
1636
1637processor = WhisperProcessor.from_pretrained("openai/whisper-large")
1638processed_inputs = processor(
1639audio=input_speech, text="This part of the speech", add_special_tokens=False, return_tensors="pt"
1640)
1641input_features = processed_inputs.input_features.to(torch_device)
1642decoder_input_ids = processed_inputs.labels.to(torch_device)
1643
1644logits = model(
1645input_features,
1646decoder_input_ids=decoder_input_ids,
1647output_hidden_states=False,
1648output_attentions=False,
1649use_cache=False,
1650)
1651
1652logits = logits.last_hidden_state @ model.decoder.embed_tokens.weight.T
1653
1654# fmt: off
1655EXPECTED_LOGITS = torch.tensor(
1656[
16572.1382, 0.9381, 4.4671, 3.5589, 2.4022, 3.8576, -0.6521, 2.5472,
16581.8301, 1.9957, 2.3432, 1.4678, 0.5459, 2.2597, 1.5179, 2.5357,
16591.1624, 0.6194, 1.0757, 1.8259, 2.4076, 1.6601, 2.3503, 1.3376,
16601.9891, 1.8635, 3.8931, 5.3699, 4.4772, 3.9184
1661]
1662)
1663# fmt: on
1664
1665self.assertTrue(torch.allclose(logits[0, 0, :30].cpu(), EXPECTED_LOGITS, atol=1e-4))
1666
1667@slow
1668def test_tiny_en_generation(self):
1669torch_device = "cpu"
1670set_seed(0)
1671processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
1672model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
1673model.to(torch_device)
1674model.config.decoder_start_token_id = 50257
1675
1676input_speech = self._load_datasamples(1)
1677input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(
1678torch_device
1679)
1680
1681generated_ids = model.generate(input_features, num_beams=5, max_length=20)
1682transcript = processor.tokenizer.batch_decode(generated_ids)[0]
1683
1684EXPECTED_TRANSCRIPT = (
1685"<|startoftranscript|><|notimestamps|> Mr. Quilter is the apostle of the middle"
1686" classes, and we are glad to"
1687)
1688self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
1689
1690@slow
1691def test_tiny_generation(self):
1692torch_device = "cpu"
1693set_seed(0)
1694processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
1695model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
1696model.to(torch_device)
1697
1698input_speech = self._load_datasamples(1)
1699input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(
1700torch_device
1701)
1702
1703generated_ids = model.generate(input_features, num_beams=5, max_length=20)
1704transcript = processor.tokenizer.decode(generated_ids[0])
1705
1706EXPECTED_TRANSCRIPT = (
1707"<|startoftranscript|><|en|><|transcribe|><|notimestamps|> Mr. Quilter is the apostle of the middle"
1708" classes and we are glad"
1709)
1710self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
1711
1712@slow
1713def test_large_generation(self):
1714torch_device = "cpu"
1715set_seed(0)
1716processor = WhisperProcessor.from_pretrained("openai/whisper-large")
1717model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
1718model.to(torch_device)
1719
1720input_speech = self._load_datasamples(1)
1721input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(
1722torch_device
1723)
1724
1725generated_ids = model.generate(
1726input_features, do_sample=False, max_length=20, language="<|en|>", task="transcribe"
1727)
1728transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
1729
1730EXPECTED_TRANSCRIPT = " Mr. Quilter is the apostle of the middle classes and we are glad"
1731self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
1732
1733@slow
1734def test_large_generation_multilingual(self):
1735torch_device = "cpu"
1736set_seed(0)
1737processor = WhisperProcessor.from_pretrained("openai/whisper-large")
1738model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
1739model.to(torch_device)
1740
1741token = os.getenv("HF_HUB_READ_TOKEN", True)
1742ds = load_dataset("mozilla-foundation/common_voice_6_1", "ja", split="test", streaming=True, token=token)
1743ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
1744
1745input_speech = next(iter(ds))["audio"]["array"]
1746input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(
1747torch_device
1748)
1749
1750generated_ids = model.generate(
1751input_features, do_sample=False, max_length=20, language="<|ja|>", task="transcribe"
1752)
1753transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
1754
1755EXPECTED_TRANSCRIPT = "木村さんに電話を貸してもらいました"
1756self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
1757
1758generated_ids = model.generate(
1759input_features, do_sample=False, max_length=20, language="<|en|>", task="transcribe"
1760)
1761transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
1762
1763EXPECTED_TRANSCRIPT = " Kimura-san called me."
1764self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
1765
1766generated_ids = model.generate(
1767input_features, do_sample=False, max_length=20, language="<|ja|>", task="translate"
1768)
1769transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
1770
1771EXPECTED_TRANSCRIPT = " I borrowed a phone from Kimura san"
1772self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
1773
1774@slow
1775def test_large_batched_generation(self):
1776set_seed(0)
1777processor = WhisperProcessor.from_pretrained("openai/whisper-large")
1778model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
1779
1780input_speech = self._load_datasamples(4)
1781input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="pt").input_features
1782generated_ids = model.generate(input_features, max_length=20, task="translate")
1783
1784# fmt: off
1785EXPECTED_LOGITS = torch.tensor(
1786[
1787[50258, 50259, 50358, 50363, 2221, 13, 2326, 388, 391, 307, 264, 50244, 295, 264, 2808, 5359, 293, 321, 366, 5404],
1788[50258, 50259, 50358, 50363, 6966, 307, 2221, 13, 2326, 388, 391, 311, 9060, 1570, 1880, 813, 702, 1871, 13, 50257],
1789[50258, 50259, 50358, 50363, 634, 5112, 505, 300, 412, 341, 42729, 3196, 295, 264, 1064, 11, 365, 5272, 293, 12904],
1790[50258, 50259, 50358, 50363, 634, 575, 12525, 22618, 1968, 6144, 35617, 20084, 1756, 311, 589, 307, 534, 10281, 934, 439]
1791]
1792)
1793# fmt: on
1794
1795self.assertTrue(torch.allclose(generated_ids, EXPECTED_LOGITS))
1796
1797# fmt: off
1798EXPECTED_TRANSCRIPT = [
1799" Mr. Quilter is the apostle of the middle classes and we are glad",
1800" Nor is Mr. Quilter's manner less interesting than his matter.",
1801" He tells us that at this festive season of the year, with Christmas and roast",
1802" He has grave doubts whether Sir Frederick Layton's work is really Greek after all",
1803]
1804# fmt: on
1805
1806transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)
1807self.assertListEqual(transcript, EXPECTED_TRANSCRIPT)
1808
1809@slow
1810def test_tiny_en_batched_generation(self):
1811set_seed(0)
1812processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
1813model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
1814model.to(torch_device)
1815
1816input_speech = self._load_datasamples(4)
1817input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(
1818torch_device
1819)
1820generated_ids = model.generate(input_features, max_length=20).to("cpu")
1821
1822# fmt: off
1823EXPECTED_LOGITS = torch.tensor(
1824[
1825[50257, 50362, 1770, 13, 2264, 346, 353, 318, 262, 46329, 286, 262, 3504, 6097, 11, 290, 356, 389, 9675, 284],
1826[50257, 50362, 5414, 318, 1770, 13, 2264, 346, 353, 338, 5642, 1342, 3499, 621, 465, 2300, 13, 50256, 50256, 50256],
1827[50257, 50362, 679, 4952, 514, 326, 379, 428, 43856, 1622, 286, 262, 614, 11, 351, 6786, 290, 32595, 12023, 28236],
1828[50257, 50362, 679, 468, 12296, 17188, 1771, 7361, 26113, 18881, 1122, 338, 670, 318, 1107, 8312, 706, 477, 290, 460]
1829]
1830
1831)
1832# fmt: on
1833
1834self.assertTrue(torch.allclose(generated_ids, EXPECTED_LOGITS))
1835
1836# fmt: off
1837EXPECTED_TRANSCRIPT = [
1838" Mr. Quilter is the apostle of the middle classes, and we are glad to",
1839" Nor is Mr. Quilter's manner less interesting than his matter.",
1840" He tells us that at this festive season of the year, with Christmas and roast beef looming",
1841" He has grave doubts whether Sir Frederick Layton's work is really Greek after all and can",
1842]
1843# fmt: on
1844
1845transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)
1846self.assertListEqual(transcript, EXPECTED_TRANSCRIPT)
1847
1848@slow
1849def test_tiny_timestamp_generation(self):
1850set_seed(0)
1851processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
1852model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
1853model.to(torch_device)
1854
1855input_speech = np.concatenate(self._load_datasamples(4))
1856input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(
1857torch_device
1858)
1859
1860generated_ids = model.generate(input_features, max_length=448, return_timestamps=True).to("cpu")
1861
1862EXPECTED_OUTPUT = torch.tensor([50258, 50259, 50359, 50364, 2221, 13, 2326, 388, 391, 307, 264, 50244, 295, 264, 2808, 5359, 11, 293, 321, 366, 5404, 281, 2928, 702, 14943, 13, 50692, 50692, 6966, 307, 2221, 13, 2326, 388, 391, 311, 9060, 1570, 1880, 813, 702, 1871, 13, 50926, 50926, 634, 5112, 505, 300, 412, 341, 42729, 3196, 295, 264, 1064, 11, 365, 5272, 293, 12904, 9256, 450, 10539, 51208, 51208, 949, 505, 11, 14138, 10117, 490, 3936, 293, 1080, 3542, 5160, 881, 26336, 281, 264, 1575, 13, 51552, 51552, 634, 575, 12525, 22618, 1968, 6144, 35617, 7354, 1292, 6, 589, 307, 534, 10281, 934, 439, 11, 293, 51836, 51836, 50257]) # fmt: skip
1863
1864self.assertTrue(torch.allclose(generated_ids, EXPECTED_OUTPUT))
1865
1866EXPECTED_TRANSCRIPT = [
1867{
1868"text": (
1869" Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is"
1870" Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season"
1871" of the year, with Christmas and roast beef looming before us, similarly drawn from eating and"
1872" its results occur most readily to the mind. He has grave doubts whether Sir Frederick Latins'"
1873" work is really Greek after all, and"
1874),
1875"offsets": [
1876{
1877"text": (
1878" Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."
1879),
1880"timestamp": (0.0, 6.5600000000000005),
1881},
1882{
1883"text": " Nor is Mr. Quilter's manner less interesting than his matter.",
1884"timestamp": (6.5600000000000005, 11.24),
1885},
1886{
1887"text": (
1888" He tells us that at this festive season of the year, with Christmas and roast beef"
1889" looming"
1890),
1891"timestamp": (11.24, 16.88),
1892},
1893{
1894"text": (
1895" before us, similarly drawn from eating and its results occur most readily to the mind."
1896),
1897"timestamp": (16.88, 23.76),
1898},
1899{
1900"text": (
1901" He has grave doubts whether Sir Frederick Latins' work is really Greek after all, and"
1902),
1903"timestamp": (23.76, 29.44),
1904},
1905],
1906}
1907]
1908
1909transcript = processor.batch_decode(generated_ids, skip_special_tokens=True, output_offsets=True)
1910self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
1911
1912@slow
1913def test_tiny_token_timestamp_generation(self):
1914set_seed(0)
1915processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
1916model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
1917model.to(torch_device)
1918model.generation_config.alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
1919
1920input_speech = self._load_datasamples(4)
1921input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(
1922torch_device
1923)
1924
1925generate_outputs = model.generate(
1926input_features, max_length=448, return_timestamps=True, return_token_timestamps=True
1927)
1928
1929self.assertEqual(generate_outputs.sequences.shape, generate_outputs.token_timestamps.shape)
1930
1931# fmt: off
1932EXPECTED_OUTPUT = torch.tensor([
1933[ 0.0000, 0.0000, 0.0000, 0.0000, 0.4800, 0.8200, 0.9600, 1.1200, 1.1200, 1.2200, 1.5000, 1.7200, 2.0000, 2.3400, 2.5000, 2.6600, 3.1800, 3.5600, 3.6800, 3.8000, 4.1000, 4.3000, 4.5800, 4.9400, 5.3800, 12.4200, 12.8400, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9400, 26.9400, 26.9400, 26.9400, 29.8400 ],
1934[ 0.0000, 0.0000, 0.0000, 0.0000, 0.5200, 0.9000, 1.1400, 1.4200, 1.5200, 1.6800, 1.6800, 1.8800, 2.1000, 2.2200, 2.6200, 3.1400, 3.5800, 3.9600, 4.4000, 17.3000, 17.3000, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7400, 26.7400, 26.7400, 26.7400, 26.7400, 26.7400, 28.0000 ],
1935[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.7600, 1.0000, 1.4200, 1.8000, 1.9400, 2.1800, 2.5200, 3.0200, 3.3200, 3.5400, 3.9400, 4.5600, 4.9200, 5.2800, 5.5600, 5.9000, 6.1600, 6.3000, 6.4800, 6.4800, 6.6400, 7.8200, 7.9600, 8.2200, 8.6000, 8.9200, 9.2200, 9.5200, 9.7200, 10.0600, 10.5400, 10.8800, 11.2600, 11.5400, 11.7400, 12.0800, 15.6800, 15.6800],
1936[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.7400, 1.0400, 1.3200, 1.6800, 2.1400, 2.4800, 2.7800, 3.0800, 3.1600, 3.4000, 3.6000, 4.0200, 4.2200, 4.8600, 5.2400, 5.7400, 6.3400, 6.6200, 6.7600, 6.7600, 6.8600, 7.2400, 7.4200, 7.6800, 7.9200, 8.4800, 8.7600, 9.2000, 9.2000, 9.4200, 15.8200, 15.8200, 29.6400, 29.6600, 29.6600, 29.6600, 29.6600, 29.7600]
1937])
1938# fmt: on
1939
1940self.assertTrue(torch.allclose(generate_outputs.token_timestamps.to("cpu"), EXPECTED_OUTPUT))
1941
1942@slow
1943def test_tiny_token_timestamp_batch_generation(self):
1944set_seed(0)
1945processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
1946model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
1947model.to(torch_device)
1948model.generation_config.alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
1949
1950num_samples = 4
1951num_return_sequences = 2
1952
1953input_speech = self._load_datasamples(num_samples)
1954input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(
1955torch_device
1956)
1957
1958generate_outputs = model.generate(
1959input_features,
1960max_length=448,
1961return_timestamps=True,
1962return_token_timestamps=True,
1963num_beams=3,
1964num_return_sequences=num_return_sequences,
1965)
1966
1967# task id and lang id prompts should not have timestamp tokens
1968self.assertEqual(generate_outputs.sequences.shape[-1] - 2, generate_outputs.token_timestamps.shape[-1])
1969
1970self.assertEqual(len(generate_outputs.sequences), num_return_sequences * num_samples)
1971
1972@slow
1973def test_tiny_token_timestamp_generation_longform(self):
1974set_seed(0)
1975processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
1976model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
1977model.to(torch_device)
1978model.generation_config.alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
1979
1980input_speech = self._load_datasamples(5)
1981long_input_speech = np.concatenate(input_speech, dtype=np.float32)
1982inputs = processor.feature_extractor(
1983raw_speech=long_input_speech,
1984return_tensors="pt",
1985truncation=False, # False so the audio isn't truncated and whole audio is sent to the model
1986return_attention_mask=True,
1987padding=True,
1988)
1989
1990inputs = inputs.to(torch_device)
1991generate_outputs = model.generate(**inputs, return_segments=True, return_token_timestamps=True)
1992
1993token_timestamps_shape = [
1994[segment["token_timestamps"].shape for segment in segment_list]
1995for segment_list in generate_outputs["segments"]
1996]
1997tokens_shape = [
1998[segment["tokens"].shape for segment in segment_list] for segment_list in generate_outputs["segments"]
1999]
2000self.assertListEqual(tokens_shape, token_timestamps_shape)
2001
2002# fmt: off
2003EXPECTED_OUTPUT = [
2004torch.tensor([0.0000, 0.4200, 0.8200, 0.9400, 1.1200, 1.1200, 1.2200, 1.5000, 1.7200, 2.0400, 2.3400, 2.5200, 2.6600, 3.2000, 3.4400, 3.5600, 3.6800, 3.8200, 4.1000, 4.3000, 4.5800, 4.9400, 5.4000, 6.3600]),
2005torch.tensor([ 6.5400, 6.5400, 6.7400, 6.9600, 7.2600, 7.3400, 7.5800, 7.5800, 7.6400, 7.8400, 8.1000, 8.5000, 9.0000, 9.4800, 9.7200, 10.2600, 11.1000]),
2006torch.tensor([11.2200, 11.2200, 11.4200, 11.6600, 12.0800, 12.4400, 12.5800, 12.8400, 13.1800, 13.6800, 14.0000, 14.2200, 14.6200, 14.9800, 15.2200, 15.6000, 15.9400, 16.2000, 16.5600, 16.8400, 16.9800]),
2007torch.tensor([16.9800, 16.9800, 17.3200, 18.1600, 18.6400, 18.8600, 19.2800, 19.5600, 19.8800, 20.1800, 20.3800, 20.7200, 21.1600, 21.5400, 21.9000, 22.2000, 22.4200, 22.8600, 23.7000]),
2008torch.tensor([23.7000, 23.7000, 23.9400, 24.1800, 24.3800, 24.8400, 25.2800, 25.6600, 25.9200, 26.2600, 26.4000, 26.5800, 26.7600, 27.1400, 27.3800, 28.0400, 28.3800, 28.8200, 29.3400, 29.5200]),
2009torch.tensor([29.4400, 29.4400, 29.7000, 30.0800, 30.3800, 30.5400, 30.8200, 31.0600, 31.6600, 31.9200, 32.3000, 32.4800, 32.6200, 33.6800]),
2010torch.tensor([33.8000, 33.8000, 33.9800, 33.9800, 34.1800, 34.4400, 34.6200, 35.0000, 35.2200, 35.3200, 35.5600, 35.9200, 36.3800, 36.6200, 36.6600, 36.9600, 37.3400, 37.9800, 38.5800, 38.7200, 38.9800, 39.4400, 39.5800, 39.8000, 40.1200, 40.2600]),
2011torch.tensor([40.5200, 40.5200, 40.6200, 41.1000, 41.5400, 41.9200, 42.1000, 42.3200, 42.3200, 43.0600, 44.6000]),
2012torch.tensor([44.7000, 44.7000, 44.8600, 44.9400, 45.1400, 45.1400, 45.2800, 45.6200, 45.9000, 46.2600, 47.1600, 47.4800, 47.7400, 48.1000, 48.2800, 48.4000, 48.6200, 48.8400, 49.0400, 49.2800, 49.4800, 49.6600, 49.9400, 50.5400]),
2013torch.tensor([50.5400, 50.5400, 50.6600, 50.8800, 51.2400, 51.7200, 52.8400]),
2014torch.tensor([52.9600, 52.9600, 53.0400, 53.2600, 53.4200, 53.5800, 53.9200, 54.1200, 54.7200, 54.9400, 55.2600, 55.6200, 55.9800, 56.5600, 56.8000, 56.9200, 57.3600, 57.9200, 58.1800, 58.5000, 58.6400, 58.8200]),
2015torch.tensor([58.6800, 58.6800, 59.1400, 59.5400, 59.9200, 60.1600, 60.3800, 60.8200, 61.6200, 62.2600, 75.2000]),
2016]
2017# fmt: on
2018
2019for segment, exp_segment in zip(generate_outputs["segments"][0], EXPECTED_OUTPUT):
2020self.assertTrue(torch.allclose(segment["token_timestamps"], exp_segment))
2021
2022@slow
2023def test_tiny_specaugment_librispeech(self):
2024torch_device = "cpu"
2025set_seed(0)
2026# Apply SpecAugment
2027model = WhisperModel.from_pretrained("openai/whisper-tiny", apply_spec_augment=True)
2028# Set model to training mode to enable SpecAugment
2029model.train()
2030model.to(torch_device)
2031input_speech = self._load_datasamples(1)
2032feature_extractor = WhisperFeatureExtractor()
2033input_features = feature_extractor(input_speech, return_tensors="pt").input_features
2034
2035with torch.no_grad():
2036logits = model(
2037input_features,
2038decoder_input_ids=torch.tensor([[50258, 50259, 50359]]),
2039output_hidden_states=False,
2040output_attentions=False,
2041return_dict=False,
2042use_cache=False,
2043)
2044
2045# fmt: off
2046EXPECTED_LOGITS = torch.tensor(
2047[
20480.9362, -4.7105, 5.0879, 3.9642, 1.0013, -6.0096, 4.7285, -3.1847,
2049-0.8648, 1.9631, 6.2653, 3.6936, 0.3575, -4.5818, 3.0564, 7.8712,
20502.9951, 0.6848, 9.9497, -2.6638, 1.1571, -6.8546, -1.4333, -7.7584,
20511.1200, 3.9030, 4.4655, -4.4919, -1.1703, 9.6241
2052]
2053)
2054# fmt: on
2055self.assertTrue(torch.allclose(logits[0][0, 0, :30].cpu(), EXPECTED_LOGITS, atol=1e-4))
2056
2057@slow
2058def test_generate_with_prompt_ids(self):
2059processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
2060model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
2061model.to(torch_device)
2062input_speech = self._load_datasamples(4)[-1:]
2063input_features = processor(input_speech, return_tensors="pt").input_features.to(torch_device)
2064
2065output_without_prompt = model.generate(input_features)
2066prompt_ids = processor.get_prompt_ids("Leighton", return_tensors="pt").to(torch_device)
2067output_with_prompt = model.generate(input_features, prompt_ids=prompt_ids)
2068
2069expected_without_prompt = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|> He has grave doubts whether Sir Frederick Layton's work is really Greek after all and can discover in it but little of Rocky Ithaca.<|endoftext|>"
2070expected_with_prompt = "<|startofprev|> Leighton<|startoftranscript|><|en|><|transcribe|><|notimestamps|> He has grave doubts whether Sir Frederick Leighton's work is really Greek after all and can discover in it but little of Rocky Ithaca.<|endoftext|>"
2071
2072output_without_prompt = processor.decode(output_without_prompt[0])
2073output_with_prompt = processor.decode(output_with_prompt[0])
2074
2075self.assertEqual(output_without_prompt, expected_without_prompt)
2076self.assertEqual(output_with_prompt, expected_with_prompt)
2077
2078@slow
2079def test_language_detection(self):
2080processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
2081model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
2082model.to(torch_device)
2083input_speech = self._load_datasamples(4)[-1:]
2084input_features = processor(input_speech, return_tensors="pt").input_features.to(torch_device)
2085
2086lang_id = model.detect_language(input_features)[0].item()
2087
2088ids_to_lang = {v: k for k, v in model.generation_config.lang_to_id.items()}
2089
2090assert ids_to_lang[lang_id] == "<|en|>"
2091
2092audio = hf_hub_download("Narsil/asr_dummy", filename="hindi.ogg", repo_type="dataset")
2093
2094raw_audio, sr = torchaudio.load(audio)
2095input_speech = torchaudio.transforms.Resample(sr, 16_000)(raw_audio).numpy()
2096
2097input_features = processor(input_speech, return_tensors="pt").input_features.to(torch_device)
2098
2099lang_id = model.detect_language(input_features)[0].item()
2100
2101assert ids_to_lang[lang_id] == "<|hi|>"
2102
2103@slow
2104def test_default_multilingual_transcription_short_form(self):
2105processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
2106model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
2107model.to(torch_device)
2108
2109audio = hf_hub_download("Narsil/asr_dummy", filename="hindi.ogg", repo_type="dataset")
2110
2111raw_audio, sr = torchaudio.load(audio)
2112input_speech = torchaudio.transforms.Resample(sr, 16_000)(raw_audio).numpy()
2113
2114input_features = processor(input_speech, return_tensors="pt").input_features.to(torch_device)
2115
2116# model.generation_config.forced_decoder_ids defaults to [1, null] for lang_token
2117sequences = model.generate(input_features)
2118
2119transcription = processor.batch_decode(sequences, skip_special_tokens=False)[0]
2120
2121assert (
2122transcription
2123== "<|startoftranscript|><|hi|><|transcribe|><|notimestamps|> Mirchi mein ki tene vibinda prajatiya hai<|endoftext|>"
2124)
2125
2126# set forced_decoder_ids to English
2127model.generation_config.forced_decoder_ids[0][-1] = 50259
2128
2129sequences = model.generate(input_features)
2130transcription = processor.batch_decode(sequences, skip_special_tokens=False)[0]
2131
2132assert (
2133transcription
2134== "<|startoftranscript|><|en|><|transcribe|><|notimestamps|> MIRCHI MET, which is the name of the Bible.<|endoftext|>"
2135)
2136
2137@slow
2138def test_default_multilingual_transcription_long_form(self):
2139processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
2140model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
2141model.to(torch_device)
2142
2143audio = hf_hub_download("Narsil/asr_dummy", filename="hindi.ogg", repo_type="dataset")
2144
2145raw_audio, sr = torchaudio.load(audio)
2146input_speech = torchaudio.transforms.Resample(sr, 16_000)(raw_audio)
2147
2148input_speech = input_speech.repeat(1, 10).numpy()
2149input_features = processor(
2150input_speech, return_tensors="pt", padding="longest", truncation=False
2151).input_features.to(torch_device)
2152
2153# model.generation_config.forced_decoder_ids defaults to [1, null] for lang_token
2154sequences = model.generate(input_features)
2155
2156transcription = processor.batch_decode(sequences)[0]
2157
2158assert transcription == " मिर्ची में कितने विबिन्द प्रजातियां हैं? मिर्ची में कितने विबिन्द प्रजातियां हैं?"
2159
2160# set forced_decoder_ids to English
2161model.generation_config.forced_decoder_ids[0][-1] = 50259
2162
2163sequences = model.generate(input_features)
2164transcription = processor.batch_decode(sequences)[0]
2165
2166assert (
2167transcription
2168== " How many different species are there in the chilli? How many different species are there in the chili?"
2169)
2170
2171@slow
2172def test_generate_with_prompt_ids_and_forced_decoder_ids(self):
2173processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
2174model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
2175model.to(torch_device)
2176input_speech = self._load_datasamples(1)
2177input_features = processor(input_speech, return_tensors="pt").input_features.to(torch_device)
2178task = "translate"
2179language = "de"
2180expected_tokens = [f"<|{task}|>", f"<|{language}|>"]
2181prompt = "test prompt"
2182prompt_ids = processor.get_prompt_ids(prompt, return_tensors="pt").to(torch_device)
2183
2184output = model.generate(input_features, task=task, language=language, prompt_ids=prompt_ids)
2185text = processor.decode(output[0])
2186
2187self.assertTrue(prompt in text)
2188self.assertTrue(all(token in text for token in expected_tokens))
2189
2190@slow
2191def test_generate_with_prompt_ids_and_no_non_prompt_forced_decoder_ids(self):
2192processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
2193model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
2194model.to(torch_device)
2195input_speech = self._load_datasamples(1)
2196input_features = processor(input_speech, return_tensors="pt").input_features.to(torch_device)
2197prompt = "test prompt"
2198prompt_ids = processor.get_prompt_ids(prompt, return_tensors="pt").to(torch_device)
2199
2200model.generation_config.forced_decoder_ids = None
2201model.config.forced_decoder_ids = None
2202
2203output = model.generate(input_features, prompt_ids=prompt_ids, return_timestamps=True)
2204text = processor.decode(output[0])
2205
2206self.assertTrue(prompt in text)
2207
2208@slow
2209@require_torch_gpu
2210def test_speculative_decoding_distil(self):
2211torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
2212model_id = "openai/whisper-large-v2"
2213model = WhisperForConditionalGeneration.from_pretrained(
2214model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
2215)
2216model.to(torch_device)
2217
2218processor = WhisperProcessor.from_pretrained(model_id)
2219
2220assistant_model_id = "distil-whisper/distil-large-v2"
2221assistant_model = WhisperForCausalLM.from_pretrained(
2222assistant_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
2223)
2224assistant_model.to(torch_device)
2225
2226dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
2227sample = dataset[0]["audio"]
2228
2229input_features = (
2230processor(sample["array"], return_tensors="pt").input_features.to(torch_device).to(torch.float16)
2231)
2232
2233# warm up assisted decoding
2234_ = model.generate(input_features, assistant_model=assistant_model)
2235# warm up non-assisted decoding
2236_ = model.generate(input_features)
2237
2238# assisted decoding
2239start_time = time.time()
2240tokens = model.generate(input_features, assistant_model=assistant_model)
2241total_time_assist = time.time() - start_time
2242
2243transcription_ass = processor.batch_decode(tokens, skip_special_tokens=True)
2244
2245# non-assisted decoding
2246start_time = time.time()
2247tokens = model.generate(input_features)
2248total_time_non_assist = time.time() - start_time
2249
2250transcription_non_ass = processor.batch_decode(tokens, skip_special_tokens=True)
2251
2252assert transcription_ass == transcription_non_ass
2253assert transcription_ass == [
2254" Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel."
2255]
2256assert total_time_non_assist > total_time_assist, "Make sure that assistant decoding is faster"
2257
2258@slow
2259@require_torch_gpu
2260def test_speculative_decoding_non_distil(self):
2261torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
2262model_id = "openai/whisper-large-v2"
2263model = WhisperForConditionalGeneration.from_pretrained(
2264model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
2265)
2266model.to(torch_device)
2267
2268processor = WhisperProcessor.from_pretrained(model_id)
2269
2270assistant_model_id = "openai/whisper-tiny"
2271assistant_model = WhisperForConditionalGeneration.from_pretrained(
2272assistant_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
2273)
2274assistant_model.to(torch_device)
2275
2276dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
2277sample = dataset[0]["audio"]
2278
2279input_features = (
2280processor(sample["array"], return_tensors="pt").input_features.to(torch_device).to(torch.float16)
2281)
2282
2283# warm up assisted decoding
2284_ = model.generate(input_features, assistant_model=assistant_model)
2285# warm up non-assisted decoding
2286_ = model.generate(input_features)
2287
2288# assisted decoding
2289start_time = time.time()
2290tokens = model.generate(input_features, assistant_model=assistant_model)
2291total_time_assist = time.time() - start_time
2292
2293transcription_ass = processor.batch_decode(tokens, skip_special_tokens=True)
2294
2295# non-assisted decoding
2296start_time = time.time()
2297tokens = model.generate(input_features)
2298total_time_non_assist = time.time() - start_time
2299
2300transcription_non_ass = processor.batch_decode(tokens, skip_special_tokens=True)
2301
2302assert transcription_ass == transcription_non_ass
2303assert transcription_ass == [
2304" Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel."
2305]
2306assert total_time_non_assist > total_time_assist, "Make sure that assistant decoding is faster"
2307
2308@slow
2309def test_whisper_longform_single_batch(self):
2310# fmt: off
2311EXPECTED_TEXT = [' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter\'s manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton\'s work is really Greek after all, and can discover in it but little of rocky Ithaca. Linnell\'s pictures are a sort of up-gards and atom paintings, and Mason\'s exquisite idles are as national as a jingo poem. Mr. Birk at Foster\'s landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. Mr. John Collier gives his sitter a cheerful slap in the back, before he says, like a shampoo or a Turkish bath. Next man, it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate an expression. On the general principles of art, Mr. Quilter writes with equal lucidity. he tells us is of a different quality to mathematics, and finish in art is adding more effect. As for etchings, there are two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures. Makes the customary appeal to the last judgment and reminds us that in the great days of art Michelangelo was the furnishing upholsterer. Near the fire, any ornaments Fred brought home from India on the mantelboard. In fact, he is quite severe on Mr. Ruskin for not recognizing that a picture should denote the frailty of man. And remarks was pleasing courtesy in Felicitis Grace that many faces are feeling. Only, unfortunately, his own work never does get good. Mr. Quilter has missed his chance, for he has failed even to make himself the Tupper of painting. By Harry Quilter M.A. A man said to the universe, Sir, I exist. Sweat-covered Breon\'s body trickling into the tight-lowing cloth that was the only german he wore. The cut on his chest still dripping blood. The ache of his overstrained eyes, even the soaring arena around him with thousands of spectators, retrovealities not worth thinking about. His instant panic was followed by a small sharp blow high on his chest. One minute, a voice said, and a time buzzer sounded. A minute is not a very large measure of time, and his body needed every fraction of it. The buzzers were triggered his muscles into complete relaxation. Oli\'s heart and lungs worked on at a strong, measured rate. He was in reverie, sliding along the borders of consciousness. The contestants in the twenties needed undisturbed rest. Therefore, nights in the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied, The thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency, clearly used to command. I\'m here because the matter is of utmost importance, and brand is the one I must see. Now stand aside. The twenties, he must have drawn his gun because the intruder said quickly, but that away you\'re being a fool. out, through his silence then, and still wondering, Breon was once more asleep. Ten seconds, he asked the handler who was needing his aching muscles. A red-haired mountain of a man, with an apparently inexhaustible store of energy. There could be little art in this last and final round of fencing. Just thrust and parry, and victory to the stronger. man who entered the twenties had his own training tricks. They were appeared to be an immediate association with the death trauma, as if the two were inextricably linked into one. The strength that enables someone in a trance to hold his body stiff and unsupported except at two points, the head and heels. This is physically impossible when conscious. had died before during the 20s and death during the last round was in some ways easier than defeat. Breathing deeply, Breon\'s softly spoke the auto-hypnotic phrases that triggered the process. When the buzzer sounded, he pulled his foil from his second startled grasp and ran forward. Our role looked amazed at the sudden fury of the attack, then smiled. He thought it was the last burst of energy. He knew how close they both were to exhaustion. Breon saw something close to panic on his opponent\'s face when the man finally recognized his error. A wave of despair rolled out from our rogue. Breon sensed it and knew the fifth point was his. Then the powerful twist that\'s rested aside, in and under the guard, because he was sleeping instead of conquering, the lovely rose princess has become a fiddle without a bow, while poor Shaggy sits there, accooing dove. He has gone, and gone for good," answered Polychrom, who had managed to squeeze into the room beside the dragon, and had witnessed the occurrences with much interest. I have remained a prisoner only because I wished to be one. And with says he stepped forward and burst the stout chains as easily as if they had been threads. The little girl had been asleep, but she heard the wraps and opened the door. The king has flooded disgrace, and your friends are asking for you. I begged Ruggadot long ago to send him away, but he would not do so. I also offered to help your brother to escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he doesn\'t work too hard, said Shaggy. He doesn\'t work at all. In fact, there\'s nothing he can do in these dominions as well as our gnomes, whose numbers are so great that it worries us to keep them all busy. Not exactly, we\'ve turned Calico. Where is my brother now, inquired Shaggy. In the metal forest. Where is that? The middle forest is in the great domed cavern, the largest and all-ard dominions, replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh no, I\'m quite sure he didn\'t. That\'s funny, remarked Betsy thoughtfully. I don\'t believe Anne knew any magic, or she\'d have worked it before. I do not know, confess Shaggy. True, agreed Calico. Calico went to the big gong and pounded on it just as Virgato used to do, but no one answered the summons. Having returned to the Royal Cavern, Calico first pounded the gong and then sat in the throne, wearing Virgato\'s discarded ruby crown and holding in his hand to scepter which reggative head so often thrown at his head.']
2312# fmt: on
2313
2314processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
2315model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
2316model = model.to(torch_device)
2317
2318ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean")
2319one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32)
2320
2321input_features = processor(one_audio, return_tensors="pt", truncation=False, padding="longest")[
2322"input_features"
2323]
2324input_features = input_features.to(device=torch_device)
2325
2326result = model.generate(input_features, return_timestamps=True)
2327decoded = processor.batch_decode(result, skip_special_tokens=True)
2328
2329assert decoded == EXPECTED_TEXT
2330
2331decoded_with_timestamps = processor.batch_decode(result, skip_special_tokens=True, decode_with_timestamps=True)
2332
2333no_timestamp_matches = re.split(r"<\|[\d\.]+\|>", decoded_with_timestamps[0])
2334
2335assert ["".join(no_timestamp_matches)] == EXPECTED_TEXT
2336
2337timestamp_matches = re.findall(r"<\|[\d\.]+\|>", decoded_with_timestamps[0])
2338
2339timestamp_floats = [float(t[2:-2]) for t in timestamp_matches]
2340
2341is_increasing = all(timestamp_floats[i] <= timestamp_floats[i + 1] for i in range(len(timestamp_floats) - 1))
2342
2343assert is_increasing
2344
2345@slow
2346def test_whisper_longform_prompt_ids(self):
2347processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
2348model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
2349model = model.to(torch_device)
2350
2351prompt = "Mr. Kilter, Ruggedo." # let's force Mr. Quilter -> Mr. Kilter
2352prompt_ids = processor.get_prompt_ids(prompt, return_tensors="pt").to(torch_device)
2353
2354ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean")
2355one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32)
2356
2357first_text = ds["validation"][0]["text"].lower()
2358last_text = ds["validation"][-1]["text"].lower()
2359
2360input_features = processor(one_audio, return_tensors="pt", truncation=False, padding="longest")[
2361"input_features"
2362]
2363input_features = input_features.to(device=torch_device)
2364
2365result = model.generate(
2366input_features,
2367prompt_ids=prompt_ids,
2368return_timestamps=True,
2369prompt_condition_type="first-segment",
2370condition_on_prev_tokens=True,
2371)
2372decoded_first_segment = processor.batch_decode(result, skip_special_tokens=True)
2373
2374result = model.generate(
2375input_features,
2376prompt_ids=prompt_ids,
2377return_timestamps=True,
2378prompt_condition_type="all-segments",
2379condition_on_prev_tokens=True,
2380)
2381decoded_all_segments = processor.batch_decode(result, skip_special_tokens=True)
2382
2383# show that first segment has quilter and last segment has ruggedo
2384assert "quilter" in first_text
2385assert "ruggedo" in last_text
2386
2387# condition on first segment correctly changes to kilter in first segment, but does not transcribe "ruggedo" correctly
2388assert "kilter" in decoded_first_segment[0][: len(first_text)].lower()
2389assert "ruggedo" not in decoded_first_segment[0][-len(last_text) :].lower()
2390
2391# condition on all-segment correctly changes to kilter in first segment and correctly transcribes "ruggedo"
2392assert "kilter" in decoded_all_segments[0][: len(first_text)].lower()
2393assert "ruggedo" in decoded_all_segments[0][-len(last_text) :].lower()
2394
2395@slow
2396def test_whisper_longform_single_batch_prev_cond(self):
2397# fmt: off
2398EXPECTED_TEXT = [""" Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grieved doubts whether Sir Frederick Layton's work is really Greek after all, and can discover in it but little of rocky Ithaca. Linnell's pictures are a sort of up-gards and atom paintings, and Mason's exquisite itals are as national as a jingo poem. Mr. Birk at Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. When Mr. John Collier gives his sitter a cheerful slap in the back, before he says like a shampooer and a Turkish bath, next man it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate an expression. On the general principles of art, Mr. Quilter writes with equal lucidity. He tells us is of a different quality to mathematics, and finish in art is adding more effect. As for etchings, there are two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures. Makes a customary appeal to the last judgment and reminds us that in the great days of art Michelangelo was the furnishing upholsterer. Near the fire, any ornaments Fred brought home from India on the mental board. In fact, he is quite severe on Mr. Ruskin for not recognizing that a picture should denote the frailty of man, and remarks was pleasing courtesy in felicitous grace that many faces are feeling. Unfortunately his own work never does get good. Mr. Quilter has missed his chance, for he has failed even to make himself the tupper of painting. By Harry Quilter M. A. A man said to the universe, Sir, I exist. Sweat covered Breon's body trickling into the tight-lowing cloth that was the only german he wore. The cut on his chest still dripping blood. The ache of his overstrained eyes, even the soaring arena around him with thousands of spectators, retroveilities not worth thinking about. His instant panic was followed by a small sharp blow high on his chest. One minute, a voice said, and a time buzzer sounded. A minute is not a very large measure of time, and his body needed every fraction of it. The buzzers were triggered as muscles into complete relaxation. Only his heart and lungs worked on at a strong measured rate. He was in reverie, sliding along the borders of consciousness. The contestants in the twenties needed undisturbed rest. Therefore, nights in the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied, the thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency, clearly used to command. I'm here because the matter is of utmost importance, and brand is the one I must see. Now stand aside. The twenties, he must have drawn his gun because the intruder said quickly, but that away you're being a fool. But there was silence then, and still wondering, Breon was once more asleep. Ten seconds, he asked the handler who was needing his aching muscles. A red-haired mountain of a man with an apparently inexhaustible store of energy. There could be little art in this last and final round of fencing. Just thrust and parry and victory to the stronger. Your man who entered the twenties had his own training tricks. They were appeared to be an immediate association with the death trauma, as if the two were inextricably linked into one. The strength that enables someone in a trance to hold his body stiff and unsupported except at two points, the head and heels. This is physically impossible when conscious. Breon's death was in some ways easier than defeat. Breon's softly spoke the auto-hypnotic phrases that triggered the process. When the buzzer sounded, he pulled his foil from his second startled grasp and ran forward. Our role looked amazed at the sudden fury of the attack, then smiled. He thought it was the last burst of energy. He knew how close they both were to exhaustion. Breon saw something close to panic on his opponent's face when the man finally recognized his error. A wave of despair rolled out from our rogue. Breon sensed it and knew the fifth point was his. Then the powerful twist that's rested aside, in and under the guard, because he was sleeping instead of conquering, the lovely rose princess has become a fiddle without a bow, while poor Shaggy sits there, accooing dove. He has gone and gone for good, answered Polychrome, who had managed to squeeze into the room beside the dragon, and had witnessed the occurrences with much interest. I have remained a prisoner only because I wished to be one. And with this, he stepped forward and burst the stout chains as easily as if they had been threads. The little girl had been asleep, but she heard the wraps and opened the door. The king has flooded disgrace, and your friends are asking for you. I begged Ruggido long ago to send him away, but he would not do so. I also offered to help your brother to escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he doesn't work too hard, since Shaggy. He doesn't work at all. In fact, there's nothing he can do in these dominions, as well as our gnomes, whose numbers are so great that it worries us to keep them all busy. Not exactly, we've turned Calico. Where is my brother now, inquired Shaggy. In the metal forest. Where is that? The metal forest is in the great domed cavern, the largest and all-ard dominions, replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh no, I'm quite sure he didn't. That's funny, remarked Betsy thoughtfully. I don't believe Anne knew any magic, or she'd have worked it before. I do not know, confessed Shaggy. True, agreed Calico. Calico went to the big gong and pounded on it, just as we're good to be used to do, but no one answered the summons. Having returned to the royal cavern, Calico first pounded the gong and then sat in the throne, wearing Regidos discarded Ruby crown, and holding in his hand to scepter which Regidos had so often thrown at his head."""]
2399# fmt: on
2400
2401processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
2402model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
2403model = model.to(torch_device)
2404
2405ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean")
2406one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32)
2407
2408input_features = processor(one_audio, return_tensors="pt", truncation=False, padding="longest")[
2409"input_features"
2410]
2411input_features = input_features.to(device=torch_device)
2412
2413gen_kwargs = {
2414"return_timestamps": True,
2415"no_speech_threshold": 0.6,
2416"temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
2417"compression_ratio_threshold": 1.35,
2418"condition_on_prev_tokens": True,
2419"logprob_threshold": -1.0,
2420}
2421
2422torch.manual_seed(0)
2423result = model.generate(input_features, **gen_kwargs)
2424decoded = processor.batch_decode(result, skip_special_tokens=True)
2425
2426assert decoded == EXPECTED_TEXT
2427
2428@slow
2429def test_whisper_longform_multi_batch(self):
2430# fmt: off
2431EXPECTED_TEXT_1 = [" Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton's work is really Greek after all, and can discover in it but little of rocky Ithaca. Linnell's pictures are a sort of up-gards and atom paintings, and Mason's exquisite idles are as national as a jingo poem. Mr. Birkett Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. And Mr. John Collier gives his sitter a cheerful slap in the back, before he says, like a shampooer and a Turkish bath. Next man, it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate an expression. On the general principles of art, Mr. Quilter writes with equal lucidity. Painting he tells us is of a different quality to mathematics, and finish in art is adding more effect. As for etchings, there are two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures. Mix a customary appeal to the last judgment and reminds us that in the great days of art Michelangelo was the furnishing a poster or near the fire, and the ornaments Fred brought home from India on the mental board. In fact, he is quite severe on Mr. Ruskin for not recognizing that a picture should denote the frailty of man. And remarks was pleasing courtesy in Felicitis Grace that many faces are feeling. Only unfortunately his own work never does get good. Mr. Quilter has missed his chance, for he has failed even to make himself the Tupper of painting. a Harry Quilter M.A. A man said to the universe, Sir, I exist. Sweat-covered Breon's body trickling into the tight-wing cloth that was the only germany war. The cut on his chest still dripping blood. The ache of his overstrained eyes, even the soaring arena around him with thousands of spectators, retrovealities not worth thinking about. His instant panic was followed by a small sharp blow high on his chest. One minute, a voice said, and a time buzzer sounded. A minute is not a very large measure of time, and his body needed every fraction of it. The buzzers were, triggered his muscles into complete relaxation. Oily his heart and lungs worked on at a strong, measured rate. He was in reverie, sliding along the borders of consciousness. The contestants in the 20s needed undisturbed rest. Therefore, knights in the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied, the thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency, clearly used to command. I'm here because the matter is of utmost importance, and brand is the one I must see. Now stand aside. The twenty's he must have drawn his gun, because the intruder said quickly, but that away you're being a fool. Out there was silence then, and still wondering, Breon was once more asleep. Ten seconds he asked the handler who was needing his aching muscles. a red-haired mountain of a man with an apparently inexhaustible store of energy. There could be little art in this last and final round of fencing, just thrust and parry and victory to the stronger. Every man who entered the twenties had his own training tricks. There appeared to be an immediate association with the death trauma as if the two were andextricably linked into one. The strength that enables someone in a trance to hold his body stiff and unsupported except at two points, the head and heels. This is physically impossible when conscious. Others had died before during the twenties and death during the last round was, in some ways, easier than defeat. Breeding deeply, Breon's softly spoke the auto-hypnotic phrases that triggered the process. When the buzzer sounded, he pulled his foil from his second startled grasp and ran forward. I rolled the mazed at the sudden fury of the attack, then smiled. He thought it was the last burst of energy. He knew how close they both were to exhaustion. Breon saw something close to panic on his opponent's face when the man finally recognized his error. A wave of despair rolled out from our rogue, pre-inscented and new to fifth point was his. Then the powerful twist that's rest of the side, in and under the guard, because you were sleeping instead of conquering, the lovely rose princess has become a fiddle without a bow, while poor Shaggy sits there, a cooing dove. He has gone and gone for good, answered Polychrome, who had managed to squeeze into the room beside the dragon, and had witnessed the occurrences with much interest. I have remained a prisoner only because I wished to be one. And with this, he stepped forward and burst the stout chains as easily as if they had been threads. The little girl had been asleep, but she heard the wraps and opened the door. The king has flooded disgrace, and your friends are asking for you. I begged Ruggadot long ago to send him away, but he would not do so. I also offered to help your brother to escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he doesn't work too hard, since Shaggy. He doesn't work at all. In fact, there's nothing he can do in these dominions, as well as our gnomes, whose numbers are so great that it worries us to keep them all busy. Not exactly, return Calico. Where is my brother now? choir-dshaggy, in the metal forest. Where is that? The metal forest is in the great domed cavern, the largest and all-ard dominions, replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh, no, I'm quite sure he didn't. That's funny, remarked Betsy thoughtfully. I don't believe and knew any magic, or she'd have worked it before. I do not know, confess shaggy. True, a great calico. Calico went to the big gong and pounded on it, just as Virgado used to do, but no one answered the summons. Having returned to the Royal Cavern, Calico first pounded the gong and then sat in the throne, wearing Virgados discarded Ruby Crown, and holding in his hand to scepter, which Virgado had so often thrown at his head. head."]
2432EXPECTED_TEXT_2 = [" Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton's work is really Greek after all, and can discover in it but little of rocky Ithaca. Linnell's pictures are a sort of up-gards and atom paintings, and Mason's exquisite idles are as national as a jingo poem. Mr. Burkett Foster's landscapes smile at one much in the same way that Mr. Carker."]
2433EXPECTED_TEXT_3 = [" possible. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grieved doubts whether Sir Frederick Layton's work is really greek after all, and can discover in it but little of rocky Ithaca. Linnell's pictures are a sort of up-guards and atom paintings, and Mason's exquisite idles are as national as a jingo poem. Mr. Birk at Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. And Mr. John Collier gives his sitter a cheerful slap in the back, before he says, like a shampooer and a Turkish bath, next man, it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate an expression. Under general principles of art, Mr. Quilter writes with equal lucidity. Painting, he tells us, is of a different quality to mathematics and finish in art is adding more effect. As for etchings, there are two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures. Mix a customary appeal to the last judgment and reminds us that in the great days of art Michelangelo was the furnishing upholsterer. Near the fire. any ornaments Fred brought home from India on the mental board. In fact, he is quite severe on Mr. Ruskin for not recognizing that a picture should denote the frailty of man, and remarks was pleasing courtesy in Felicitis Grace that many faces are feeling. Only, unfortunately, his own work never does get good. Mr. Quilter has missed his chance, for he has failed even to make himself the tupper of painting. By Harry Quilter M.A. A man said to the universe, Sir, I exist. Sweat-covered Breon's body trickling into the titling cloth that was the only german he wore. The cut on his chest still dripping blood. The ache of his overstrained eyes. Even to soaring arena around him with thousands of spectators, retrovealities not worth thinking about. His instant panic was followed by a small sharp blow high on his chest. One minute, a voice said, and a time buzzer sounded. A minute is not a very large measure of time, and his body needed every fraction of it. The buzzers were triggered as muscles into complete relaxation. Oily his heart and lungs worked on at a strong measured rate. He was in In reverie, sliding along the borders of consciousness. The contestants in the 20s needed undisturbed rest. Therefore, nights in the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied, the thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency clearly used to command. I'm here because the matter is of utmost importance, and brand is the one I must see. Now stand aside. The twenty's he must have drawn his gun, because the intruder said quickly, but that away you're being a fool. Out there was silence then, and still wondering, Breon was once more asleep. Ten seconds he asked the handler who was needing his aching muscles. a red-haired mountain of a man with an apparently inexhaustible store of energy. There could be little art in this last and final round of fencing, just thrust and parry and victory to the stronger. Every man who entered the twenties had his own training tricks. There appeared to be an immediate association with the death trauma as if the two were andextricably linked into one. The strength that enables someone in a trance to hold his body stiff and unsupported except at two points, the head and heels. This is physically impossible when conscious. Others had died before during the twenties and death during the last round was, in some ways, easier than defeat. Breeding deeply, Breon's softly spoke the auto-hypnotic phrases that triggered the process. When the buzzer sounded, he pulled his foil from his second startled grasp and ran forward. Our role looked amazed at the sudden fury of the attack, then smiled. He thought it was the last burst of energy. He knew how close they both were to exhaustion. Breon saw something close to panic on his opponent's face when the man finally recognized his error. A wave of despair rolled out from our rogue, re-insunced it and knew the fifth point was his. Then the powerful twist that's rest of the side, in and under the guard, because you were sleeping instead of conquering, the lovely rose princess has become a fiddle without a bow, while poor Shaggy sits there, a cooing dove. He has gone and gone for good, answered Polychrome, who had managed to squeeze into the room beside the dragon, and had witnessed the occurrences with much interest. I have remained a prisoner only because I wished to be one. And with this, he stepped forward and burst the stout chains as easily as if they had been threads. The little girl had been asleep, but she heard the wraps and opened the door. The king has fled and disgraced, and your friends are asking for you. I begged Ruggadot long ago to send him away, but he would not do so. I also offered to help your brother to escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he doesn't work too hard, since Shaggy. He doesn't work at all. In fact, there's nothing he can do in these dominions as well as our gnomes, whose numbers are so great that it worries us to keep them all busy. Not exactly, we've turned Calico. Where is my brother now? quared shaggy. In the metal forest. Where is that? The metal forest is in the great domed cavern, the largest and all-ard dominions, replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh no, I'm quite sure he didn't. And that's funny, remarked Betsy thoughtfully. I don't believe Anne knew any magic, or she'd have worked it before. I do not know, confess Shaggy. True, a great calico. Calico went to the big gong and pounded on it, just as we're good to have used to do, but no one answered the summons. Having returned to the Royal Cavern, Calico first pounded the gong and then sat in the thrown wearing ruggedos discarded ruby crown and holding in his hand to septor which ruggedo had so often thrown at his head."]
2434EXPECTED_TEXT_4 = [' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter\'s manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton\'s work is really Greek after all, and can discover in it but little of rocky Ithaca. Linnell\'s pictures are a sort of up-gards and atom paintings, and Mason\'s exquisite idles are as national as a jingo poem. Mr. Birk at Foster\'s landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. Mr. John Collier gives his sitter a cheerful slap in the back, before he says, like a shampoo or a Turkish bath. Next man, it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate an expression. On the general principles of art, Mr. Quilter writes with equal lucidity. he tells us is of a different quality to mathematics, and finish in art is adding more effect. As for etchings, there are two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures. Makes the customary appeal to the last judgment and reminds us that in the great days of art Michelangelo was the furnishing upholsterer. Near the fire, any ornaments Fred brought home from India on the mantelboard. In fact, he is quite severe on Mr. Ruskin for not recognizing that a picture should denote the frailty of man. And remarks was pleasing courtesy in Felicitis Grace that many faces are feeling. Only, unfortunately, his own work never does get good. Mr. Quilter has missed his chance, for he has failed even to make himself the Tupper of painting. By Harry Quilter M.A. A man said to the universe, Sir, I exist. Sweat-covered Breon\'s body trickling into the tight-lowing cloth that was the only german he wore. The cut on his chest still dripping blood. The ache of his overstrained eyes, even the soaring arena around him with thousands of spectators, retrovealities not worth thinking about. His instant panic was followed by a small sharp blow high on his chest. One minute, a voice said, and a time buzzer sounded. A minute is not a very large measure of time, and his body needed every fraction of it. The buzzers were triggered his muscles into complete relaxation. Oli\'s heart and lungs worked on at a strong, measured rate. He was in reverie, sliding along the borders of consciousness. The contestants in the twenties needed undisturbed rest. Therefore, nights in the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied, The thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency, clearly used to command. I\'m here because the matter is of utmost importance, and brand is the one I must see. Now stand aside. The twenties, he must have drawn his gun because the intruder said quickly, but that away you\'re being a fool. out, through his silence then, and still wondering, Breon was once more asleep. Ten seconds, he asked the handler who was needing his aching muscles. A red-haired mountain of a man, with an apparently inexhaustible store of energy. There could be little art in this last and final round of fencing. Just thrust and parry, and victory to the stronger. man who entered the twenties had his own training tricks. They were appeared to be an immediate association with the death trauma, as if the two were inextricably linked into one. The strength that enables someone in a trance to hold his body stiff and unsupported except at two points, the head and heels. This is physically impossible when conscious. had died before during the 20s and death during the last round was in some ways easier than defeat. Breathing deeply, Breon\'s softly spoke the auto-hypnotic phrases that triggered the process. When the buzzer sounded, he pulled his foil from his second startled grasp and ran forward. Our role looked amazed at the sudden fury of the attack, then smiled. He thought it was the last burst of energy. He knew how close they both were to exhaustion. Breon saw something close to panic on his opponent\'s face when the man finally recognized his error. A wave of despair rolled out from our rogue. Breon sensed it and knew the fifth point was his. Then the powerful twist that\'s rested aside, in and under the guard, because he was sleeping instead of conquering, the lovely rose princess has become a fiddle without a bow, while poor Shaggy sits there, accooing dove. He has gone, and gone for good," answered Polychrom, who had managed to squeeze into the room beside the dragon, and had witnessed the occurrences with much interest. I have remained a prisoner only because I wished to be one. And with says he stepped forward and burst the stout chains as easily as if they had been threads. The little girl had been asleep, but she heard the wraps and opened the door. The king has flooded disgrace, and your friends are asking for you. I begged Ruggadot long ago to send him away, but he would not do so. I also offered to help your brother to escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he doesn\'t work too hard, said Shaggy. He doesn\'t work at all. In fact, there\'s nothing he can do in these dominions as well as our gnomes, whose numbers are so great that it worries us to keep them all busy. Not exactly, we\'ve turned Calico. Where is my brother now, inquired Shaggy. In the metal forest. Where is that? The middle forest is in the great domed cavern, the largest and all-ard dominions, replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh no, I\'m quite sure he didn\'t. That\'s funny, remarked Betsy thoughtfully. I don\'t believe Anne knew any magic, or she\'d have worked it before. I do not know, confess Shaggy. True, agreed Calico. Calico went to the big gong and pounded on it just as Virgato used to do, but no one answered the summons. Having returned to the Royal Cavern, Calico first pounded the gong and then sat in the throne, wearing Virgato\'s discarded ruby crown and holding in his hand to scepter which reggative head so often thrown at his head.']
2435# fmt: on
2436
2437processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
2438model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
2439model = model.to(torch_device)
2440
2441ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean")
2442one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32)
2443audios = []
2444audios.append(one_audio[110000:])
2445audios.append(one_audio[:800000])
2446audios.append(one_audio[80000:])
2447audios.append(one_audio[:])
2448
2449decoded_single = []
2450for audio in audios:
2451inputs = processor(audio, return_tensors="pt", truncation=False)
2452inputs = inputs.to(device=torch_device)
2453
2454result = model.generate(**inputs, return_timestamps=True)
2455decoded_single.append(processor.batch_decode(result, skip_special_tokens=True))
2456
2457inputs = processor(
2458audios, return_tensors="pt", truncation=False, padding="longest", return_attention_mask=True
2459)
2460inputs = inputs.to(device=torch_device)
2461
2462result = model.generate(**inputs, return_timestamps=True)
2463decoded_all = processor.batch_decode(result, skip_special_tokens=True)
2464
2465# make sure single & batch is exactly the same
2466assert decoded_all[0:1] == decoded_single[0]
2467assert decoded_all[1:2] == decoded_single[1]
2468assert decoded_all[2:3] == decoded_single[2]
2469assert decoded_all[3:4] == decoded_single[3]
2470
2471# exact match
2472assert decoded_all[0:1] == EXPECTED_TEXT_1
2473assert decoded_all[1:2] == EXPECTED_TEXT_2
2474assert decoded_all[2:3] == EXPECTED_TEXT_3
2475assert decoded_all[3:4] == EXPECTED_TEXT_4
2476
2477@slow
2478def test_whisper_longform_multi_batch_prev_cond(self):
2479# fmt: off
2480EXPECTED_TEXT_1 = [" Mr. Quilters manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton's work is really Greek after all and can discover in it but little of Rocky Ithaca. The Nils, pictures are sort of upguards and atom paintings and Mason's exquisite itals are as national as a jingo poem. Mr. Berkett Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. And Mr. John Collier gives his sitter a cheerful slap on the back before he says like a shampooer and a Turkish bath. Next man, it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate and expression. On the general principles of art, Mr. Quilters writes with equal lucidity. Painting he tells us is of a different quality to mathematics and finish in art is adding more effect. As for etchings, there are of two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures makes a customary appeal to the last judgment and reminds us that in the great days of art Michelangelo was the furnishing apostorer. Near the fire, any ornaments Fred brought home from India on the mental board. In fact, he is quite severe on Mr. Ruskin, for not recognizing that a picture should denote the frailty of man. And remarks with pleasing courtesy and solicitous grace that many phases of feeling only, unfortunately, his own work never does get good. Mr. Quilters has missed his chance, for he has failed even to make himself the tougher of painting. My hair equal to M.A. A man said to the universe, Sir, I exist. Sweat covered Breon's body, trickling into the tight-wing cloth that was the only garment he wore. The cut on his chest still dripping blood. The ache of his overstrain dyes. Even the soaring arena around him with thousands of spectators, retrievalidies not worth thinking about. His instant panic was followed by a small sharp blow, high on his chest. One minute, a voice said, and a time buzzer sounded. A minute is not a very large measure of time, and his body needed every fraction of it. The buzzer's were triggered as muscles into complete relaxation. Only his heart and lungs worked on at a strong, measured rate. He was in reverie, sliding along the borders of consciousness. The contestants in the 20s needed undisturbed rest. Therefore, knights and the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied, the thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency, clearly used to command. I'm here because the matter is of utmost importance. And brand is the one I must see. Now stand aside. To 20s, he must have drawn his gun because the intruder said quickly. But that away, he'd be no fool. Out, the resoundance then, and still wondering, Brienne was once more asleep. Ten seconds, he asked the handler who was needing his aching muscles. A red-haired mountain of a man, with an apparently inexhaustible story of energy. There could be little art in this last and final round of fencing. Just thrust and parry and victory to the stronger. Every man who entered the 20s had his own training tricks. There appeared to be an immediate association with the death trauma as if the two were inexplicably linked into one. This strength that enables someone in a trance to hold his body stiff and unsupported, except at two points, the head and heels. This is physically impossible when conscious. Others had died before during the 20s, and death during the last round was, in some ways, easier than defeat. Breathing deeply, Brienne softly spoke the other hypnotic phrases that triggered the process. In the buzzer sounded, he pulled his foil from his second startled grasp and ran forward. I rolled the maze at the sudden fury of the attack, then smiled. He thought it was the last burst of energy. He knew how close they both were to exhaustion. Brienne saw something close to panic on his opponent's face when the man finally recognized his error. A wave of despair rolled out from our role. Brienne sensed it and knew the fifth point was his. Then the powerful twist that's right to the side, in and under the guard, because he was sleeping instead of conquering, the lovely rose princess has become a fiddle with a bow, while poor shaggy sits there, a cooling dove. He has gone and gone for good, answered polychrome, who had managed to squeeze into the room beside the dragon, and had witnessed the occurrences with much interest. I have remained a prisoner only because I wished to be one. And with this, he stepped forward and burst the stoutchanges as easily as if they had been threads. The little girl had been asleep, but she heard the wraps and opened the door. The king has fled in disgrace in your friends, they're asking for you. I begged Ruggano a long ago to send him away, but he would not do so. I also offered to help you run into escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he doesn't work too hard since shaggy. He doesn't work at all. In fact, there's nothing he can do in these dominions, as well as our nooms, whose numbers are so great that it worries us to keep them all busy. Not exactly, we've turned Calico, whereas my brother now inquired shaggy in the metal forest. Where is that? The metal forest is in the great domed cavern, the largest and all our dominions replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh no, I'm quite sure he didn't. That's funny, remarked to Bedsey thoughtfully. I don't believe Anne knew any magic or she'd have worked before. I do not know, confessed shaggy. True, agreed Calico. Calico went to the big gong and pounded on it just as Ruggano used to do, but no one answered the summons. Having returned to the royal cavern, Calico first pounded the gong and then sat in the throne, wearing Ruggano's discarded ruby crown. And holding in his hand the scepter which Ruggano had so often thrown at his head."]
2481EXPECTED_TEXT_2 = [" Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Latins' work is really Greek after all, and can discover in it but little of rocky Ithaca. Lennials, pictures are a sort of upguards and atom paintings, and Mason's exquisite idles are as national as a jingo poem. Mr. Berkett Foster's landscapes smile at one much in the same way that Mr. Carker"]
2482EXPECTED_TEXT_3 = [" gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly drawn from eating in its results occur most readily to the mind. He has grave doubts whether Sir Frederick Latins work is really Greek after all and can discover in it but little of rocky ithaka. Lennils, pictures, are a sort of upguards and atom paintings and Mason's exquisite itals are as national as a jingo poem. Mr. Birkut Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. And Mr. John Collier gives his sitter a cheerful slap on the back before he says like a shampooer and a Turkish bath. Next man, it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate and expression. Under general principles of art, Mr. Quilter writes with equal lucidity. Painting he tells us is of a different quality to mathematics and finish in art is adding more effect. As for etchings, thereof two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures makes a customary appeal to the last judgment and reminds us that in the great days of art Michelangelo was the furnishing apostoror. Near the fire, any ornaments spread brought home from India on the mental board. In fact, he is quite severe on Mr. Ruskin for not recognizing that a picture should denote the frailty of man. And remarks with pleasing courtesy and solicitous grace that many faces are feeling, only unfortunately his own work never does get good. Mr. Quilter has missed his chance. For he has failed even to make himself the tougher of painting. By Harry Quilter M.A. A man said to the universe, Sir, I exist. Sweat covered Brienne's body trickling into the tight-wing cloth that was the only garment you wore. The cut on his chest still dripping blood. The ache of his overstrained eyes. Even the soaring arena around him with thousands of spectators, retrievalidies not worth thinking about. His instant panic was followed by a small sharp blow, high on his chest. One minute, a voice said, and a time buzzer sounded. A minute is not a very large measure of time, and his body needed every fraction of it. The buzzer's were triggered his muscles into complete relaxation. Only his heart and lungs worked on at a strong measured rate. He was in reverie, sliding out on the borders of consciousness. The contestants in the 20s needed undisturbed rest. Therefore, knights and the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied, the thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency, clearly used to command. I'm here because the matter is of utmost importance, and brand is the one I must see. Now stand aside. The 20s, he must have drawn his gun because the intruder said quickly, but that away here being a fool. Out, there is silence then, and still wondering, Brienne was once more asleep. 10 seconds, he asked the handler who was needing his aching muscles. I've read here at Mountain of a Man with an apparently inexhaustible story of energy. There could be little art in this last and final round of fencing, just thrust and parry and victory to the stronger. Every man who entered the 20s had his own training tricks. There appeared to be an immediate association with the death trauma as if the two were anextricably linked into one. The strength that enables someone in a trance to hold his body stiff and unsupported, except at two points, the head and heels. This is physically impossible when conscious. Others had died before during the 20s, and death during the last round was, in some ways, easier than defeat. Breathing deeply, Brienne's softly spoke the odd hypnotic phrases that triggered the process. When the buzzer sounded, he pulled his foil from his second startled grasp and ran forward. I rolled up the maze at the sudden fury of the attack, then smiled. He said it was the last burst of energy. He knew how close they both were to exhaustion. Brienne saw something close to panic on his opponent's face when the man finally recognized his error. A wave of despair rolled out from our ol' Brienne sensed it and knew the fifth point was his. Then the powerful twist that's right to decide, in and under the guard, because he was sleeping instead of conquering, the lovely rose princess has become a fiddle with a bow, while poor shaggy sits there, a cooling dove. He has gone and gone for good, answered polychrome, who had managed to squeeze into the room beside the dragon, and had witnessed the occurrences with much interest. I have remained a prisoner only because I wished to be one. And with this, he stepped forward and burst the stout chains as easily as if they had been threads. The little girl had been asleep, but she heard the wraps and opened the door. The king has fled in disgrace in your friends, they're asking for you. I begged Brienne to long ago to send him away, but he would not do so. I also offered to help you brother to escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he doesn't work too hard, since Shaggy. He doesn't work at all. In fact, there's nothing he can do in these dominions as well as our nooms, whose numbers are so great that it worries us to keep them all busy. Not exactly, we've turned Calico, whereas my brother now inquired Shaggy in the metal forest. Where is that? The metal forest is in the great domed cavern, the largest and all our dominions replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh no, I'm quite sure he didn't. That's funny, remarked to bed see you thoughtfully. I don't believe Anne knew any magic or she'd have worked it before. I do not know, confessed Shaggy. True, agreed Calico. Calico went to the big gone and pounded on it, just as we're good or used to do, but no one answered the summons. Having returned to the royal cavern, Calico first pounded the gone and then sat in the throne, wearing reggos, discarded ruby crown, and holding in his hand to scepter which reggos hand so often thrown at his head."]
2483EXPECTED_TEXT_4 = [" Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Latins' work is really Greek after all, and can discover in it but little of rocky Ithaca. Lennils, pictures, are a sort of upguards and atom paintings, and Mason's exquisite idles are as national as a jingo poem. Mr. Berkett Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. And Mr. John Collier gives his sitter a cheerful slap on the back before he says, like a shampooer in a Turkish bath. Next man, it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate and expression. On the general principles of art, Mr. Quilter writes with equal lucidity. Painting he tells us is of a different quality to mathematics, and finish in art is adding more effect. As for etchings, thereof two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures makes a customary appeal to the last judgment and reminds us that in the great days of art Michelangelo was the furnishing apostorer. Near the fire, any ornaments Fred brought home from India on the mental board. In fact, he is quite severe on Mr. Ruskin, for not recognizing that a picture should denote the frailty of man. And remarks with pleasing courtesy and solicitous grace that many phases of feeling only, unfortunately, his own work never does, get good. Mr. Quilter has missed his chance, for he has failed even to make himself the tougher of painting. By Harry Quilter, M.A. A man said to the universe, Sir, I exist. Sweat covered Breon's body, trickling into the tight-wing cloth that was the only garment you wore. The cut on his chest still dripping blood. The ache of his overstrained eyes, even the soaring arena around him with thousands of spectators were trivialities not worth thinking about. His instant panic was followed by a small sharp blow, high on his chest. One minute, a voice said, and a time buzzer sounded. A minute is not a very large measure of time, and his body needed every fraction of it. The buzzer's were triggered as muscles into complete relaxation. Only his heart and lungs worked on at a strong, measured rate. He was in reverie, sliding along the borders of consciousness. The contestants in the 20s needed undisturbed rest. Therefore, knights and the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied, the thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency, clearly used to command. I'm here because the matter is of utmost importance. And brand is the one I must see. Now stand aside. To 20s, he must have drawn his gun because the intruder said quickly, but that away, he could be no fool. Out, there was silence then, and still wondering, Brienne was once more asleep. Ten seconds, he asked the handler who was needing his aching muscles. I've read here at Mountain of a Man, with an apparently inexhaustible story of energy. There could be little art in this last and final round of fencing. Just thrust and parry and victory to the stronger. Every man who entered the 20s had his own training tricks. There appeared to be an immediate association with the death trauma, as if the two were inextricably linked into one. The strength that enables someone in a trance to hold his body stiff and unsupported, except at two points, the head and heels. This is physically impossible when conscious. Others had died before during the 20s, and death during the last round was, in some ways, easier than defeat. Breathing deeply, Brienne softly spoke the other hypnotic phrases that triggered the process. When the buzzer sounded, he pulled his foil from his second startled grasp and ran forward. I rolled the maze at the sudden fury of the attack, then smiled. He thought it was the last burst of energy. He knew how close they both were to exhaustion. Brienne saw something close to panic on his opponent's face when the man finally recognized his error. A wave of despair rolled out from Irohog. Brienne sensed it and knew the fifth point was his. Then the powerful twist that's for us to decide, in and under the guard, because he was sleeping instead of conquering, the lovely rose princess has become a fiddle with a bow, while poor shaggy sits there, a cooling dove. He has gone and gone for good, answered polychrome, who had managed to squeeze into the room beside the dragon, and had witnessed the occurrences with much interest. I have remained a prisoner only because I wished to be one. And with this, he stepped forward and burst the stoutchanges as easily as if they had been threads. The little girl had been asleep, but she heard the wraps and opened the door. The king has fled in disgrace in your friends, they are asking for you. I begged Ruggano a long ago to send him away, but he would not do so. I also offered to help you brother to escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he doesn't work too hard since shaggy. He doesn't work at all. In fact, there is nothing he can do in these dominions, as well as our nooms, whose numbers are so great that it worries us to keep them all busy. And exactly we've turned Calico, where is my brother now in Quaragejji, in the metal forest? Where is that? The metal forest is in the great donned cavern, the largest and all our dominions replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh no, I'm quite sure he didn't. That's funny, remarked to Bedzeeth thoughtfully. I don't believe Anne knew any magic or she'd have worked before. I do not know, confessed shaggy. True, agreed Calico. Calico went to the big gong and pounded on it just as we're good to have used to do, but no one answered the summons. Having returned to the royal cavern, Calico first pounded the gong and then sat in the throne, wearing reggos, discarded ruby crown. And holding in his hand to scepter which reggos had so often thrown at his head."]
2484# fmt: on
2485
2486processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
2487model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
2488model = model.to(torch_device)
2489
2490ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean")
2491one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32)
2492audios = []
2493audios.append(one_audio[110000:])
2494audios.append(one_audio[:800000])
2495audios.append(one_audio[80000:])
2496audios.append(one_audio[:])
2497
2498gen_kwargs = {
2499"return_timestamps": True,
2500"no_speech_threshold": 0.6,
2501"temperature": 0.0,
2502"compression_ratio_threshold": 1.35,
2503"condition_on_prev_tokens": True,
2504"logprob_threshold": -1.0,
2505}
2506
2507decoded_single = []
2508for audio in audios:
2509inputs = processor(audio, return_tensors="pt", truncation=False)
2510inputs = inputs.to(device=torch_device)
2511
2512result = model.generate(**inputs, **gen_kwargs)
2513decoded_single.append(processor.batch_decode(result, skip_special_tokens=True))
2514
2515# exact match
2516assert decoded_single[0] == EXPECTED_TEXT_1
2517assert decoded_single[1] == EXPECTED_TEXT_2
2518assert decoded_single[2] == EXPECTED_TEXT_3
2519assert decoded_single[3] == EXPECTED_TEXT_4
2520
2521@slow
2522def test_whisper_longform_multi_batch_hard(self):
2523# fmt: off
2524EXPECTED_TEXT = [
2525" Folks, if you watch the show, you know, I spent a lot of time right over there. Patiently and astutely scrutinizing the boxwood and mahogany chest set of the day's biggest stories developing the central headline pawns, definitely maneuvering an oso topical night to F6, fainting a classic Sicilian, nade door variation on the news, all the while seeing eight moves deep and patiently marshalling the latest press releases into a fisher's shows in Lip Nitsky attack that culminates in the elegant lethal slow-played, all-passant checkmate that is my nightly monologue. But sometimes, sometimes, folks, I. CHEERING AND APPLAUSE Sometimes I startle away, cubside down in the monkey bars of a condemned playground on a super fun site. Get all hept up on goofballs. Rummage that were discarded tag bag of defective toys. Yank out a fist bowl of disembodied doll limbs, toss them on a stained kid's place mat from a defunct dennies. set up a table inside a rusty cargo container down by the Wharf and challenged toothless drifters to the godless bughouse blitz of tournament that is my segment. Meanwhile.",
2526" Folks, I spend a lot of time right over there, night after night after night, actually. Carefully selecting for you the day's noosiest, most aerodynamic headlines, stress testing, and those topical anti-lock breaks and power steering, painstakingly stitching, leather seating so soft, it would make JD power and her associates blush to create the luxury sedan that is my nightly monologue. But sometimes, you sometimes, folks. I lurched a consciousness in the back of an abandoned school and slap myself awake with a crusty floor mat. Before using a mouse-bitten timing belt to strap some old plywood to a couple of discarded oil drums, then by the light of a heathen moon, render a gas tank out of an empty big gulp, fill with white claw and denatured alcohol, then light a match and let her rip and the demented one man soapbox derby of news that is my segment. Me, Guadalupe! No!",
2527" Ladies and gentlemen, you know, I spent a lot of time right over there Raising the finest Holstein news cattle firmly yet tenderly milking the latest headlines from their jokes swollen teats Churning the daily stories into the decadent proven-style style triple cream breed that is my nightly monologue But sometimes sometimes folks I stagger home hungry after being released by the police and Root around in the neighbor's trash can for an old milk carton scrape out the blooming dairy residue into the remains of a wet cheese rod I won from a rat in a pre-donned street fight. Put it in a discarded paint can to leave it to ferment next to a trash fire then hunker down and hallucinate while eating the listeria laden demon custard of news that is my segment. You mean one of them.",
2528" Folks, if you watch this show, you know I spend most of my time right over there carefully sorting through the day's biggest stories and selecting only the most subtle and unblemished ostrich and crocodile news leather, which I then entrust to artisan graduates of the Ichol Gregoire Ferrandi, who carefully dye them in a palette of bright zesty shades and adorn them in the finest and most topical inlay work using hand tools and double magnifying glasses, then assemble them according to now classic and elegant geometry using our signature saddles stitching. In line it with bees, wax, coated linen, finely attached a mallet, hammered strap, pearled hardware, and close-shit to create for you the one-of-a-kind hoke couture, Erme's Birkin bag that is my monologue. But sometimes, sometimes folks, sometimes. Sometimes I wake up in the last car of an abandoned roller coaster at Coney Island where I'm I'm hiding from the triads. I have some engine lubricants out of a safe way bag and stagger down the shore to tear the sail off a beach schooner. Then I rip the coaxial cable out of an RV and elderly couple from Utah, Hank, and Mabel lovely folks. And use it to stitch the sail into a loose pouch like a rock sack. And I stow away in the back of a garbage truck to the junkyard where I pick through to the debris for only the broken toys that make me the saddest until I have loaded for you. The Hobo Fugitives bug out, bindle of news that is my segment. Me one!",
2529" You know, folks, I spent a lot of time crafting for you a bespoke playlist of the day's biggest stories right over there. Meticulously selecting the most topical chakra affirming scented candles, and using Feng Shui to perfectly align the joke energy in the exclusive boutique yoga retreat that is my monologue. But sometimes just sometimes I go to the dumpster behind the waffle house at three in the morning, take off my shirt, cover myself, and used fry oil, wrap my hands with some double-duct tape by stole from the broken car window. Pound a six-pack of blueberry hard-seltzer and a sack of pills I stole from a parked ambulance. Then arm wrestle a raccoon in the back alley vision quest of news that is my segment. Meanwhile!",
2530" You know, folks, I spend most of my time right over there. Mining the day's biggest, most important stories, collecting the finest, most topical iron or hand hammering it into joke panels. Then I craft sheets of bronze and blazing with patterns that tell an epic tale of conquest and glory. Then, using the Germanic tradition press-black process, I place thin sheets of foil against the scenes and by hammering or otherwise applying pressure from the back, I project these scenes into a pair of cheat cards in a faceplate and, finally, using fluted strips of white alloyed molding, I divide the designs into framed panels and hold it all together using bronze rivets to create the beautiful and intimidating, Anglo-Saxon battle helm that is my nightly monologue. Sometimes, sometimes folks. Sometimes, just sometimes, I come into my sense as fully naked on the deck of a pirate besieged melee container ship that picked me up floating on the detached door of a portapotty in the Indian Ocean. Then after a sunstroke-induced realization of the crew of this ship plans to sell me an exchange for a bag of oranges to fight off scurvy, I lead a mutiny using only a PVC pipe at a pool chain that accepting my new role as Captain and declaring myself king of the windarc seas. I grab a dirty mop bucket covered in barnacles and adorn it with the teeth of the vanquished to create the sopping wet pirate crown of news that is my segment. Meanwhile!",
2531" Folks, if you watch this show, you know I spend most of my time right over there carefully blending for you the day's Newsiest most topical flower eggs milk and butter and Stranding into a fine batter to make delicate and informative comedy pancakes Then I glaze them in the juice and zest of the most relevant midnight Valencia oranges and douse it all and a fine Dela main de voyage cognac Before prom baying and basting them tables. I deserve for you the James Beard award worthy crepe suzzette That is my nightly monologue, but sometimes just sometimes folks. I wake up in the baggage hold of Greyhound bus. It's being hoisted by the scrap yard claw toward the burn pit. Escape to a nearby abandoned price chopper where I scrounge for old bread scraps and busted open bags of starfruit candies and expired eggs. Chuck it all on a dirty hubcap and slap it over a tire fire before using the legs of a strain, pair of sweatpants and as oven mitts to extract and serve the demented transience poundcake of news that is my segment. Me, Guadalupe!",
2532" Folks, if you watched the show and I hope you do, I spent a lot of time right over there. Tiredlessly studying the lineage of the days most important thoroughbred stories and whole-stiner headlines, working with the best trainers, money can buy to rear their comedy offspring with a hand that is stern yet gentle into the triple crown winning equine specimen. That is my nightly monologue, but sometimes, sometimes, folks, I break into an unincorporated veterinary genetics lab and grab whatever test tubes I can find and then under a grow light I got from a discarded chia pet. I mixed the pilfered DNA of a horse and whatever was in a tube labeled Keith Colan extra. Slurrying the concoction with caffeine pills and a microwave red bull, I screamed, sang a prayer to Janice, initiator of human life and God of transformation as a half horse, half man, freak. Seizes to life before me and the hideous collection of loose animal parts and corrupted man tissue that is my segment. Meanwhile!"
2533]
2534# fmt: on
2535
2536processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
2537model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
2538model = model.to(torch_device)
2539
2540ds = load_dataset("distil-whisper/meanwhile", "default")["test"]
2541ds = ds.cast_column("audio", Audio(sampling_rate=16000))
2542
2543num_samples = 8
2544
2545audio = ds[:num_samples]["audio"]
2546audios = [x["array"] for x in audio]
2547
2548decoded_single = []
2549for audio in audios:
2550inputs = processor(audio, return_tensors="pt", truncation=False, sampling_rate=16_000)
2551inputs = inputs.to(device=torch_device)
2552
2553result = model.generate(**inputs, return_timestamps=True)
2554decoded_single += processor.batch_decode(result, skip_special_tokens=True)
2555
2556inputs = processor(
2557audios, return_tensors="pt", truncation=False, padding="longest", return_attention_mask=True
2558)
2559inputs = inputs.to(device=torch_device)
2560
2561result = model.generate(**inputs, return_timestamps=True)
2562decoded_all = processor.batch_decode(result, skip_special_tokens=True)
2563
2564for i in range(num_samples):
2565assert decoded_all[i] == decoded_single[i]
2566assert decoded_all[i] == EXPECTED_TEXT[i]
2567
2568@slow
2569def test_whisper_longform_multi_batch_hard_prev_cond(self):
2570# fmt: off
2571EXPECTED_TEXT = [
2572" Folks, if you watch the show, you know I spent a lot of time right over there. Patiently and astutely scrutinizing the boxwood and mahogany chest set of the day's biggest stories, developing the central headline pawns, definitely maneuvering an oh-so-topical night to F6, faming of classic Sicilian, named or variation on the news, all the while seeing eight moves deep and patiently marshalling the latest press releases into a Fisher show's in lip-nitsky attack that culminates in the elegant lethal slow played all pass on checkmate that is my nightly monologue, but sometimes sometimes folks I sometimes I start a little wake-up side down in the monkey bars of a condemned playground on a super fun site, get all hepped up on goofballs, rummage that would discard a tag bag of defective toys, yank out a fistball of disembodied doll limbs, toss them on a stain kid's place mad from a defunked denies, set up a table inside a rusty cargo container down by the warf and challenge toothless drifters to the godless bughouse blitz of tournament that is my segment.",
2573" Folks, I spent a lot of time right over there night after night, actually. Carefully selecting for you the day's newsiest, most aerodynamic headlines, stress testing on those topical anti-lock breaks and power steering, painstakingly stitching, leather seating, so soft, it would make JD power and her associates blush. To create the luxury sedan that is my nightly monologue, but sometimes I just sometimes focus. I lurched to consciousness in the back of an abandoned school bus and slapped myself awake with a crusty floor mat. Before using a mouse-bitten timing belt to strap some old plywood to a couple of discarded oil drums, then by the light of a heathen-moon render a gas tank out of an empty big gulp, filled with white claw and de-natured alcohol, then light a match, letter-ripping the dis-mented one-man soapbox derby of news that is my segment.",
2574" Ladies and gentlemen, you know, I spent a lot of time right over there, raising the finest hosting news cattle firmly, yet tenderly milking the latest headlines from their jokes, swollen teats, churning the daily stories into the decadent Provincil style triple cream-breed. It is my nightly monologue, but sometimes sometimes I stagger home hungry after being released by the police and root around in the neighbors trash can for an old milk carton scrape out the blooming dairy residue into the remains of a wet cheese rind I won from a rat and a pre-drawn street fight. Put it into discarded paint can to leave it to ferment next to a trash fire than a hunker down in hallucinate while eating the lusteria latent demon custard of news that is my segment.",
2575" Folks, you watched this show, you know I spend most of my time right over there, carefully sorting through the days, big stories, and selecting only the most subtle, and unblemished ostrich and crocodile news leather, which I then entrust to artisan graduates of the Ickel Greg Waferandi, who carefully died them in a pallet of bright, zesty shades, and adorn them in the finest most topical inlay work, using hand tools and double magnifying glasses, then assemble them according to now classic and elegant geometry using our signature saddle stitching, and line it with bees, wax, coated linen, and finally attach a mallet hammered strap, perled hardware, and close-shet to create for you the one of a kind hope, kutur, earn-may is burkin bag that is my monologue, but sometimes, sometimes, sometimes. Sometimes, sometimes I wake up in the last car of an abandoned roller coaster at Kony Island, where I'm hiding from the triads, I have some engine lubricants out of a safe way bag and staggered down the shore to tear the sail off a beach sooner than I ripped the coaxial cable out of an RV and elderly couple from Utah, Hank, and Mabel Lovelyfokes, and use it to stitch the sail into a loose pouch like rock sack, and I stole a bag of a garbage truck to the junkyard, where I picked through to the debris for only the broken toys that make me the saddest, until I have loaded for you. The hobo fugitives bug out Bindle of news that is my segment.",
2576" You know, folks, I spent a lot of time crafting for you a bespoke playlist of the day's big stories right over there. meticulously selecting the most topical chakra affirming scented candles, using Feng Shui, to perfectly align the joke energy in the exclusive boutique yoga retreat that is my monologue, but sometimes just sometimes, I go to the dumpster behind the waffle house at three in the morning, take off my shirt, cover myself and use fry oil, wrap my hands and some old duct tape I stole from a broken car window, pound a six pack of blueberry hard-seller and a second pill, as I stole from a park damsel, and it's then arm wrestle a raccoon in the back alley vision quest of news that is my segment.",
2577" You know, folks, I spend most of my time right over there. Mining the days, biggest, most important stories, collecting the finest, most topical iron or hand hammering it into joke panels, then I craft sheets of bronze and blazing with patterns that tell an epic tale of conquest and glory. Then, using the Germanic tradition press, black process, I place thin sheets of foil against the scenes and by hammering or otherwise applying pressure from the back, I project these scenes into a pair of cheat cards and a face plate, and finally using fluted strips of white alloyed molding I divide the designs into framed panels and hold it all together using bronze rivets to create the beautiful and intimidating Anglo-Saxon battle helm that is my nightly monologue. Sometimes, sometimes, folks. Sometimes, just sometimes, I come to my senses fully naked on the deck of a pirate, beceived, melee, container ship that picked me up floating on the detainees. Then after I sunstroke in juice, realization of the crew of this ship plans to sell me and exchange for a bag of oranges to fight off scurvy, I lead a mutiny using only a PVC pipe in a pool chain that accepting my new role as captain and declaring myself king of the wind arc seas. I grab a dirty muck bucket covered in barnacles and a dornet with the teeth of the vanquished to create the softening wet pirate crown of news that is my segment. I'm going to use the white paper to create the softened white paper to create the softened white paper to create the softened white pirate crown of news that is my segment. Meanwhile.",
2578" Folks, if you watch this show, you know I spend most of my time right over there carefully blending for you the day's newsiest, most topical flower eggs, milk and butter. And straining into a fine batter to make delicate and informative comedy pancakes, then I glaze them in the juice and zest of the most relevant midnight valencio oranges. And doubts at all, and I find delimane de voyage cognac, before from bang and basting them tables, I deserve you the James Beard Award worthy creeps to ZET. That is my nightly monologue, but sometimes sometimes folks I wake up in the baggage hole of Greyhound bus, it's being hoisted by the scrapyard claw toward the burn pit. Escape to a nearby abandoned price chopper where I scrounge for old bread scraps, busted open bags of starfruit candies and expired eggs. Chuck it all on a dirty hubcap and slap it over a tire fire before using the legs of a strained pair of sweatpants and as ovenmets to extract and serve the demented transients pound cake of news that is my segment.",
2579" Folks, if you watch the show and I hope you do, I spend a lot of time right over there. Tirelessly studying the lineage of the day's most important thoroughbred stories and whole-stiner headlines, working with the best trainers money can buy to rear their comedy offspring with a hand that is stern yet gentle into the triple crown winning equine specimen that is my nightly monologue. But sometimes sometimes folks I break into an unincorporated veterinary genetics lab. And grab whatever test tubes I can find and then under a grow light I got from it a discarded chia pet. I mixed the pill for DNA of a horse and whatever was in a tube labeled Keith Cole and extra. Sloering the concoction with caffeine pills and a microwave bread bowl, I screamed sing a prayer to Janice initiator of human life and God of transformation as a half horse, half man freak, seasons to life before me. And the hideous collection of loose animal parts and corrupted men tissue that is my segment.",
2580]
2581# fmt: on
2582
2583processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
2584model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
2585model = model.to(torch_device)
2586
2587ds = load_dataset("distil-whisper/meanwhile", "default")["test"]
2588ds = ds.cast_column("audio", Audio(sampling_rate=16000))
2589
2590num_samples = 8
2591
2592audio = ds[:num_samples]["audio"]
2593audios = [x["array"] for x in audio]
2594
2595inputs = processor(
2596audios, return_tensors="pt", truncation=False, padding="longest", return_attention_mask=True
2597)
2598inputs = inputs.to(device=torch_device)
2599
2600gen_kwargs = {
2601"return_timestamps": True,
2602"no_speech_threshold": 0.6,
2603"temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
2604"compression_ratio_threshold": 1.35,
2605"condition_on_prev_tokens": True,
2606"logprob_threshold": -1.0,
2607"num_beams": 5,
2608}
2609
2610torch.manual_seed(0)
2611result = model.generate(**inputs, **gen_kwargs)
2612decoded_all = processor.batch_decode(result, skip_special_tokens=True)
2613
2614for i in range(num_samples):
2615assert decoded_all[i] == EXPECTED_TEXT[i]
2616
2617
2618def prepare_whisper_encoder_inputs_dict(config, input_features, head_mask=None):
2619if head_mask is None:
2620head_mask = torch.ones(config.encoder_layers, config.encoder_attention_heads, device=torch_device)
2621return {"input_features": input_features, "head_mask": head_mask}
2622
2623
2624@require_torch
2625class WhisperEncoderModelTester:
2626def __init__(
2627self,
2628parent,
2629batch_size=2,
2630seq_length=60,
2631is_training=True,
2632use_labels=True,
2633hidden_size=16,
2634num_hidden_layers=2,
2635num_attention_heads=4,
2636input_channels=1,
2637hidden_act="gelu",
2638hidden_dropout_prob=0.1,
2639attention_probs_dropout_prob=0.1,
2640max_position_embeddings=20,
2641max_source_positions=30,
2642num_mel_bins=80,
2643num_conv_layers=1,
2644suppress_tokens=None,
2645begin_suppress_tokens=None,
2646classifier_proj_size=4,
2647num_labels=2,
2648is_encoder_decoder=False,
2649is_decoder=False,
2650):
2651self.parent = parent
2652self.batch_size = batch_size
2653self.seq_length = seq_length
2654self.is_training = is_training
2655self.use_labels = use_labels
2656self.hidden_size = hidden_size
2657self.num_hidden_layers = num_hidden_layers
2658self.num_attention_heads = num_attention_heads
2659self.input_channels = input_channels
2660self.hidden_act = hidden_act
2661self.hidden_dropout_prob = hidden_dropout_prob
2662self.attention_probs_dropout_prob = attention_probs_dropout_prob
2663self.num_mel_bins = num_mel_bins
2664self.max_position_embeddings = max_position_embeddings
2665self.max_source_positions = max_source_positions
2666self.num_conv_layers = num_conv_layers
2667self.suppress_tokens = suppress_tokens
2668self.begin_suppress_tokens = begin_suppress_tokens
2669self.classifier_proj_size = classifier_proj_size
2670self.num_labels = num_labels
2671self.is_encoder_decoder = is_encoder_decoder
2672self.is_decoder = is_decoder
2673
2674def get_config(self):
2675return WhisperConfig(
2676d_model=self.hidden_size,
2677encoder_layers=self.num_hidden_layers,
2678decoder_layers=self.num_hidden_layers,
2679encoder_attention_heads=self.num_attention_heads,
2680decoder_attention_heads=self.num_attention_heads,
2681input_channels=self.input_channels,
2682dropout=self.hidden_dropout_prob,
2683attention_dropout=self.attention_probs_dropout_prob,
2684max_position_embeddings=self.max_position_embeddings,
2685max_source_positions=self.max_source_positions,
2686decoder_ffn_dim=self.hidden_size,
2687encoder_ffn_dim=self.hidden_size,
2688suppress_tokens=self.suppress_tokens,
2689begin_suppress_tokens=self.begin_suppress_tokens,
2690classifier_proj_size=self.classifier_proj_size,
2691num_labels=self.num_labels,
2692is_encoder_decoder=self.is_encoder_decoder,
2693is_decoder=self.is_decoder,
2694)
2695
2696def prepare_config_and_inputs(self):
2697input_features = floats_tensor([self.batch_size, self.num_mel_bins, self.seq_length])
2698
2699config = self.get_config()
2700inputs_dict = prepare_whisper_encoder_inputs_dict(
2701config,
2702input_features=input_features,
2703)
2704return config, inputs_dict
2705
2706def prepare_config_and_inputs_for_common(self):
2707config, inputs_dict = self.prepare_config_and_inputs()
2708return config, inputs_dict
2709
2710def get_subsampled_output_lengths(self, input_lengths):
2711"""
2712Computes the output length of the convolutional layers
2713"""
2714
2715for i in range(self.num_conv_layers):
2716input_lengths = (input_lengths - 1) // 2 + 1
2717
2718return input_lengths
2719
2720@property
2721def encoder_seq_length(self):
2722return self.get_subsampled_output_lengths(self.seq_length)
2723
2724def create_and_check_model_forward(self, config, inputs_dict, use_weighted_layer_sum=False):
2725config.use_weighted_layer_sum = use_weighted_layer_sum
2726model = WhisperForAudioClassification(config=config)
2727model.to(torch_device).eval()
2728
2729input_features = inputs_dict["input_features"]
2730
2731with torch.no_grad():
2732last_hidden_state = model(input_features).logits
2733
2734self.parent.assertTrue(last_hidden_state.shape, (13, 2))
2735
2736
2737@require_torch
2738class WhisperEncoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
2739all_model_classes = (WhisperForAudioClassification,) if is_torch_available() else ()
2740is_encoder_decoder = False
2741fx_compatible = False
2742test_pruning = False
2743test_missing_keys = False
2744
2745input_name = "input_features"
2746
2747def setUp(self):
2748self.model_tester = WhisperEncoderModelTester(self)
2749self.config_tester = ConfigTester(self, config_class=WhisperConfig)
2750self.maxDiff = 3000
2751
2752def test_config(self):
2753self.config_tester.run_common_tests()
2754
2755def test_forward_signature(self):
2756config, _ = self.model_tester.prepare_config_and_inputs_for_common()
2757
2758for model_class in self.all_model_classes:
2759model = model_class(config)
2760signature = inspect.signature(model.forward)
2761# signature.parameters is an OrderedDict => so arg_names order is deterministic
2762arg_names = [*signature.parameters.keys()]
2763
2764expected_arg_names = ["input_features", "head_mask", "encoder_outputs"]
2765self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
2766
2767def test_forward_pass(self):
2768config_and_inputs = self.model_tester.prepare_config_and_inputs()
2769self.model_tester.create_and_check_model_forward(*config_and_inputs)
2770
2771def test_forward_pass_weighted_layer_sum(self):
2772config_and_inputs = self.model_tester.prepare_config_and_inputs()
2773self.model_tester.create_and_check_model_forward(*config_and_inputs, use_weighted_layer_sum=True)
2774
2775@unittest.skip(reason="Some undefined behavior encountered with tiny versions of this model. Skip for now.")
2776def test_cpu_offload(self):
2777pass
2778
2779@unittest.skip(reason="Some undefined behavior encountered with tiny versions of this model. Skip for now.")
2780def test_disk_offload_bin(self):
2781pass
2782
2783@unittest.skip(reason="Some undefined behavior encountered with tiny versions of this model. Skip for now.")
2784def test_disk_offload_safetensors(self):
2785pass
2786
2787@unittest.skip(reason="Some undefined behavior encountered with tiny versions of this model. Skip for now.")
2788def test_model_parallelism(self):
2789pass
2790
2791# input embeds is meaningless for an encoder-only acoustic model
2792def test_inputs_embeds(self):
2793pass
2794
2795# the equivalent test is passing the encoder outputs directly to the model
2796def test_encoder_outputs(self):
2797config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
2798
2799for model_class in self.all_model_classes:
2800model = model_class(config)
2801model.to(torch_device)
2802model.eval()
2803
2804inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
2805
2806with torch.no_grad():
2807outputs = model(**inputs)[0]
2808
2809encoder = model.encoder
2810
2811encoder_inputs = {"input_features": inputs["input_features"]}
2812del inputs["input_features"]
2813
2814if "head_mask" in inputs:
2815encoder_inputs["head_mask"] = inputs["head_mask"]
2816if "attention_mask" in inputs:
2817encoder_inputs["attention_mask"] = inputs["attention_mask"]
2818if "output_attentions" in inputs:
2819encoder_inputs["output_attentions"] = inputs["output_attentions"]
2820
2821with torch.no_grad():
2822inputs["encoder_outputs"] = encoder(**encoder_inputs)
2823outputs_embeds = model(**inputs)[0]
2824
2825self.assertTrue((outputs_embeds == outputs).all())
2826
2827# Needs to override as the encoder input embedding is a Conv1d
2828def test_model_common_attributes(self):
2829config, _ = self.model_tester.prepare_config_and_inputs_for_common()
2830
2831for model_class in self.all_model_classes:
2832model = model_class(config)
2833self.assertIsInstance(model.get_input_embeddings(), (torch.nn.Conv1d))
2834model.set_input_embeddings(torch.nn.Conv1d(10, 10, 3))
2835x = model.get_output_embeddings()
2836self.assertTrue(x is None or isinstance(x, torch.nn.Conv1d))
2837
2838# WhisperEncoder cannot resize token embeddings since it has no tokens embeddings
2839def test_resize_tokens_embeddings(self):
2840pass
2841
2842@is_pt_flax_cross_test
2843def test_equivalence_pt_to_flax(self):
2844config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
2845init_shape = (1,) + inputs_dict["input_features"].shape[1:]
2846
2847for model_class in self.all_model_classes:
2848with self.subTest(model_class.__name__):
2849fx_model_class_name = "Flax" + model_class.__name__
2850
2851if not hasattr(transformers, fx_model_class_name):
2852# no flax model exists for this class
2853return
2854
2855# Output all for aggressive testing
2856config.output_hidden_states = True
2857config.output_attentions = self.has_attentions
2858
2859fx_model_class = getattr(transformers, fx_model_class_name)
2860
2861# load PyTorch class
2862pt_model = model_class(config).eval()
2863# Flax models don't use the `use_cache` option and cache is not returned as a default.
2864# So we disable `use_cache` here for PyTorch model.
2865pt_model.config.use_cache = False
2866
2867# load Flax class
2868fx_model = fx_model_class(config, input_shape=init_shape, dtype=jnp.float32)
2869
2870# make sure only flax inputs are forward that actually exist in function args
2871fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
2872
2873# prepare inputs
2874pt_inputs = self._prepare_for_class(inputs_dict, model_class)
2875
2876# remove function args that don't exist in Flax
2877pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
2878
2879# send pytorch inputs to the correct device
2880pt_inputs = {
2881k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v for k, v in pt_inputs.items()
2882}
2883
2884# convert inputs to Flax
2885fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
2886
2887fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
2888fx_model.params = fx_state
2889
2890# send pytorch model to the correct device
2891pt_model.to(torch_device)
2892
2893with torch.no_grad():
2894pt_outputs = pt_model(**pt_inputs)
2895fx_outputs = fx_model(**fx_inputs)
2896
2897fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
2898pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
2899
2900self.assertEqual(fx_keys, pt_keys)
2901self.check_pt_flax_outputs(fx_outputs, pt_outputs, model_class)
2902
2903with tempfile.TemporaryDirectory() as tmpdirname:
2904pt_model.save_pretrained(tmpdirname)
2905fx_model_loaded = fx_model_class.from_pretrained(tmpdirname, input_shape=init_shape, from_pt=True)
2906
2907fx_outputs_loaded = fx_model_loaded(**fx_inputs)
2908
2909fx_keys = tuple([k for k, v in fx_outputs_loaded.items() if v is not None])
2910pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
2911
2912self.assertEqual(fx_keys, pt_keys)
2913self.check_pt_flax_outputs(fx_outputs_loaded, pt_outputs, model_class)
2914
2915@is_pt_flax_cross_test
2916def test_equivalence_flax_to_pt(self):
2917config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
2918init_shape = (1,) + inputs_dict["input_features"].shape[1:]
2919
2920for model_class in self.all_model_classes:
2921with self.subTest(model_class.__name__):
2922fx_model_class_name = "Flax" + model_class.__name__
2923
2924if not hasattr(transformers, fx_model_class_name):
2925# no flax model exists for this class
2926return
2927
2928# Output all for aggressive testing
2929config.output_hidden_states = True
2930config.output_attentions = self.has_attentions
2931
2932fx_model_class = getattr(transformers, fx_model_class_name)
2933
2934# load PyTorch class
2935pt_model = model_class(config).eval()
2936# Flax models don't use the `use_cache` option and cache is not returned as a default.
2937# So we disable `use_cache` here for PyTorch model.
2938pt_model.config.use_cache = False
2939
2940# load Flax class
2941fx_model = fx_model_class(config, input_shape=init_shape, dtype=jnp.float32)
2942
2943# make sure only flax inputs are forward that actually exist in function args
2944fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
2945
2946# prepare inputs
2947pt_inputs = self._prepare_for_class(inputs_dict, model_class)
2948
2949# remove function args that don't exist in Flax
2950pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
2951
2952# send pytorch inputs to the correct device
2953pt_inputs = {
2954k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v for k, v in pt_inputs.items()
2955}
2956
2957# convert inputs to Flax
2958fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
2959
2960pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
2961
2962# make sure weights are tied in PyTorch
2963pt_model.tie_weights()
2964
2965# send pytorch model to the correct device
2966pt_model.to(torch_device)
2967
2968with torch.no_grad():
2969pt_outputs = pt_model(**pt_inputs)
2970fx_outputs = fx_model(**fx_inputs)
2971
2972fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
2973pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
2974
2975self.assertEqual(fx_keys, pt_keys)
2976self.check_pt_flax_outputs(fx_outputs, pt_outputs, model_class)
2977
2978with tempfile.TemporaryDirectory() as tmpdirname:
2979fx_model.save_pretrained(tmpdirname)
2980pt_model_loaded = model_class.from_pretrained(tmpdirname, from_flax=True)
2981
2982# send pytorch model to the correct device
2983pt_model_loaded.to(torch_device)
2984pt_model_loaded.eval()
2985
2986with torch.no_grad():
2987pt_outputs_loaded = pt_model_loaded(**pt_inputs)
2988
2989fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
2990pt_keys = tuple([k for k, v in pt_outputs_loaded.items() if v is not None])
2991
2992self.assertEqual(fx_keys, pt_keys)
2993self.check_pt_flax_outputs(fx_outputs, pt_outputs_loaded, model_class)
2994
2995
2996class WhisperStandaloneDecoderModelTester:
2997def __init__(
2998self,
2999parent,
3000batch_size=2,
3001is_training=True,
3002use_labels=False,
3003vocab_size=200,
3004hidden_size=16,
3005num_hidden_layers=2,
3006num_attention_heads=4,
3007input_channels=1,
3008hidden_act="gelu",
3009hidden_dropout_prob=0.1,
3010attention_probs_dropout_prob=0.1,
3011max_position_embeddings=20,
3012max_source_positions=30,
3013max_target_positions=40,
3014bos_token_id=98,
3015eos_token_id=98,
3016pad_token_id=0,
3017num_mel_bins=80,
3018decoder_start_token_id=85,
3019num_conv_layers=1,
3020suppress_tokens=None,
3021begin_suppress_tokens=None,
3022):
3023self.parent = parent
3024self.batch_size = batch_size
3025self.is_training = is_training
3026self.use_labels = use_labels
3027self.vocab_size = vocab_size
3028self.hidden_size = hidden_size
3029self.num_hidden_layers = num_hidden_layers
3030self.num_attention_heads = num_attention_heads
3031self.input_channels = input_channels
3032self.hidden_act = hidden_act
3033self.hidden_dropout_prob = hidden_dropout_prob
3034self.attention_probs_dropout_prob = attention_probs_dropout_prob
3035self.num_mel_bins = num_mel_bins
3036self.max_position_embeddings = max_position_embeddings
3037self.max_source_positions = max_source_positions
3038self.max_target_positions = max_target_positions
3039self.eos_token_id = eos_token_id
3040self.pad_token_id = pad_token_id
3041self.bos_token_id = bos_token_id
3042self.decoder_start_token_id = decoder_start_token_id
3043self.num_conv_layers = num_conv_layers
3044self.suppress_tokens = suppress_tokens
3045self.begin_suppress_tokens = begin_suppress_tokens
3046
3047def prepare_config_and_inputs(self):
3048input_features = floats_tensor([self.batch_size, self.num_mel_bins, self.seq_length], self.vocab_size)
3049
3050decoder_input_ids = torch.tensor(
3051self.batch_size * [[self.decoder_start_token_id, 3, 3, 7, 2]], device=torch_device
3052)
3053
3054config = self.get_config()
3055config.is_encoder_decoder = False
3056inputs_dict = prepare_whisper_inputs_dict(
3057config,
3058attention_mask=None,
3059input_features=input_features,
3060decoder_input_ids=decoder_input_ids,
3061)
3062
3063inputs_dict.pop("input_features")
3064inputs_dict.pop("head_mask")
3065inputs_dict.pop("decoder_head_mask")
3066inputs_dict.pop("cross_attn_head_mask")
3067
3068inputs_dict["attention_mask"] = inputs_dict.pop("decoder_attention_mask")
3069inputs_dict["input_ids"] = inputs_dict.pop("decoder_input_ids")
3070return config, inputs_dict
3071
3072@property
3073def encoder_seq_length(self):
3074return 5
3075
3076@property
3077def seq_length(self):
3078return 5
3079
3080def get_config(self):
3081return WhisperConfig(
3082vocab_size=self.vocab_size,
3083d_model=self.hidden_size,
3084encoder_layers=self.num_hidden_layers,
3085decoder_layers=self.num_hidden_layers,
3086encoder_attention_heads=self.num_attention_heads,
3087decoder_attention_heads=self.num_attention_heads,
3088input_channels=self.input_channels,
3089dropout=self.hidden_dropout_prob,
3090attention_dropout=self.attention_probs_dropout_prob,
3091max_position_embeddings=self.max_position_embeddings,
3092max_source_positions=self.max_source_positions,
3093max_target_positions=self.max_target_positions,
3094eos_token_id=self.eos_token_id,
3095bos_token_id=self.bos_token_id,
3096pad_token_id=self.pad_token_id,
3097decoder_ffn_dim=self.hidden_size,
3098encoder_ffn_dim=self.hidden_size,
3099decoder_start_token_id=self.decoder_start_token_id,
3100suppress_tokens=self.suppress_tokens,
3101begin_suppress_tokens=self.begin_suppress_tokens,
3102)
3103
3104def prepare_config_and_inputs_for_common(self):
3105config, inputs_dict = self.prepare_config_and_inputs()
3106
3107inputs_dict["input_ids"][:, -1] = self.pad_token_id
3108
3109return config, inputs_dict
3110
3111def prepare_config_and_inputs_for_decoder(self):
3112config, input_features = self.prepare_config_and_inputs()
3113input_ids = input_features["input_ids"]
3114encoder_hidden_states = floats_tensor([self.batch_size, self.decoder_seq_length, self.hidden_size])
3115
3116return (config, input_ids, encoder_hidden_states)
3117
3118def create_and_check_decoder_model_past(self, config, input_ids):
3119config.use_cache = True
3120model = WhisperDecoder(config=config).to(torch_device).eval()
3121# first forward pass
3122outputs = model(input_ids, use_cache=True)
3123outputs_use_cache_conf = model(input_ids)
3124outputs_no_past = model(input_ids, use_cache=False)
3125
3126self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
3127self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
3128
3129past_key_values = outputs["past_key_values"]
3130
3131# create hypothetical next token and extent to next_input_ids
3132next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
3133
3134# append to next input_ids and
3135next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
3136
3137output_from_no_past = model(next_input_ids)["last_hidden_state"]
3138output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
3139
3140# select random slice
3141random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
3142output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
3143output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
3144
3145# test that outputs are equal for slice
3146assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
3147
3148def create_and_check_decoder_model_attention_mask_past(self, config, input_ids):
3149model = WhisperDecoder(config=config).to(torch_device).eval()
3150
3151# create attention mask
3152attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
3153
3154half_seq_length = input_ids.shape[-1] // 2
3155attn_mask[:, half_seq_length:] = 0
3156
3157# first forward pass
3158past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True)["past_key_values"]
3159
3160# create hypothetical next token and extent to next_input_ids
3161next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
3162
3163# change a random masked slice from input_ids
3164random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
3165random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
3166input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
3167
3168# append to next input_ids and attn_mask
3169next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
3170attn_mask = torch.cat(
3171[attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
3172dim=1,
3173)
3174
3175# get two different outputs
3176output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
3177output_from_past = model(next_tokens, attention_mask=attn_mask, past_key_values=past_key_values)[
3178"last_hidden_state"
3179]
3180
3181# select random slice
3182random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
3183output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
3184output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
3185
3186# test that outputs are equal for slice
3187assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
3188
3189
3190@require_torch
3191class WhisperStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
3192all_model_classes = (WhisperDecoder, WhisperForCausalLM) if is_torch_available() else ()
3193all_generative_model_classes = (WhisperForCausalLM,) if is_torch_available() else ()
3194fx_comptatible = False
3195test_pruning = False
3196is_encoder_decoder = False
3197test_missing_keys = False
3198
3199def setUp(self):
3200self.model_tester = WhisperStandaloneDecoderModelTester(self, is_training=False)
3201self.config_tester = ConfigTester(self, config_class=WhisperConfig)
3202
3203def test_config(self):
3204self.config_tester.run_common_tests()
3205
3206def test_decoder_model_past(self):
3207config_and_inputs = self.model_tester.prepare_config_and_inputs()
3208config, inputs_dict = config_and_inputs
3209
3210self.model_tester.create_and_check_decoder_model_past(config=config, input_ids=inputs_dict["input_ids"])
3211
3212def test_decoder_model_attn_mask_past(self):
3213config_and_inputs = self.model_tester.prepare_config_and_inputs()
3214config, inputs_dict = config_and_inputs
3215
3216self.model_tester.create_and_check_decoder_model_attention_mask_past(
3217config=config, input_ids=inputs_dict["input_ids"]
3218)
3219
3220@unittest.skip("Generate needs input ids")
3221def test_generate_without_input_ids(self):
3222# generate only works with input ids for whisper
3223pass
3224
3225@unittest.skip("Decoder can't keep attention grads")
3226def test_retain_grad_hidden_states_attentions(self):
3227# decoder cannot keep gradients
3228return
3229
3230@unittest.skip("The model doesn't support fast init from base")
3231def test_save_load_fast_init_from_base(self):
3232pass
3233
3234@unittest.skip("The model doesn't support left padding") # and it's not used enough to be worth fixing :)
3235def test_left_padding_compatibility(self):
3236pass
3237