transformers
421 строка · 18.7 Кб
1# coding=utf-8
2# Copyright 2021-2023 HuggingFace Inc.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15"""Tests for the SpeechT5 feature extractors."""
16
17import itertools
18import random
19import unittest
20
21import numpy as np
22
23from transformers import BatchFeature, SpeechT5FeatureExtractor
24from transformers.testing_utils import require_torch
25from transformers.utils.import_utils import is_torch_available
26
27from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
28
29
30if is_torch_available():
31import torch
32
33
34global_rng = random.Random()
35
36
37# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
38def floats_list(shape, scale=1.0, rng=None, name=None):
39"""Creates a random float32 tensor"""
40if rng is None:
41rng = global_rng
42
43values = []
44for batch_idx in range(shape[0]):
45values.append([])
46for _ in range(shape[1]):
47values[-1].append(rng.random() * scale)
48
49return values
50
51
52@require_torch
53class SpeechT5FeatureExtractionTester(unittest.TestCase):
54def __init__(
55self,
56parent,
57batch_size=7,
58min_seq_length=400,
59max_seq_length=2000,
60feature_size=1,
61padding_value=0.0,
62sampling_rate=16000,
63do_normalize=True,
64num_mel_bins=80,
65hop_length=16,
66win_length=64,
67win_function="hann_window",
68fmin=80,
69fmax=7600,
70mel_floor=1e-10,
71return_attention_mask=True,
72):
73self.parent = parent
74self.batch_size = batch_size
75self.min_seq_length = min_seq_length
76self.max_seq_length = max_seq_length
77self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
78self.feature_size = feature_size
79self.padding_value = padding_value
80self.sampling_rate = sampling_rate
81self.do_normalize = do_normalize
82self.num_mel_bins = num_mel_bins
83self.hop_length = hop_length
84self.win_length = win_length
85self.win_function = win_function
86self.fmin = fmin
87self.fmax = fmax
88self.mel_floor = mel_floor
89self.return_attention_mask = return_attention_mask
90
91def prepare_feat_extract_dict(self):
92return {
93"feature_size": self.feature_size,
94"padding_value": self.padding_value,
95"sampling_rate": self.sampling_rate,
96"do_normalize": self.do_normalize,
97"num_mel_bins": self.num_mel_bins,
98"hop_length": self.hop_length,
99"win_length": self.win_length,
100"win_function": self.win_function,
101"fmin": self.fmin,
102"fmax": self.fmax,
103"mel_floor": self.mel_floor,
104"return_attention_mask": self.return_attention_mask,
105}
106
107def prepare_inputs_for_common(self, equal_length=False, numpify=False):
108def _flatten(list_of_lists):
109return list(itertools.chain(*list_of_lists))
110
111if equal_length:
112speech_inputs = floats_list((self.batch_size, self.max_seq_length))
113else:
114# make sure that inputs increase in size
115speech_inputs = [
116_flatten(floats_list((x, self.feature_size)))
117for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
118]
119
120if numpify:
121speech_inputs = [np.asarray(x) for x in speech_inputs]
122
123return speech_inputs
124
125def prepare_inputs_for_target(self, equal_length=False, numpify=False):
126if equal_length:
127speech_inputs = [floats_list((self.max_seq_length, self.num_mel_bins)) for _ in range(self.batch_size)]
128else:
129# make sure that inputs increase in size
130speech_inputs = [
131floats_list((x, self.num_mel_bins))
132for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
133]
134
135if numpify:
136speech_inputs = [np.asarray(x) for x in speech_inputs]
137
138return speech_inputs
139
140
141@require_torch
142class SpeechT5FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
143feature_extraction_class = SpeechT5FeatureExtractor
144
145def setUp(self):
146self.feat_extract_tester = SpeechT5FeatureExtractionTester(self)
147
148def _check_zero_mean_unit_variance(self, input_vector):
149self.assertTrue(np.all(np.mean(input_vector, axis=0) < 1e-3))
150self.assertTrue(np.all(np.abs(np.var(input_vector, axis=0) - 1) < 1e-3))
151
152def test_call(self):
153# Tests that all call wrap to encode_plus and batch_encode_plus
154feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
155# create three inputs of length 800, 1000, and 1200
156speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
157np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
158
159# Test not batched input
160encoded_sequences_1 = feat_extract(speech_inputs[0], return_tensors="np").input_values
161encoded_sequences_2 = feat_extract(np_speech_inputs[0], return_tensors="np").input_values
162self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
163
164# Test batched
165encoded_sequences_1 = feat_extract(speech_inputs, return_tensors="np").input_values
166encoded_sequences_2 = feat_extract(np_speech_inputs, return_tensors="np").input_values
167for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
168self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
169
170def test_zero_mean_unit_variance_normalization_np(self):
171feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
172speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
173
174paddings = ["longest", "max_length", "do_not_pad"]
175max_lengths = [None, 1600, None]
176for max_length, padding in zip(max_lengths, paddings):
177processed = feat_extract(speech_inputs, padding=padding, max_length=max_length, return_tensors="np")
178input_values = processed.input_values
179
180self._check_zero_mean_unit_variance(input_values[0][:800])
181self.assertTrue(input_values[0][800:].sum() < 1e-6)
182self._check_zero_mean_unit_variance(input_values[1][:1000])
183self.assertTrue(input_values[0][1000:].sum() < 1e-6)
184self._check_zero_mean_unit_variance(input_values[2][:1200])
185
186def test_zero_mean_unit_variance_normalization(self):
187feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
188lengths = range(800, 1400, 200)
189speech_inputs = [floats_list((1, x))[0] for x in lengths]
190
191paddings = ["longest", "max_length", "do_not_pad"]
192max_lengths = [None, 1600, None]
193
194for max_length, padding in zip(max_lengths, paddings):
195processed = feat_extract(speech_inputs, max_length=max_length, padding=padding)
196input_values = processed.input_values
197
198self._check_zero_mean_unit_variance(input_values[0][:800])
199self._check_zero_mean_unit_variance(input_values[1][:1000])
200self._check_zero_mean_unit_variance(input_values[2][:1200])
201
202def test_zero_mean_unit_variance_normalization_trunc_np_max_length(self):
203feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
204speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
205processed = feat_extract(
206speech_inputs, truncation=True, max_length=1000, padding="max_length", return_tensors="np"
207)
208input_values = processed.input_values
209
210self._check_zero_mean_unit_variance(input_values[0, :800])
211self._check_zero_mean_unit_variance(input_values[1])
212self._check_zero_mean_unit_variance(input_values[2])
213
214def test_zero_mean_unit_variance_normalization_trunc_np_longest(self):
215feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
216speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
217processed = feat_extract(
218speech_inputs, truncation=True, max_length=1000, padding="longest", return_tensors="np"
219)
220input_values = processed.input_values
221
222self._check_zero_mean_unit_variance(input_values[0, :800])
223self._check_zero_mean_unit_variance(input_values[1, :1000])
224self._check_zero_mean_unit_variance(input_values[2])
225
226# make sure that if max_length < longest -> then pad to max_length
227self.assertTrue(input_values.shape == (3, 1000))
228
229speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
230processed = feat_extract(
231speech_inputs, truncation=True, max_length=2000, padding="longest", return_tensors="np"
232)
233input_values = processed.input_values
234
235self._check_zero_mean_unit_variance(input_values[0, :800])
236self._check_zero_mean_unit_variance(input_values[1, :1000])
237self._check_zero_mean_unit_variance(input_values[2])
238
239# make sure that if max_length > longest -> then pad to longest
240self.assertTrue(input_values.shape == (3, 1200))
241
242def test_double_precision_pad(self):
243feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
244np_speech_inputs = np.random.rand(100).astype(np.float64)
245py_speech_inputs = np_speech_inputs.tolist()
246
247for inputs in [py_speech_inputs, np_speech_inputs]:
248np_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="np")
249self.assertTrue(np_processed.input_values.dtype == np.float32)
250pt_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="pt")
251self.assertTrue(pt_processed.input_values.dtype == torch.float32)
252
253def test_call_target(self):
254# Tests that all call wrap to encode_plus and batch_encode_plus
255feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
256# create three inputs of length 800, 1000, and 1200
257speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
258np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
259
260# Test feature size
261input_values = feature_extractor(audio_target=np_speech_inputs, padding=True, return_tensors="np").input_values
262self.assertTrue(input_values.ndim == 3)
263self.assertTrue(input_values.shape[-1] == feature_extractor.num_mel_bins)
264
265# Test not batched input
266encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_values
267encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_values
268self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
269
270# Test batched
271encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_values
272encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_values
273for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
274self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
275
276# Test 2-D numpy arrays are batched.
277speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
278np_speech_inputs = np.asarray(speech_inputs)
279encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_values
280encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_values
281for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
282self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
283
284def test_batch_feature_target(self):
285speech_inputs = self.feat_extract_tester.prepare_inputs_for_target()
286feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
287input_name = feat_extract.model_input_names[0]
288
289processed_features = BatchFeature({input_name: speech_inputs})
290
291self.assertTrue(all(len(x) == len(y) for x, y in zip(speech_inputs, processed_features[input_name])))
292
293speech_inputs = self.feat_extract_tester.prepare_inputs_for_target(equal_length=True)
294processed_features = BatchFeature({input_name: speech_inputs}, tensor_type="np")
295
296batch_features_input = processed_features[input_name]
297
298if len(batch_features_input.shape) < 3:
299batch_features_input = batch_features_input[:, :, None]
300
301self.assertTrue(
302batch_features_input.shape
303== (self.feat_extract_tester.batch_size, len(speech_inputs[0]), self.feat_extract_tester.num_mel_bins)
304)
305
306@require_torch
307def test_batch_feature_target_pt(self):
308speech_inputs = self.feat_extract_tester.prepare_inputs_for_target(equal_length=True)
309feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
310input_name = feat_extract.model_input_names[0]
311
312processed_features = BatchFeature({input_name: speech_inputs}, tensor_type="pt")
313
314batch_features_input = processed_features[input_name]
315
316if len(batch_features_input.shape) < 3:
317batch_features_input = batch_features_input[:, :, None]
318
319self.assertTrue(
320batch_features_input.shape
321== (self.feat_extract_tester.batch_size, len(speech_inputs[0]), self.feat_extract_tester.num_mel_bins)
322)
323
324@require_torch
325def test_padding_accepts_tensors_target_pt(self):
326feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
327speech_inputs = self.feat_extract_tester.prepare_inputs_for_target()
328input_name = feat_extract.model_input_names[0]
329
330processed_features = BatchFeature({input_name: speech_inputs})
331
332feat_extract.feature_size = feat_extract.num_mel_bins # hack!
333
334input_np = feat_extract.pad(processed_features, padding="longest", return_tensors="np")[input_name]
335input_pt = feat_extract.pad(processed_features, padding="longest", return_tensors="pt")[input_name]
336
337self.assertTrue(abs(input_np.astype(np.float32).sum() - input_pt.numpy().astype(np.float32).sum()) < 1e-2)
338
339def test_attention_mask_target(self):
340feat_dict = self.feat_extract_dict
341feat_dict["return_attention_mask"] = True
342feat_extract = self.feature_extraction_class(**feat_dict)
343speech_inputs = self.feat_extract_tester.prepare_inputs_for_target()
344input_lengths = [len(x) for x in speech_inputs]
345input_name = feat_extract.model_input_names[0]
346
347processed = BatchFeature({input_name: speech_inputs})
348
349feat_extract.feature_size = feat_extract.num_mel_bins # hack!
350
351processed = feat_extract.pad(processed, padding="longest", return_tensors="np")
352self.assertIn("attention_mask", processed)
353self.assertListEqual(list(processed.attention_mask.shape), list(processed[input_name].shape[:2]))
354self.assertListEqual(processed.attention_mask.sum(-1).tolist(), input_lengths)
355
356def test_attention_mask_with_truncation_target(self):
357feat_dict = self.feat_extract_dict
358feat_dict["return_attention_mask"] = True
359feat_extract = self.feature_extraction_class(**feat_dict)
360speech_inputs = self.feat_extract_tester.prepare_inputs_for_target()
361input_lengths = [len(x) for x in speech_inputs]
362input_name = feat_extract.model_input_names[0]
363
364processed = BatchFeature({input_name: speech_inputs})
365max_length = min(input_lengths)
366
367feat_extract.feature_size = feat_extract.num_mel_bins # hack!
368
369processed_pad = feat_extract.pad(
370processed, padding="max_length", max_length=max_length, truncation=True, return_tensors="np"
371)
372self.assertIn("attention_mask", processed_pad)
373self.assertListEqual(
374list(processed_pad.attention_mask.shape), [processed_pad[input_name].shape[0], max_length]
375)
376self.assertListEqual(
377processed_pad.attention_mask[:, :max_length].sum(-1).tolist(), [max_length for x in speech_inputs]
378)
379
380def _load_datasamples(self, num_samples):
381from datasets import load_dataset
382
383ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
384# automatic decoding with librispeech
385speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
386
387return [x["array"] for x in speech_samples]
388
389def test_integration(self):
390# fmt: off
391EXPECTED_INPUT_VALUES = torch.tensor(
392[2.3804e-03, 2.0752e-03, 1.9836e-03, 2.1057e-03, 1.6174e-03,
3933.0518e-04, 9.1553e-05, 3.3569e-04, 9.7656e-04, 1.8311e-03,
3942.0142e-03, 2.1057e-03, 1.7395e-03, 4.5776e-04, -3.9673e-04,
3954.5776e-04, 1.0071e-03, 9.1553e-05, 4.8828e-04, 1.1597e-03,
3967.3242e-04, 9.4604e-04, 1.8005e-03, 1.8311e-03, 8.8501e-04,
3974.2725e-04, 4.8828e-04, 7.3242e-04, 1.0986e-03, 2.1057e-03]
398)
399# fmt: on
400
401input_speech = self._load_datasamples(1)
402feature_extractor = SpeechT5FeatureExtractor()
403input_values = feature_extractor(input_speech, return_tensors="pt").input_values
404self.assertEquals(input_values.shape, (1, 93680))
405self.assertTrue(torch.allclose(input_values[0, :30], EXPECTED_INPUT_VALUES, atol=1e-6))
406
407def test_integration_target(self):
408# fmt: off
409EXPECTED_INPUT_VALUES = torch.tensor(
410[-2.6870, -3.0104, -3.1356, -3.5352, -3.0044, -3.0353, -3.4719, -3.6777,
411-3.1520, -2.9435, -2.6553, -2.8795, -2.9944, -2.5921, -3.0279, -3.0386,
412-3.0864, -3.1291, -3.2353, -2.7444, -2.6831, -2.7287, -3.1761, -3.1571,
413-3.2726, -3.0582, -3.1007, -3.4533, -3.4695, -3.0998]
414)
415# fmt: on
416
417input_speech = self._load_datasamples(1)
418feature_extractor = SpeechT5FeatureExtractor()
419input_values = feature_extractor(audio_target=input_speech, return_tensors="pt").input_values
420self.assertEquals(input_values.shape, (1, 366, 80))
421self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-4))
422