llama-index
84 строки · 2.5 Кб
1"""Bark TTS module."""
2
3import os
4import tempfile
5from typing import Any, Optional
6
7import numpy as np
8
9from llama_index.legacy.tts.base import BaseTTS
10
11# text to be chunked into chunks of 10 words
12# to avoid hallicunation for bark
13DEFAULT_CHUNK_SIZE = 10
14
15
16class BarkTTS(BaseTTS):
17"""Bark TTS.
18
19Args:
20text_temp: generation temperature (1.0 more diverse, \
210.0 more conservative)
22waveform_temp: generation temperature (1.0 more diverse, \
230.0 more conservative)
24lang_speaker_voice: language speaker voice for audio cloning.
25
26"""
27
28def __init__(
29self,
30text_temp: float = 0.7,
31waveform_temp: float = 0.7,
32lang_speaker_voice: Optional[str] = None,
33) -> None:
34"""Init params."""
35super().__init__()
36
37self.text_temp = text_temp
38self.waveform_temp = waveform_temp
39self.lang_speaker_voice = lang_speaker_voice
40
41def generate_audio(self, text: str) -> Any:
42"""Generate audio from text.
43
44NOTE: return type is Any, but it should be any object that can be fed
45as `data` into IPython.display.Audio(). This includes numpy array, list,
46unicode, str or bytes
47
48Args:
49text: text to be turned into audio.
50"""
51import_err_msg = "`bark` package not found, \
52please run `pip install git+https://github.com/suno-ai/bark.git`"
53try:
54import bark
55except ImportError:
56raise ImportError(import_err_msg)
57
58words = text.split()
59chunks = [
60words[i : i + DEFAULT_CHUNK_SIZE]
61for i in range(0, len(words), DEFAULT_CHUNK_SIZE)
62]
63chunks = [" ".join(chunk) for chunk in chunks] # type: ignore
64
65full_generation = None
66history_prompt = self.lang_speaker_voice
67audio_chunks = []
68
69for chunk in chunks:
70with tempfile.TemporaryDirectory() as d:
71if full_generation:
72f = os.path.join(d, "history_prompt.npz")
73bark.save_as_prompt(f, full_generation)
74history_prompt = f
75full_generation, audio_array = bark.generate_audio(
76chunk,
77history_prompt=history_prompt,
78text_temp=self.text_temp,
79waveform_temp=self.waveform_temp,
80output_full=True,
81)
82audio_chunks.append(audio_array)
83
84return np.concatenate(audio_chunks)
85