llama-index

Форк
0
84 строки · 2.5 Кб
1
"""Bark TTS module."""
2

3
import os
4
import tempfile
5
from typing import Any, Optional
6

7
import numpy as np
8

9
from llama_index.legacy.tts.base import BaseTTS
10

11
# text to be chunked into chunks of 10 words
12
# to avoid hallicunation for bark
13
DEFAULT_CHUNK_SIZE = 10
14

15

16
class BarkTTS(BaseTTS):
17
    """Bark TTS.
18

19
    Args:
20
        text_temp: generation temperature (1.0 more diverse, \
21
            0.0 more conservative)
22
        waveform_temp: generation temperature (1.0 more diverse, \
23
            0.0 more conservative)
24
        lang_speaker_voice: language speaker voice for audio cloning.
25

26
    """
27

28
    def __init__(
29
        self,
30
        text_temp: float = 0.7,
31
        waveform_temp: float = 0.7,
32
        lang_speaker_voice: Optional[str] = None,
33
    ) -> None:
34
        """Init params."""
35
        super().__init__()
36

37
        self.text_temp = text_temp
38
        self.waveform_temp = waveform_temp
39
        self.lang_speaker_voice = lang_speaker_voice
40

41
    def generate_audio(self, text: str) -> Any:
42
        """Generate audio from text.
43

44
        NOTE: return type is Any, but it should be any object that can be fed
45
        as `data` into IPython.display.Audio(). This includes numpy array, list,
46
        unicode, str or bytes
47

48
        Args:
49
            text: text to be turned into audio.
50
        """
51
        import_err_msg = "`bark` package not found, \
52
            please run `pip install git+https://github.com/suno-ai/bark.git`"
53
        try:
54
            import bark
55
        except ImportError:
56
            raise ImportError(import_err_msg)
57

58
        words = text.split()
59
        chunks = [
60
            words[i : i + DEFAULT_CHUNK_SIZE]
61
            for i in range(0, len(words), DEFAULT_CHUNK_SIZE)
62
        ]
63
        chunks = [" ".join(chunk) for chunk in chunks]  # type: ignore
64

65
        full_generation = None
66
        history_prompt = self.lang_speaker_voice
67
        audio_chunks = []
68

69
        for chunk in chunks:
70
            with tempfile.TemporaryDirectory() as d:
71
                if full_generation:
72
                    f = os.path.join(d, "history_prompt.npz")
73
                    bark.save_as_prompt(f, full_generation)
74
                    history_prompt = f
75
                full_generation, audio_array = bark.generate_audio(
76
                    chunk,
77
                    history_prompt=history_prompt,
78
                    text_temp=self.text_temp,
79
                    waveform_temp=self.waveform_temp,
80
                    output_full=True,
81
                )
82
                audio_chunks.append(audio_array)
83

84
        return np.concatenate(audio_chunks)
85

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.