File size: 2,216 Bytes
ddf45bb
f99447f
bc2a0f7
ddf45bb
 
9db8aa5
 
 
 
 
 
 
 
 
 
 
bc2a0f7
37ad554
 
 
bc2a0f7
9db8aa5
 
 
 
 
 
 
 
 
 
 
 
 
37ad554
9db8aa5
 
37ad554
9db8aa5
646ceb9
9db8aa5
ddf45bb
9db8aa5
18c74ed
f99447f
9db8aa5
ddf45bb
f99447f
ddf45bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import io
import time
from typing import Generator, Iterator
import base64
import wave

import numpy as np
from gradio_client import Client
from stream2sentence import generate_sentences

from mcp_host.tts.utils import KOKORO_TO_STD_LANG, VOICES


__all__ = ["stream_text_to_speech"]


def stream_text_to_speech(
    text_stream: Iterator[str],
    client: Client,
    voice: str | None = None,
) -> Generator[tuple[int, np.ndarray], None, None]:
    """
    Convert text to speech using the specified voice.

    Args:
        text (str): The text to convert to speech.
        voice (str): The voice to use for the conversion. Default to af_heart

    Returns:
        np.ndarray: The audio as a NumPy array.
    """
    voice = voice or "af_heart"
    if voice not in VOICES.values():
        raise ValueError(f"Voice '{voice}' is not available.")

    kokoro_lang = voice[0]
    standard_lang_code = KOKORO_TO_STD_LANG[kokoro_lang]

    for text in generate_sentences(text_stream, language=standard_lang_code):
        print(f"Streaming audio for text: {text}")
        audio = client.submit(
            text=text, voice=voice, speed=1, use_gpu=True, api_name="/stream"
        )
        print("Job submitted, waiting for audio chunks...")
        t = time.time()
        for audio_chunk in audio:
            yield base64_to_audio_array(audio_chunk)
            print(f"Received audio chunk: {audio_chunk[:10]} in {time.time() - t:.2f} seconds")


def base64_to_audio_array(base64_string):
    # Decode base64 to raw WAV bytes
    audio_bytes = base64.b64decode(base64_string)
    buffer = io.BytesIO(audio_bytes)

    # Read WAV using wave module
    with wave.open(buffer, 'rb') as wf:
        sample_rate = wf.getframerate()
        n_channels = wf.getnchannels()
        n_frames = wf.getnframes()

        audio_data = wf.readframes(n_frames)

    # Convert bytes to NumPy array (assumes int16)
    audio_array = np.frombuffer(audio_data, dtype=np.int16)

    # Reshape for stereo if needed
    if n_channels > 1:
        audio_array = audio_array.reshape(-1, n_channels)

    # Normalize to float32 [-1.0, 1.0]
    audio_array = audio_array.astype(np.float32) / 32767

    return sample_rate, audio_array