Spaces:
Running
Running
File size: 2,216 Bytes
ddf45bb f99447f bc2a0f7 ddf45bb 9db8aa5 bc2a0f7 37ad554 bc2a0f7 9db8aa5 37ad554 9db8aa5 37ad554 9db8aa5 646ceb9 9db8aa5 ddf45bb 9db8aa5 18c74ed f99447f 9db8aa5 ddf45bb f99447f ddf45bb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
import io
import time
from typing import Generator, Iterator
import base64
import wave
import numpy as np
from gradio_client import Client
from stream2sentence import generate_sentences
from mcp_host.tts.utils import KOKORO_TO_STD_LANG, VOICES
__all__ = ["stream_text_to_speech"]
def stream_text_to_speech(
text_stream: Iterator[str],
client: Client,
voice: str | None = None,
) -> Generator[tuple[int, np.ndarray], None, None]:
"""
Convert text to speech using the specified voice.
Args:
text (str): The text to convert to speech.
voice (str): The voice to use for the conversion. Default to af_heart
Returns:
np.ndarray: The audio as a NumPy array.
"""
voice = voice or "af_heart"
if voice not in VOICES.values():
raise ValueError(f"Voice '{voice}' is not available.")
kokoro_lang = voice[0]
standard_lang_code = KOKORO_TO_STD_LANG[kokoro_lang]
for text in generate_sentences(text_stream, language=standard_lang_code):
print(f"Streaming audio for text: {text}")
audio = client.submit(
text=text, voice=voice, speed=1, use_gpu=True, api_name="/stream"
)
print("Job submitted, waiting for audio chunks...")
t = time.time()
for audio_chunk in audio:
yield base64_to_audio_array(audio_chunk)
print(f"Received audio chunk: {audio_chunk[:10]} in {time.time() - t:.2f} seconds")
def base64_to_audio_array(base64_string):
# Decode base64 to raw WAV bytes
audio_bytes = base64.b64decode(base64_string)
buffer = io.BytesIO(audio_bytes)
# Read WAV using wave module
with wave.open(buffer, 'rb') as wf:
sample_rate = wf.getframerate()
n_channels = wf.getnchannels()
n_frames = wf.getnframes()
audio_data = wf.readframes(n_frames)
# Convert bytes to NumPy array (assumes int16)
audio_array = np.frombuffer(audio_data, dtype=np.int16)
# Reshape for stereo if needed
if n_channels > 1:
audio_array = audio_array.reshape(-1, n_channels)
# Normalize to float32 [-1.0, 1.0]
audio_array = audio_array.astype(np.float32) / 32767
return sample_rate, audio_array |