Spaces:

sitatech
/

vibe-shopping

Running

File size: 2,216 Bytes

ddf45bb
f99447f
bc2a0f7
ddf45bb
 
9db8aa5
 
 
 
 
 
 
 
 
 
 
bc2a0f7
37ad554
 
 
bc2a0f7
9db8aa5
 
 
 
 
 
 
 
 
 
 
 
 
37ad554
9db8aa5
 
37ad554
9db8aa5
646ceb9
9db8aa5
ddf45bb
9db8aa5
18c74ed
f99447f
9db8aa5
ddf45bb
f99447f
ddf45bb

import io
import time
from typing import Generator, Iterator
import base64
import wave

import numpy as np
from gradio_client import Client
from stream2sentence import generate_sentences

from mcp_host.tts.utils import KOKORO_TO_STD_LANG, VOICES


__all__ = ["stream_text_to_speech"]


def stream_text_to_speech(
    text_stream: Iterator[str],
    client: Client,
    voice: str | None = None,
) -> Generator[tuple[int, np.ndarray], None, None]:
    """
    Convert text to speech using the specified voice.

    Args:
        text (str): The text to convert to speech.
        voice (str): The voice to use for the conversion. Default to af_heart

    Returns:
        np.ndarray: The audio as a NumPy array.
    """
    voice = voice or "af_heart"
    if voice not in VOICES.values():
        raise ValueError(f"Voice '{voice}' is not available.")

    kokoro_lang = voice[0]
    standard_lang_code = KOKORO_TO_STD_LANG[kokoro_lang]

    for text in generate_sentences(text_stream, language=standard_lang_code):
        print(f"Streaming audio for text: {text}")
        audio = client.submit(
            text=text, voice=voice, speed=1, use_gpu=True, api_name="/stream"
        )
        print("Job submitted, waiting for audio chunks...")
        t = time.time()
        for audio_chunk in audio:
            yield base64_to_audio_array(audio_chunk)
            print(f"Received audio chunk: {audio_chunk[:10]} in {time.time() - t:.2f} seconds")


def base64_to_audio_array(base64_string):
    # Decode base64 to raw WAV bytes
    audio_bytes = base64.b64decode(base64_string)
    buffer = io.BytesIO(audio_bytes)

    # Read WAV using wave module
    with wave.open(buffer, 'rb') as wf:
        sample_rate = wf.getframerate()
        n_channels = wf.getnchannels()
        n_frames = wf.getnframes()

        audio_data = wf.readframes(n_frames)

    # Convert bytes to NumPy array (assumes int16)
    audio_array = np.frombuffer(audio_data, dtype=np.int16)

    # Reshape for stereo if needed
    if n_channels > 1:
        audio_array = audio_array.reshape(-1, n_channels)

    # Normalize to float32 [-1.0, 1.0]
    audio_array = audio_array.astype(np.float32) / 32767

    return sample_rate, audio_array