Spaces:

sitatech
/

vibe-shopping

Running

App Files Files Community

vibe-shopping / mcp_host /tts /gradio_api_tts.py

sitatech

Make the app sync

bc2a0f7 6 months ago

raw

history blame contribute delete

2.22 kB

	import io
	import time
	from typing import Generator, Iterator
	import base64
	import wave

	import numpy as np
	from gradio_client import Client
	from stream2sentence import generate_sentences

	from mcp_host.tts.utils import KOKORO_TO_STD_LANG, VOICES


	__all__ = ["stream_text_to_speech"]


	def stream_text_to_speech(
	text_stream: Iterator[str],
	client: Client,
	voice: str \| None = None,
	) -> Generator[tuple[int, np.ndarray], None, None]:
	"""
	Convert text to speech using the specified voice.

	Args:
	text (str): The text to convert to speech.
	voice (str): The voice to use for the conversion. Default to af_heart

	Returns:
	np.ndarray: The audio as a NumPy array.
	"""
	voice = voice or "af_heart"
	if voice not in VOICES.values():
	raise ValueError(f"Voice '{voice}' is not available.")

	kokoro_lang = voice[0]
	standard_lang_code = KOKORO_TO_STD_LANG[kokoro_lang]

	for text in generate_sentences(text_stream, language=standard_lang_code):
	print(f"Streaming audio for text: {text}")
	audio = client.submit(
	text=text, voice=voice, speed=1, use_gpu=True, api_name="/stream"
	)
	print("Job submitted, waiting for audio chunks...")
	t = time.time()
	for audio_chunk in audio:
	yield base64_to_audio_array(audio_chunk)
	print(f"Received audio chunk: {audio_chunk[:10]} in {time.time() - t:.2f} seconds")


	def base64_to_audio_array(base64_string):
	# Decode base64 to raw WAV bytes
	audio_bytes = base64.b64decode(base64_string)
	buffer = io.BytesIO(audio_bytes)

	# Read WAV using wave module
	with wave.open(buffer, 'rb') as wf:
	sample_rate = wf.getframerate()
	n_channels = wf.getnchannels()
	n_frames = wf.getnframes()

	audio_data = wf.readframes(n_frames)

	# Convert bytes to NumPy array (assumes int16)
	audio_array = np.frombuffer(audio_data, dtype=np.int16)

	# Reshape for stereo if needed
	if n_channels > 1:
	audio_array = audio_array.reshape(-1, n_channels)

	# Normalize to float32 [-1.0, 1.0]
	audio_array = audio_array.astype(np.float32) / 32767

	return sample_rate, audio_array