import os import warnings import gradio as gr import numpy as np import torch from transformers import ( AutoModelForSpeechSeq2Seq, AutoProcessor, logging, pipeline, ) warnings.simplefilter("ignore", FutureWarning) # —— CPU performance tweaks —— os.environ["OMP_NUM_THREADS"] = "4" os.environ["MKL_NUM_THREADS"] = "4" torch.set_num_threads(4) logging.set_verbosity_error() # —— Model setup —— model_id = "kingabzpro/whisper-base-urdu-full" # Load and quantize to int8 model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, use_safetensors=True, ) model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8) processor = AutoProcessor.from_pretrained(model_id) # Build a CPU-based pipeline with chunking transcriber = pipeline( task="automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, device=-1, # CPU chunk_length_s=30, stride_length_s=(5, 5), ) def transcribe(audio): if audio is None: return "No audio provided. Please record or upload an audio file." sr, y = audio # mono & normalize if y.ndim > 1: y = y.mean(axis=1) y = y.astype(np.float32) peak = np.max(np.abs(y)) if peak > 0: y /= peak else: return "Audio appears to be silent. Please try again." # Inference under no_grad with torch.no_grad(): result = transcriber({"sampling_rate": sr, "raw": y}) text = result.get("text", "") # Add Urdu full stop if not present if text: text = text.rstrip() if text.endswith("."): text = text[:-1] + "۔" elif not text.endswith("۔"): text = text + "۔" return text # —— Gradio UI —— description = """
Record or upload audio in Urdu and get the transcribed text using the Whisper Base Urdu model.
""" examples = [ ["samples/audio1.mp3"], ["samples/audio2.mp3"], ["samples/audio3.mp3"], ] demo = gr.Interface( fn=transcribe, inputs=gr.Audio( sources=["microphone", "upload"], type="numpy", label="Record or Upload Audio (Urdu)", ), outputs=gr.Textbox( label="Transcribed Text (Urdu)", placeholder="Transcribed Urdu text will appear here...", ), title="⚡Fast Urdu Speech Recognition", description=description, examples=examples, allow_flagging="never", theme=gr.themes.Soft(), ) if __name__ == "__main__": demo.launch()