import os
import warnings

import gradio as gr
import numpy as np
import torch
from transformers import (
    AutoModelForSpeechSeq2Seq,
    AutoProcessor,
    logging,
    pipeline,
)

warnings.simplefilter("ignore", FutureWarning)

# —— CPU performance tweaks ——
os.environ["OMP_NUM_THREADS"] = "4"
os.environ["MKL_NUM_THREADS"] = "4"
torch.set_num_threads(4)

logging.set_verbosity_error()

# —— Model setup ——
model_id = "kingabzpro/whisper-base-urdu-full"

# Load and quantize to int8
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id,
    use_safetensors=True,
)
model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

processor = AutoProcessor.from_pretrained(model_id)

# Build a CPU-based pipeline with chunking
transcriber = pipeline(
    task="automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    device=-1,  # CPU
    chunk_length_s=30,
    stride_length_s=(5, 5),
)


def transcribe(audio):
    if audio is None:
        return "No audio provided. Please record or upload an audio file."

    sr, y = audio
    # mono & normalize
    if y.ndim > 1:
        y = y.mean(axis=1)
    y = y.astype(np.float32)
    peak = np.max(np.abs(y))
    if peak > 0:
        y /= peak
    else:
        return "Audio appears to be silent. Please try again."

    # Inference under no_grad
    with torch.no_grad():
        result = transcriber({"sampling_rate": sr, "raw": y})
    text = result.get("text", "")
    # Add Urdu full stop if not present
    if text:
        text = text.rstrip()
        if text.endswith("."):
            text = text[:-1] + "۔"
        elif not text.endswith("۔"):
            text = text + "۔"
    return text


# —— Gradio UI ——
description = """
<p style='text-align: center'>
Record or upload audio in Urdu and get the transcribed text using the Whisper Base Urdu model.
</p>
"""
examples = [
    ["samples/audio1.mp3"],
    ["samples/audio2.mp3"],
    ["samples/audio3.mp3"],
]


demo = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(
        sources=["microphone", "upload"],
        type="numpy",
        label="Record or Upload Audio (Urdu)",
    ),
    outputs=gr.Textbox(
        label="Transcribed Text (Urdu)",
        placeholder="Transcribed Urdu text will appear here...",
    ),
    title="⚡Fast Urdu Speech Recognition",
    description=description,
    examples=examples,
    allow_flagging="never",
    theme=gr.themes.Soft(),
)

if __name__ == "__main__":
    demo.launch()