import gradio as gr import torch import soundfile as sf from transformers import AutoProcessor from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM # 模型名称 MODEL_NAME = "Qwen/Qwen2-Audio-7B" # 选择设备 DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # 加载处理器和模型 processor = AutoProcessor.from_pretrained(MODEL_NAME) model = Qwen2ForCausalLM.from_pretrained(MODEL_NAME) model.to(DEVICE) model.eval() # 转文字函数 def transcribe(audio_file): try: # 读取音频 speech, rate = sf.read(audio_file.name) # 处理输入 - Qwen2-Audio使用audios参数 inputs = processor(audios=speech, sampling_rate=rate, return_tensors="pt").to(DEVICE) # 推理生成 generated_ids = model.generate(**inputs, max_length=512) # 解码 transcription = processor.batch_decode(generated_ids, skip_special_tokens=True) return transcription[0] except Exception as e: return f"出错了: {e}" # Gradio 界面 with gr.Blocks() as demo: gr.Markdown("## Qwen2-Audio 7B 音频转文字 Demo") with gr.Row(): audio_input = gr.Audio(source="upload", type="file", label="上传音频文件") output_text = gr.Textbox(label="识别结果") transcribe_btn = gr.Button("开始转文字") transcribe_btn.click(transcribe, inputs=audio_input, outputs=output_text) # 启动 demo.launch()