Spaces:

zhuohany1206
/

audiototext

Runtime error

zhuohany1206 commited on Oct 29

Commit

b18d8c0

1 Parent(s): 8943040

提交信息

Files changed (2) hide show

app.py CHANGED Viewed

@@ -1,17 +1,18 @@
 import gradio as gr
 import torch
 import soundfile as sf
-from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
 # 模型名称
-MODEL_NAME = "openai/whisper-large-v3"
 # 选择设备
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # 加载处理器和模型
 processor = AutoProcessor.from_pretrained(MODEL_NAME)
-model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_NAME)
 model.to(DEVICE)
 model.eval()
@@ -20,10 +21,10 @@ def transcribe(audio_file):
     try:
         # 读取音频
         speech, rate = sf.read(audio_file.name)
-        # 处理输入
-        inputs = processor(audio=speech, sampling_rate=rate, return_tensors="pt").to(DEVICE)
         # 推理生成
-        generated_ids = model.generate(**inputs)
         # 解码
         transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)
         return transcription[0]
@@ -32,7 +33,7 @@ def transcribe(audio_file):
 # Gradio 界面
 with gr.Blocks() as demo:
-    gr.Markdown("## Whisper Large V3 音频转文字 Demo")
     with gr.Row():
         audio_input = gr.Audio(source="upload", type="file", label="上传音频文件")
         output_text = gr.Textbox(label="识别结果")

 import gradio as gr
 import torch
 import soundfile as sf
+from transformers import AutoProcessor
+from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM
 # 模型名称
+MODEL_NAME = "Qwen/Qwen2-Audio-7B"
 # 选择设备
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # 加载处理器和模型
 processor = AutoProcessor.from_pretrained(MODEL_NAME)
+model = Qwen2ForCausalLM.from_pretrained(MODEL_NAME)
 model.to(DEVICE)
 model.eval()
     try:
         # 读取音频
         speech, rate = sf.read(audio_file.name)
+        # 处理输入 - Qwen2-Audio使用audios参数
+        inputs = processor(audios=speech, sampling_rate=rate, return_tensors="pt").to(DEVICE)
         # 推理生成
+        generated_ids = model.generate(**inputs, max_length=512)
         # 解码
         transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)
         return transcription[0]
 # Gradio 界面
 with gr.Blocks() as demo:
+    gr.Markdown("## Qwen2-Audio 7B 音频转文字 Demo")
     with gr.Row():
         audio_input = gr.Audio(source="upload", type="file", label="上传音频文件")
         output_text = gr.Textbox(label="识别结果")

requirements.txt CHANGED Viewed

@@ -1,8 +1,8 @@
-# 核心依赖 - Space兼容版本
 gradio==4.44.0
 torch>=2.0.0
 torchaudio>=2.0.0
-transformers>=4.35.0
 accelerate>=0.20.0
 soundfile>=0.12.0
 librosa>=0.10.0

+# 核心依赖 - Qwen2-Audio兼容版本
 gradio==4.44.0
 torch>=2.0.0
 torchaudio>=2.0.0
+transformers>=4.40.0
 accelerate>=0.20.0
 soundfile>=0.12.0
 librosa>=0.10.0