zhuohany1206 commited on
Commit
b18d8c0
·
1 Parent(s): 8943040

提交信息

Browse files
Files changed (2) hide show
  1. app.py +8 -7
  2. requirements.txt +2 -2
app.py CHANGED
@@ -1,17 +1,18 @@
1
  import gradio as gr
2
  import torch
3
  import soundfile as sf
4
- from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
 
5
 
6
  # 模型名称
7
- MODEL_NAME = "openai/whisper-large-v3"
8
 
9
  # 选择设备
10
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
11
 
12
  # 加载处理器和模型
13
  processor = AutoProcessor.from_pretrained(MODEL_NAME)
14
- model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_NAME)
15
  model.to(DEVICE)
16
  model.eval()
17
 
@@ -20,10 +21,10 @@ def transcribe(audio_file):
20
  try:
21
  # 读取音频
22
  speech, rate = sf.read(audio_file.name)
23
- # 处理输入
24
- inputs = processor(audio=speech, sampling_rate=rate, return_tensors="pt").to(DEVICE)
25
  # 推理生成
26
- generated_ids = model.generate(**inputs)
27
  # 解码
28
  transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)
29
  return transcription[0]
@@ -32,7 +33,7 @@ def transcribe(audio_file):
32
 
33
  # Gradio 界面
34
  with gr.Blocks() as demo:
35
- gr.Markdown("## Whisper Large V3 音频转文字 Demo")
36
  with gr.Row():
37
  audio_input = gr.Audio(source="upload", type="file", label="上传音频文件")
38
  output_text = gr.Textbox(label="识别结果")
 
1
  import gradio as gr
2
  import torch
3
  import soundfile as sf
4
+ from transformers import AutoProcessor
5
+ from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM
6
 
7
  # 模型名称
8
+ MODEL_NAME = "Qwen/Qwen2-Audio-7B"
9
 
10
  # 选择设备
11
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
12
 
13
  # 加载处理器和模型
14
  processor = AutoProcessor.from_pretrained(MODEL_NAME)
15
+ model = Qwen2ForCausalLM.from_pretrained(MODEL_NAME)
16
  model.to(DEVICE)
17
  model.eval()
18
 
 
21
  try:
22
  # 读取音频
23
  speech, rate = sf.read(audio_file.name)
24
+ # 处理输入 - Qwen2-Audio使用audios参数
25
+ inputs = processor(audios=speech, sampling_rate=rate, return_tensors="pt").to(DEVICE)
26
  # 推理生成
27
+ generated_ids = model.generate(**inputs, max_length=512)
28
  # 解码
29
  transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)
30
  return transcription[0]
 
33
 
34
  # Gradio 界面
35
  with gr.Blocks() as demo:
36
+ gr.Markdown("## Qwen2-Audio 7B 音频转文字 Demo")
37
  with gr.Row():
38
  audio_input = gr.Audio(source="upload", type="file", label="上传音频文件")
39
  output_text = gr.Textbox(label="识别结果")
requirements.txt CHANGED
@@ -1,8 +1,8 @@
1
- # 核心依赖 - Space兼容版本
2
  gradio==4.44.0
3
  torch>=2.0.0
4
  torchaudio>=2.0.0
5
- transformers>=4.35.0
6
  accelerate>=0.20.0
7
  soundfile>=0.12.0
8
  librosa>=0.10.0
 
1
+ # 核心依赖 - Qwen2-Audio兼容版本
2
  gradio==4.44.0
3
  torch>=2.0.0
4
  torchaudio>=2.0.0
5
+ transformers>=4.40.0
6
  accelerate>=0.20.0
7
  soundfile>=0.12.0
8
  librosa>=0.10.0