Spaces:
Sleeping
Sleeping
| import torch | |
| import gradio as gr | |
| from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline | |
| # 加载模型和处理器 | |
| def load_model(model_path, use_gpu=True, use_flash_attention_2=False, use_bettertransformer=False): | |
| device = "cuda:0" if torch.cuda.is_available() and use_gpu else "cpu" | |
| torch_dtype = torch.float16 if torch.cuda.is_available() and use_gpu else torch.float32 | |
| processor = AutoProcessor.from_pretrained(model_path) | |
| model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
| model_path, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, | |
| use_flash_attention_2=use_flash_attention_2 | |
| ) | |
| if use_bettertransformer and not use_flash_attention_2: | |
| model = model.to_bettertransformer() | |
| model.to(device) | |
| return processor, model, device, torch_dtype | |
| # 初始化模型 | |
| processor, model, device, torch_dtype = load_model( | |
| model_path=r"panlr/whisper-finetune-teochew", | |
| use_gpu=True, | |
| use_flash_attention_2=False, | |
| use_bettertransformer=False | |
| ) | |
| # 创建推理管道 | |
| infer_pipe = pipeline( | |
| "automatic-speech-recognition", | |
| model=model, | |
| tokenizer=processor.tokenizer, | |
| feature_extractor=processor.feature_extractor, | |
| max_new_tokens=128, | |
| chunk_length_s=30, | |
| batch_size=16, | |
| torch_dtype=torch_dtype, | |
| device=device | |
| ) | |
| # 定义推理函数 | |
| def transcribe_audio(audio_path, num_beams=1): | |
| generate_kwargs = {"num_beams": num_beams} | |
| forced_decoder_ids = [ | |
| (1, processor.tokenizer.encode("<|startoftranscript|>")[0]), | |
| (2, processor.tokenizer.encode("<|zh|>")[0]), | |
| (3, processor.tokenizer.encode("<|transcribe|>")[0]), | |
| ] | |
| model.generation_config.forced_decoder_ids = forced_decoder_ids | |
| # if language is not None: | |
| # generate_kwargs["language"] = language | |
| result = infer_pipe(audio_path, return_timestamps=False, generate_kwargs=generate_kwargs) | |
| return result['text'] | |
| # Gradio 界面 | |
| def gradio_interface(audio): | |
| return transcribe_audio(audio) | |
| # 创建 Gradio 界面 | |
| with gr.Blocks() as interface: | |
| gr.Markdown("## Whisper 潮汕话-正字 语音转录") | |
| audio_input = gr.Audio( | |
| sources=["microphone", "upload"], | |
| type="filepath", | |
| label="输入音频", | |
| value="./example.wav" # 指定默认音频文件 | |
| ) | |
| output_text = gr.Textbox(label="转录结果") | |
| # 在输入模块的下方添加说明 | |
| gr.Markdown(""" | |
| 📢 **使用说明** | |
| - 本demo部署在CPU上,所以推理速度较慢。对于比较书面的话语,识别效果还不错,对土话、俗话还需要更多的数据。 | |
| - 你可以 **上传音频文件** 或 **使用麦克风** 向模型输入。 | |
| - 音频文件最好发音清晰、标准。 | |
| - 默认提供一个示例音频,你可以直接点击“提交”查看转录效果。 | |
| - 示例音频的对应文本: 【状元 林大钦,兵部尚(siên7)书 翁万达,了佮 工部 左侍郎(se6 neng5) 陈一松,拢是 嘉靖 年间 介 进士】 | |
| """) | |
| submit_btn = gr.Button("提交") | |
| submit_btn.click(gradio_interface, inputs=audio_input, outputs=output_text) | |
| # 启动 Gradio 应用 | |
| interface.launch() | |