1.首先下载模型,加载模型
- import torch
- import numpy as np
- import webrtcvad
- import pyaudio
- import queue
- import threading
- from datetime import datetime
- from faster_whisper import WhisperModel
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,pipeline
- from transformers import T5ForConditionalGeneration, T5Tokenizer
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
- torch_dtype = torch.float16
- save_directory = "./faster-distil-whiper-large-v3-local" # 替换为你希望保存的本地路径
- # en_zh_directory = "./opus-mt-en-zh-local" # 替换为你希望保存的本地路径
- en_zh_directory = "./t5-translate-en-ru-zh-base-200-sent-local" # 替换为你希望保存的本地路径
- whisperModel = WhisperModel(save_directory, device="cuda", compute_type="float32")
- model = T5ForConditionalGeneration.from_pretrained(en_zh_directory)
- model.eval()
- model.to(device)
- tokenizer = T5Tokenizer.from_pretrained(en_zh_directory)
- vad = webrtcvad.Vad(3) # 设置 VAD 灵敏度(0-3,3 最敏感)
- prefix = 'translate to zh: '
复制代码 2.配置麦克风
- # 初始化 PyAudio
- p = pyaudio.PyAudio()
- # 设置音频流参数
- FORMAT = pyaudio.paInt16 # 16-bit 音频格式
- CHANNELS = 1 # 单声道
- RATE = 16000 # 采样率(Whisper 需要 16kHz)
- FRAME_DURATION = 20 # 每帧的时长(ms)
- CHUNK = int(RATE * FRAME_DURATION / 1000) # 每帧的帧数
- MIN_SILENCE_DURATION = 0.2 # 最小静音时长(秒)
复制代码 3.队列构建,构建录音基本参数
- # 共享队列,用于录音和推理线程之间的数据交换
- audio_queue = queue.Queue()
- silence_frames = 0
- silence_frames_lock = threading.Lock()
复制代码 4.构建录音函数
- # 录音线程
- def record_audio():
- global silence_frames
- stream = p.open(
- format=FORMAT,
- channels=CHANNELS,
- rate=RATE,
- input=True,
- frames_per_buffer=CHUNK,
- )
- print("开始录音...按 Ctrl+C 停止")
- try:
- while True:
- # 从麦克风读取音频数据
- data = stream.read(CHUNK)
- audio_data = np.frombuffer(data, dtype=np.int16).astype(np.float32) / 32768.0
- # 使用 VAD 检测语音活动
- if vad.is_speech(data, RATE):
- audio_queue.put(audio_data)
- with silence_frames_lock:
- silence_frames = 0 # 重置静音计数器
- else:
- with silence_frames_lock:
- silence_frames += 1 # 重置静音计数器
- except KeyboardInterrupt:
- print("录音停止")
- finally:
- stream.stop_stream()
- stream.close()
- p.terminate()
复制代码 5.构建翻译函数
- def process_audio():
- global silence_frames
- audio_buffer = np.array([], dtype=np.float32)
- silence_frames = 0
- while True:
- try:
- # 从队列中获取音频数据
- audio_data = audio_queue.get(timeout=1) # 超时 1 秒
- audio_buffer = np.concatenate((audio_buffer, audio_data))
- except Exception as e:
- pass
- current_silence_frames = 0
- # 检查静音计数器
- with silence_frames_lock:
- current_silence_frames = silence_frames
- # 如果检测到静音时间超过阈值,处理累积的音频
- if (current_silence_frames > MIN_SILENCE_DURATION * (RATE / CHUNK)) or len(audio_buffer) > 320 * 200:
- if(len(audio_buffer) > 0):#, language="en"
- segments, _ = whisperModel.transcribe(audio_buffer,vad_filter=True,vad_parameters=dict(min_silence_duration_ms=200), language="en", condition_on_previous_text=True)
- for segment in segments:
- if(segment.text == ""):
- continue
- elif(segment.text == "Thank you."):
- print("[%s] %s (%s)" % (str(datetime.now()), "感谢", segment.text))
- else:
- src_text = prefix + segment.text
-
- input_ids = tokenizer(src_text, return_tensors="pt")
- generated_tokens = model.generate(**input_ids.to(device))
- result = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
- print("[%s] %s (%s)" % (str(datetime.now()), result[0], segment.text))
-
-
- # result = pipeline(segment.text)
- # print("[%s] %s (%s)" % (str(datetime.now()), result[0]['translation_text'], segment.text))
-
- audio_buffer = np.array([], dtype=np.float16)
- silence_frames = 0
复制代码 6.启动线程,启动步伐
- # 启动录音线程和推理线程
- record_thread = threading.Thread(target=record_audio)
- process_thread = threading.Thread(target=process_audio)
- record_thread.start()
- process_thread.start()
复制代码
免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!更多信息从访问主页:qidao123.com:ToB企服之家,中国第一个企服评测及商务社交产业平台。 |