能实现:

只要替换内里的key,就能跑通。
key的查找方法:
【保姆级教程】如何在azure里快速找到openai的key和demo-CSDN博客
代码结构:
azure_openai_client.py
main.py
prompts_config.py
speech_utils.py
stt01.py
tts01.py
azure_openai_client.py
- import os
- import base64
- from openai import AzureOpenAI
- from typing import List, Dict, Optional, Union
- class AzureOpenAIClient:
- def __init__(
- self,
- endpoint: str = "替换成你的终结点",
- deployment: str = "模型名称",
- api_key: Optional[str] = None,
- system_prompt: str = None
- ):
- """
- 初始化 Azure OpenAI 客户端
-
- Args:
- endpoint: Azure OpenAI 服务端点
- deployment: 部署名称
- api_key: API 密钥,如果为 None 则从环境变量获取
- system_prompt: 系统提示词,如果为 None 则使用默认提示
- """
- self.endpoint = endpoint
- self.deployment = deployment
- self.api_key = api_key or os.getenv(
- "AZURE_OPENAI_API_KEY",
- "替换为你的key"
- )
-
- self.client = AzureOpenAI(
- azure_endpoint=self.endpoint,
- api_key=self.api_key,
- api_version="2024-05-01-preview"
- )
-
- # 使用传入的系统提示词或默认提示词
- default_prompt_text = "你是一个帮助用户查找信息的 AI 助手。"
- if system_prompt:
- default_prompt_text = system_prompt
-
- self.default_chat_prompt = [
- {
- "role": "system",
- "content": [
- {
- "type": "text",
- "text": default_prompt_text
- }
- ]
- }
- ]
- def encode_image(self, image_path: str) -> str:
- """
- 将图片编码为 base64 字符串
-
- Args:
- image_path: 图片路径
-
- Returns:
- base64 编码的图片字符串
- """
- with open(image_path, 'rb') as image_file:
- return base64.b64encode(image_file.read()).decode('ascii')
- def chat_completion(
- self,
- messages: Optional[List[Dict]] = None,
- max_tokens: int = 200,
- temperature: float = 0.7,
- top_p: float = 0.95,
- frequency_penalty: float = 0,
- presence_penalty: float = 0,
- stop: Optional[Union[str, List[str]]] = None,
- stream: bool = False
- ):
- """
- 生成聊天完成
-
- Args:
- messages: 聊天消息列表,如果为 None 则使用默认提示
- max_tokens: 生成的最大标记数
- temperature: 采样温度
- top_p: 核采样概率
- frequency_penalty: 频率惩罚
- presence_penalty: 存在惩罚
- stop: 停止序列
- stream: 是否使用流式响应
-
- Returns:
- 聊天完成响应
- """
- if messages is None:
- messages = self.default_chat_prompt
- completion = self.client.chat.completions.create(
- model=self.deployment,
- messages=messages,
- max_tokens=max_tokens,
- temperature=temperature,
- top_p=top_p,
- frequency_penalty=frequency_penalty,
- presence_penalty=presence_penalty,
- stop=stop,
- stream=stream
- )
-
- return completion
复制代码 main.py
- from speech_utils import SpeechService, text_to_speech, speech_to_text
- import time
- import os
- from azure_openai_client import AzureOpenAIClient
- from prompts_config import get_system_prompt
- def main():
- # 创建 SpeechService 实例
- speech_service = SpeechService(
- speech_key="替换为你的key",
- service_region="资源部署地"
- )
-
- while True:
- print("\n=== 功能菜单 ===")
- print("1. 语音转文字")
- print("2. 文字转语音")
- print("3. AI 语音对话")
- print("0. 退出")
-
- choice = input("请选择功能 (0-3): ")
-
- if choice == "0":
- print("感谢使用,再见!")
- break
- elif choice == "1":
- print("\n=== 语音转文字 ===")
- print("支持的语言:中文、英语、日语")
- print("请说话...")
-
- # 记录开始时间
- start_time = time.time()
-
- success, result = speech_service.speech_to_text(
- languages=["zh-CN", "en-US", "ja-JP"]
- )
-
- # 计算并显示耗时
- elapsed_time = time.time() - start_time
- print(f"\n语音识别耗时: {elapsed_time:.2f}秒")
-
- if success:
- print(f"识别结果: {result['text']}")
- if result['detected_language']:
- print(f"检测到的语言: {result['detected_language']}")
-
- if input("\n是否要将识别的文字转换为语音?(y/n): ").lower() == 'y':
- # 记录文字转语音开始时间
- tts_start_time = time.time()
-
- success, message = speech_service.text_to_speech(result['text'])
-
- # 计算并显示文字转语音耗时
- tts_elapsed_time = time.time() - tts_start_time
- print(f"文字转语音耗时: {tts_elapsed_time:.2f}秒")
- print(message)
- else:
- print(f"错误: {result}")
-
- elif choice == "2":
- print("\n=== 文字转语音 ===")
- print("可选择的语音:")
- print("1. 中文女声 (zh-CN-XiaoxiaoNeural)")
- print("2. 中文男声 (zh-CN-YunxiNeural)")
- print("3. 英文女声 (en-US-AriaNeural)")
-
- voice_choice = input("请选择语音 (1-3,默认1): ").strip()
- voice_map = {
- "1": "zh-CN-XiaoxiaoNeural",
- "2": "zh-CN-YunxiNeural",
- "3": "en-US-AriaNeural"
- }
- voice_name = voice_map.get(voice_choice, "zh-CN-XiaoxiaoNeural")
-
- text = input("\n请输入要转换为语音的文字: ")
-
- # 记录开始时间
- start_time = time.time()
-
- success, message = speech_service.text_to_speech(text, voice_name=voice_name)
-
- # 计算并显示耗时
- elapsed_time = time.time() - start_time
- print(f"文字转语音耗时: {elapsed_time:.2f}秒")
- print(message)
-
- elif choice == "3":
- voice_chat()
- else:
- print("\n无效的选择,请重试。")
-
- time.sleep(1)
- def voice_chat():
- # 初始化服务
- ai_client = AzureOpenAIClient()
- speech_service = SpeechService(
- speech_key=".....",
- service_region="资源位置。例:eastus"
- )
-
- # 选择语言
- print("\n请选择对话语言:")
- print("1. 中文")
- print("2. English")
- lang_choice = input("请选择 (1/2): ")
-
- language = "zh-CN" if lang_choice == "1" else "en-US"
-
- # 获取为该语言配置的系统提示词
- system_prompt = get_system_prompt(language)
-
- # 创建AI客户端并设置系统提示
- ai_client = AzureOpenAIClient(system_prompt=system_prompt)
-
- # 更新系统提示
- messages = [
- {
- "role": "system",
- "content": [{"type": "text", "text": system_prompt}]
- }
- ]
-
- print("\n=== AI 语音对话开始 ===")
- print("输入 's' 开始对话,输入 'q' 结束对话")
-
- while True:
- command = input("\n请输入命令 (s: 开始对话, q: 退出): ")
- if command.lower() == 'q':
- break
- elif command.lower() == 's':
- print("\n开始对话模式...")
- print("系统会在AI回复完成后才开始检测您的语音")
- print("说 '再见' 或 'goodbye' 结束对话")
-
- continue_dialog = True
- while continue_dialog:
- # 提示用户说话
- print("\n请开始说话...")
-
- # 每次只进行一次语音识别
- success, result = speech_service.speech_to_text(languages=[language])
-
- if success and result['text']:
- user_text = result['text']
- print(f"\n您说: {user_text}")
-
- # 检查是否要结束对话
- if (language == "zh-CN" and "再见" in user_text.lower()) or \
- (language == "en-US" and "goodbye" in user_text.lower()):
- print("对话结束")
- continue_dialog = False
- break
-
- # 添加用户消息
- messages.append({
- "role": "user",
- "content": [{"type": "text", "text": user_text}]
- })
-
- # 获取 AI 响应
- print("AI思考中...")
- response = ai_client.chat_completion(messages=messages)
- ai_text = response.choices[0].message.content
- print(f"AI 响应: {ai_text}")
-
- # 添加 AI 响应到消息历史
- messages.append({
- "role": "assistant",
- "content": [{"type": "text", "text": ai_text}]
- })
-
- # 文字转语音 - 等待语音合成完成
- print("正在生成语音...")
- voice_name = "zh-CN-XiaoxiaoNeural" if language == "zh-CN" else "en-US-AriaNeural"
- success, message = speech_service.text_to_speech(ai_text, voice_name=voice_name)
- if not success:
- print(f"语音合成失败: {message}")
-
- print("AI语音播放完成,准备下一轮对话")
- else:
- print("未能识别您的语音,请重试")
- else:
- print("无效的命令,请重试")
- if __name__ == "__main__":
- main()
复制代码 prompts_config.py
- # 系统提示词配置
- # 主要系统提示词 - 使用一种语言编写(中文)
- MAIN_SYSTEM_PROMPT = """
- 你是一个智能AI助手,专注于提供有用、准确的信息。请遵循以下准则:
- 1. 保持回答简洁明了,避免冗长解释
- 2. 使用礼貌友好的语气
- 3. 如果不确定答案,坦诚表示不知道
- 4. 避免有害或不适当的内容
- 5. 提供准确、最新的信息
- 6. 尊重用户隐私,不要要求个人信息
- 7. 只能输出自然语言,禁止输出md格式的内容。
- """
- # 语言特定的补充提示
- LANGUAGE_PROMPTS = {
- "zh-CN": "请用中文简短回答。",
- "en-US": "Please respond in English concisely.",
- "ja-JP": "簡潔に日本語で回答してください。",
- # 可以添加更多语言
- }
- def get_system_prompt(language_code="zh-CN"):
- """获取指定语言的完整系统提示词"""
- language_prompt = LANGUAGE_PROMPTS.get(language_code, LANGUAGE_PROMPTS["zh-CN"])
- return f"{MAIN_SYSTEM_PROMPT}\n{language_prompt}"
复制代码 speech_utils.py
- import azure.cognitiveservices.speech as speechsdk
- import time
- class SpeechService:
- def __init__(self, speech_key, service_region):
- self.speech_key = speech_key
- self.service_region = service_region
- self.speech_config = speechsdk.SpeechConfig(
- subscription=speech_key,
- region=service_region
- )
- def text_to_speech(self, text, voice_name="zh-CN-XiaoxiaoNeural"):
- """
- 将文字转换为语音
- :param text: 要转换的文字
- :param voice_name: 语音名称,默认使用中文女声
- :return: 转换结果和错误信息(如果有)
- """
- try:
- self.speech_config.speech_synthesis_voice_name = voice_name
- speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=self.speech_config)
-
- # 创建事件来跟踪语音合成完成
- synthesis_completed = False
-
- def synthesis_completed_cb(evt):
- nonlocal synthesis_completed
- synthesis_completed = True
-
- # 注册事件
- speech_synthesizer.synthesis_completed.connect(synthesis_completed_cb)
-
- result = speech_synthesizer.speak_text_async(text).get()
- if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
- # 等待合成完成事件
- while not synthesis_completed:
- time.sleep(0.1)
- return True, "语音合成成功"
- elif result.reason == speechsdk.ResultReason.Canceled:
- cancellation_details = result.cancellation_details
- return False, f"语音合成取消: {cancellation_details.reason}"
- except Exception as e:
- return False, f"发生错误: {str(e)}"
- def speech_to_text(self, languages=None, continuous=False):
- """
- 语音转文字
- :param languages: 支持的语言列表,例如 ["zh-CN", "en-US", "ja-JP"]
- :param continuous: 是否使用连续识别模式
- :return: 识别结果和错误信息(如果有)
- """
- try:
- if languages:
- # 多语言支持
- auto_detect_source_language_config = speechsdk.languageconfig.AutoDetectSourceLanguageConfig(
- languages=languages
- )
- speech_recognizer = speechsdk.SpeechRecognizer(
- speech_config=self.speech_config,
- auto_detect_source_language_config=auto_detect_source_language_config
- )
- else:
- # 默认使用中文
- self.speech_config.speech_recognition_language = "zh-CN"
- speech_recognizer = speechsdk.SpeechRecognizer(speech_config=self.speech_config)
- if continuous:
- # 使用连续识别模式
- done = False
- def handle_result(evt):
- if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
- return True, {
- "text": evt.result.text,
- "detected_language": evt.result.properties.get(
- speechsdk.PropertyId.SpeechServiceConnection_AutoDetectSourceLanguageResult
- )
- }
- return False, "无识别结果"
- def stop_cb(evt):
- nonlocal done
- done = True
- # 绑定事件
- speech_recognizer.recognized.connect(handle_result)
- speech_recognizer.session_stopped.connect(stop_cb)
- speech_recognizer.canceled.connect(stop_cb)
- # 开始连续识别
- speech_recognizer.start_continuous_recognition()
- while not done:
- time.sleep(0.5)
- speech_recognizer.stop_continuous_recognition()
-
- return True, {"text": "", "detected_language": None}
- else:
- # 单次识别模式
- result = speech_recognizer.recognize_once()
- if result.reason == speechsdk.ResultReason.RecognizedSpeech:
- detected_language = None
- if hasattr(result, 'properties') and result.properties.get(
- speechsdk.PropertyId.SpeechServiceConnection_AutoDetectSourceLanguageResult
- ):
- detected_language = result.properties[
- speechsdk.PropertyId.SpeechServiceConnection_AutoDetectSourceLanguageResult
- ]
- return True, {
- "text": result.text,
- "detected_language": detected_language
- }
- elif result.reason == speechsdk.ResultReason.NoMatch:
- return False, f"无法识别语音: {result.no_match_details}"
- elif result.reason == speechsdk.ResultReason.Canceled:
- return False, f"语音识别取消: {result.cancellation_details.reason}"
- except Exception as e:
- return False, f"发生错误: {str(e)}"
- def start_continuous_recognition(self, languages=None, callback=None):
- """
- 启动连续语音识别
- :param languages: 支持的语言列表,例如 ["zh-CN", "en-US", "ja-JP"]
- :param callback: 回调函数,用于处理识别结果
- :return: speech_recognizer 对象,用于后续控制
- """
- try:
- if languages:
- # 多语言支持
- auto_detect_source_language_config = speechsdk.languageconfig.AutoDetectSourceLanguageConfig(
- languages=languages
- )
- speech_recognizer = speechsdk.SpeechRecognizer(
- speech_config=self.speech_config,
- auto_detect_source_language_config=auto_detect_source_language_config
- )
- else:
- # 默认使用中文
- self.speech_config.speech_recognition_language = "zh-CN"
- speech_recognizer = speechsdk.SpeechRecognizer(speech_config=self.speech_config)
- # 处理识别结果的事件
- def handle_result(evt):
- if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
- text = evt.result.text
- detected_language = evt.result.properties.get(
- speechsdk.PropertyId.SpeechServiceConnection_AutoDetectSourceLanguageResult
- )
-
- if callback:
- should_continue = callback(text, detected_language)
- if should_continue is False:
- # 如果回调返回 False,停止识别
- speech_recognizer.stop_continuous_recognition_async()
- # 处理错误的事件
- def handle_canceled(evt):
- if evt.reason == speechsdk.CancellationReason.Error:
- print(f"语音识别错误: {evt.error_details}")
- # 绑定事件处理器
- speech_recognizer.recognized.connect(handle_result)
- speech_recognizer.canceled.connect(handle_canceled)
- # 开始连续识别
- speech_recognizer.start_continuous_recognition_async()
-
- return speech_recognizer
-
- except Exception as e:
- print(f"启动连续识别时发生错误: {str(e)}")
- raise
- def text_to_speech(text: str, language: str = "zh-CN") -> None:
- """
- 将文本转换为语音
-
- Args:
- text: 要转换的文本
- language: 语言代码,默认为中文
- """
- # 使用类中已定义的密钥
- speech_key = "语音识别的key"
- service_region = "资源位置,例:eastus"
-
- # 创建语音配置
- speech_config = speechsdk.SpeechConfig(
- subscription=speech_key,
- region=service_region
- )
-
- # 根据语言选择合适的语音
- if language == "zh-CN":
- speech_config.speech_synthesis_voice_name = "zh-CN-XiaoxiaoNeural"
- else:
- speech_config.speech_synthesis_voice_name = "en-US-AriaNeural"
-
- # 创建语音合成器
- speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)
-
- # 执行语音合成
- result = speech_synthesizer.speak_text_async(text).get()
-
- if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
- print("语音合成完成")
- elif result.reason == speechsdk.ResultReason.Canceled:
- cancellation_details = result.cancellation_details
- print(f"语音合成取消: {cancellation_details.reason}")
- if cancellation_details.reason == speechsdk.CancellationReason.Error:
- print(f"错误详情: {cancellation_details.error_details}")
- def speech_to_text(language: str = "zh-CN") -> str:
- """
- 将语音转换为文本
-
- Args:
- language: 语言代码,默认为中文
-
- Returns:
- 识别出的文本,如果失败则返回空字符串
- """
- speech_key = "语音识别的key"
- service_region = "资源位置"
-
- # 创建语音配置
- speech_config = speechsdk.SpeechConfig(
- subscription=speech_key,
- region=service_region
- )
-
- # 设置语音识别语言
- speech_config.speech_recognition_language = language
-
- # 创建音频配置
- audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)
-
- # 创建语音识别器
- speech_recognizer = speechsdk.SpeechRecognizer(
- speech_config=speech_config,
- audio_config=audio_config
- )
-
- print("开始说话...")
-
- # 执行语音识别
- result = speech_recognizer.recognize_once_async().get()
-
- if result.reason == speechsdk.ResultReason.RecognizedSpeech:
- return result.text
- elif result.reason == speechsdk.ResultReason.NoMatch:
- print(f"无法识别语音: {result.no_match_details}")
- elif result.reason == speechsdk.ResultReason.Canceled:
- cancellation_details = result.cancellation_details
- print(f"语音识别取消: {cancellation_details.reason}")
- if cancellation_details.reason == speechsdk.CancellationReason.Error:
- print(f"错误详情: {cancellation_details.error_details}")
-
- return ""
复制代码 stt01.py
- # Copyright (c) Microsoft. All rights reserved.
- # Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
- from speech_utils import SpeechService
- def main():
- # 创建 SpeechService 实例
- speech_service = SpeechService(
- speech_key="语音识别的key",
- service_region="eastus"
- )
-
- print("请说话...")
- success, result = speech_service.speech_to_text(languages=["zh-CN", "en-US", "ja-JP"])
-
- if success:
- print(f"识别结果: {result['text']}")
- if result['detected_language']:
- print(f"检测到的语言: {result['detected_language']}")
- else:
- print(f"错误: {result}")
- if __name__ == "__main__":
- main()
复制代码 tts01.py
- # Copyright (c) Microsoft. All rights reserved.
- # Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
- from speech_utils import SpeechService
- def main():
- # 创建 SpeechService 实例
- speech_service = SpeechService(
- speech_key="语音识别的key",
- service_region="eastus"
- )
-
- print("请输入要转换为语音的文字...")
- text = input()
-
- success, message = speech_service.text_to_speech(text)
- print(message)
- if __name__ == "__main__":
- main()
复制代码
免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!更多信息从访问主页:qidao123.com:ToB企服之家,中国第一个企服评测及商务社交产业平台。 |