资源获取
- OpenAI官网
- Azure OpenAI
我这里用的是第二种,从Azure上获取的模型资源,想要从这获取得先注册Azure,并添加OpenAI资源,而且摆设OpenAI 4O RealTime模型,摆设后可以获得终结点和密钥,雷同如下格式:
终结点
- https://openaitest.azure.com/openai/realtime?api-version=2024-10-01-preview&deployment=gpt-4o-realtime-preview
复制代码 API KEY
- abcdefghijklmnopqrstuvwxyz123456789
复制代码 后端实现
参考OpenAI官方文档:https://platform.openai.com/docs/guides/realtime-model-capabilities
OpenAI RealTime不能用Http请求,只能用WebSocket或者WebRTC情势,本文只展示WebSocket方式对接。
本文利用的语言是C#。
调用接口代码示例:
- using System;
- using System.Collections.Generic;
- using System.Linq;
- using System.Text;
- using System.Threading.Tasks;
- using System.Net.WebSockets;
- using Newtonsoft.Json.Linq;
- using Newtonsoft.Json;
- using Azure;
- using Microsoft.AspNetCore.Http;
- using Microsoft.IdentityModel.Tokens;
- namespace RealTime.Modules.Common.Api
- {
- public class RealTimeApi
- {
- public static List<RealTimeConnect> RealTimeConnectList = new List<RealTimeConnect>();
- public record RealTimeConnect(string id, ClientWebSocket clinet, string conversationItemID, CancellationTokenSource cts);
- /// <summary>
- /// 创建连接
- /// </summary>
- /// <returns>连接ID,用于下次带session的问答</returns>
- public async Task<string> CreatedConnect(string prompt = "") {
- if (string.IsNullOrEmpty(prompt))
- prompt = "Your answer can only be a translation of what I said";
- string API_KEY = "YOUR API KEY";
- string ENDPOINT = "wss://YOUR ENDPOINT/openai/realtime?api-version=2024-10-01-preview&deployment=gpt-4o-realtime-preview";
- var ws = new ClientWebSocket();
- ws.Options.SetRequestHeader("api-key", $"{API_KEY}");
- var cts = new CancellationTokenSource();
- await ws.ConnectAsync(new Uri(ENDPOINT), cts.Token);
- string id = Guid.NewGuid().ToString("N");
- await SendAsync(ws, cts, new
- {
- type = "session.update",
- session = new
- {
- instructions = prompt
- }
- });
- Console.WriteLine("提示词发送成功");
- await SendAsync(ws, cts, new
- {
- type = "conversation.item.create",
- item = new
- {
- type = "message",
- role = "user",
- content = new[]
- {
- new {
- type = "input_text",
- text = prompt
- }
- }
- }
- });
- Console.WriteLine("初始化问题发送成功");
- var conversationItemID = string.Empty;
- var buffer = new byte[4096];
- while (ws.State == WebSocketState.Open)
- {
- using (var ms = new MemoryStream())
- {
- WebSocketReceiveResult result;
- do
- {
- result = await ws.ReceiveAsync(new ArraySegment<byte>(buffer), cts.Token);
- if (result.MessageType == WebSocketMessageType.Close)
- {
- await ws.CloseAsync(WebSocketCloseStatus.NormalClosure, "", cts.Token);
- break;
- }
- ms.Write(buffer, 0, result.Count);
- }
- while (!result.EndOfMessage);
- if (result.MessageType == WebSocketMessageType.Close) break;
- var fullMessageBytes = ms.ToArray();
- if (result.MessageType == WebSocketMessageType.Text)
- {
- var jsonStr = Encoding.UTF8.GetString(fullMessageBytes);
- var msg = JObject.Parse(jsonStr);
- switch (msg["type"].Value<string>())
- {
- case "conversation.item.created":
- if (msg["type"].Value<string>() == "conversation.item.created")
- {
- conversationItemID = msg["item"]["id"].Value<string>();
- string conversationContent = msg["item"]["content"].Children().First()["text"].Value<string>();
- Console.WriteLine($"获取到前置对话ID:{conversationItemID},Content:{conversationContent}");
- goto loop_exit;
- }
- break;
- default:
- break;
- }
- }
- }
- }
- loop_exit:;
- if (string.IsNullOrEmpty(conversationItemID))
- throw new Exception("前置提示词发送失败");
- RealTimeConnectList.Add(new RealTimeConnect(id, ws, conversationItemID, cts));
- return id;
- }
-
- /// <summary>
- /// 问答
- /// </summary>
- /// <param name="id">CreatedConnect 所创建的连接ID</param>
- /// <param name="question">问题或者WAV文件的Base64字符串</param>
- /// <param name="isAudio">是否输入为音频</param>
- /// <returns>回答</returns>
- /// <exception cref="NullReferenceException"></exception>
- public async Task<string> SendQuestion(string id,string question, bool isAudio = true)
- {
- var connect = RealTimeConnectList.FirstOrDefault(x => x.id == id);
- if (connect == null)
- throw new NullReferenceException("连接不存在");
- var audioChunks = new List<byte>();
- var response = string.Empty;
- var ws = connect.clinet;
- var cts = connect.cts;
- var content = new List<dynamic>() { };
- var input = new List<dynamic>
- {
- new
- {
- type = "item_reference",
- id = string.IsNullOrEmpty(connect.conversationItemID)?"":connect.conversationItemID
- },
- new
- {
- type = "message",
- role = "user",
- content = content
- }
- };
- if (isAudio)
- {
- content.Add(new
- {
- type = "input_audio",//当输入问题为音频Base64需要type为input_audio
- audio = question
- });
- }
- else
- {
- content.Add(new
- {
- type = "input_text",//当输入问题为真实问题时需要type为input_text
- text = question
- });
- }
- //发送问题
- await SendAsync(ws, cts, new
- {
- type = "response.create",
- response = new
- {
- conversation = "none",
- metadata = new { topic = "translate" },
- modalities = new[] { /*"audio", */"text" },
- //instructions = question
- input = input
- }
- });
- // 接收消息循环
- var buffer = new byte[4096]; // 缓冲区大小可调整
- var conversationItemID = string.Empty;
- while (ws.State == WebSocketState.Open)
- {
- using (var ms = new MemoryStream())
- {
- WebSocketReceiveResult result;
- do
- {
- result = await ws.ReceiveAsync(new ArraySegment<byte>(buffer), cts.Token);
- if (result.MessageType == WebSocketMessageType.Close)
- {
- await ws.CloseAsync(WebSocketCloseStatus.NormalClosure, "", cts.Token);
- break;
- }
- ms.Write(buffer, 0, result.Count);
- }
- while (!result.EndOfMessage); // 循环接收直到消息结束,当返回数据长度大于4096时,会分批发送
- if (result.MessageType == WebSocketMessageType.Close) break;
- var fullMessageBytes = ms.ToArray();
- if (result.MessageType == WebSocketMessageType.Text)
- {
- var jsonStr = Encoding.UTF8.GetString(fullMessageBytes);
- var msg = JObject.Parse(jsonStr);
- switch (msg["type"].Value<string>())
- {
- case "response.done"://回答结束
- {
- Console.WriteLine($"提问结束:{question}");
- goto loop_exit;
- }
- case "error":
- Console.WriteLine($"错误: {msg["error"]}");
- goto loop_exit;
- case "response.audio_transcript.delta":
- Console.WriteLine($"识别文本: {msg["delta"]}");
- response += msg["delta"].Value<string>();
- break;
- case "response.text.delta":
- Console.WriteLine($"识别文本: {msg["delta"]}");
- response += msg["delta"].Value<string>();
- break;
- case "response.audio.delta"://当response.create的modalities参数指定audio时,会返回语音回答
- var audioData = Convert.FromBase64String(msg["delta"].Value<string>());
- Console.WriteLine($"收到音频数据: {audioData.Length}字节");
- audioChunks.AddRange(audioData);
- break;
- }
- }
- }
- }
- loop_exit:;
-
- // 保存音频文件(当response.create的modalities参数指定audio时,会返回语音回答)
- if (audioChunks.Count > 0)
- {
- var totalAudio = audioChunks.ToArray();
- var header = CreateWavHeader(
- sampleRate: 16000,
- bitsPerSample: 16,
- channels: 1,
- dataSize: totalAudio.Length
- );
- File.WriteAllBytes(@"D:\test\output.wav", CombineBytes(header, totalAudio));
- Console.WriteLine("音频文件已保存为 output.wav");
- }
- else
- {
- Console.WriteLine("未接收到音频数据");
- }
- Console.WriteLine("回答:" + response);
- return response;
- }
- public async Task CloseConnect(string id)
- {
- var connect = RealTimeConnectList.FirstOrDefault(x => x.id == id);
- if (connect == null)
- throw new NullReferenceException("连接不存在");
- await connect.clinet.CloseAsync(WebSocketCloseStatus.NormalClosure, "", connect.cts.Token);
- }
-
- //文件转Base64字符串
- public static string ConvertFileToBase64(string filePath)
- {
- try
- {
- // 读取文件的所有字节
- byte[] fileBytes = File.ReadAllBytes(filePath);
- // 将字节数组转换为Base64字符串
- string base64String = Convert.ToBase64String(fileBytes);
- return base64String;
- }
- catch (Exception ex)
- {
- Console.WriteLine($"转换失败: {ex.Message}");
- return null;
- }
- }
- //文件转Base64字符串
- public static async Task<string> ConvertToBase64Async(IFormFile file)
- {
- if (file == null || file.Length == 0)
- throw new ArgumentException("文件不能为空");
- using (var memoryStream = new MemoryStream())
- {
- // 将文件内容复制到内存流
- await file.CopyToAsync(memoryStream);
- // 获取字节数组并转换为Base64
- byte[] fileBytes = memoryStream.ToArray();
- return Convert.ToBase64String(fileBytes);
- }
- }
- //发送消息
- public async Task SendAsync(ClientWebSocket ws, CancellationTokenSource cts, object obj)
- {
- var json = JsonConvert.SerializeObject(obj);
- await ws.SendAsync(
- Encoding.UTF8.GetBytes(json),
- WebSocketMessageType.Text,
- true,
- cts.Token);
- }
-
- public static byte[] CombineBytes(params byte[][] arrays)
- {
- var output = new MemoryStream();
- foreach (var arr in arrays)
- {
- output.Write(arr, 0, arr.Length);
- }
- return output.ToArray();
- }
-
- public static byte[] CreateWavHeader(int sampleRate, int bitsPerSample, int channels, int dataSize)
- {
- using (var ms = new MemoryStream())
- using (var writer = new BinaryWriter(ms))
- {
- // RIFF 头
- writer.Write(Encoding.ASCII.GetBytes("RIFF"));
- writer.Write(dataSize + 36); // 总长度
- writer.Write(Encoding.ASCII.GetBytes("WAVE"));
- // fmt 子块
- writer.Write(Encoding.ASCII.GetBytes("fmt "));
- writer.Write(16); // fmt块长度
- writer.Write((short)1); // PCM格式
- writer.Write((short)channels);
- writer.Write(sampleRate);
- writer.Write(sampleRate * channels * bitsPerSample / 8); // 字节率
- writer.Write((short)(channels * bitsPerSample / 8)); // 块对齐
- writer.Write((short)bitsPerSample);
- // data 子块
- writer.Write(Encoding.ASCII.GetBytes("data"));
- writer.Write(dataSize);
- return ms.ToArray();
- }
- }
- }
- }
复制代码 RealTime Api接收的语音文件可以为wav格式,但当文件的声道数量和比特率不对时,会导致识别有毛病,以是如果传入文件的比特率大于256,必要低落比特率再传入

低落伍

ffmpeg处理音频文件
这里用ffmpeg来处理:
这里只提供c#的处理思路,java python应该更方便吧~
起首上ffmpeg官网https://ffmpeg.org/download.html
找到

进入后找最新的下载就行

解压后是这样的

后面要想摆设到linux docker的话,就把linux的一并下载(摆设在windows iis的可以忽略)


windows我们只用到了bin文件夹的三个exe

linux只用到这两个文件

把这些文件全放入项目里,我这里放在ff/bin 下

转换代码
- using Xabe.FFmpeg;
- /// <summary>
- /// wav文件比特率转换
- /// </summary>
- /// <param name="filePath">文件路径</param>
- /// <returns></returns>
- public static async Task<(string, string)> ConvertVideoAsync(string filePath)
- {
- //设置ffmpeg执行文件的目录
- FFmpeg.SetExecutablesPath(@"ff/bin");
- // 自动下载备用方案(没用上,太慢了)
- //await FFmpegDownloader.GetLatestVersion(FFmpegVersion.Official);
-
- var mediaInfo = await FFmpeg.GetMediaInfo(filePath);
- string saveDirectory = @"UploadFile\Convert";
- if (!Directory.Exists(saveDirectory))
- Directory.CreateDirectory(saveDirectory);
- string outputFileName = $"{Path.GetFileName(filePath).Replace(Path.GetExtension(filePath), "")}_Convert{Path.GetExtension(filePath)}";
- string outputFilePath = Path.Combine(saveDirectory, outputFileName);
- var conversion = FFmpeg.Conversions.New()
- .AddStream(mediaInfo.Streams)
- .AddParameter("-ac 1")
- .AddParameter("-ar 16000 -acodec pcm_s16le")
- .AddParameter("-acodec pcm_s16le")
- .SetOutput(outputFilePath);
- await conversion.Start();
- using (var reader = new BinaryReader(File.OpenRead(outputFilePath)))
- {
- using (var memoryStream = new MemoryStream())
- {
- // 将文件内容复制到内存流
- await reader.BaseStream.CopyToAsync(memoryStream);
- // 获取字节数组并转换为Base64
- byte[] fileBytes = memoryStream.ToArray();
- return (Convert.ToBase64String(fileBytes), outputFilePath);
- }
- }
- }
复制代码 调用实例
- RealTimeApi realTimeApi;//依赖注入
- /// <summary>
- /// 问答
- /// </summary>
- /// <param name="formFile">文件</param>
- /// <param name="question">问题</param>
- /// <param name="connectID">连接ID 为空时自动新增</param>
- /// <param name="connectID">提示词</param>
- /// <returns></returns>
- public async Task<List<string>> SendAudioQuestionAsync(IFormFile formFile,string question,string connectID,string prompt)
- {
- try
- {
- if(string.IsNullOrEmpty(connectID))
- connectID = await realTimeApi.CreatedConnect(prompt);
- List<string> responses = new List<string>();
- if (string.IsNullOrEmpty(question))
- {
- string filePath = await SaveToLocalAsync(formFile, @"UploadFile\Org");
- (string base64, string convertFilePath) = await AudioProcessor.ConvertVideoAsync(filePath);
- File.Delete(convertFilePath);
- File.Delete(filePath);
- if (base64 == null)
- base64 = await RealTimeApi.ConvertToBase64Async(formFile);
- string response = await realTimeApi.SendQuestion(connectID, base64, true);
- responses.Add(response);
- }
- else
- {
- responses.Add(await realTimeApi.SendQuestion(connectID, question, false));
- }
- return responses;
- }
- catch (Exception ex)
- {
- throw;
- }
- }
-
- //保存文件
- public static async Task<string> SaveToLocalAsync(
- IFormFile file,
- string saveDirectory,
- string? customFileName = null)
- {
- // 参数验证
- if (file == null || file.Length == 0)
- throw new ArgumentException("文件不能为空");
- if (string.IsNullOrEmpty(saveDirectory))
- throw new ArgumentException("保存目录不能为空");
- // 创建目录(如果不存在)
- if (!Directory.Exists(saveDirectory))
- Directory.CreateDirectory(saveDirectory);
- Random random = new Random();
-
- string fileName = customFileName ??
- $"{Path.GetFileName(file.FileName).Replace(Path.GetExtension(file.FileName),"")}-{DateTime.Now.ToString("MMdd_HHmmss")}_{random.Next(100,999)}{Path.GetExtension(file.FileName)}";
-
- string filePath = Path.Combine(saveDirectory, fileName);
-
- using (var fileStream = new FileStream(filePath, FileMode.Create))
- {
- await file.CopyToAsync(fileStream);
- }
- return filePath;
- }
复制代码 后续用Controller调用一下就行
前端实现
vite+vue
引入灌音包
index.vue
- <template>
- <div>
- <AudioRecorder />
- </div>
- </template>
- <script>
- import AudioRecorder from '../components/AudioRecorder.vue'
- export default {
- components: {
- AudioRecorder
- }
- }
- </script>
复制代码 AudioRecorder.vue
- <template>
- <div class="audio-recorder">
- <div>
- <p>提示词:</p>
- <textarea v-model="state.prompt" style="width: 306px; height: 213px;"></textarea ></div>
- <button
- @click="toggleRecording"
- :class="{ 'recording': state.isRecording }"
- >
- {{ state.isRecording ? '录音中...' : '开始录音' }}
- </button>
- <p v-if="state.recordingTime">已录制: {{ state.formattedTime }}</p>
- <div v-if="state.error" class="error-message">{{ state.error }}</div>
- </div>
- <div v-for="(item,index) in state.responseMessage">
- <p>{{index+1}}:{{item}}</p>
- </div>
- </template>
- <script>
- import { reactive, onBeforeUnmount } from 'vue'
- import Recorder from 'recorder-js' //需要npm引入
- export default {
- name: 'AudioRecorder',
- setup() {
- const state = reactive({
- recorder: null,
- audioContext: null,
- mediaStream: null,
- isRecording: false,
- isProcessing: false,
- error: null,
- startTime: 0,
- responseMessage:[],
- prompt : 'Your answer can only be a translation of what I said.',
- })
- // 初始化音频设备
- const initRecorder = async () => {
- try {
- // 清理旧实例
- if (state.recorder) {
- state.recorder.destroy()
- state.audioContext.close()
- }
- // 创建新实例
- state.audioContext = new (window.AudioContext || window.webkitAudioContext)()
- state.mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true })
-
- state.recorder = new Recorder(state.audioContext, {
- numChannels: 1, // 单声道
- })
-
- await state.recorder.init(state.mediaStream)
- } catch (err) {
- handleError(err)
- }
- }
- // 开始录音
- const startRecording = async () => {
- try {
- if (!state.recorder) {
- await initRecorder()
- }
-
- await state.recorder.start()
- state.isRecording = true
- state.startTime = Date.now()
- state.error = null
- } catch (err) {
- handleError(err)
- }
- }
- // 停止录音(关键修复部分)
- const stopRecording = async () => {
- if (!state.isRecording) return
-
- state.isRecording = false
- try {
- state.isProcessing = true
-
- // 等待录音停止并获取数据
- const { blob, buffer } = await state.recorder.stop()
-
- console.log('获取到音频Blob:', blob)
- console.log('音频Buffer:', buffer)
-
- // 自动下载测试
- //Recorder.download(blob, 'recording')
- // 上传逻辑
- const formData = new FormData()
- formData.append('files', blob, `recording_${Date.now()}.wav`)
- formData.append('IsAudio', true)
- formData.append('prompt', state.prompt)
- await uploadChunk(formData)
-
- } catch (err) {
- handleError(err)
- } finally {
- // 资源清理
- state.mediaStream?.getTracks().forEach(track => track.stop())
- state.audioContext?.close()
- state.recorder = null
- state.isRecording = false
- state.isProcessing = false
- }
- }
- // 文件上传方法
- const uploadChunk = async (formData) => {
- try {
- const response = await fetch('http://localhost:12132/api/RealTime/AudioTranslate', {
- method: 'POST',
- body: formData
- })
-
- if (!response.ok) throw new Error(`上传失败: ${response.status}`)
- console.log(response);
- var jsonRes = await response.json()
- state.connectId = jsonRes.data.connectId;
- const now = new Date();
- var currentTime = now.toLocaleTimeString();
- state.responseMessage.push(currentTime + ':' + jsonRes.data.responses[0]);
- return jsonRes;
- } catch (err) {
- handleError(err)
- }
- }
- // 错误处理
- const handleError = (error) => {
- console.error('录音错误:', error)
- state.error = error.message || '录音功能异常'
- stopRecording()
- }
- // 切换录音状态
- const toggleRecording = () => {
- state.isRecording ? stopRecording() : startRecording()
- }
- // 组件卸载时清理
- onBeforeUnmount(() => {
- if (state.isRecording) stopRecording()
- })
- return {
- state,
- toggleRecording
- }
- }
- }
- </script>
- <style scoped>
- .audio-recorder {
- max-width: 900px;
- margin: 20px auto;
- padding: 20px;
- border: 1px solid #eee;
- border-radius: 8px;
- }
-
- button {
- padding: 10px 20px;
- background: #42b983;
- color: white;
- border: none;
- border-radius: 4px;
- cursor: pointer;
- transition: background 0.3s;
- }
-
- button:disabled {
- background: #ccc;
- cursor: not-allowed;
- }
-
- button.recording {
- background: #ff4757;
- animation: pulse 1s infinite;
- }
-
- @keyframes pulse {
- 0% { opacity: 1; }
- 50% { opacity: 0.5; }
- 100% { opacity: 1; }
- }
-
- .error-message {
- color: #ff4757;
- margin-top: 10px;
- }
- </style>
复制代码 结果
其实这个模型照旧不太灵敏,必要一个字一个字的说才能正常识别
免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!更多信息从访问主页:qidao123.com:ToB企服之家,中国第一个企服评测及商务社交产业平台。 |