first commit

This commit is contained in:
lingxiao865
2025-09-18 17:50:03 +08:00
parent 4c3c83092c
commit f5c4158fc1
16 changed files with 31819 additions and 0 deletions

8
.idea/.gitignore generated vendored Normal file
View File

@@ -0,0 +1,8 @@
# 默认忽略的文件
/shelf/
/workspace.xml
# 基于编辑器的 HTTP 客户端请求
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

8
.idea/PythonProject.iml generated Normal file
View File

@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="C:\Users\Administrator\miniconda3" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

View File

@@ -0,0 +1,12 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
<option name="ignoredErrors">
<list>
<option value="N802" />
</list>
</option>
</inspection_tool>
</profile>
</component>

View File

@@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

6
.idea/misc.xml generated Normal file
View File

@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Black">
<option name="sdkName" value="C:\Users\Administrator\miniconda3" />
</component>
</project>

8
.idea/modules.xml generated Normal file
View File

@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/PythonProject.iml" filepath="$PROJECT_DIR$/.idea/PythonProject.iml" />
</modules>
</component>
</project>

332
1.html Normal file
View File

@@ -0,0 +1,332 @@
<!--<!DOCTYPE html>-->
<!--<html lang="zh">-->
<!--<head>-->
<!-- <meta charset="UTF-8">-->
<!-- <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no">-->
<!-- <title>语音识别系统</title>-->
<!-- <style>-->
<!-- body {-->
<!-- font-family: Arial, sans-serif;-->
<!-- max-width: 800px;-->
<!-- margin: 0 auto;-->
<!-- padding: 20px;-->
<!-- background-color: #f5f5f5;-->
<!-- }-->
<!-- .container {-->
<!-- background-color: white;-->
<!-- padding: 30px;-->
<!-- border-radius: 10px;-->
<!-- box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);-->
<!-- }-->
<!-- h1 {-->
<!-- color: #333;-->
<!-- text-align: center;-->
<!-- }-->
<!-- .btn {-->
<!-- background-color: #4CAF50;-->
<!-- color: white;-->
<!-- padding: 12px 24px;-->
<!-- border: none;-->
<!-- border-radius: 5px;-->
<!-- cursor: pointer;-->
<!-- font-size: 16px;-->
<!-- margin: 10px;-->
<!-- touch-action: manipulation;-->
<!-- }-->
<!-- .btn:hover {-->
<!-- background-color: #45a049;-->
<!-- }-->
<!-- .btn:disabled {-->
<!-- background-color: #cccccc;-->
<!-- cursor: not-allowed;-->
<!-- }-->
<!-- .record-btn {-->
<!-- background-color: #f44336;-->
<!-- padding: 15px 30px;-->
<!-- font-size: 18px;-->
<!-- }-->
<!-- .record-btn:hover {-->
<!-- background-color: #d32f2f;-->
<!-- }-->
<!-- .record-btn.recording {-->
<!-- background-color: #ff9800;-->
<!-- animation: pulse 1s infinite;-->
<!-- }-->
<!-- @keyframes pulse {-->
<!-- 0% {-->
<!-- transform: scale(1);-->
<!-- }-->
<!-- 50% {-->
<!-- transform: scale(1.05);-->
<!-- }-->
<!-- 100% {-->
<!-- transform: scale(1);-->
<!-- }-->
<!-- }-->
<!-- .stream-result {-->
<!-- margin-top: 20px;-->
<!-- padding: 15px;-->
<!-- border-radius: 5px;-->
<!-- background-color: #e3f2fd;-->
<!-- min-height: 50px;-->
<!-- }-->
<!-- .recording-indicator {-->
<!-- text-align: center;-->
<!-- margin: 10px 0;-->
<!-- font-weight: bold;-->
<!-- color: #ff9800;-->
<!-- display: none;-->
<!-- }-->
<!-- .error-message {-->
<!-- color: #f44336;-->
<!-- background-color: #ffebee;-->
<!-- padding: 10px;-->
<!-- border-radius: 5px;-->
<!-- margin: 10px 0;-->
<!-- display: none;-->
<!-- }-->
<!-- /* 移动端适配 */-->
<!-- @media screen and (max-width: 768px) {-->
<!-- body {-->
<!-- padding: 10px;-->
<!-- }-->
<!-- .container {-->
<!-- padding: 15px;-->
<!-- }-->
<!-- .btn {-->
<!-- width: 100%;-->
<!-- margin: 10px 0;-->
<!-- padding: 15px;-->
<!-- font-size: 18px;-->
<!-- }-->
<!-- .record-btn {-->
<!-- padding: 20px;-->
<!-- font-size: 20px;-->
<!-- }-->
<!-- }-->
<!-- @media screen and (max-width: 480px) {-->
<!-- h1 {-->
<!-- font-size: 24px;-->
<!-- }-->
<!-- }-->
<!-- </style>-->
<!--</head>-->
<!--<body>-->
<!--<div class="container">-->
<!-- <h1>实时语音识别系统</h1>-->
<!-- <div>-->
<!-- <p>按住下方按钮开始录音,识别结果将实时显示</p>-->
<!-- <button class="btn record-btn" id="recordBtn">按住录音-->
<!-- </button>-->
<!-- <div class="error-message" id="errorMessage"></div>-->
<!-- </div>-->
<!-- <div class="recording-indicator" id="recordingIndicator">正在录音和识别...</div>-->
<!-- <div class="stream-result" id="recordStreamResult" style="display: none;"></div>-->
<!--</div>-->
<!--<script>-->
<!-- let websocket;-->
<!-- const recordButton = document.getElementById('recordBtn');-->
<!-- const status = document.getElementById('recordingIndicator');-->
<!-- const recordStreamResult = document.getElementById('recordStreamResult');-->
<!-- let mediaRecorder;-->
<!-- let audioChunks = [];-->
<!-- let isRecording = false;-->
<!-- let isRecorderInitialized = false;-->
<!-- // connectWebSocket();-->
<!-- // 初始化录音功能-->
<!-- async function initRecorder() {-->
<!-- if (isRecorderInitialized) return; // 避免重复初始化-->
<!-- try {-->
<!-- const stream = await navigator.mediaDevices.getUserMedia({audio: true});-->
<!-- mediaRecorder = new MediaRecorder(stream);-->
<!-- isRecorderInitialized = true;-->
<!-- // 监听录制的数据-->
<!-- mediaRecorder.addEventListener('dataavailable', event => {-->
<!-- audioChunks.push(event.data);-->
<!-- });-->
<!-- // 录音停止后处理数据-->
<!-- mediaRecorder.addEventListener('stop', () => {-->
<!-- status.textContent = '正在上传...';-->
<!-- // 将音频数据组合成 Blob-->
<!-- const audioBlob = new Blob(audioChunks, {type: 'audio/wav'}); // 可根据需要改为 audio/wav 等-->
<!-- audioChunks = []; // 清空缓存-->
<!-- // websocket.send(audioBlob)-->
<!-- // 发送到后端-->
<!-- uploadAudioToBackend(audioBlob);-->
<!-- });-->
<!-- } catch (err) {-->
<!-- console.error('无法访问麦克风:', err);-->
<!-- status.textContent = '麦克风访问被拒绝或不可用';-->
<!-- showError('无法访问麦克风,请检查权限设置');-->
<!-- }-->
<!-- }-->
<!-- // 开始录音-->
<!-- function startRecording() {-->
<!-- // 如果录音器尚未初始化,则初始化-->
<!-- if (!isRecorderInitialized) {-->
<!-- initRecorder().then(() => {-->
<!-- // 初始化成功后开始录音-->
<!-- if (isRecorderInitialized && mediaRecorder && mediaRecorder.state !== 'recording') {-->
<!-- audioChunks = [];-->
<!-- mediaRecorder.start();-->
<!-- isRecording = true;-->
<!-- recordButton.textContent = '正在录音...';-->
<!-- status.textContent = '录音中...';-->
<!-- status.style.display = 'block';-->
<!-- recordStreamResult.style.display = 'block';-->
<!-- recordStreamResult.innerHTML = '<h3>实时识别结果:</h3><p>正在识别...</p>';-->
<!-- }-->
<!-- }).catch(err => {-->
<!-- console.error('初始化失败:', err);-->
<!-- status.textContent = '初始化录音失败';-->
<!-- });-->
<!-- } else if (mediaRecorder && mediaRecorder.state !== 'recording') {-->
<!-- audioChunks = [];-->
<!-- mediaRecorder.start();-->
<!-- isRecording = true;-->
<!-- recordButton.textContent = '正在录音...';-->
<!-- status.textContent = '录音中...';-->
<!-- status.style.display = 'block';-->
<!-- recordStreamResult.style.display = 'block';-->
<!-- recordStreamResult.innerHTML = '<h3>实时识别结果:</h3><p>正在识别...</p>';-->
<!-- }-->
<!-- }-->
<!-- // 停止录音-->
<!-- function stopRecording() {-->
<!-- if (mediaRecorder && mediaRecorder.state === 'recording') {-->
<!-- mediaRecorder.stop();-->
<!-- isRecording = false;-->
<!-- recordButton.textContent = '按住说话';-->
<!-- status.textContent = '已停止';-->
<!-- }-->
<!-- }-->
<!-- // 显示错误信息-->
<!-- function showError(message) {-->
<!-- const errorElement = document.getElementById('errorMessage');-->
<!-- errorElement.textContent = message;-->
<!-- errorElement.style.display = 'block';-->
<!-- setTimeout(() => {-->
<!-- errorElement.style.display = 'none';-->
<!-- }, 5000); // 5秒后隐藏错误信息-->
<!-- }-->
<!-- // 上传音频到后端-->
<!-- async function uploadAudioToBackend(blob) {-->
<!-- const formData = new FormData();-->
<!-- formData.append('file', blob, 'recording.wav'); // 文件名可自定义-->
<!-- try {-->
<!-- const response = await fetch('http://192.168.1.2:5000/transcribe', {-->
<!-- method: 'POST',-->
<!-- body: formData-->
<!-- });-->
<!-- if (response.ok) {-->
<!-- const result = await response.json();-->
<!-- console.log('上传成功:', result);-->
<!-- status.textContent = '上传成功!';-->
<!-- recordStreamResult.innerHTML += `<p>${result.result}</p>`;-->
<!-- } else {-->
<!-- status.textContent = '上传失败';-->
<!-- }-->
<!-- } catch (error) {-->
<!-- console.error('上传出错:', error);-->
<!-- status.textContent = '上传失败:网络错误';-->
<!-- }-->
<!-- }-->
<!-- // 初始化事件监听-->
<!-- recordButton.addEventListener('mousedown', () => {-->
<!-- if (!isRecording) startRecording();-->
<!-- });-->
<!-- recordButton.addEventListener('mouseup', () => {-->
<!-- if (isRecording) stopRecording();-->
<!-- });-->
<!-- // 支持移动端touch事件-->
<!-- recordButton.addEventListener('touchstart', (e) => {-->
<!-- e.preventDefault(); // 防止默认行为-->
<!-- if (!isRecording) startRecording();-->
<!-- });-->
<!-- recordButton.addEventListener('touchend', (e) => {-->
<!-- e.preventDefault();-->
<!-- if (isRecording) stopRecording();-->
<!-- });-->
<!-- // 建立WebSocket连接-->
<!-- function connectWebSocket() {-->
<!-- // 处理不同协议的情况-->
<!-- let wsUrl = "ws://localhost:5000/ws/stream_transcribe";-->
<!-- if (window.location.protocol === "https:") {-->
<!-- wsUrl = "wss://localhost:5000/ws/stream_transcribe";-->
<!-- }-->
<!-- websocket = new WebSocket(wsUrl);-->
<!-- websocket.onopen = function (event) {-->
<!-- console.log("WebSocket连接已建立");-->
<!-- };-->
<!-- websocket.onmessage = function (event) {-->
<!-- // 更新识别结果-->
<!-- const currentContent = recordStreamResult.innerHTML;-->
<!-- // 如果是第一次收到结果,替换"正在识别..."提示-->
<!-- if (currentContent.includes("正在识别...")) {-->
<!-- recordStreamResult.innerHTML = '<h3>实时识别结果:</h3><p>' + event.data + '</p>';-->
<!-- } else {-->
<!-- // 否则追加结果-->
<!-- const newContent = currentContent.replace('</p>', '') + event.data + '</p>';-->
<!-- recordStreamResult.innerHTML = newContent;-->
<!-- }-->
<!-- };-->
<!-- websocket.onerror = function (error) {-->
<!-- console.log("WebSocket错误:", error);-->
<!-- showError('连接服务器失败,请检查服务器是否运行');-->
<!-- recordStreamResult.innerHTML = '<h3>实时识别结果:</h3><p>识别出错,请查看控制台了解详情</p>';-->
<!-- };-->
<!-- websocket.onclose = function (event) {-->
<!-- console.log("WebSocket连接已关闭");-->
<!-- if (event.wasClean) {-->
<!-- console.log(`连接关闭,代码=${event.code},原因=${event.reason}`);-->
<!-- } else {-->
<!-- console.log('连接意外中断');-->
<!-- }-->
<!-- };-->
<!-- }-->
<!--</script>-->
<!--</body>-->
<!--</html>-->

137
2.html Normal file
View File

@@ -0,0 +1,137 @@
<!--&lt;!&ndash; index.html &ndash;&gt;-->
<!--<!DOCTYPE html>-->
<!--<html lang="zh">-->
<!--<head>-->
<!-- <meta charset="UTF-8"/>-->
<!-- <title>SenseVoice + VAD 实时识别</title>-->
<!-- <style>-->
<!-- body {-->
<!-- font-family: Arial, sans-serif;-->
<!-- padding: 40px;-->
<!-- }-->
<!-- button {-->
<!-- padding: 12px 24px;-->
<!-- font-size: 16px;-->
<!-- margin: 10px 5px;-->
<!-- }-->
<!-- #result {-->
<!-- margin: 20px 0;-->
<!-- padding: 15px;-->
<!-- background: #f0f0f0;-->
<!-- border-radius: 6px;-->
<!-- }-->
<!-- </style>-->
<!--</head>-->
<!--<body>-->
<!--<h1>🎙️ SenseVoiceSmall 实时语音识别</h1>-->
<!--<button id="startBtn">开始监听</button>-->
<!--<button id="stopBtn" disabled>停止监听</button>-->
<!--<div id="status">状态:初始化中...</div>-->
<!--<div id="result">识别结果将显示在这里...</div>-->
<!--&lt;!&ndash; ONNX Runtime Web &ndash;&gt;-->
<!--<script src="https://cdn.jsdelivr.net/npm/onnxruntime-web@1.14.0/dist/ort.js"></script>-->
<!--&lt;!&ndash; vad-web &ndash;&gt;-->
<!--<script src="https://cdn.jsdelivr.net/npm/@ricky0123/vad-web@0.0.26/dist/bundle.min.js"></script>-->
<!--<script>-->
<!-- let myvad = null;-->
<!-- let websocket = null;-->
<!-- const statusDiv = document.getElementById('status');-->
<!-- const resultDiv = document.getElementById('result');-->
<!-- const startBtn = document.getElementById('startBtn');-->
<!-- const stopBtn = document.getElementById('stopBtn');-->
<!-- let audioBuffer = [];-->
<!-- const BUFFER_THRESHOLD = 1024 * 5; // 缓冲数据达到 5KB 后发送-->
<!-- // 初始化 VAD-->
<!-- async function initVAD() {-->
<!-- try {-->
<!-- myvad = await vad.MicVAD.new({-->
<!-- onSpeechStart: () => {-->
<!-- statusDiv.innerText = "🟢 检测到语音,正在发送...";-->
<!-- },-->
<!-- onSpeechEnd: (audio) => {-->
<!-- if (websocket && websocket.readyState === WebSocket.OPEN) {-->
<!-- const pcm16 = floatTo16BitPCM(audio);-->
<!-- websocket.send(pcm16);-->
<!-- statusDiv.innerText = `📤 发送语音段 (${audio.length} 个采样点)`;-->
<!-- }-->
<!-- },-->
<!-- // onFrameProcessed: (probs, frame) => {-->
<!-- // // 可以调整阈值,比如 > 0.3 就认为可能是语音-->
<!-- // const isLikelySpeech = probs.isSpeech > 0.3;-->
<!-- // audioBuffer.push(frame); // 缓存每一段音频数据-->
<!-- //-->
<!-- // if (isLikelySpeech && websocket?.readyState === WebSocket.OPEN) {-->
<!-- // const pcm16 = floatTo16BitPCM(frame);-->
<!-- // websocket.send(pcm16);-->
<!-- // }-->
<!-- // }-->
<!-- });-->
<!-- statusDiv.innerText = "✅ VAD 初始化完成,点击【开始监听】";-->
<!-- } catch (err) {-->
<!-- statusDiv.innerText = `❌ VAD 加载失败: ${err.message}`;-->
<!-- }-->
<!-- }-->
<!-- // 连接 WebSocket-->
<!-- function connectWebSocket() {-->
<!-- websocket = new WebSocket('ws://localhost:8000/ws/asr');-->
<!-- websocket.onopen = () => {-->
<!-- statusDiv.innerText = "🟢 WebSocket 已连接";-->
<!-- };-->
<!-- websocket.onmessage = (e) => {-->
<!-- try {-->
<!-- const data = JSON.parse(e.data);-->
<!-- resultDiv.innerText = data.text || "";-->
<!-- } catch {-->
<!-- resultDiv.innerText = e.data;-->
<!-- }-->
<!-- };-->
<!-- websocket.onerror = (e) => {-->
<!-- statusDiv.innerText = "❌ WebSocket 错误";-->
<!-- };-->
<!-- websocket.onclose = () => {-->
<!-- statusDiv.innerText = "🔌 已断开连接,正在重连...";-->
<!-- setTimeout(connectWebSocket, 3000); // 重连-->
<!-- };-->
<!-- }-->
<!-- // Float32 → 16-bit PCM-->
<!-- function floatTo16BitPCM(float32Array) {-->
<!-- const buffer = new ArrayBuffer(float32Array.length * 2);-->
<!-- const view = new DataView(buffer);-->
<!-- for (let i = 0; i < float32Array.length; i++) {-->
<!-- const s = Math.max(-1, Math.min(1, float32Array[i]));-->
<!-- const val = s < 0 ? s * 0x8000 : s * 0x7FFF;-->
<!-- view.setInt16(i * 2, val, true);-->
<!-- }-->
<!-- return buffer;-->
<!-- }-->
<!-- // 控制按钮-->
<!-- startBtn.addEventListener('click', () => {-->
<!-- if (myvad) myvad.start();-->
<!-- startBtn.disabled = true;-->
<!-- stopBtn.disabled = false;-->
<!-- });-->
<!-- stopBtn.addEventListener('click', () => {-->
<!-- if (myvad) myvad.stop();-->
<!-- startBtn.disabled = false;-->
<!-- stopBtn.disabled = true;-->
<!-- statusDiv.innerText = "⏸️ 已停止监听";-->
<!-- });-->
<!-- // 启动-->
<!-- window.onload = () => {-->
<!-- initVAD();-->
<!-- connectWebSocket();-->
<!-- };-->
<!--</script>-->
<!--</body>-->
<!--</html>-->

34
2.py Normal file
View File

@@ -0,0 +1,34 @@
from modelscope import AutoModelForCausalLM, AutoTokenizer
class QwenChatbot:
def __init__(self, model_name="Qwen/Qwen3-0.6B"):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForCausalLM.from_pretrained(model_name)
self.history = []
def generate_response(self, user_input):
messages = self.history + [{"role": "system", "content": "你是一个翻译模型,请将输入翻译成中文。"},{"role": "user", "content": user_input}]
text = self.tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
enable_thinking=False # Setting enable_thinking=False disables thinking mode
)
inputs = self.tokenizer(text, return_tensors="pt")
response_ids = self.model.generate(**inputs, max_new_tokens=32768)[0][len(inputs.input_ids[0]):].tolist()
response = self.tokenizer.decode(response_ids, skip_special_tokens=True)
# Update history
self.history.append({"role": "user", "content": user_input})
self.history.append({"role": "assistant", "content": response})
return response
# Example Usage
if __name__ == "__main__":
chatbot = QwenChatbot()
print(chatbot.generate_response("Hello"))

1136
3.html Normal file

File diff suppressed because it is too large Load Diff

9
3.py Normal file
View File

@@ -0,0 +1,9 @@
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
inference_pipeline = pipeline(
task=Tasks.auto_speech_recognition,
model='iic/Whisper-large-v3', model_revision="v2.0.5")
rec_result = inference_pipeline(input='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav', language=None)
print(rec_result)

113
asr_api.py Normal file
View File

@@ -0,0 +1,113 @@
import numpy as np
import torch
# server.py
from fastapi import FastAPI, WebSocket
from fastapi import Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import HTMLResponse
from funasr import AutoModel
from funasr.utils.postprocess_utils import rich_transcription_postprocess
# 初始化FastAPI应用
app = FastAPI(title="ASR API", description="语音识别API", version="1.0.0")
# 添加跨域支持
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
model_dir = "iic/SenseVoiceSmall"
# 检查是否有可用的CUDA设备
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModel(
model=model_dir,
trust_remote_code=True,
vad_model="fsmn-vad",
vad_kwargs={"max_single_segment_time": 30000},
device=device, # 根据实际环境选择设备
language="auto", # 自动检测语言
use_itn=True, # 启用逆文本归一化
)
def render_template(template_name: str, context: dict = None):
"""简化模板渲染函数避免依赖Jinja2"""
context = context or {}
if template_name == "3.html":
# 直接返回静态HTML内容
with open("3.html", "r", encoding="utf-8") as f:
content = f.read()
return HTMLResponse(content=content)
return HTMLResponse(content="Template not found")
def bytes_to_float_array(audio_bytes: bytes) -> np.ndarray:
"""Convert 16-bit PCM bytes to float32 array normalized to [-1, 1]"""
# 从 bytes 创建 int16 数组
audio_np = np.frombuffer(audio_bytes, dtype=np.int16)
# 转为 float32 并归一化
return audio_np.astype(np.float32) / 32768.0
def convert_audio_data(audio_data: bytes) -> np.ndarray:
# 首先尝试作为 PCM 数据处理
try:
if len(audio_data) % 2 == 0 and len(audio_data) > 0: # 16-bit数据长度应该是2的倍数
audio_array = bytes_to_float_array(audio_data)
# 确保数据是float32类型
return audio_array.astype(np.float32)
except Exception as e:
print(f"⚠️ 16-bit PCM处理失败: {e}")
return np.array([], dtype=np.float32)
# chatbot = QwenChatbot()
@app.websocket("/ws/asr")
async def websocket_asr(websocket: WebSocket):
# 绕过origin检查解决403 Forbidden问题
await websocket.accept()
while True:
# 🔁 接收前端发送的音频数据
audio_data = await websocket.receive_bytes()
# 转换音频数据为模型可处理的格式
audio_array = convert_audio_data(audio_data)
# 确保数据是正确的格式
if audio_array.dtype != np.float32:
audio_array = audio_array.astype(np.float32)
# ✅ 使用 SenseVoice 推理
result = model.generate(
audio_array,
sampling_rate=16000,
language="auto", # 自动检测语言
use_itn=True, # 启用逆文本归一化
batch_size_s=60
)
text = rich_transcription_postprocess(result[0]["text"])
# print(chatbot.generate_response(text))
print(f"🔍 Raw result type: {result}, content: {text}")
await websocket.send_text(text)
@app.get("/", response_class=HTMLResponse)
async def index(request: Request):
"""
主页提供Web界面
"""
return render_template("3.html", {"request": request})
if __name__ == '__main__':
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000, log_level="info")

29512
js-libraries/ort.js Normal file

File diff suppressed because one or more lines are too long

1
js-libraries/vad.bundle.min.js vendored Normal file

File diff suppressed because one or more lines are too long

9
requirements.txt Normal file
View File

@@ -0,0 +1,9 @@
fastapi>=0.116.0,<0.116.1
uvicorn~=0.15.0
torch>=1.10.0
torchaudio>=0.10.0
python-multipart>=0.0.5
modelscope~=1.29.2
numpy~=2.3.2
soundfile~=0.13.1
funasr~=1.2.7

488
speaker.html Normal file
View File

@@ -0,0 +1,488 @@
<!DOCTYPE html>
<html lang="zh">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>扬声器识别系统</title>
<style>
body {
font-family: Arial, sans-serif;
padding: 20px;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
min-height: 100vh;
margin: 0;
color: white;
}
.container {
max-width: 1000px;
margin: 0 auto;
background: rgba(255, 255, 255, 0.1);
padding: 20px;
border-radius: 15px;
backdrop-filter: blur(10px);
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.2);
}
h1 {
text-align: center;
margin-bottom: 20px;
font-size: 2.5em;
text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.3);
}
.controls {
display: flex;
justify-content: center;
gap: 20px;
margin-bottom: 20px;
flex-wrap: wrap;
}
button {
padding: 15px 30px;
font-size: 18px;
border: none;
border-radius: 50px;
cursor: pointer;
transition: all 0.3s ease;
font-weight: bold;
box-shadow: 0 4px 15px rgba(0, 0, 0, 0.2);
}
#speakerBtn {
background: linear-gradient(45deg, #9C27B0, #E91E63);
color: white;
}
#stopSpeakerBtn {
background: linear-gradient(45deg, #f44336, #e91e63);
color: white;
}
#translateBtn {
background: linear-gradient(45deg, #FF9800, #FF5722);
color: white;
}
button:hover:not(:disabled) {
transform: translateY(-3px);
box-shadow: 0 6px 20px rgba(0, 0, 0, 0.3);
}
button:disabled {
opacity: 0.6;
cursor: not-allowed;
}
.status-container {
text-align: center;
margin: 20px 0;
padding: 15px;
background: rgba(255, 255, 255, 0.15);
border-radius: 10px;
font-size: 16px;
}
#speakerStatus {
font-weight: bold;
margin-bottom: 10px;
}
.result-container {
margin: 20px 0;
}
.result-box {
min-height: 120px;
padding: 20px;
background: rgba(0, 0, 0, 0.2);
border-radius: 10px;
font-size: 18px;
line-height: 1.6;
white-space: pre-wrap;
overflow-y: auto;
max-height: 200px;
margin-bottom: 15px;
}
.section-title {
font-size: 1.5em;
margin-bottom: 15px;
text-align: center;
color: #FFD700;
}
.recording-indicator {
display: inline-block;
width: 12px;
height: 12px;
background-color: #f44336;
border-radius: 50%;
margin-right: 10px;
animation: blink 1s infinite;
}
@keyframes blink {
0%, 100% {
opacity: 1;
}
50% {
opacity: 0.3;
}
}
</style>
</head>
<body>
<div class="container">
<h1>🎙️ 扬声器识别系统</h1>
<div class="controls">
<button id="speakerBtn">开始录制扬声器</button>
<button id="stopSpeakerBtn" disabled>停止录制</button>
<button id="translateBtn" disabled>翻译结果</button>
</div>
<div class="status-container">
<div id="speakerStatus">状态:等待录制...</div>
</div>
<div class="result-container">
<h2 class="section-title">识别结果</h2>
<div id="speakerResult" class="result-box">识别结果将显示在这里...</div>
<h2 class="section-title">翻译结果</h2>
<div id="translatedResult" class="result-box">翻译结果将显示在这里...</div>
</div>
</div>
<script>
let speakerAudioContext = null;
let speakerMediaRecorder = null;
let speakerChunks = [];
let websocket = null;
// DOM 元素
const speakerBtn = document.getElementById('speakerBtn');
const stopSpeakerBtn = document.getElementById('stopSpeakerBtn');
const translateBtn = document.getElementById('translateBtn');
const speakerStatusDiv = document.getElementById('speakerStatus');
const speakerResultDiv = document.getElementById('speakerResult');
const translatedResultDiv = document.getElementById('translatedResult');
// Float32 → 16-bit PCM
function floatTo16BitPCM(float32Array) {
const buffer = new ArrayBuffer(float32Array.length * 2);
const view = new DataView(buffer);
for (let i = 0; i < float32Array.length; i++) {
const s = Math.max(-1, Math.min(1, float32Array[i]));
const val = s < 0 ? s * 0x8000 : s * 0x7FFF;
view.setInt16(i * 2, val, true);
}
return buffer;
}
// 连接 WebSocket
function connectWebSocket() {
try {
// 如果已有连接且处于开启状态,则不重新连接
if (websocket && websocket.readyState === WebSocket.OPEN) {
console.log('WebSocket已连接无需重新连接');
speakerStatusDiv.innerText = "🟢 WebSocket 已连接";
return;
}
// 如果连接正在建立中,也不重新连接
if (websocket && websocket.readyState === WebSocket.CONNECTING) {
console.log('WebSocket正在连接中无需重新连接');
return;
}
speakerStatusDiv.innerText = "🔄 正在连接 WebSocket...";
websocket = new WebSocket('ws://localhost:8000/ws/asr');
// 添加连接超时处理
const connectionTimeout = setTimeout(() => {
if (websocket && websocket.readyState === WebSocket.CONNECTING) {
speakerStatusDiv.innerText = "❌ WebSocket 连接超时";
}
}, 10000); // 10秒超时
websocket.onopen = () => {
clearTimeout(connectionTimeout);
speakerStatusDiv.innerText = "🟢 WebSocket 已连接";
};
websocket.onmessage = (e) => {
try {
const data = e.data;
console.log('WebSocket 收到消息:', data);
if (data) {
speakerResultDiv.innerText = data;
translateBtn.disabled = false;
}
} catch (err) {
console.error('处理WebSocket消息时出错:', err);
speakerResultDiv.innerText = e.data;
}
};
websocket.onclose = () => {
clearTimeout(connectionTimeout);
speakerStatusDiv.innerText = "🔌 已断开连接,正在重连...";
setTimeout(connectWebSocket, 3000); // 重连
};
websocket.onerror = (e) => {
clearTimeout(connectionTimeout);
speakerStatusDiv.innerText = "❌ WebSocket 错误";
console.error('WebSocket 错误:', e);
};
} catch (error) {
speakerStatusDiv.innerText = `❌ WebSocket 连接失败: ${error.message}`;
console.error('WebSocket 连接失败:', error);
}
}
// 扬声器识别功能
speakerBtn.addEventListener('click', async () => {
if (!speakerMediaRecorder || speakerMediaRecorder.state === 'inactive') {
try {
speakerStatusDiv.innerHTML = "🔄 正在请求桌面媒体流...";
// 使用getDisplayMedia API获取桌面音频流
const constraints = {
video: true, // 只请求音频
audio: {
echoCancellation: false,
noiseSuppression: false,
autoGainControl: false
}
};
speakerStatusDiv.innerHTML = "🔄 正在请求音频流...";
const stream = await navigator.mediaDevices.getDisplayMedia(constraints);
// 停止所有视频轨道,因为我们只关心音频
const videoTracks = stream.getVideoTracks();
videoTracks.forEach(track => track.stop());
speakerStatusDiv.innerHTML = "🔄 正在设置音频处理...";
// 检查流中是否包含音频轨道
const audioTracks = stream.getAudioTracks();
console.log('音频轨道数量:', audioTracks.length);
console.log('音频轨道详情:', audioTracks);
if (audioTracks.length === 0) {
speakerStatusDiv.innerText = "⚠️ 音频流不包含音频轨道";
stream.getTracks().forEach(track => track.stop());
return;
}
// 设置音频轨道属性
audioTracks.forEach(track => {
console.log('音频轨道设置:', track);
track.addEventListener('ended', () => {
console.log('音频轨道结束');
if (speakerAudioContext) {
speakerAudioContext.close();
speakerAudioContext = null;
}
speakerBtn.textContent = '开始录制扬声器';
speakerBtn.style.background = 'linear-gradient(45deg, #9C27B0, #E91E63)';
speakerStatusDiv.innerText = "⏹️ 录制已停止";
stopSpeakerBtn.disabled = true;
speakerChunks = [];
});
});
// 确保之前的音频上下文已关闭
if (speakerAudioContext) {
speakerAudioContext.close();
speakerAudioContext = null;
}
// 创建新的音频上下文使用48000采样率以匹配大多数系统音频
speakerAudioContext = new (window.AudioContext || window.webkitAudioContext)({
sampleRate: 16000
});
console.log('创建音频上下文:', speakerAudioContext);
const sourceNode = speakerAudioContext.createMediaStreamSource(stream);
console.log('创建媒体流源:', sourceNode);
// 创建处理器来收集音频数据
const processor = speakerAudioContext.createScriptProcessor(4096, 1, 1);
console.log('创建处理器:', processor);
sourceNode.connect(processor);
processor.connect(speakerAudioContext.destination);
speakerChunks = [];
processor.onaudioprocess = (e) => {
try {
const inputData = e.inputBuffer.getChannelData(0);
speakerChunks.push(new Float32Array(inputData));
// 实时显示缓冲区大小
if (speakerChunks.length % 10 === 0) {
speakerStatusDiv.innerHTML = `<span class='recording-indicator'></span>🎤 正在录制扬声器... (缓冲区: ${speakerChunks.length})`;
}
} catch (err) {
console.error('处理音频数据时出错:', err);
}
};
speakerBtn.textContent = '停止录制';
speakerBtn.style.background = 'linear-gradient(45deg, #f44336, #e91e63)';
stopSpeakerBtn.disabled = false;
speakerStatusDiv.innerHTML = "<span class='recording-indicator'></span>🎤 正在录制扬声器...";
// 存储流引用以便在需要时停止
window.speakerStream = stream;
} catch (err) {
console.error('获取扬声器完整错误:', err);
if (err.name === 'NotSupportedError') {
speakerStatusDiv.innerHTML = "❌ 系统不支持扬声器录制功能<br/>" +
"请尝试以下解决方案:<br/>" +
"1. 确保在屏幕共享时选择包含音频的选项<br/>" +
"2. 检查系统音频设置和权限<br/>" +
"3. 在 Windows 系统中,确保已启用立体声混音设备";
} else if (err.name === 'NotAllowedError' || err.name === 'PermissionDeniedError') {
speakerStatusDiv.innerText = "❌ 用户拒绝了屏幕共享或音频访问权限";
} else if (err.name === 'NotFoundError' || err.name === 'OverconstrainedError') {
speakerStatusDiv.innerText = "❌ 未找到可用的音频输入设备";
} else if (err.message && err.message.includes('Error starting capture')) {
speakerStatusDiv.innerHTML = "❌ 启动捕获失败<br/>" +
"请尝试以下解决方案:<br/>" +
"1. 重新点击按钮再次尝试<br/>" +
"2. 重启应用程序<br/>" +
"3. 检查系统音频驱动程序<br/>" +
"4. 确保没有其他应用程序正在使用音频设备";
} else {
speakerStatusDiv.innerHTML = `❌ 获取扬声器失败: ${err.message || err.name}<br/>` +
"请尝试以下解决方案:<br/>" +
"1. 确保使用最新版本的浏览器<br/>" +
"2. 检查应用权限设置<br/>" +
"3. 重启应用程序";
}
}
} else {
// 停止录制
if (speakerAudioContext) {
speakerAudioContext.close();
speakerAudioContext = null;
}
// 停止所有轨道
if (window.speakerStream) {
window.speakerStream.getTracks().forEach(track => {
track.stop();
});
window.speakerStream = null;
}
speakerBtn.textContent = '开始录制扬声器';
speakerBtn.style.background = 'linear-gradient(45deg, #9C27B0, #E91E63)';
speakerStatusDiv.innerText = "⏹️ 录制已停止";
stopSpeakerBtn.disabled = true;
speakerChunks = [];
}
});
// 停止录制并发送数据
stopSpeakerBtn.addEventListener('click', () => {
// 发送类型标识,告诉服务器这是扬声器识别数据
if (speakerChunks && speakerChunks.length > 0 && websocket && websocket.readyState === WebSocket.OPEN) {
// 合并所有录音片段
const totalLength = speakerChunks.reduce((acc, chunk) => acc + chunk.length, 0);
const fullAudio = new Float32Array(totalLength);
let offset = 0;
for (const chunk of speakerChunks) {
fullAudio.set(chunk, offset);
offset += chunk.length;
}
// 转换为 PCM 并发送
const pcm16 = floatTo16BitPCM(fullAudio);
websocket.send(pcm16);
speakerStatusDiv.innerText = `📤 发送扬声器数据 (${fullAudio.length} 个采样点)`;
} else if (!speakerChunks || speakerChunks.length === 0) {
speakerStatusDiv.innerText = "⚠️ 没有录制到音频数据";
} else {
speakerStatusDiv.innerText = "⚠️ 没有录制数据可发送或WebSocket未连接";
}
// 真正停止录制
if (speakerAudioContext) {
speakerAudioContext.close();
speakerAudioContext = null;
}
if (window.speakerStream) {
window.speakerStream.getTracks().forEach(track => {
track.stop();
});
window.speakerStream = null;
}
// 重置按钮和状态
speakerBtn.textContent = '开始录制扬声器';
speakerBtn.style.background = 'linear-gradient(45deg, #9C27B0, #E91E63)';
speakerStatusDiv.innerText = "⏹️ 录制已停止";
stopSpeakerBtn.disabled = true;
speakerChunks = [];
});
// 翻译功能
translateBtn.addEventListener('click', async () => {
if (!speakerResultDiv.innerText || speakerResultDiv.innerText === "识别结果将显示在这里...") {
alert("请先进行语音识别");
return;
}
try {
speakerStatusDiv.innerText = "🔄 正在翻译...";
translateBtn.disabled = true;
// 发送翻译请求到后端
const response = await fetch('http://localhost:8000/translate', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({
text: speakerResultDiv.innerText
})
});
if (response.ok) {
const result = await response.json();
translatedResultDiv.innerText = result.translated_text || "翻译失败";
speakerStatusDiv.innerText = "✅ 翻译完成";
} else {
throw new Error(`翻译服务返回错误: ${response.status}`);
}
} catch (error) {
console.error('翻译失败:', error);
speakerStatusDiv.innerText = `❌ 翻译失败: ${error.message}`;
translatedResultDiv.innerText = "翻译失败,请查看控制台了解详情";
} finally {
translateBtn.disabled = false;
}
});
// 启动
window.onload = () => {
connectWebSocket();
};
</script>
</body>
</html>