Files
PythonProject/js-libraries/vad.bundle.min.js
lingxiao865 f5c4158fc1 first commit
2025-09-18 17:50:03 +08:00

1 line
19 KiB
JavaScript

!function(e,t){"object"==typeof exports&&"object"==typeof module?module.exports=t(require("onnxruntime-web")):"function"==typeof define&&define.amd?define(["onnxruntime-web"],t):"object"==typeof exports?exports.vad=t(require("onnxruntime-web")):e.vad=t(e.ort)}(self,(e=>(()=>{"use strict";var t={485:(e,t)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.baseAssetPath=void 0;const s="undefined"!=typeof window&&void 0!==window.document?window.document.currentScript:null;let o="/";s&&(o=s.src.replace(/#.*$/,"").replace(/\?.*$/,"").replace(/\/[^\/]+$/,"/")),t.baseAssetPath=o},973:(e,t)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.defaultModelFetcher=void 0,t.defaultModelFetcher=e=>fetch(e).then((e=>e.arrayBuffer()))},362:(e,t,s)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.FrameProcessor=t.validateOptions=t.defaultV5FrameProcessorOptions=t.defaultLegacyFrameProcessorOptions=void 0;const o=s(710),r=s(954),i=[512,1024,1536];t.defaultLegacyFrameProcessorOptions={positiveSpeechThreshold:.5,negativeSpeechThreshold:.35,preSpeechPadFrames:1,redemptionFrames:8,frameSamples:1536,minSpeechFrames:3,submitUserSpeechOnPause:!1},t.defaultV5FrameProcessorOptions={positiveSpeechThreshold:.5,negativeSpeechThreshold:.35,preSpeechPadFrames:3,redemptionFrames:24,frameSamples:512,minSpeechFrames:9,submitUserSpeechOnPause:!1},t.validateOptions=function(e){i.includes(e.frameSamples)||o.log.warn("You are using an unusual frame size"),(e.positiveSpeechThreshold<0||e.positiveSpeechThreshold>1)&&o.log.error("positiveSpeechThreshold should be a number between 0 and 1"),(e.negativeSpeechThreshold<0||e.negativeSpeechThreshold>e.positiveSpeechThreshold)&&o.log.error("negativeSpeechThreshold should be between 0 and positiveSpeechThreshold"),e.preSpeechPadFrames<0&&o.log.error("preSpeechPadFrames should be positive"),e.redemptionFrames<0&&o.log.error("redemptionFrames should be positive")};const n=e=>{const t=e.reduce(((e,t)=>(e.push(e.at(-1)+t.length),e)),[0]),s=new Float32Array(t.at(-1));return e.forEach(((e,o)=>{const r=t[o];s.set(e,r)})),s};t.FrameProcessor=class{constructor(e,t,s){this.modelProcessFunc=e,this.modelResetFunc=t,this.options=s,this.speaking=!1,this.redemptionCounter=0,this.active=!1,this.reset=()=>{this.speaking=!1,this.audioBuffer=[],this.modelResetFunc(),this.redemptionCounter=0},this.pause=()=>(this.active=!1,this.options.submitUserSpeechOnPause?this.endSegment():(this.reset(),{})),this.resume=()=>{this.active=!0},this.endSegment=()=>{const e=this.audioBuffer;this.audioBuffer=[];const t=this.speaking;this.reset();const s=e.reduce(((e,t)=>e+ +t.isSpeech),0);if(t){if(s>=this.options.minSpeechFrames){const t=n(e.map((e=>e.frame)));return{msg:r.Message.SpeechEnd,audio:t}}return{msg:r.Message.VADMisfire}}return{}},this.process=async e=>{if(!this.active)return{};const t=await this.modelProcessFunc(e);if(this.audioBuffer.push({frame:e,isSpeech:t.isSpeech>=this.options.positiveSpeechThreshold}),t.isSpeech>=this.options.positiveSpeechThreshold&&this.redemptionCounter&&(this.redemptionCounter=0),t.isSpeech>=this.options.positiveSpeechThreshold&&!this.speaking)return this.speaking=!0,{probs:t,msg:r.Message.SpeechStart,frame:e};if(t.isSpeech<this.options.negativeSpeechThreshold&&this.speaking&&++this.redemptionCounter>=this.options.redemptionFrames){this.redemptionCounter=0,this.speaking=!1;const s=this.audioBuffer;if(this.audioBuffer=[],s.reduce(((e,t)=>e+ +t.isSpeech),0)>=this.options.minSpeechFrames){const o=n(s.map((e=>e.frame)));return{probs:t,msg:r.Message.SpeechEnd,audio:o,frame:e}}return{probs:t,msg:r.Message.VADMisfire,frame:e}}if(!this.speaking)for(;this.audioBuffer.length>this.options.preSpeechPadFrames;)this.audioBuffer.shift();return{probs:t,frame:e}},this.audioBuffer=[],this.reset()}}},590:function(e,t,s){var o=this&&this.__createBinding||(Object.create?function(e,t,s,o){void 0===o&&(o=s);var r=Object.getOwnPropertyDescriptor(t,s);r&&!("get"in r?!t.__esModule:r.writable||r.configurable)||(r={enumerable:!0,get:function(){return t[s]}}),Object.defineProperty(e,o,r)}:function(e,t,s,o){void 0===o&&(o=s),e[o]=t[s]}),r=this&&this.__setModuleDefault||(Object.create?function(e,t){Object.defineProperty(e,"default",{enumerable:!0,value:t})}:function(e,t){e.default=t}),i=this&&this.__importStar||function(e){if(e&&e.__esModule)return e;var t={};if(null!=e)for(var s in e)"default"!==s&&Object.prototype.hasOwnProperty.call(e,s)&&o(t,e,s);return r(t,e),t};Object.defineProperty(t,"__esModule",{value:!0}),t.NonRealTimeVAD=t.Message=t.FrameProcessor=t.getDefaultRealTimeVADOptions=t.MicVAD=t.DEFAULT_MODEL=t.AudioNodeVAD=t.utils=t.defaultNonRealTimeVADOptions=void 0;const n=i(s(656)),a=s(485),c=s(973),h=s(362);Object.defineProperty(t,"FrameProcessor",{enumerable:!0,get:function(){return h.FrameProcessor}});const d=s(954);Object.defineProperty(t,"Message",{enumerable:!0,get:function(){return d.Message}});const u=s(202),l=s(787);t.defaultNonRealTimeVADOptions={modelURL:a.baseAssetPath+"silero_vad_legacy.onnx",modelFetcher:c.defaultModelFetcher};class p extends u.PlatformAgnosticNonRealTimeVAD{static async new(e={}){const{modelURL:s,modelFetcher:o}={...t.defaultNonRealTimeVADOptions,...e};return await this._new((()=>o(s)),n,e)}}t.NonRealTimeVAD=p,t.utils={audioFileToArray:l.audioFileToArray,minFramesForTargetMS:l.minFramesForTargetMS,arrayBufferToBase64:l.arrayBufferToBase64,encodeWAV:l.encodeWAV};var f=s(746);Object.defineProperty(t,"AudioNodeVAD",{enumerable:!0,get:function(){return f.AudioNodeVAD}}),Object.defineProperty(t,"DEFAULT_MODEL",{enumerable:!0,get:function(){return f.DEFAULT_MODEL}}),Object.defineProperty(t,"MicVAD",{enumerable:!0,get:function(){return f.MicVAD}}),Object.defineProperty(t,"getDefaultRealTimeVADOptions",{enumerable:!0,get:function(){return f.getDefaultRealTimeVADOptions}})},710:(e,t)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.log=t.LOG_PREFIX=void 0,t.LOG_PREFIX="[VAD]";const s=["error","debug","warn"].reduce(((e,s)=>(e[s]=function(e){return(...s)=>{console[e](t.LOG_PREFIX,...s)}}(s),e)),{});t.log=s},954:(e,t)=>{var s;Object.defineProperty(t,"__esModule",{value:!0}),t.Message=void 0,function(e){e.AudioFrame="AUDIO_FRAME",e.SpeechStart="SPEECH_START",e.VADMisfire="VAD_MISFIRE",e.SpeechEnd="SPEECH_END",e.SpeechStop="SPEECH_STOP"}(s||(t.Message=s={}))},650:(e,t)=>{Object.defineProperty(t,"__esModule",{value:!0})},559:function(e,t,s){var o=this&&this.__createBinding||(Object.create?function(e,t,s,o){void 0===o&&(o=s);var r=Object.getOwnPropertyDescriptor(t,s);r&&!("get"in r?!t.__esModule:r.writable||r.configurable)||(r={enumerable:!0,get:function(){return t[s]}}),Object.defineProperty(e,o,r)}:function(e,t,s,o){void 0===o&&(o=s),e[o]=t[s]}),r=this&&this.__exportStar||function(e,t){for(var s in e)"default"===s||Object.prototype.hasOwnProperty.call(t,s)||o(t,e,s)};Object.defineProperty(t,"__esModule",{value:!0}),t.SileroV5=t.SileroLegacy=void 0,r(s(650),t);var i=s(143);Object.defineProperty(t,"SileroLegacy",{enumerable:!0,get:function(){return i.SileroLegacy}});var n=s(508);Object.defineProperty(t,"SileroV5",{enumerable:!0,get:function(){return n.SileroV5}})},143:(e,t,s)=>{var o;Object.defineProperty(t,"__esModule",{value:!0}),t.SileroLegacy=void 0;const r=s(710);class i{constructor(e,t,s,o,r){this.ortInstance=e,this._session=t,this._h=s,this._c=o,this._sr=r,this.reset_state=()=>{const e=Array(128).fill(0);this._h=new this.ortInstance.Tensor("float32",e,[2,1,64]),this._c=new this.ortInstance.Tensor("float32",e,[2,1,64])},this.process=async e=>{const t={input:new this.ortInstance.Tensor("float32",e,[1,e.length]),h:this._h,c:this._c,sr:this._sr},s=await this._session.run(t);this._h=s.hn,this._c=s.cn;const[o]=s.output?.data;return{notSpeech:1-o,isSpeech:o}}}}t.SileroLegacy=i,o=i,i.new=async(e,t)=>{r.log.debug("initializing vad");const s=await t(),i=await e.InferenceSession.create(s),n=new e.Tensor("int64",[16000n]),a=Array(128).fill(0),c=new e.Tensor("float32",a,[2,1,64]),h=new e.Tensor("float32",a,[2,1,64]);return r.log.debug("vad is initialized"),new o(e,i,c,h,n)}},508:(e,t,s)=>{var o;Object.defineProperty(t,"__esModule",{value:!0}),t.SileroV5=void 0;const r=s(710);function i(e){const t=Array(256).fill(0);return new e.Tensor("float32",t,[2,1,128])}class n{constructor(e,t,s,o){this._session=e,this._state=t,this._sr=s,this.ortInstance=o,this.reset_state=()=>{this._state=i(this.ortInstance)},this.process=async e=>{const t={input:new this.ortInstance.Tensor("float32",e,[1,e.length]),state:this._state,sr:this._sr},s=await this._session.run(t);this._state=s.stateN;const[o]=s.output?.data;return{notSpeech:1-o,isSpeech:o}}}}t.SileroV5=n,o=n,n.new=async(e,t)=>{r.log.debug("Loading VAD...");const s=await t(),n=await e.InferenceSession.create(s),a=new e.Tensor("int64",[16000n]),c=i(e);return r.log.debug("...finished loading VAD"),new o(n,c,a,e)}},202:(e,t,s)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.PlatformAgnosticNonRealTimeVAD=t.defaultNonRealTimeVADOptions=void 0;const o=s(362),r=s(954),i=s(559),n=s(825);t.defaultNonRealTimeVADOptions={...o.defaultLegacyFrameProcessorOptions,ortConfig:void 0},t.PlatformAgnosticNonRealTimeVAD=class{static async _new(e,s,o={}){const r={...t.defaultNonRealTimeVADOptions,...o};void 0!==r.ortConfig&&r.ortConfig(s);const i=new this(e,s,r);return await i.init(),i}constructor(e,t,s){this.modelFetcher=e,this.ort=t,this.options=s,this.init=async()=>{const e=await i.SileroLegacy.new(this.ort,this.modelFetcher);this.frameProcessor=new o.FrameProcessor(e.process,e.reset_state,{frameSamples:this.options.frameSamples,positiveSpeechThreshold:this.options.positiveSpeechThreshold,negativeSpeechThreshold:this.options.negativeSpeechThreshold,redemptionFrames:this.options.redemptionFrames,preSpeechPadFrames:this.options.preSpeechPadFrames,minSpeechFrames:this.options.minSpeechFrames,submitUserSpeechOnPause:this.options.submitUserSpeechOnPause}),this.frameProcessor.resume()},this.run=async function*(e,t){const s={nativeSampleRate:t,targetSampleRate:16e3,targetFrameSize:this.options.frameSamples},o=new n.Resampler(s);let i=0,a=0,c=0;for await(const t of o.stream(e)){const{msg:e,audio:s}=await this.frameProcessor.process(t);switch(e){case r.Message.SpeechStart:i=c*this.options.frameSamples/16;break;case r.Message.SpeechEnd:a=(c+1)*this.options.frameSamples/16,yield{audio:s,start:i,end:a}}c++}const{msg:h,audio:d}=this.frameProcessor.endSegment();h==r.Message.SpeechEnd&&(yield{audio:d,start:i,end:c*this.options.frameSamples/16})},(0,o.validateOptions)(s)}}},746:function(e,t,s){var o=this&&this.__createBinding||(Object.create?function(e,t,s,o){void 0===o&&(o=s);var r=Object.getOwnPropertyDescriptor(t,s);r&&!("get"in r?!t.__esModule:r.writable||r.configurable)||(r={enumerable:!0,get:function(){return t[s]}}),Object.defineProperty(e,o,r)}:function(e,t,s,o){void 0===o&&(o=s),e[o]=t[s]}),r=this&&this.__setModuleDefault||(Object.create?function(e,t){Object.defineProperty(e,"default",{enumerable:!0,value:t})}:function(e,t){e.default=t}),i=this&&this.__importStar||function(e){if(e&&e.__esModule)return e;var t={};if(null!=e)for(var s in e)"default"!==s&&Object.prototype.hasOwnProperty.call(e,s)&&o(t,e,s);return r(t,e),t};Object.defineProperty(t,"__esModule",{value:!0}),t.AudioNodeVAD=t.MicVAD=t.getDefaultRealTimeVADOptions=t.ort=t.DEFAULT_MODEL=void 0;const n=i(s(656)),a=s(973),c=s(362),h=s(710),d=s(954),u=s(559),l=s(825);t.DEFAULT_MODEL="legacy",t.ort=n,t.getDefaultRealTimeVADOptions=e=>({..."v5"===e?c.defaultV5FrameProcessorOptions:c.defaultLegacyFrameProcessorOptions,onFrameProcessed:e=>{},onVADMisfire:()=>{h.log.debug("VAD misfire")},onSpeechStart:()=>{h.log.debug("Detected speech start")},onSpeechEnd:()=>{h.log.debug("Detected speech end")},baseAssetPath:"https://cdn.jsdelivr.net/npm/@ricky0123/vad-web@0.0.22/dist/",onnxWASMBasePath:"https://cdn.jsdelivr.net/npm/onnxruntime-web@1.14.0/dist/",stream:void 0,ortConfig:void 0,model:t.DEFAULT_MODEL,workletOptions:{}});class p{static async new(e={}){const s={...(0,t.getDefaultRealTimeVADOptions)(e.model??t.DEFAULT_MODEL),...e};let o;(0,c.validateOptions)(s),o=void 0===s.stream?await navigator.mediaDevices.getUserMedia({audio:{...s.additionalAudioConstraints,channelCount:1,echoCancellation:!0,autoGainControl:!0,noiseSuppression:!0}}):s.stream;const r=new AudioContext,i=new MediaStreamAudioSourceNode(r,{mediaStream:o}),n=await f.new(r,s);return n.receive(i),new p(s,r,o,n,i)}constructor(e,t,s,o,r,i=!1){this.options=e,this.audioContext=t,this.stream=s,this.audioNodeVAD=o,this.sourceNode=r,this.listening=i,this.pause=()=>{this.audioNodeVAD.pause(),this.listening=!1},this.start=()=>{this.audioNodeVAD.start(),this.listening=!0},this.destroy=()=>{this.listening&&this.pause(),void 0===this.options.stream&&this.stream.getTracks().forEach((e=>e.stop())),this.sourceNode.disconnect(),this.audioNodeVAD.destroy(),this.audioContext.close()}}}t.MicVAD=p;class f{static async new(e,s={}){const o={...(0,t.getDefaultRealTimeVADOptions)(s.model??t.DEFAULT_MODEL),...s};(0,c.validateOptions)(o),t.ort.env.wasm.wasmPaths=o.onnxWASMBasePath,void 0!==o.ortConfig&&o.ortConfig(t.ort);const r="v5"===o.model?"silero_vad_v5.onnx":"silero_vad_legacy.onnx",i=o.baseAssetPath+r,n="v5"===o.model?u.SileroV5.new:u.SileroLegacy.new;let h;try{h=await n(t.ort,(()=>(0,a.defaultModelFetcher)(i)))}catch(e){throw console.error(`Encountered an error while loading model file ${i}`),e}const d=new c.FrameProcessor(h.process,h.reset_state,{frameSamples:o.frameSamples,positiveSpeechThreshold:o.positiveSpeechThreshold,negativeSpeechThreshold:o.negativeSpeechThreshold,redemptionFrames:o.redemptionFrames,preSpeechPadFrames:o.preSpeechPadFrames,minSpeechFrames:o.minSpeechFrames,submitUserSpeechOnPause:o.submitUserSpeechOnPause}),l=new f(e,o,d);return await l.setupAudioNode(),l}constructor(e,t,s){this.ctx=e,this.options=t,this.bufferIndex=0,this.pause=()=>{const e=this.frameProcessor.pause();this.handleFrameProcessorEvent(e)},this.start=()=>{this.frameProcessor.resume()},this.receive=e=>{e.connect(this.audioNode)},this.processFrame=async e=>{const t=await this.frameProcessor.process(e);this.handleFrameProcessorEvent(t)},this.handleFrameProcessorEvent=e=>{switch(void 0!==e.probs&&this.options.onFrameProcessed(e.probs,e.frame),e.msg){case d.Message.SpeechStart:this.options.onSpeechStart();break;case d.Message.VADMisfire:this.options.onVADMisfire();break;case d.Message.SpeechEnd:this.options.onSpeechEnd(e.audio)}},this.destroy=()=>{this.audioNode instanceof AudioWorkletNode&&this.audioNode.port.postMessage({message:d.Message.SpeechStop}),this.audioNode.disconnect(),this.gainNode?.disconnect()},this.frameProcessor=s}async setupAudioNode(){if("audioWorklet"in this.ctx&&"function"==typeof AudioWorkletNode)try{const e=this.options.baseAssetPath+"vad.worklet.bundle.min.js";await this.ctx.audioWorklet.addModule(e);const t=this.options.workletOptions??{};return t.processorOptions={...t.processorOptions??{},frameSamples:this.options.frameSamples},this.audioNode=new AudioWorkletNode(this.ctx,"vad-helper-worklet",t),void(this.audioNode.port.onmessage=async e=>{if(e.data?.message===d.Message.AudioFrame){let t=e.data.data;t instanceof ArrayBuffer||(t=new ArrayBuffer(e.data.data.byteLength),new Uint8Array(t).set(new Uint8Array(e.data.data)));const s=new Float32Array(t);await this.processFrame(s)}})}catch(e){console.log("AudioWorklet setup failed, falling back to ScriptProcessor",e)}this.resampler=new l.Resampler({nativeSampleRate:this.ctx.sampleRate,targetSampleRate:16e3,targetFrameSize:this.options.frameSamples??480}),this.audioNode=this.ctx.createScriptProcessor(4096,1,1),this.gainNode=this.ctx.createGain(),this.gainNode.gain.value=0;let e=!1;this.audioNode.onaudioprocess=async t=>{if(!e){e=!0;try{const e=t.inputBuffer.getChannelData(0);if(t.outputBuffer.getChannelData(0).fill(0),this.resampler){const t=this.resampler.process(e);for(const e of t)await this.processFrame(e)}}catch(e){console.error("Error processing audio:",e)}finally{e=!1}}},this.audioNode.connect(this.gainNode),this.gainNode.connect(this.ctx.destination)}}t.AudioNodeVAD=f},825:(e,t,s)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.Resampler=void 0;const o=s(710);t.Resampler=class{constructor(e){this.options=e,this.process=e=>{const t=[];for(const s of e)for(this.inputBuffer.push(s);this.hasEnoughDataForFrame();){const e=this.generateOutputFrame();t.push(e)}return t},this.stream=async function*(e){for(const t of e)for(this.inputBuffer.push(t);this.hasEnoughDataForFrame();){const e=this.generateOutputFrame();yield e}},e.nativeSampleRate<16e3&&o.log.error("nativeSampleRate is too low. Should have 16000 = targetSampleRate <= nativeSampleRate"),this.inputBuffer=[]}hasEnoughDataForFrame(){return this.inputBuffer.length*this.options.targetSampleRate/this.options.nativeSampleRate>=this.options.targetFrameSize}generateOutputFrame(){const e=new Float32Array(this.options.targetFrameSize);let t=0,s=0;for(;t<this.options.targetFrameSize;){let o=0,r=0;for(;s<Math.min(this.inputBuffer.length,(t+1)*this.options.nativeSampleRate/this.options.targetSampleRate);){const e=this.inputBuffer[s];void 0!==e&&(o+=e,r++),s++}e[t]=o/r,t++}return this.inputBuffer=this.inputBuffer.slice(s),e}}},787:(e,t)=>{function s(e,t,s){for(var o=0;o<s.length;o++)e.setUint8(t+o,s.charCodeAt(o))}Object.defineProperty(t,"__esModule",{value:!0}),t.audioFileToArray=t.encodeWAV=t.arrayBufferToBase64=t.minFramesForTargetMS=void 0,t.minFramesForTargetMS=function(e,t,s=16e3){return Math.ceil(e*s/1e3/t)},t.arrayBufferToBase64=function(e){const t=new Uint8Array(e),s=t.byteLength,o=new Array(s);for(var r=0;r<s;r++){const e=t[r];if(void 0===e)break;o[r]=String.fromCharCode(e)}return btoa(o.join(""))},t.encodeWAV=function(e,t=3,o=16e3,r=1,i=32){var n=i/8,a=r*n,c=new ArrayBuffer(44+e.length*n),h=new DataView(c);return s(h,0,"RIFF"),h.setUint32(4,36+e.length*n,!0),s(h,8,"WAVE"),s(h,12,"fmt "),h.setUint32(16,16,!0),h.setUint16(20,t,!0),h.setUint16(22,r,!0),h.setUint32(24,o,!0),h.setUint32(28,o*a,!0),h.setUint16(32,a,!0),h.setUint16(34,i,!0),s(h,36,"data"),h.setUint32(40,e.length*n,!0),1===t?function(e,t,s){for(var o=0;o<s.length;o++,t+=2){var r=Math.max(-1,Math.min(1,s[o]));e.setInt16(t,r<0?32768*r:32767*r,!0)}}(h,44,e):function(e,t,s){for(var o=0;o<s.length;o++,t+=4)e.setFloat32(t,s[o],!0)}(h,44,e),c},t.audioFileToArray=async function(e){const t=new OfflineAudioContext(1,1,44100),s=new FileReader;let o=null;if(await new Promise((r=>{s.addEventListener("loadend",(e=>{const i=s.result;t.decodeAudioData(i,(e=>{o=e,t.startRendering().then((e=>{console.log("Rendering completed successfully"),r()})).catch((e=>{console.error(`Rendering failed: ${e}`)}))}),(e=>{console.log(`Error with decoding audio data: ${e}`)}))})),s.readAsArrayBuffer(e)})),null===o)throw Error("some shit");let r=o,i=new Float32Array(r.length);for(let e=0;e<r.length;e++)for(let t=0;t<r.numberOfChannels;t++)i[e]+=r.getChannelData(t)[e];return{audio:i,sampleRate:r.sampleRate}}},656:t=>{t.exports=e}},s={};return function e(o){var r=s[o];if(void 0!==r)return r.exports;var i=s[o]={exports:{}};return t[o].call(i.exports,i,i.exports,e),i.exports}(590)})()));