| |
| |
| |
| |
|
|
| let vadSession = null; |
| const VAD_SAMPLE_RATE = 16000; |
| const VAD_CHUNK_SIZE = 512; |
|
|
| const MODEL_CACHE_NAME = 'granite-speech-local-models'; |
|
|
| |
| async function cachedFetch(url) { |
| const cache = await caches.open(MODEL_CACHE_NAME); |
| const cached = await cache.match(url); |
| if (cached) return cached; |
| const response = await fetch(url); |
| if (response.ok) await cache.put(url, response.clone()); |
| return response; |
| } |
|
|
| |
| async function loadVAD() { |
| if (vadSession) return; |
|
|
| console.log('Loading VAD model...'); |
| const response = await cachedFetch('./silero_vad.onnx'); |
| const buffer = await response.arrayBuffer(); |
| vadSession = await ort.InferenceSession.create(buffer, { |
| executionProviders: ['wasm'], |
| }); |
| console.log('VAD model loaded'); |
| } |
|
|
| |
| |
| async function getSpeechTimestamps(audioData, threshold = 0.5) { |
| await loadVAD(); |
|
|
| |
| let state = new Float32Array(2 * 1 * 128); |
| const sr = BigInt(VAD_SAMPLE_RATE); |
|
|
| const speechProbs = []; |
|
|
| |
| for (let i = 0; i < audioData.length; i += VAD_CHUNK_SIZE) { |
| const chunkEnd = Math.min(i + VAD_CHUNK_SIZE, audioData.length); |
| let chunk = new Float32Array(VAD_CHUNK_SIZE); |
|
|
| |
| for (let j = 0; j < chunkEnd - i; j++) { |
| chunk[j] = audioData[i + j]; |
| } |
|
|
| |
| const inputTensor = new ort.Tensor('float32', chunk, [1, VAD_CHUNK_SIZE]); |
| const stateTensor = new ort.Tensor('float32', state, [2, 1, 128]); |
| const srTensor = new ort.Tensor('int64', BigInt64Array.from([sr]), []); |
|
|
| const outputs = await vadSession.run({ |
| input: inputTensor, |
| state: stateTensor, |
| sr: srTensor |
| }); |
|
|
| speechProbs.push(outputs.output.data[0]); |
| state = new Float32Array(outputs.stateN.data); |
| } |
|
|
| |
| const segments = []; |
| let inSpeech = false; |
| let speechStart = 0; |
|
|
| for (let i = 0; i < speechProbs.length; i++) { |
| const isSpeech = speechProbs[i] >= threshold; |
|
|
| if (isSpeech && !inSpeech) { |
| speechStart = i * VAD_CHUNK_SIZE; |
| inSpeech = true; |
| } else if (!isSpeech && inSpeech) { |
| segments.push({ |
| start: speechStart, |
| end: i * VAD_CHUNK_SIZE |
| }); |
| inSpeech = false; |
| } |
| } |
|
|
| if (inSpeech) { |
| segments.push({ |
| start: speechStart, |
| end: audioData.length |
| }); |
| } |
|
|
| return segments; |
| } |
|
|
| |
| |
| async function getSpeechSegments(audioData, sampleRate = VAD_SAMPLE_RATE) { |
| const vadSegments = await getSpeechTimestamps(audioData); |
|
|
| if (vadSegments.length === 0) { |
| return [{ start: 0, end: audioData.length / sampleRate }]; |
| } |
|
|
| |
| const startBuffer = 0.3; |
| const minGap = 0.5; |
|
|
| const segments = []; |
|
|
| for (const seg of vadSegments) { |
| const startSec = seg.start / sampleRate; |
| const endSec = seg.end / sampleRate; |
| const bufferedStart = Math.max(0, startSec - startBuffer); |
|
|
| if (segments.length > 0 && bufferedStart - segments[segments.length - 1].end < minGap) { |
| |
| segments[segments.length - 1].end = endSec; |
| } else { |
| |
| if (segments.length > 0) { |
| |
| segments.push({ start: segments[segments.length - 1].end, end: endSec }); |
| } else { |
| segments.push({ start: bufferedStart, end: endSec }); |
| } |
| } |
| } |
|
|
| return segments; |
| } |
|
|
| |
| async function trimSilence(audioData, sampleRate = VAD_SAMPLE_RATE) { |
| const segments = await getSpeechTimestamps(audioData); |
|
|
| if (segments.length === 0) { |
| console.log('VAD: No speech detected, returning original audio'); |
| return audioData; |
| } |
|
|
| |
| const paddingSamples = Math.floor(0.3 * sampleRate); |
|
|
| const start = Math.max(0, segments[0].start - paddingSamples); |
| const end = Math.min(audioData.length, segments[segments.length - 1].end + paddingSamples); |
|
|
| const trimmedStart = (start / sampleRate).toFixed(2); |
| const trimmedEnd = ((audioData.length - end) / sampleRate).toFixed(2); |
| console.log(`VAD: Trimmed ${trimmedStart}s from start, ${trimmedEnd}s from end`); |
|
|
| return audioData.slice(start, end); |
| } |
|
|
| |
| function formatTimestamp(seconds) { |
| const mins = Math.floor(seconds / 60); |
| const secs = Math.floor(seconds % 60); |
| return `${mins}:${secs.toString().padStart(2, '0')}`; |
| } |
|
|
| |
| window.loadVAD = loadVAD; |
| window.trimSilence = trimSilence; |
| window.getSpeechSegments = getSpeechSegments; |
| window.formatTimestamp = formatTimestamp; |
|
|