/** * Silero VAD for speech detection and silence trimming * Based on the approach in ibm-granite/granite-speech HF demo */ let vadSession = null; const VAD_SAMPLE_RATE = 16000; const VAD_CHUNK_SIZE = 512; // 32ms chunks at 16kHz const MODEL_CACHE_NAME = 'granite-speech-local-models'; // Fetch with Cache API persistence async function cachedFetch(url) { const cache = await caches.open(MODEL_CACHE_NAME); const cached = await cache.match(url); if (cached) return cached; const response = await fetch(url); if (response.ok) await cache.put(url, response.clone()); return response; } // Load VAD model async function loadVAD() { if (vadSession) return; console.log('Loading VAD model...'); const response = await cachedFetch('./silero_vad.onnx'); const buffer = await response.arrayBuffer(); vadSession = await ort.InferenceSession.create(buffer, { executionProviders: ['wasm'], }); console.log('VAD model loaded'); } // Get speech timestamps using Silero VAD // Returns list of {start, end} in samples async function getSpeechTimestamps(audioData, threshold = 0.5) { await loadVAD(); // Initialize state [2, 1, 128] let state = new Float32Array(2 * 1 * 128); const sr = BigInt(VAD_SAMPLE_RATE); const speechProbs = []; // Process in chunks for (let i = 0; i < audioData.length; i += VAD_CHUNK_SIZE) { const chunkEnd = Math.min(i + VAD_CHUNK_SIZE, audioData.length); let chunk = new Float32Array(VAD_CHUNK_SIZE); // Copy chunk data for (let j = 0; j < chunkEnd - i; j++) { chunk[j] = audioData[i + j]; } // Run VAD const inputTensor = new ort.Tensor('float32', chunk, [1, VAD_CHUNK_SIZE]); const stateTensor = new ort.Tensor('float32', state, [2, 1, 128]); const srTensor = new ort.Tensor('int64', BigInt64Array.from([sr]), []); const outputs = await vadSession.run({ input: inputTensor, state: stateTensor, sr: srTensor }); speechProbs.push(outputs.output.data[0]); state = new Float32Array(outputs.stateN.data); } // Find speech segments const segments = []; let inSpeech = false; let speechStart = 0; for (let i = 0; i < speechProbs.length; i++) { const isSpeech = speechProbs[i] >= threshold; if (isSpeech && !inSpeech) { speechStart = i * VAD_CHUNK_SIZE; inSpeech = true; } else if (!isSpeech && inSpeech) { segments.push({ start: speechStart, end: i * VAD_CHUNK_SIZE }); inSpeech = false; } } if (inSpeech) { segments.push({ start: speechStart, end: audioData.length }); } return segments; } // Get speech segments with merging (like granite-speech demo) // Returns segments with start/end in seconds async function getSpeechSegments(audioData, sampleRate = VAD_SAMPLE_RATE) { const vadSegments = await getSpeechTimestamps(audioData); if (vadSegments.length === 0) { return [{ start: 0, end: audioData.length / sampleRate }]; } // Convert to seconds and apply buffering/merging const startBuffer = 0.3; // seconds - pad segment start const minGap = 0.5; // seconds - merge segments with small gaps const segments = []; for (const seg of vadSegments) { const startSec = seg.start / sampleRate; const endSec = seg.end / sampleRate; const bufferedStart = Math.max(0, startSec - startBuffer); if (segments.length > 0 && bufferedStart - segments[segments.length - 1].end < minGap) { // Merge with previous segment segments[segments.length - 1].end = endSec; } else { // Start new segment if (segments.length > 0) { // Extend to previous end to avoid gaps segments.push({ start: segments[segments.length - 1].end, end: endSec }); } else { segments.push({ start: bufferedStart, end: endSec }); } } } return segments; } // Trim silence from audio (simple version - just trim start/end) async function trimSilence(audioData, sampleRate = VAD_SAMPLE_RATE) { const segments = await getSpeechTimestamps(audioData); if (segments.length === 0) { console.log('VAD: No speech detected, returning original audio'); return audioData; } // Add padding (300ms) const paddingSamples = Math.floor(0.3 * sampleRate); const start = Math.max(0, segments[0].start - paddingSamples); const end = Math.min(audioData.length, segments[segments.length - 1].end + paddingSamples); const trimmedStart = (start / sampleRate).toFixed(2); const trimmedEnd = ((audioData.length - end) / sampleRate).toFixed(2); console.log(`VAD: Trimmed ${trimmedStart}s from start, ${trimmedEnd}s from end`); return audioData.slice(start, end); } // Format timestamp as MM:SS function formatTimestamp(seconds) { const mins = Math.floor(seconds / 60); const secs = Math.floor(seconds % 60); return `${mins}:${secs.toString().padStart(2, '0')}`; } // Export window.loadVAD = loadVAD; window.trimSilence = trimSilence; window.getSpeechSegments = getSpeechSegments; window.formatTimestamp = formatTimestamp;