gsaon's picture
Upload 2 files
25633dd verified
/**
* Silero VAD for speech detection and silence trimming
* Based on the approach in ibm-granite/granite-speech HF demo
*/
let vadSession = null;
const VAD_SAMPLE_RATE = 16000;
const VAD_CHUNK_SIZE = 512; // 32ms chunks at 16kHz
const MODEL_CACHE_NAME = 'granite-speech-local-models';
// Fetch with Cache API persistence
async function cachedFetch(url) {
const cache = await caches.open(MODEL_CACHE_NAME);
const cached = await cache.match(url);
if (cached) return cached;
const response = await fetch(url);
if (response.ok) await cache.put(url, response.clone());
return response;
}
// Load VAD model
async function loadVAD() {
if (vadSession) return;
console.log('Loading VAD model...');
const response = await cachedFetch('./silero_vad.onnx');
const buffer = await response.arrayBuffer();
vadSession = await ort.InferenceSession.create(buffer, {
executionProviders: ['wasm'],
});
console.log('VAD model loaded');
}
// Get speech timestamps using Silero VAD
// Returns list of {start, end} in samples
async function getSpeechTimestamps(audioData, threshold = 0.5) {
await loadVAD();
// Initialize state [2, 1, 128]
let state = new Float32Array(2 * 1 * 128);
const sr = BigInt(VAD_SAMPLE_RATE);
const speechProbs = [];
// Process in chunks
for (let i = 0; i < audioData.length; i += VAD_CHUNK_SIZE) {
const chunkEnd = Math.min(i + VAD_CHUNK_SIZE, audioData.length);
let chunk = new Float32Array(VAD_CHUNK_SIZE);
// Copy chunk data
for (let j = 0; j < chunkEnd - i; j++) {
chunk[j] = audioData[i + j];
}
// Run VAD
const inputTensor = new ort.Tensor('float32', chunk, [1, VAD_CHUNK_SIZE]);
const stateTensor = new ort.Tensor('float32', state, [2, 1, 128]);
const srTensor = new ort.Tensor('int64', BigInt64Array.from([sr]), []);
const outputs = await vadSession.run({
input: inputTensor,
state: stateTensor,
sr: srTensor
});
speechProbs.push(outputs.output.data[0]);
state = new Float32Array(outputs.stateN.data);
}
// Find speech segments
const segments = [];
let inSpeech = false;
let speechStart = 0;
for (let i = 0; i < speechProbs.length; i++) {
const isSpeech = speechProbs[i] >= threshold;
if (isSpeech && !inSpeech) {
speechStart = i * VAD_CHUNK_SIZE;
inSpeech = true;
} else if (!isSpeech && inSpeech) {
segments.push({
start: speechStart,
end: i * VAD_CHUNK_SIZE
});
inSpeech = false;
}
}
if (inSpeech) {
segments.push({
start: speechStart,
end: audioData.length
});
}
return segments;
}
// Get speech segments with merging (like granite-speech demo)
// Returns segments with start/end in seconds
async function getSpeechSegments(audioData, sampleRate = VAD_SAMPLE_RATE) {
const vadSegments = await getSpeechTimestamps(audioData);
if (vadSegments.length === 0) {
return [{ start: 0, end: audioData.length / sampleRate }];
}
// Convert to seconds and apply buffering/merging
const startBuffer = 0.3; // seconds - pad segment start
const minGap = 0.5; // seconds - merge segments with small gaps
const segments = [];
for (const seg of vadSegments) {
const startSec = seg.start / sampleRate;
const endSec = seg.end / sampleRate;
const bufferedStart = Math.max(0, startSec - startBuffer);
if (segments.length > 0 && bufferedStart - segments[segments.length - 1].end < minGap) {
// Merge with previous segment
segments[segments.length - 1].end = endSec;
} else {
// Start new segment
if (segments.length > 0) {
// Extend to previous end to avoid gaps
segments.push({ start: segments[segments.length - 1].end, end: endSec });
} else {
segments.push({ start: bufferedStart, end: endSec });
}
}
}
return segments;
}
// Trim silence from audio (simple version - just trim start/end)
async function trimSilence(audioData, sampleRate = VAD_SAMPLE_RATE) {
const segments = await getSpeechTimestamps(audioData);
if (segments.length === 0) {
console.log('VAD: No speech detected, returning original audio');
return audioData;
}
// Add padding (300ms)
const paddingSamples = Math.floor(0.3 * sampleRate);
const start = Math.max(0, segments[0].start - paddingSamples);
const end = Math.min(audioData.length, segments[segments.length - 1].end + paddingSamples);
const trimmedStart = (start / sampleRate).toFixed(2);
const trimmedEnd = ((audioData.length - end) / sampleRate).toFixed(2);
console.log(`VAD: Trimmed ${trimmedStart}s from start, ${trimmedEnd}s from end`);
return audioData.slice(start, end);
}
// Format timestamp as MM:SS
function formatTimestamp(seconds) {
const mins = Math.floor(seconds / 60);
const secs = Math.floor(seconds % 60);
return `${mins}:${secs.toString().padStart(2, '0')}`;
}
// Export
window.loadVAD = loadVAD;
window.trimSilence = trimSilence;
window.getSpeechSegments = getSpeechSegments;
window.formatTimestamp = formatTimestamp;