gsaon's picture
Switched to q4f16 for encoder (saves 200Mb)
0183c29 verified
/**
* Granite Speech WebGPU Demo
* Uses Transformers.js v4 for in-browser speech recognition
*/
import {
AutoProcessor,
GraniteSpeechForConditionalGeneration,
TextStreamer,
} from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@4.0.0-next.7';
import { detect } from 'https://cdn.jsdelivr.net/npm/tinyld/+esm';
// Model
const MODEL_ID = 'onnx-community/granite-4.0-1b-speech-ONNX';
// Audio config
const SAMPLE_RATE = 16000;
const MAX_NEW_TOKENS = 256;
// Task prompts — <|audio|> is expanded by the processor's chat template
const TASK_PROMPTS = {
'transcribe': '<|audio|>Transcribe the speech to text',
'translate_en': '<|audio|>Translate the speech to English',
'translate_fr': '<|audio|>Translate the speech to French',
'translate_de': '<|audio|>Translate the speech to German',
'translate_es': '<|audio|>Translate the speech to Spanish',
'translate_pt': '<|audio|>Translate the speech to Portuguese',
'translate_ja': '<|audio|>Translate the speech to Japanese',
};
// State
let model = null;
let processor = null;
let isModelLoading = false;
let currentAudioData = null;
// DOM Elements
const statusDot = document.getElementById('statusDot');
const statusText = document.getElementById('statusText');
const recordBtn = document.getElementById('recordBtn');
const audioFile = document.getElementById('audioFile');
const fileTile = document.querySelector('.file-label');
const inputCard = document.querySelector('.input-card');
const audioPreview = document.getElementById('audioPreview');
const audioPlayer = document.getElementById('audioPlayer');
const playBtn = document.getElementById('playBtn');
const waveformCanvas = document.getElementById('waveformCanvas');
const waveformProgress = document.getElementById('waveformProgress');
const audioTime = document.getElementById('audioTime');
const transcribeSection = document.getElementById('transcribeSection');
const transcribeBtn = document.getElementById('transcribeBtn');
const promptSelect = document.getElementById('promptSelect');
const punctuationCheckbox = document.getElementById('punctuationCheckbox');
const transcriptCard = document.getElementById('transcriptCard');
const outputText = document.getElementById('outputText');
const copyBtn = document.getElementById('copyBtn');
const downloadBtn = document.getElementById('downloadBtn');
const clearBtn = document.getElementById('clearBtn');
const progressSection = document.getElementById('progressSection');
const progressFill = document.getElementById('progressFill');
const progressText = document.getElementById('progressText');
const vadCheckbox = document.getElementById('vadCheckbox');
const gpuInfo = document.getElementById('gpuInfo');
// Recording state
let mediaRecorder = null;
let audioChunks = [];
let transcriptionAborted = false;
// Utility functions
function setStatus(status, message) {
statusDot.className = `status-dot ${status}`;
statusText.textContent = message;
}
// Punctuation is handled by punctuator.js (applyPunctuation function)
function showProgress(show) {
progressSection.style.display = show ? 'block' : 'none';
}
function updateProgress(progress, text) {
progressFill.style.width = `${progress}%`;
progressText.textContent = text;
}
// Check WebGPU support
async function checkWebGPU() {
if (!navigator.gpu) {
gpuInfo.textContent = 'WebGPU not supported. Use Chrome 113+ or Edge 113+';
gpuInfo.style.color = '#e74c3c';
return false;
}
try {
const adapter = await navigator.gpu.requestAdapter();
if (!adapter) {
gpuInfo.textContent = 'No WebGPU adapter available';
gpuInfo.style.color = '#f39c12';
return false;
}
return true;
} catch (e) {
console.error('WebGPU error:', e);
gpuInfo.textContent = `WebGPU error: ${e.message || e}`;
gpuInfo.style.color = '#e74c3c';
return false;
}
}
// Initialize models using Transformers.js v4
async function initModels() {
if (isModelLoading) return;
isModelLoading = true;
setStatus('loading', 'Loading processor...');
try {
await checkWebGPU();
processor = await AutoProcessor.from_pretrained(MODEL_ID);
setStatus('loading', 'Downloading models...');
progressFill.style.width = '0%';
let lastProgressUpdate = 0;
const fileProgress = {};
model = await GraniteSpeechForConditionalGeneration.from_pretrained(MODEL_ID, {
dtype: {
audio_encoder: 'q4f16',
embed_tokens: 'q4f16',
decoder_model_merged: 'q4f16',
},
device: 'webgpu',
progress_callback: (progress) => {
if (progress.status === 'progress' && progress.total) {
fileProgress[progress.file] = { loaded: progress.loaded, total: progress.total };
const now = performance.now();
if (now - lastProgressUpdate < 100) return;
lastProgressUpdate = now;
let totalLoaded = 0, totalSize = 0;
for (const f of Object.values(fileProgress)) {
totalLoaded += f.loaded;
totalSize += f.total;
}
const pct = totalSize > 0 ? (totalLoaded / totalSize) * 100 : 0;
progressFill.style.width = `${pct}%`;
const mb = (totalLoaded / 1e6).toFixed(0);
const totalMb = (totalSize / 1e6).toFixed(0);
setStatus('loading', `Downloading models... ${mb} / ${totalMb} MB`);
}
},
});
setStatus('loading', 'Loading VAD and punctuation models...');
await Promise.all([loadVAD(), loadPunctuator()]);
progressFill.style.width = '0%';
setStatus('ready', 'Ready - Record or upload audio');
enableControls(true);
} catch (error) {
console.error('Model loading failed:', error);
console.error('Error stack:', error?.stack);
const errorMsg = error?.message || error?.toString() || 'Unknown error';
setStatus('error', `Error: ${errorMsg}`);
progressFill.style.width = '0%';
isModelLoading = false;
}
}
function enableControls(enabled) {
recordBtn.disabled = !enabled;
audioFile.disabled = !enabled;
}
// Transcribe a single audio segment and return the text
async function transcribeSegment(audioSegment, onPartialResult) {
// Build prompt using chat template
const taskKey = promptSelect.value;
const content = TASK_PROMPTS[taskKey] || TASK_PROMPTS['transcribe'];
const messages = [{ role: 'user', content }];
const text = processor.tokenizer.apply_chat_template(messages, {
add_generation_prompt: true,
tokenize: false,
});
// Process text + audio into model inputs
const inputs = await processor(text, audioSegment, { sampling_rate: SAMPLE_RATE });
// Streaming via TextStreamer
let accumulated = '';
const streamer = new TextStreamer(processor.tokenizer, {
skip_prompt: true,
skip_special_tokens: true,
callback_function: (chunk) => {
accumulated += chunk;
if (onPartialResult) {
onPartialResult(accumulated);
}
},
});
// Generate
await model.generate({
...inputs,
max_new_tokens: MAX_NEW_TOKENS,
streamer,
});
return accumulated;
}
// Wait until audio playback reaches a specific time
function waitForPlaybackTime(targetTime) {
return new Promise((resolve) => {
const check = () => {
if (audioPlayer.paused || audioPlayer.currentTime >= targetTime) {
resolve();
} else {
requestAnimationFrame(check);
}
};
check();
});
}
// Run inference with segmentation and audio sync
async function transcribe() {
if (!model || !processor || !currentAudioData) {
setStatus('error', 'Model or audio not ready');
return;
}
setStatus('processing', 'Processing audio...');
transcribeBtn.disabled = true;
transcriptionAborted = false;
outputText.textContent = '';
transcriptCard.style.display = 'block';
showProgress(true);
try {
// Get speech segments using VAD, or treat entire audio as one segment
let segments;
if (vadCheckbox.checked) {
updateProgress(5, 'Detecting speech segments...');
segments = await getSpeechSegments(currentAudioData, SAMPLE_RATE);
console.log(`VAD found ${segments.length} segment(s)`);
} else {
segments = [{ start: 0, end: currentAudioData.length / SAMPLE_RATE }];
}
// Start audio playback immediately
audioPlayer.currentTime = 0;
audioPlayer.play();
playBtn.querySelector('.play-icon').style.display = 'none';
playBtn.querySelector('.pause-icon').style.display = 'block';
const playbackStartTime = performance.now() / 1000;
// Process and display segments in sync with audio
const displayedResults = [];
const totalSegments = segments.length;
for (let segIdx = 0; segIdx < totalSegments; segIdx++) {
if (transcriptionAborted) break;
const seg = segments[segIdx];
// Update progress bar
const segProgress = ((segIdx + 1) / totalSegments) * 100;
updateProgress(segProgress, '');
// Wait for audio to reach this segment's start time
const elapsed = (performance.now() / 1000) - playbackStartTime;
const waitTime = seg.start - elapsed;
if (waitTime > 0) {
await new Promise(resolve => setTimeout(resolve, waitTime * 1000));
}
setStatus('processing', `Segment ${segIdx + 1}/${totalSegments}`);
// Extract and transcribe this segment
const startSample = Math.floor(seg.start * SAMPLE_RATE);
const endSample = Math.floor(seg.end * SAMPLE_RATE);
const audioSegment = currentAudioData.slice(startSample, endSample);
const timestamp = formatTimestamp(seg.start);
const makeRow = (ts, text) => `<div class="transcript-row"><span class="timestamp">${ts}</span><span class="transcript-text">${text}</span></div>`;
// Transcribe with streaming display
const segmentText = await transcribeSegment(audioSegment, (partial) => {
const escaped = partial.replace(/</g, '&lt;').replace(/>/g, '&gt;');
const rows = [...displayedResults, makeRow(timestamp, escaped)];
outputText.innerHTML = rows.join('');
outputText.scrollTop = outputText.scrollHeight;
});
if (segmentText.trim()) {
let finalSegmentText = segmentText.trim();
// Apply punctuation/capitalization for English only
if (punctuationCheckbox.checked) {
const detectedLang = detect(finalSegmentText);
if (detectedLang === 'en') {
const stripped = finalSegmentText.replace(/[.,!?]/g, ' ').replace(/\s+/g, ' ').trim();
finalSegmentText = await applyPunctuation(stripped, 'en');
finalSegmentText = finalSegmentText.replace(/<unk>/gi, ' ').replace(/\s+/g, ' ').trim();
}
}
const escaped = finalSegmentText.replace(/</g, '&lt;').replace(/>/g, '&gt;');
displayedResults.push(makeRow(timestamp, escaped));
outputText.innerHTML = displayedResults.join('');
outputText.scrollTop = outputText.scrollHeight;
}
}
// Final output
if (displayedResults.length === 0) {
outputText.innerHTML = '<span style="color: #94a3b8;">(No speech detected)</span>';
}
copyBtn.disabled = false;
showProgress(false);
setStatus('ready', 'Transcription complete');
} catch (error) {
console.error('Transcription failed:', error);
setStatus('error', `Error: ${error.message}`);
showProgress(false);
}
transcribeBtn.disabled = false;
}
// Audio recording
let isRecording = false;
function toggleRecording() {
if (isRecording) {
stopRecording();
} else {
startRecording();
}
}
async function startRecording() {
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
mediaRecorder = new MediaRecorder(stream);
audioChunks = [];
mediaRecorder.ondataavailable = (event) => {
audioChunks.push(event.data);
};
mediaRecorder.onstop = async () => {
const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
const audioUrl = URL.createObjectURL(audioBlob);
audioPlayer.src = audioUrl;
audioPreview.style.display = 'flex';
transcribeSection.style.display = 'flex';
await processAudioBlob(audioBlob);
drawWaveform();
updateAudioTime();
stream.getTracks().forEach(track => track.stop());
};
mediaRecorder.start();
isRecording = true;
setStatus('recording', 'Recording...');
// Update button UI
recordBtn.querySelector('.mic-icon').style.display = 'none';
recordBtn.querySelector('.stop-icon').style.display = 'block';
recordBtn.querySelector('span').textContent = 'Stop';
recordBtn.classList.add('recording');
} catch (error) {
console.error('Recording failed:', error);
setStatus('error', 'Microphone access denied');
}
}
function stopRecording() {
if (mediaRecorder && mediaRecorder.state !== 'inactive') {
mediaRecorder.stop();
isRecording = false;
setStatus('ready', 'Recording stopped - Click Transcribe');
// Update button UI
recordBtn.querySelector('.mic-icon').style.display = 'block';
recordBtn.querySelector('.stop-icon').style.display = 'none';
recordBtn.querySelector('span').textContent = 'Record';
recordBtn.classList.remove('recording');
}
}
// Process audio file/blob
async function processAudioBlob(blob) {
try {
const arrayBuffer = await blob.arrayBuffer();
const audioCtx = new AudioContext({ sampleRate: SAMPLE_RATE });
const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer);
// Convert to mono Float32Array
let audioData;
if (audioBuffer.numberOfChannels > 1) {
const left = audioBuffer.getChannelData(0);
const right = audioBuffer.getChannelData(1);
audioData = new Float32Array(left.length);
for (let i = 0; i < left.length; i++) {
audioData[i] = (left[i] + right[i]) / 2;
}
} else {
audioData = audioBuffer.getChannelData(0);
}
// Resample if needed
if (audioBuffer.sampleRate !== SAMPLE_RATE) {
audioData = resample(audioData, audioBuffer.sampleRate, SAMPLE_RATE);
}
currentAudioData = audioData;
transcribeBtn.disabled = false;
} catch (error) {
console.error('Audio processing failed:', error);
setStatus('error', 'Failed to process audio');
}
}
// Simple linear resampling
function resample(audioData, fromRate, toRate) {
const ratio = fromRate / toRate;
const newLength = Math.round(audioData.length / ratio);
const result = new Float32Array(newLength);
for (let i = 0; i < newLength; i++) {
const srcIndex = i * ratio;
const srcIndexFloor = Math.floor(srcIndex);
const srcIndexCeil = Math.min(srcIndexFloor + 1, audioData.length - 1);
const t = srcIndex - srcIndexFloor;
result[i] = audioData[srcIndexFloor] * (1 - t) + audioData[srcIndexCeil] * t;
}
return result;
}
// Handle file upload
async function handleFileUpload(event) {
const file = event.target.files[0];
if (!file) return;
await loadAudioFile(file);
}
// Handle dropped files
async function handleFileDrop(event) {
event.preventDefault();
inputCard.classList.remove('drag-over');
const file = event.dataTransfer.files[0];
if (!file || !file.type.startsWith('audio/')) {
setStatus('error', 'Please drop an audio file');
return;
}
await loadAudioFile(file);
}
// Common file loading logic
async function loadAudioFile(file) {
setStatus('processing', 'Processing audio file...');
const audioUrl = URL.createObjectURL(file);
audioPlayer.src = audioUrl;
audioPreview.style.display = 'flex';
transcribeSection.style.display = 'flex';
await processAudioBlob(file);
drawWaveform();
updateAudioTime();
setStatus('ready', 'Audio loaded - Click Transcribe');
}
// Draw waveform visualization
function drawWaveform() {
if (!currentAudioData) return;
const canvas = waveformCanvas;
const ctx = canvas.getContext('2d');
const dpr = window.devicePixelRatio || 1;
// Set canvas size
const rect = canvas.getBoundingClientRect();
canvas.width = rect.width * dpr;
canvas.height = rect.height * dpr;
ctx.scale(dpr, dpr);
const width = rect.width;
const height = rect.height;
const centerY = height / 2;
// Downsample audio data for visualization
const samples = currentAudioData;
const barCount = Math.floor(width / 3);
const samplesPerBar = Math.floor(samples.length / barCount);
// Calculate bar amplitudes
const barAmplitudes = [];
for (let i = 0; i < barCount; i++) {
let sum = 0;
const start = i * samplesPerBar;
for (let j = 0; j < samplesPerBar; j++) {
sum += Math.abs(samples[start + j] || 0);
}
barAmplitudes.push(sum / samplesPerBar);
}
// Find max amplitude for normalization
const maxAmp = Math.max(...barAmplitudes, 0.01);
// Get color based on color scheme
const isDark = window.matchMedia('(prefers-color-scheme: dark)').matches;
ctx.fillStyle = isDark ? '#64748b' : '#cbd5e1';
// Draw bars normalized to fill height
for (let i = 0; i < barCount; i++) {
const normalized = barAmplitudes[i] / maxAmp;
const barHeight = Math.max(2, normalized * height * 0.9);
ctx.fillRect(i * 3, centerY - barHeight / 2, 2, barHeight);
}
}
// Format time as M:SS
function formatTime(seconds) {
const mins = Math.floor(seconds / 60);
const secs = Math.floor(seconds % 60);
return `${mins}:${secs.toString().padStart(2, '0')}`;
}
// Update audio time display
function updateAudioTime() {
const current = audioPlayer.currentTime || 0;
const duration = audioPlayer.duration || 0;
if (duration > 0) {
audioTime.textContent = `${formatTime(current)} / ${formatTime(duration)}`;
waveformProgress.style.width = `${(current / duration) * 100}%`;
} else {
audioTime.textContent = formatTime(currentAudioData ? currentAudioData.length / SAMPLE_RATE : 0);
}
}
// Toggle play/pause
function togglePlayback() {
if (audioPlayer.paused) {
audioPlayer.play();
playBtn.querySelector('.play-icon').style.display = 'none';
playBtn.querySelector('.pause-icon').style.display = 'block';
} else {
audioPlayer.pause();
playBtn.querySelector('.play-icon').style.display = 'block';
playBtn.querySelector('.pause-icon').style.display = 'none';
// Stop transcription if running
if (!transcriptionAborted && transcribeBtn.disabled) {
transcriptionAborted = true;
showProgress(false);
setStatus('ready', 'Transcription stopped');
transcribeBtn.disabled = false;
}
}
}
// Seek in audio
function seekAudio(event) {
const rect = waveformCanvas.getBoundingClientRect();
const x = event.clientX - rect.left;
const percent = x / rect.width;
audioPlayer.currentTime = percent * audioPlayer.duration;
updateAudioTime();
}
// Copy to clipboard
async function copyToClipboard() {
try {
await navigator.clipboard.writeText(outputText.textContent);
// Brief visual feedback via title attribute
const originalTitle = copyBtn.title;
copyBtn.title = 'Copied!';
setTimeout(() => {
copyBtn.title = originalTitle;
}, 2000);
} catch (error) {
console.error('Copy failed:', error);
}
}
function downloadTranscript() {
// Convert <br> to newlines and strip other HTML
const text = outputText.innerHTML
.replace(/<br\s*\/?>/gi, '\n')
.replace(/<[^>]+>/g, '');
if (!text) return;
const blob = new Blob([text], { type: 'text/plain' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = 'transcript.txt';
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
URL.revokeObjectURL(url);
// Brief visual feedback
const originalTitle = downloadBtn.title;
downloadBtn.title = 'Downloaded!';
setTimeout(() => {
downloadBtn.title = originalTitle;
}, 2000);
}
function clearAudio() {
// Stop any playback
audioPlayer.pause();
audioPlayer.src = '';
// Reset audio state
currentAudioData = null;
// Hide audio player and transcribe section
audioPreview.style.display = 'none';
transcribeSection.style.display = 'none';
// Clear transcript
transcriptCard.style.display = 'none';
outputText.textContent = '';
// Reset waveform
waveformProgress.style.width = '0%';
const ctx = waveformCanvas.getContext('2d');
ctx.clearRect(0, 0, waveformCanvas.width, waveformCanvas.height);
// Reset time display
audioTime.textContent = '0:00';
// Reset buttons
transcribeBtn.disabled = true;
// Reset file input
audioFile.value = '';
// Update status
setStatus('ready', 'Ready');
}
// Event listeners
recordBtn.addEventListener('click', toggleRecording);
audioFile.addEventListener('change', handleFileUpload);
// Audio player controls
playBtn.addEventListener('click', togglePlayback);
waveformCanvas.addEventListener('click', seekAudio);
audioPlayer.addEventListener('timeupdate', updateAudioTime);
audioPlayer.addEventListener('ended', () => {
playBtn.querySelector('.play-icon').style.display = 'block';
playBtn.querySelector('.pause-icon').style.display = 'none';
waveformProgress.style.width = '0%';
});
// Redraw waveform on resize
window.addEventListener('resize', drawWaveform);
transcribeBtn.addEventListener('click', transcribe);
copyBtn.addEventListener('click', copyToClipboard);
downloadBtn.addEventListener('click', downloadTranscript);
clearBtn.addEventListener('click', clearAudio);
// Drag and drop on input card
inputCard.addEventListener('dragover', (e) => {
e.preventDefault();
inputCard.classList.add('drag-over');
});
inputCard.addEventListener('dragleave', (e) => {
e.preventDefault();
inputCard.classList.remove('drag-over');
});
inputCard.addEventListener('drop', handleFileDrop);
// Initialize on load
window.addEventListener('load', initModels);