Spaces:

ibm-granite
/

granite-speech-webgpu

Running

App Files Files Community

granite-speech-webgpu / app.js

gsaon's picture

Switched to q4f16 for encoder (saves 200Mb)

0183c29 verified about 1 month ago

history blame contribute delete

23.6 kB

	/**
	* Granite Speech WebGPU Demo
	* Uses Transformers.js v4 for in-browser speech recognition
	*/

	import {
	AutoProcessor,
	GraniteSpeechForConditionalGeneration,
	TextStreamer,
	} from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@4.0.0-next.7';
	import { detect } from 'https://cdn.jsdelivr.net/npm/tinyld/+esm';

	// Model
	const MODEL_ID = 'onnx-community/granite-4.0-1b-speech-ONNX';

	// Audio config
	const SAMPLE_RATE = 16000;
	const MAX_NEW_TOKENS = 256;

	// Task prompts — <\|audio\|> is expanded by the processor's chat template
	const TASK_PROMPTS = {
	'transcribe': '<\|audio\|>Transcribe the speech to text',
	'translate_en': '<\|audio\|>Translate the speech to English',
	'translate_fr': '<\|audio\|>Translate the speech to French',
	'translate_de': '<\|audio\|>Translate the speech to German',
	'translate_es': '<\|audio\|>Translate the speech to Spanish',
	'translate_pt': '<\|audio\|>Translate the speech to Portuguese',
	'translate_ja': '<\|audio\|>Translate the speech to Japanese',
	};

	// State
	let model = null;
	let processor = null;
	let isModelLoading = false;
	let currentAudioData = null;

	// DOM Elements
	const statusDot = document.getElementById('statusDot');
	const statusText = document.getElementById('statusText');
	const recordBtn = document.getElementById('recordBtn');
	const audioFile = document.getElementById('audioFile');
	const fileTile = document.querySelector('.file-label');
	const inputCard = document.querySelector('.input-card');
	const audioPreview = document.getElementById('audioPreview');
	const audioPlayer = document.getElementById('audioPlayer');
	const playBtn = document.getElementById('playBtn');
	const waveformCanvas = document.getElementById('waveformCanvas');
	const waveformProgress = document.getElementById('waveformProgress');
	const audioTime = document.getElementById('audioTime');
	const transcribeSection = document.getElementById('transcribeSection');
	const transcribeBtn = document.getElementById('transcribeBtn');
	const promptSelect = document.getElementById('promptSelect');
	const punctuationCheckbox = document.getElementById('punctuationCheckbox');
	const transcriptCard = document.getElementById('transcriptCard');
	const outputText = document.getElementById('outputText');
	const copyBtn = document.getElementById('copyBtn');
	const downloadBtn = document.getElementById('downloadBtn');
	const clearBtn = document.getElementById('clearBtn');
	const progressSection = document.getElementById('progressSection');
	const progressFill = document.getElementById('progressFill');
	const progressText = document.getElementById('progressText');
	const vadCheckbox = document.getElementById('vadCheckbox');
	const gpuInfo = document.getElementById('gpuInfo');

	// Recording state
	let mediaRecorder = null;
	let audioChunks = [];
	let transcriptionAborted = false;

	// Utility functions
	function setStatus(status, message) {
	statusDot.className = `status-dot ${status}`;
	statusText.textContent = message;
	}

	// Punctuation is handled by punctuator.js (applyPunctuation function)

	function showProgress(show) {
	progressSection.style.display = show ? 'block' : 'none';
	}

	function updateProgress(progress, text) {
	progressFill.style.width = `${progress}%`;
	progressText.textContent = text;
	}

	// Check WebGPU support
	async function checkWebGPU() {
	if (!navigator.gpu) {
	gpuInfo.textContent = 'WebGPU not supported. Use Chrome 113+ or Edge 113+';
	gpuInfo.style.color = '#e74c3c';
	return false;
	}

	try {
	const adapter = await navigator.gpu.requestAdapter();
	if (!adapter) {
	gpuInfo.textContent = 'No WebGPU adapter available';
	gpuInfo.style.color = '#f39c12';
	return false;
	}

	return true;
	} catch (e) {
	console.error('WebGPU error:', e);
	gpuInfo.textContent = `WebGPU error: ${e.message \|\| e}`;
	gpuInfo.style.color = '#e74c3c';
	return false;
	}
	}

	// Initialize models using Transformers.js v4
	async function initModels() {
	if (isModelLoading) return;
	isModelLoading = true;

	setStatus('loading', 'Loading processor...');

	try {
	await checkWebGPU();

	processor = await AutoProcessor.from_pretrained(MODEL_ID);

	setStatus('loading', 'Downloading models...');
	progressFill.style.width = '0%';
	let lastProgressUpdate = 0;
	const fileProgress = {};
	model = await GraniteSpeechForConditionalGeneration.from_pretrained(MODEL_ID, {
	dtype: {
	audio_encoder: 'q4f16',
	embed_tokens: 'q4f16',
	decoder_model_merged: 'q4f16',
	},
	device: 'webgpu',
	progress_callback: (progress) => {
	if (progress.status === 'progress' && progress.total) {
	fileProgress[progress.file] = { loaded: progress.loaded, total: progress.total };
	const now = performance.now();
	if (now - lastProgressUpdate < 100) return;
	lastProgressUpdate = now;
	let totalLoaded = 0, totalSize = 0;
	for (const f of Object.values(fileProgress)) {
	totalLoaded += f.loaded;
	totalSize += f.total;
	}
	const pct = totalSize > 0 ? (totalLoaded / totalSize) * 100 : 0;
	progressFill.style.width = `${pct}%`;
	const mb = (totalLoaded / 1e6).toFixed(0);
	const totalMb = (totalSize / 1e6).toFixed(0);
	setStatus('loading', `Downloading models... ${mb} / ${totalMb} MB`);
	}
	},
	});

	setStatus('loading', 'Loading VAD and punctuation models...');
	await Promise.all([loadVAD(), loadPunctuator()]);

	progressFill.style.width = '0%';
	setStatus('ready', 'Ready - Record or upload audio');
	enableControls(true);

	} catch (error) {
	console.error('Model loading failed:', error);
	console.error('Error stack:', error?.stack);
	const errorMsg = error?.message \|\| error?.toString() \|\| 'Unknown error';
	setStatus('error', `Error: ${errorMsg}`);
	progressFill.style.width = '0%';
	isModelLoading = false;
	}
	}

	function enableControls(enabled) {
	recordBtn.disabled = !enabled;
	audioFile.disabled = !enabled;
	}

	// Transcribe a single audio segment and return the text
	async function transcribeSegment(audioSegment, onPartialResult) {
	// Build prompt using chat template
	const taskKey = promptSelect.value;
	const content = TASK_PROMPTS[taskKey] \|\| TASK_PROMPTS['transcribe'];
	const messages = [{ role: 'user', content }];

	const text = processor.tokenizer.apply_chat_template(messages, {
	add_generation_prompt: true,
	tokenize: false,
	});

	// Process text + audio into model inputs
	const inputs = await processor(text, audioSegment, { sampling_rate: SAMPLE_RATE });

	// Streaming via TextStreamer
	let accumulated = '';
	const streamer = new TextStreamer(processor.tokenizer, {
	skip_prompt: true,
	skip_special_tokens: true,
	callback_function: (chunk) => {
	accumulated += chunk;
	if (onPartialResult) {
	onPartialResult(accumulated);
	}
	},
	});

	// Generate
	await model.generate({
	...inputs,
	max_new_tokens: MAX_NEW_TOKENS,
	streamer,
	});

	return accumulated;
	}

	// Wait until audio playback reaches a specific time
	function waitForPlaybackTime(targetTime) {
	return new Promise((resolve) => {
	const check = () => {
	if (audioPlayer.paused \|\| audioPlayer.currentTime >= targetTime) {
	resolve();
	} else {
	requestAnimationFrame(check);
	}
	};
	check();
	});
	}

	// Run inference with segmentation and audio sync
	async function transcribe() {
	if (!model \|\| !processor \|\| !currentAudioData) {
	setStatus('error', 'Model or audio not ready');
	return;
	}

	setStatus('processing', 'Processing audio...');
	transcribeBtn.disabled = true;
	transcriptionAborted = false;
	outputText.textContent = '';
	transcriptCard.style.display = 'block';
	showProgress(true);

	try {
	// Get speech segments using VAD, or treat entire audio as one segment
	let segments;
	if (vadCheckbox.checked) {
	updateProgress(5, 'Detecting speech segments...');
	segments = await getSpeechSegments(currentAudioData, SAMPLE_RATE);
	console.log(`VAD found ${segments.length} segment(s)`);
	} else {
	segments = [{ start: 0, end: currentAudioData.length / SAMPLE_RATE }];
	}

	// Start audio playback immediately
	audioPlayer.currentTime = 0;
	audioPlayer.play();
	playBtn.querySelector('.play-icon').style.display = 'none';
	playBtn.querySelector('.pause-icon').style.display = 'block';
	const playbackStartTime = performance.now() / 1000;

	// Process and display segments in sync with audio
	const displayedResults = [];
	const totalSegments = segments.length;

	for (let segIdx = 0; segIdx < totalSegments; segIdx++) {
	if (transcriptionAborted) break;

	const seg = segments[segIdx];

	// Update progress bar
	const segProgress = ((segIdx + 1) / totalSegments) * 100;
	updateProgress(segProgress, '');

	// Wait for audio to reach this segment's start time
	const elapsed = (performance.now() / 1000) - playbackStartTime;
	const waitTime = seg.start - elapsed;
	if (waitTime > 0) {
	await new Promise(resolve => setTimeout(resolve, waitTime * 1000));
	}

	setStatus('processing', `Segment ${segIdx + 1}/${totalSegments}`);

	// Extract and transcribe this segment
	const startSample = Math.floor(seg.start * SAMPLE_RATE);
	const endSample = Math.floor(seg.end * SAMPLE_RATE);
	const audioSegment = currentAudioData.slice(startSample, endSample);

	const timestamp = formatTimestamp(seg.start);
	const makeRow = (ts, text) => `<div class="transcript-row"><span class="timestamp">${ts}</span><span class="transcript-text">${text}</span></div>`;

	// Transcribe with streaming display
	const segmentText = await transcribeSegment(audioSegment, (partial) => {
	const escaped = partial.replace(/</g, '<').replace(/>/g, '>');
	const rows = [...displayedResults, makeRow(timestamp, escaped)];
	outputText.innerHTML = rows.join('');
	outputText.scrollTop = outputText.scrollHeight;
	});

	if (segmentText.trim()) {
	let finalSegmentText = segmentText.trim();
	// Apply punctuation/capitalization for English only
	if (punctuationCheckbox.checked) {
	const detectedLang = detect(finalSegmentText);
	if (detectedLang === 'en') {
	const stripped = finalSegmentText.replace(/[.,!?]/g, ' ').replace(/\s+/g, ' ').trim();
	finalSegmentText = await applyPunctuation(stripped, 'en');
	finalSegmentText = finalSegmentText.replace(/<unk>/gi, ' ').replace(/\s+/g, ' ').trim();
	}
	}
	const escaped = finalSegmentText.replace(/</g, '<').replace(/>/g, '>');
	displayedResults.push(makeRow(timestamp, escaped));
	outputText.innerHTML = displayedResults.join('');
	outputText.scrollTop = outputText.scrollHeight;
	}
	}

	// Final output
	if (displayedResults.length === 0) {
	outputText.innerHTML = '<span style="color: #94a3b8;">(No speech detected)</span>';
	}
	copyBtn.disabled = false;

	showProgress(false);
	setStatus('ready', 'Transcription complete');

	} catch (error) {
	console.error('Transcription failed:', error);
	setStatus('error', `Error: ${error.message}`);
	showProgress(false);
	}

	transcribeBtn.disabled = false;
	}

	// Audio recording
	let isRecording = false;

	function toggleRecording() {
	if (isRecording) {
	stopRecording();
	} else {
	startRecording();
	}
	}

	async function startRecording() {
	try {
	const stream = await navigator.mediaDevices.getUserMedia({ audio: true });

	mediaRecorder = new MediaRecorder(stream);
	audioChunks = [];

	mediaRecorder.ondataavailable = (event) => {
	audioChunks.push(event.data);
	};

	mediaRecorder.onstop = async () => {
	const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
	const audioUrl = URL.createObjectURL(audioBlob);
	audioPlayer.src = audioUrl;
	audioPreview.style.display = 'flex';
	transcribeSection.style.display = 'flex';

	await processAudioBlob(audioBlob);
	drawWaveform();
	updateAudioTime();
	stream.getTracks().forEach(track => track.stop());
	};

	mediaRecorder.start();
	isRecording = true;
	setStatus('recording', 'Recording...');

	// Update button UI
	recordBtn.querySelector('.mic-icon').style.display = 'none';
	recordBtn.querySelector('.stop-icon').style.display = 'block';
	recordBtn.querySelector('span').textContent = 'Stop';
	recordBtn.classList.add('recording');

	} catch (error) {
	console.error('Recording failed:', error);
	setStatus('error', 'Microphone access denied');
	}
	}

	function stopRecording() {
	if (mediaRecorder && mediaRecorder.state !== 'inactive') {
	mediaRecorder.stop();
	isRecording = false;
	setStatus('ready', 'Recording stopped - Click Transcribe');

	// Update button UI
	recordBtn.querySelector('.mic-icon').style.display = 'block';
	recordBtn.querySelector('.stop-icon').style.display = 'none';
	recordBtn.querySelector('span').textContent = 'Record';
	recordBtn.classList.remove('recording');
	}
	}

	// Process audio file/blob
	async function processAudioBlob(blob) {
	try {
	const arrayBuffer = await blob.arrayBuffer();
	const audioCtx = new AudioContext({ sampleRate: SAMPLE_RATE });
	const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer);

	// Convert to mono Float32Array
	let audioData;
	if (audioBuffer.numberOfChannels > 1) {
	const left = audioBuffer.getChannelData(0);
	const right = audioBuffer.getChannelData(1);
	audioData = new Float32Array(left.length);
	for (let i = 0; i < left.length; i++) {
	audioData[i] = (left[i] + right[i]) / 2;
	}
	} else {
	audioData = audioBuffer.getChannelData(0);
	}

	// Resample if needed
	if (audioBuffer.sampleRate !== SAMPLE_RATE) {
	audioData = resample(audioData, audioBuffer.sampleRate, SAMPLE_RATE);
	}

	currentAudioData = audioData;
	transcribeBtn.disabled = false;

	} catch (error) {
	console.error('Audio processing failed:', error);
	setStatus('error', 'Failed to process audio');
	}
	}

	// Simple linear resampling
	function resample(audioData, fromRate, toRate) {
	const ratio = fromRate / toRate;
	const newLength = Math.round(audioData.length / ratio);
	const result = new Float32Array(newLength);

	for (let i = 0; i < newLength; i++) {
	const srcIndex = i * ratio;
	const srcIndexFloor = Math.floor(srcIndex);
	const srcIndexCeil = Math.min(srcIndexFloor + 1, audioData.length - 1);
	const t = srcIndex - srcIndexFloor;
	result[i] = audioData[srcIndexFloor] * (1 - t) + audioData[srcIndexCeil] * t;
	}

	return result;
	}

	// Handle file upload
	async function handleFileUpload(event) {
	const file = event.target.files[0];
	if (!file) return;
	await loadAudioFile(file);
	}

	// Handle dropped files
	async function handleFileDrop(event) {
	event.preventDefault();
	inputCard.classList.remove('drag-over');

	const file = event.dataTransfer.files[0];
	if (!file \|\| !file.type.startsWith('audio/')) {
	setStatus('error', 'Please drop an audio file');
	return;
	}
	await loadAudioFile(file);
	}

	// Common file loading logic
	async function loadAudioFile(file) {
	setStatus('processing', 'Processing audio file...');

	const audioUrl = URL.createObjectURL(file);
	audioPlayer.src = audioUrl;
	audioPreview.style.display = 'flex';
	transcribeSection.style.display = 'flex';

	await processAudioBlob(file);
	drawWaveform();
	updateAudioTime();
	setStatus('ready', 'Audio loaded - Click Transcribe');
	}

	// Draw waveform visualization
	function drawWaveform() {
	if (!currentAudioData) return;

	const canvas = waveformCanvas;
	const ctx = canvas.getContext('2d');
	const dpr = window.devicePixelRatio \|\| 1;

	// Set canvas size
	const rect = canvas.getBoundingClientRect();
	canvas.width = rect.width * dpr;
	canvas.height = rect.height * dpr;
	ctx.scale(dpr, dpr);

	const width = rect.width;
	const height = rect.height;
	const centerY = height / 2;

	// Downsample audio data for visualization
	const samples = currentAudioData;
	const barCount = Math.floor(width / 3);
	const samplesPerBar = Math.floor(samples.length / barCount);

	// Calculate bar amplitudes
	const barAmplitudes = [];
	for (let i = 0; i < barCount; i++) {
	let sum = 0;
	const start = i * samplesPerBar;
	for (let j = 0; j < samplesPerBar; j++) {
	sum += Math.abs(samples[start + j] \|\| 0);
	}
	barAmplitudes.push(sum / samplesPerBar);
	}

	// Find max amplitude for normalization
	const maxAmp = Math.max(...barAmplitudes, 0.01);

	// Get color based on color scheme
	const isDark = window.matchMedia('(prefers-color-scheme: dark)').matches;
	ctx.fillStyle = isDark ? '#64748b' : '#cbd5e1';

	// Draw bars normalized to fill height
	for (let i = 0; i < barCount; i++) {
	const normalized = barAmplitudes[i] / maxAmp;
	const barHeight = Math.max(2, normalized * height * 0.9);

	ctx.fillRect(i * 3, centerY - barHeight / 2, 2, barHeight);
	}
	}

	// Format time as M:SS
	function formatTime(seconds) {
	const mins = Math.floor(seconds / 60);
	const secs = Math.floor(seconds % 60);
	return `${mins}:${secs.toString().padStart(2, '0')}`;
	}

	// Update audio time display
	function updateAudioTime() {
	const current = audioPlayer.currentTime \|\| 0;
	const duration = audioPlayer.duration \|\| 0;
	if (duration > 0) {
	audioTime.textContent = `${formatTime(current)} / ${formatTime(duration)}`;
	waveformProgress.style.width = `${(current / duration) * 100}%`;
	} else {
	audioTime.textContent = formatTime(currentAudioData ? currentAudioData.length / SAMPLE_RATE : 0);
	}
	}

	// Toggle play/pause
	function togglePlayback() {
	if (audioPlayer.paused) {
	audioPlayer.play();
	playBtn.querySelector('.play-icon').style.display = 'none';
	playBtn.querySelector('.pause-icon').style.display = 'block';
	} else {
	audioPlayer.pause();
	playBtn.querySelector('.play-icon').style.display = 'block';
	playBtn.querySelector('.pause-icon').style.display = 'none';
	// Stop transcription if running
	if (!transcriptionAborted && transcribeBtn.disabled) {
	transcriptionAborted = true;
	showProgress(false);
	setStatus('ready', 'Transcription stopped');
	transcribeBtn.disabled = false;
	}
	}
	}

	// Seek in audio
	function seekAudio(event) {
	const rect = waveformCanvas.getBoundingClientRect();
	const x = event.clientX - rect.left;
	const percent = x / rect.width;
	audioPlayer.currentTime = percent * audioPlayer.duration;
	updateAudioTime();
	}

	// Copy to clipboard
	async function copyToClipboard() {
	try {
	await navigator.clipboard.writeText(outputText.textContent);
	// Brief visual feedback via title attribute
	const originalTitle = copyBtn.title;
	copyBtn.title = 'Copied!';
	setTimeout(() => {
	copyBtn.title = originalTitle;
	}, 2000);
	} catch (error) {
	console.error('Copy failed:', error);
	}
	}

	function downloadTranscript() {
	// Convert <br> to newlines and strip other HTML
	const text = outputText.innerHTML
	.replace(/<br\s*\/?>/gi, '\n')
	.replace(/<[^>]+>/g, '');
	if (!text) return;

	const blob = new Blob([text], { type: 'text/plain' });
	const url = URL.createObjectURL(blob);
	const a = document.createElement('a');
	a.href = url;
	a.download = 'transcript.txt';
	document.body.appendChild(a);
	a.click();
	document.body.removeChild(a);
	URL.revokeObjectURL(url);

	// Brief visual feedback
	const originalTitle = downloadBtn.title;
	downloadBtn.title = 'Downloaded!';
	setTimeout(() => {
	downloadBtn.title = originalTitle;
	}, 2000);
	}

	function clearAudio() {
	// Stop any playback
	audioPlayer.pause();
	audioPlayer.src = '';

	// Reset audio state
	currentAudioData = null;

	// Hide audio player and transcribe section
	audioPreview.style.display = 'none';
	transcribeSection.style.display = 'none';

	// Clear transcript
	transcriptCard.style.display = 'none';
	outputText.textContent = '';

	// Reset waveform
	waveformProgress.style.width = '0%';
	const ctx = waveformCanvas.getContext('2d');
	ctx.clearRect(0, 0, waveformCanvas.width, waveformCanvas.height);

	// Reset time display
	audioTime.textContent = '0:00';

	// Reset buttons
	transcribeBtn.disabled = true;

	// Reset file input
	audioFile.value = '';

	// Update status
	setStatus('ready', 'Ready');
	}

	// Event listeners
	recordBtn.addEventListener('click', toggleRecording);
	audioFile.addEventListener('change', handleFileUpload);

	// Audio player controls
	playBtn.addEventListener('click', togglePlayback);
	waveformCanvas.addEventListener('click', seekAudio);
	audioPlayer.addEventListener('timeupdate', updateAudioTime);
	audioPlayer.addEventListener('ended', () => {
	playBtn.querySelector('.play-icon').style.display = 'block';
	playBtn.querySelector('.pause-icon').style.display = 'none';
	waveformProgress.style.width = '0%';
	});

	// Redraw waveform on resize
	window.addEventListener('resize', drawWaveform);
	transcribeBtn.addEventListener('click', transcribe);
	copyBtn.addEventListener('click', copyToClipboard);
	downloadBtn.addEventListener('click', downloadTranscript);
	clearBtn.addEventListener('click', clearAudio);

	// Drag and drop on input card
	inputCard.addEventListener('dragover', (e) => {
	e.preventDefault();
	inputCard.classList.add('drag-over');
	});
	inputCard.addEventListener('dragleave', (e) => {
	e.preventDefault();
	inputCard.classList.remove('drag-over');
	});
	inputCard.addEventListener('drop', handleFileDrop);

	// Initialize on load
	window.addEventListener('load', initModels);