Spaces:

ibm-granite
/

granite-speech-webgpu

Running

File size: 23,578 Bytes

/**
 * Granite Speech WebGPU Demo
 * Uses Transformers.js v4 for in-browser speech recognition
 */

import {
    AutoProcessor,
    GraniteSpeechForConditionalGeneration,
    TextStreamer,
} from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@4.0.0-next.7';
import { detect } from 'https://cdn.jsdelivr.net/npm/tinyld/+esm';

// Model
const MODEL_ID = 'onnx-community/granite-4.0-1b-speech-ONNX';

// Audio config
const SAMPLE_RATE = 16000;
const MAX_NEW_TOKENS = 256;

// Task prompts — <|audio|> is expanded by the processor's chat template
const TASK_PROMPTS = {
    'transcribe':   '<|audio|>Transcribe the speech to text',
    'translate_en': '<|audio|>Translate the speech to English',
    'translate_fr': '<|audio|>Translate the speech to French',
    'translate_de': '<|audio|>Translate the speech to German',
    'translate_es': '<|audio|>Translate the speech to Spanish',
    'translate_pt': '<|audio|>Translate the speech to Portuguese',
    'translate_ja': '<|audio|>Translate the speech to Japanese',
};

// State
let model = null;
let processor = null;
let isModelLoading = false;
let currentAudioData = null;

// DOM Elements
const statusDot = document.getElementById('statusDot');
const statusText = document.getElementById('statusText');
const recordBtn = document.getElementById('recordBtn');
const audioFile = document.getElementById('audioFile');
const fileTile = document.querySelector('.file-label');
const inputCard = document.querySelector('.input-card');
const audioPreview = document.getElementById('audioPreview');
const audioPlayer = document.getElementById('audioPlayer');
const playBtn = document.getElementById('playBtn');
const waveformCanvas = document.getElementById('waveformCanvas');
const waveformProgress = document.getElementById('waveformProgress');
const audioTime = document.getElementById('audioTime');
const transcribeSection = document.getElementById('transcribeSection');
const transcribeBtn = document.getElementById('transcribeBtn');
const promptSelect = document.getElementById('promptSelect');
const punctuationCheckbox = document.getElementById('punctuationCheckbox');
const transcriptCard = document.getElementById('transcriptCard');
const outputText = document.getElementById('outputText');
const copyBtn = document.getElementById('copyBtn');
const downloadBtn = document.getElementById('downloadBtn');
const clearBtn = document.getElementById('clearBtn');
const progressSection = document.getElementById('progressSection');
const progressFill = document.getElementById('progressFill');
const progressText = document.getElementById('progressText');
const vadCheckbox = document.getElementById('vadCheckbox');
const gpuInfo = document.getElementById('gpuInfo');

// Recording state
let mediaRecorder = null;
let audioChunks = [];
let transcriptionAborted = false;

// Utility functions
function setStatus(status, message) {
    statusDot.className = `status-dot ${status}`;
    statusText.textContent = message;
}

// Punctuation is handled by punctuator.js (applyPunctuation function)

function showProgress(show) {
    progressSection.style.display = show ? 'block' : 'none';
}

function updateProgress(progress, text) {
    progressFill.style.width = `${progress}%`;
    progressText.textContent = text;
}

// Check WebGPU support
async function checkWebGPU() {
    if (!navigator.gpu) {
        gpuInfo.textContent = 'WebGPU not supported. Use Chrome 113+ or Edge 113+';
        gpuInfo.style.color = '#e74c3c';
        return false;
    }

    try {
        const adapter = await navigator.gpu.requestAdapter();
        if (!adapter) {
            gpuInfo.textContent = 'No WebGPU adapter available';
            gpuInfo.style.color = '#f39c12';
            return false;
        }

        return true;
    } catch (e) {
        console.error('WebGPU error:', e);
        gpuInfo.textContent = `WebGPU error: ${e.message || e}`;
        gpuInfo.style.color = '#e74c3c';
        return false;
    }
}

// Initialize models using Transformers.js v4
async function initModels() {
    if (isModelLoading) return;
    isModelLoading = true;

    setStatus('loading', 'Loading processor...');

    try {
        await checkWebGPU();

        processor = await AutoProcessor.from_pretrained(MODEL_ID);

        setStatus('loading', 'Downloading models...');
        progressFill.style.width = '0%';
        let lastProgressUpdate = 0;
        const fileProgress = {};
        model = await GraniteSpeechForConditionalGeneration.from_pretrained(MODEL_ID, {
            dtype: {
                audio_encoder: 'q4f16',
                embed_tokens: 'q4f16',
                decoder_model_merged: 'q4f16',
            },
            device: 'webgpu',
            progress_callback: (progress) => {
                if (progress.status === 'progress' && progress.total) {
                    fileProgress[progress.file] = { loaded: progress.loaded, total: progress.total };
                    const now = performance.now();
                    if (now - lastProgressUpdate < 100) return;
                    lastProgressUpdate = now;
                    let totalLoaded = 0, totalSize = 0;
                    for (const f of Object.values(fileProgress)) {
                        totalLoaded += f.loaded;
                        totalSize += f.total;
                    }
                    const pct = totalSize > 0 ? (totalLoaded / totalSize) * 100 : 0;
                    progressFill.style.width = `${pct}%`;
                    const mb = (totalLoaded / 1e6).toFixed(0);
                    const totalMb = (totalSize / 1e6).toFixed(0);
                    setStatus('loading', `Downloading models... ${mb} / ${totalMb} MB`);
                }
            },
        });

        setStatus('loading', 'Loading VAD and punctuation models...');
        await Promise.all([loadVAD(), loadPunctuator()]);

        progressFill.style.width = '0%';
        setStatus('ready', 'Ready - Record or upload audio');
        enableControls(true);

    } catch (error) {
        console.error('Model loading failed:', error);
        console.error('Error stack:', error?.stack);
        const errorMsg = error?.message || error?.toString() || 'Unknown error';
        setStatus('error', `Error: ${errorMsg}`);
        progressFill.style.width = '0%';
        isModelLoading = false;
    }
}

function enableControls(enabled) {
    recordBtn.disabled = !enabled;
    audioFile.disabled = !enabled;
}

// Transcribe a single audio segment and return the text
async function transcribeSegment(audioSegment, onPartialResult) {
    // Build prompt using chat template
    const taskKey = promptSelect.value;
    const content = TASK_PROMPTS[taskKey] || TASK_PROMPTS['transcribe'];
    const messages = [{ role: 'user', content }];

    const text = processor.tokenizer.apply_chat_template(messages, {
        add_generation_prompt: true,
        tokenize: false,
    });

    // Process text + audio into model inputs
    const inputs = await processor(text, audioSegment, { sampling_rate: SAMPLE_RATE });

    // Streaming via TextStreamer
    let accumulated = '';
    const streamer = new TextStreamer(processor.tokenizer, {
        skip_prompt: true,
        skip_special_tokens: true,
        callback_function: (chunk) => {
            accumulated += chunk;
            if (onPartialResult) {
                onPartialResult(accumulated);
            }
        },
    });

    // Generate
    await model.generate({
        ...inputs,
        max_new_tokens: MAX_NEW_TOKENS,
        streamer,
    });

    return accumulated;
}

// Wait until audio playback reaches a specific time
function waitForPlaybackTime(targetTime) {
    return new Promise((resolve) => {
        const check = () => {
            if (audioPlayer.paused || audioPlayer.currentTime >= targetTime) {
                resolve();
            } else {
                requestAnimationFrame(check);
            }
        };
        check();
    });
}

// Run inference with segmentation and audio sync
async function transcribe() {
    if (!model || !processor || !currentAudioData) {
        setStatus('error', 'Model or audio not ready');
        return;
    }

    setStatus('processing', 'Processing audio...');
    transcribeBtn.disabled = true;
    transcriptionAborted = false;
    outputText.textContent = '';
    transcriptCard.style.display = 'block';
    showProgress(true);

    try {
        // Get speech segments using VAD, or treat entire audio as one segment
        let segments;
        if (vadCheckbox.checked) {
            updateProgress(5, 'Detecting speech segments...');
            segments = await getSpeechSegments(currentAudioData, SAMPLE_RATE);
            console.log(`VAD found ${segments.length} segment(s)`);
        } else {
            segments = [{ start: 0, end: currentAudioData.length / SAMPLE_RATE }];
        }

        // Start audio playback immediately
        audioPlayer.currentTime = 0;
        audioPlayer.play();
        playBtn.querySelector('.play-icon').style.display = 'none';
        playBtn.querySelector('.pause-icon').style.display = 'block';
        const playbackStartTime = performance.now() / 1000;

        // Process and display segments in sync with audio
        const displayedResults = [];
        const totalSegments = segments.length;

        for (let segIdx = 0; segIdx < totalSegments; segIdx++) {
            if (transcriptionAborted) break;

            const seg = segments[segIdx];

            // Update progress bar
            const segProgress = ((segIdx + 1) / totalSegments) * 100;
            updateProgress(segProgress, '');

            // Wait for audio to reach this segment's start time
            const elapsed = (performance.now() / 1000) - playbackStartTime;
            const waitTime = seg.start - elapsed;
            if (waitTime > 0) {
                await new Promise(resolve => setTimeout(resolve, waitTime * 1000));
            }

            setStatus('processing', `Segment ${segIdx + 1}/${totalSegments}`);

            // Extract and transcribe this segment
            const startSample = Math.floor(seg.start * SAMPLE_RATE);
            const endSample = Math.floor(seg.end * SAMPLE_RATE);
            const audioSegment = currentAudioData.slice(startSample, endSample);

            const timestamp = formatTimestamp(seg.start);
            const makeRow = (ts, text) => `<div class="transcript-row"><span class="timestamp">${ts}</span><span class="transcript-text">${text}</span></div>`;

            // Transcribe with streaming display
            const segmentText = await transcribeSegment(audioSegment, (partial) => {
                const escaped = partial.replace(/</g, '&lt;').replace(/>/g, '&gt;');
                const rows = [...displayedResults, makeRow(timestamp, escaped)];
                outputText.innerHTML = rows.join('');
                outputText.scrollTop = outputText.scrollHeight;
            });

            if (segmentText.trim()) {
                let finalSegmentText = segmentText.trim();
                // Apply punctuation/capitalization for English only
                if (punctuationCheckbox.checked) {
                    const detectedLang = detect(finalSegmentText);
                    if (detectedLang === 'en') {
                        const stripped = finalSegmentText.replace(/[.,!?]/g, ' ').replace(/\s+/g, ' ').trim();
                        finalSegmentText = await applyPunctuation(stripped, 'en');
                        finalSegmentText = finalSegmentText.replace(/<unk>/gi, ' ').replace(/\s+/g, ' ').trim();
                    }
                }
                const escaped = finalSegmentText.replace(/</g, '&lt;').replace(/>/g, '&gt;');
                displayedResults.push(makeRow(timestamp, escaped));
                outputText.innerHTML = displayedResults.join('');
                outputText.scrollTop = outputText.scrollHeight;
            }
        }

        // Final output
        if (displayedResults.length === 0) {
            outputText.innerHTML = '<span style="color: #94a3b8;">(No speech detected)</span>';
        }
        copyBtn.disabled = false;

        showProgress(false);
        setStatus('ready', 'Transcription complete');

    } catch (error) {
        console.error('Transcription failed:', error);
        setStatus('error', `Error: ${error.message}`);
        showProgress(false);
    }

    transcribeBtn.disabled = false;
}

// Audio recording
let isRecording = false;

function toggleRecording() {
    if (isRecording) {
        stopRecording();
    } else {
        startRecording();
    }
}

async function startRecording() {
    try {
        const stream = await navigator.mediaDevices.getUserMedia({ audio: true });

        mediaRecorder = new MediaRecorder(stream);
        audioChunks = [];

        mediaRecorder.ondataavailable = (event) => {
            audioChunks.push(event.data);
        };

        mediaRecorder.onstop = async () => {
            const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
            const audioUrl = URL.createObjectURL(audioBlob);
            audioPlayer.src = audioUrl;
            audioPreview.style.display = 'flex';
            transcribeSection.style.display = 'flex';

            await processAudioBlob(audioBlob);
            drawWaveform();
            updateAudioTime();
            stream.getTracks().forEach(track => track.stop());
        };

        mediaRecorder.start();
        isRecording = true;
        setStatus('recording', 'Recording...');

        // Update button UI
        recordBtn.querySelector('.mic-icon').style.display = 'none';
        recordBtn.querySelector('.stop-icon').style.display = 'block';
        recordBtn.querySelector('span').textContent = 'Stop';
        recordBtn.classList.add('recording');

    } catch (error) {
        console.error('Recording failed:', error);
        setStatus('error', 'Microphone access denied');
    }
}

function stopRecording() {
    if (mediaRecorder && mediaRecorder.state !== 'inactive') {
        mediaRecorder.stop();
        isRecording = false;
        setStatus('ready', 'Recording stopped - Click Transcribe');

        // Update button UI
        recordBtn.querySelector('.mic-icon').style.display = 'block';
        recordBtn.querySelector('.stop-icon').style.display = 'none';
        recordBtn.querySelector('span').textContent = 'Record';
        recordBtn.classList.remove('recording');
    }
}

// Process audio file/blob
async function processAudioBlob(blob) {
    try {
        const arrayBuffer = await blob.arrayBuffer();
        const audioCtx = new AudioContext({ sampleRate: SAMPLE_RATE });
        const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer);

        // Convert to mono Float32Array
        let audioData;
        if (audioBuffer.numberOfChannels > 1) {
            const left = audioBuffer.getChannelData(0);
            const right = audioBuffer.getChannelData(1);
            audioData = new Float32Array(left.length);
            for (let i = 0; i < left.length; i++) {
                audioData[i] = (left[i] + right[i]) / 2;
            }
        } else {
            audioData = audioBuffer.getChannelData(0);
        }

        // Resample if needed
        if (audioBuffer.sampleRate !== SAMPLE_RATE) {
            audioData = resample(audioData, audioBuffer.sampleRate, SAMPLE_RATE);
        }

        currentAudioData = audioData;
        transcribeBtn.disabled = false;

    } catch (error) {
        console.error('Audio processing failed:', error);
        setStatus('error', 'Failed to process audio');
    }
}

// Simple linear resampling
function resample(audioData, fromRate, toRate) {
    const ratio = fromRate / toRate;
    const newLength = Math.round(audioData.length / ratio);
    const result = new Float32Array(newLength);

    for (let i = 0; i < newLength; i++) {
        const srcIndex = i * ratio;
        const srcIndexFloor = Math.floor(srcIndex);
        const srcIndexCeil = Math.min(srcIndexFloor + 1, audioData.length - 1);
        const t = srcIndex - srcIndexFloor;
        result[i] = audioData[srcIndexFloor] * (1 - t) + audioData[srcIndexCeil] * t;
    }

    return result;
}

// Handle file upload
async function handleFileUpload(event) {
    const file = event.target.files[0];
    if (!file) return;
    await loadAudioFile(file);
}

// Handle dropped files
async function handleFileDrop(event) {
    event.preventDefault();
    inputCard.classList.remove('drag-over');

    const file = event.dataTransfer.files[0];
    if (!file || !file.type.startsWith('audio/')) {
        setStatus('error', 'Please drop an audio file');
        return;
    }
    await loadAudioFile(file);
}

// Common file loading logic
async function loadAudioFile(file) {
    setStatus('processing', 'Processing audio file...');

    const audioUrl = URL.createObjectURL(file);
    audioPlayer.src = audioUrl;
    audioPreview.style.display = 'flex';
    transcribeSection.style.display = 'flex';

    await processAudioBlob(file);
    drawWaveform();
    updateAudioTime();
    setStatus('ready', 'Audio loaded - Click Transcribe');
}

// Draw waveform visualization
function drawWaveform() {
    if (!currentAudioData) return;

    const canvas = waveformCanvas;
    const ctx = canvas.getContext('2d');
    const dpr = window.devicePixelRatio || 1;

    // Set canvas size
    const rect = canvas.getBoundingClientRect();
    canvas.width = rect.width * dpr;
    canvas.height = rect.height * dpr;
    ctx.scale(dpr, dpr);

    const width = rect.width;
    const height = rect.height;
    const centerY = height / 2;

    // Downsample audio data for visualization
    const samples = currentAudioData;
    const barCount = Math.floor(width / 3);
    const samplesPerBar = Math.floor(samples.length / barCount);

    // Calculate bar amplitudes
    const barAmplitudes = [];
    for (let i = 0; i < barCount; i++) {
        let sum = 0;
        const start = i * samplesPerBar;
        for (let j = 0; j < samplesPerBar; j++) {
            sum += Math.abs(samples[start + j] || 0);
        }
        barAmplitudes.push(sum / samplesPerBar);
    }

    // Find max amplitude for normalization
    const maxAmp = Math.max(...barAmplitudes, 0.01);

    // Get color based on color scheme
    const isDark = window.matchMedia('(prefers-color-scheme: dark)').matches;
    ctx.fillStyle = isDark ? '#64748b' : '#cbd5e1';

    // Draw bars normalized to fill height
    for (let i = 0; i < barCount; i++) {
        const normalized = barAmplitudes[i] / maxAmp;
        const barHeight = Math.max(2, normalized * height * 0.9);

        ctx.fillRect(i * 3, centerY - barHeight / 2, 2, barHeight);
    }
}

// Format time as M:SS
function formatTime(seconds) {
    const mins = Math.floor(seconds / 60);
    const secs = Math.floor(seconds % 60);
    return `${mins}:${secs.toString().padStart(2, '0')}`;
}

// Update audio time display
function updateAudioTime() {
    const current = audioPlayer.currentTime || 0;
    const duration = audioPlayer.duration || 0;
    if (duration > 0) {
        audioTime.textContent = `${formatTime(current)} / ${formatTime(duration)}`;
        waveformProgress.style.width = `${(current / duration) * 100}%`;
    } else {
        audioTime.textContent = formatTime(currentAudioData ? currentAudioData.length / SAMPLE_RATE : 0);
    }
}

// Toggle play/pause
function togglePlayback() {
    if (audioPlayer.paused) {
        audioPlayer.play();
        playBtn.querySelector('.play-icon').style.display = 'none';
        playBtn.querySelector('.pause-icon').style.display = 'block';
    } else {
        audioPlayer.pause();
        playBtn.querySelector('.play-icon').style.display = 'block';
        playBtn.querySelector('.pause-icon').style.display = 'none';
        // Stop transcription if running
        if (!transcriptionAborted && transcribeBtn.disabled) {
            transcriptionAborted = true;
            showProgress(false);
            setStatus('ready', 'Transcription stopped');
            transcribeBtn.disabled = false;
        }
    }
}

// Seek in audio
function seekAudio(event) {
    const rect = waveformCanvas.getBoundingClientRect();
    const x = event.clientX - rect.left;
    const percent = x / rect.width;
    audioPlayer.currentTime = percent * audioPlayer.duration;
    updateAudioTime();
}

// Copy to clipboard
async function copyToClipboard() {
    try {
        await navigator.clipboard.writeText(outputText.textContent);
        // Brief visual feedback via title attribute
        const originalTitle = copyBtn.title;
        copyBtn.title = 'Copied!';
        setTimeout(() => {
            copyBtn.title = originalTitle;
        }, 2000);
    } catch (error) {
        console.error('Copy failed:', error);
    }
}

function downloadTranscript() {
    // Convert <br> to newlines and strip other HTML
    const text = outputText.innerHTML
        .replace(/<br\s*\/?>/gi, '\n')
        .replace(/<[^>]+>/g, '');
    if (!text) return;

    const blob = new Blob([text], { type: 'text/plain' });
    const url = URL.createObjectURL(blob);
    const a = document.createElement('a');
    a.href = url;
    a.download = 'transcript.txt';
    document.body.appendChild(a);
    a.click();
    document.body.removeChild(a);
    URL.revokeObjectURL(url);

    // Brief visual feedback
    const originalTitle = downloadBtn.title;
    downloadBtn.title = 'Downloaded!';
    setTimeout(() => {
        downloadBtn.title = originalTitle;
    }, 2000);
}

function clearAudio() {
    // Stop any playback
    audioPlayer.pause();
    audioPlayer.src = '';

    // Reset audio state
    currentAudioData = null;

    // Hide audio player and transcribe section
    audioPreview.style.display = 'none';
    transcribeSection.style.display = 'none';

    // Clear transcript
    transcriptCard.style.display = 'none';
    outputText.textContent = '';

    // Reset waveform
    waveformProgress.style.width = '0%';
    const ctx = waveformCanvas.getContext('2d');
    ctx.clearRect(0, 0, waveformCanvas.width, waveformCanvas.height);

    // Reset time display
    audioTime.textContent = '0:00';

    // Reset buttons
    transcribeBtn.disabled = true;

    // Reset file input
    audioFile.value = '';

    // Update status
    setStatus('ready', 'Ready');
}

// Event listeners
recordBtn.addEventListener('click', toggleRecording);
audioFile.addEventListener('change', handleFileUpload);

// Audio player controls
playBtn.addEventListener('click', togglePlayback);
waveformCanvas.addEventListener('click', seekAudio);
audioPlayer.addEventListener('timeupdate', updateAudioTime);
audioPlayer.addEventListener('ended', () => {
    playBtn.querySelector('.play-icon').style.display = 'block';
    playBtn.querySelector('.pause-icon').style.display = 'none';
    waveformProgress.style.width = '0%';
});

// Redraw waveform on resize
window.addEventListener('resize', drawWaveform);
transcribeBtn.addEventListener('click', transcribe);
copyBtn.addEventListener('click', copyToClipboard);
downloadBtn.addEventListener('click', downloadTranscript);
clearBtn.addEventListener('click', clearAudio);

// Drag and drop on input card
inputCard.addEventListener('dragover', (e) => {
    e.preventDefault();
    inputCard.classList.add('drag-over');
});
inputCard.addEventListener('dragleave', (e) => {
    e.preventDefault();
    inputCard.classList.remove('drag-over');
});
inputCard.addEventListener('drop', handleFileDrop);

// Initialize on load
window.addEventListener('load', initModels);