| |
| |
| |
| |
|
|
| import { |
| AutoProcessor, |
| GraniteSpeechForConditionalGeneration, |
| TextStreamer, |
| } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@4.0.0-next.7'; |
| import { detect } from 'https://cdn.jsdelivr.net/npm/tinyld/+esm'; |
|
|
| |
| const MODEL_ID = 'onnx-community/granite-4.0-1b-speech-ONNX'; |
|
|
| |
| const SAMPLE_RATE = 16000; |
| const MAX_NEW_TOKENS = 256; |
|
|
| |
| const TASK_PROMPTS = { |
| 'transcribe': '<|audio|>Transcribe the speech to text', |
| 'translate_en': '<|audio|>Translate the speech to English', |
| 'translate_fr': '<|audio|>Translate the speech to French', |
| 'translate_de': '<|audio|>Translate the speech to German', |
| 'translate_es': '<|audio|>Translate the speech to Spanish', |
| 'translate_pt': '<|audio|>Translate the speech to Portuguese', |
| 'translate_ja': '<|audio|>Translate the speech to Japanese', |
| }; |
|
|
| |
| let model = null; |
| let processor = null; |
| let isModelLoading = false; |
| let currentAudioData = null; |
|
|
| |
| const statusDot = document.getElementById('statusDot'); |
| const statusText = document.getElementById('statusText'); |
| const recordBtn = document.getElementById('recordBtn'); |
| const audioFile = document.getElementById('audioFile'); |
| const fileTile = document.querySelector('.file-label'); |
| const inputCard = document.querySelector('.input-card'); |
| const audioPreview = document.getElementById('audioPreview'); |
| const audioPlayer = document.getElementById('audioPlayer'); |
| const playBtn = document.getElementById('playBtn'); |
| const waveformCanvas = document.getElementById('waveformCanvas'); |
| const waveformProgress = document.getElementById('waveformProgress'); |
| const audioTime = document.getElementById('audioTime'); |
| const transcribeSection = document.getElementById('transcribeSection'); |
| const transcribeBtn = document.getElementById('transcribeBtn'); |
| const promptSelect = document.getElementById('promptSelect'); |
| const punctuationCheckbox = document.getElementById('punctuationCheckbox'); |
| const transcriptCard = document.getElementById('transcriptCard'); |
| const outputText = document.getElementById('outputText'); |
| const copyBtn = document.getElementById('copyBtn'); |
| const downloadBtn = document.getElementById('downloadBtn'); |
| const clearBtn = document.getElementById('clearBtn'); |
| const progressSection = document.getElementById('progressSection'); |
| const progressFill = document.getElementById('progressFill'); |
| const progressText = document.getElementById('progressText'); |
| const vadCheckbox = document.getElementById('vadCheckbox'); |
| const gpuInfo = document.getElementById('gpuInfo'); |
|
|
| |
| let mediaRecorder = null; |
| let audioChunks = []; |
| let transcriptionAborted = false; |
|
|
| |
| function setStatus(status, message) { |
| statusDot.className = `status-dot ${status}`; |
| statusText.textContent = message; |
| } |
|
|
| |
|
|
| function showProgress(show) { |
| progressSection.style.display = show ? 'block' : 'none'; |
| } |
|
|
| function updateProgress(progress, text) { |
| progressFill.style.width = `${progress}%`; |
| progressText.textContent = text; |
| } |
|
|
| |
| async function checkWebGPU() { |
| if (!navigator.gpu) { |
| gpuInfo.textContent = 'WebGPU not supported. Use Chrome 113+ or Edge 113+'; |
| gpuInfo.style.color = '#e74c3c'; |
| return false; |
| } |
|
|
| try { |
| const adapter = await navigator.gpu.requestAdapter(); |
| if (!adapter) { |
| gpuInfo.textContent = 'No WebGPU adapter available'; |
| gpuInfo.style.color = '#f39c12'; |
| return false; |
| } |
|
|
| return true; |
| } catch (e) { |
| console.error('WebGPU error:', e); |
| gpuInfo.textContent = `WebGPU error: ${e.message || e}`; |
| gpuInfo.style.color = '#e74c3c'; |
| return false; |
| } |
| } |
|
|
| |
| async function initModels() { |
| if (isModelLoading) return; |
| isModelLoading = true; |
|
|
| setStatus('loading', 'Loading processor...'); |
|
|
| try { |
| await checkWebGPU(); |
|
|
| processor = await AutoProcessor.from_pretrained(MODEL_ID); |
|
|
| setStatus('loading', 'Downloading models...'); |
| progressFill.style.width = '0%'; |
| let lastProgressUpdate = 0; |
| const fileProgress = {}; |
| model = await GraniteSpeechForConditionalGeneration.from_pretrained(MODEL_ID, { |
| dtype: { |
| audio_encoder: 'q4f16', |
| embed_tokens: 'q4f16', |
| decoder_model_merged: 'q4f16', |
| }, |
| device: 'webgpu', |
| progress_callback: (progress) => { |
| if (progress.status === 'progress' && progress.total) { |
| fileProgress[progress.file] = { loaded: progress.loaded, total: progress.total }; |
| const now = performance.now(); |
| if (now - lastProgressUpdate < 100) return; |
| lastProgressUpdate = now; |
| let totalLoaded = 0, totalSize = 0; |
| for (const f of Object.values(fileProgress)) { |
| totalLoaded += f.loaded; |
| totalSize += f.total; |
| } |
| const pct = totalSize > 0 ? (totalLoaded / totalSize) * 100 : 0; |
| progressFill.style.width = `${pct}%`; |
| const mb = (totalLoaded / 1e6).toFixed(0); |
| const totalMb = (totalSize / 1e6).toFixed(0); |
| setStatus('loading', `Downloading models... ${mb} / ${totalMb} MB`); |
| } |
| }, |
| }); |
|
|
| setStatus('loading', 'Loading VAD and punctuation models...'); |
| await Promise.all([loadVAD(), loadPunctuator()]); |
|
|
| progressFill.style.width = '0%'; |
| setStatus('ready', 'Ready - Record or upload audio'); |
| enableControls(true); |
|
|
| } catch (error) { |
| console.error('Model loading failed:', error); |
| console.error('Error stack:', error?.stack); |
| const errorMsg = error?.message || error?.toString() || 'Unknown error'; |
| setStatus('error', `Error: ${errorMsg}`); |
| progressFill.style.width = '0%'; |
| isModelLoading = false; |
| } |
| } |
|
|
| function enableControls(enabled) { |
| recordBtn.disabled = !enabled; |
| audioFile.disabled = !enabled; |
| } |
|
|
| |
| async function transcribeSegment(audioSegment, onPartialResult) { |
| |
| const taskKey = promptSelect.value; |
| const content = TASK_PROMPTS[taskKey] || TASK_PROMPTS['transcribe']; |
| const messages = [{ role: 'user', content }]; |
|
|
| const text = processor.tokenizer.apply_chat_template(messages, { |
| add_generation_prompt: true, |
| tokenize: false, |
| }); |
|
|
| |
| const inputs = await processor(text, audioSegment, { sampling_rate: SAMPLE_RATE }); |
|
|
| |
| let accumulated = ''; |
| const streamer = new TextStreamer(processor.tokenizer, { |
| skip_prompt: true, |
| skip_special_tokens: true, |
| callback_function: (chunk) => { |
| accumulated += chunk; |
| if (onPartialResult) { |
| onPartialResult(accumulated); |
| } |
| }, |
| }); |
|
|
| |
| await model.generate({ |
| ...inputs, |
| max_new_tokens: MAX_NEW_TOKENS, |
| streamer, |
| }); |
|
|
| return accumulated; |
| } |
|
|
| |
| function waitForPlaybackTime(targetTime) { |
| return new Promise((resolve) => { |
| const check = () => { |
| if (audioPlayer.paused || audioPlayer.currentTime >= targetTime) { |
| resolve(); |
| } else { |
| requestAnimationFrame(check); |
| } |
| }; |
| check(); |
| }); |
| } |
|
|
| |
| async function transcribe() { |
| if (!model || !processor || !currentAudioData) { |
| setStatus('error', 'Model or audio not ready'); |
| return; |
| } |
|
|
| setStatus('processing', 'Processing audio...'); |
| transcribeBtn.disabled = true; |
| transcriptionAborted = false; |
| outputText.textContent = ''; |
| transcriptCard.style.display = 'block'; |
| showProgress(true); |
|
|
| try { |
| |
| let segments; |
| if (vadCheckbox.checked) { |
| updateProgress(5, 'Detecting speech segments...'); |
| segments = await getSpeechSegments(currentAudioData, SAMPLE_RATE); |
| console.log(`VAD found ${segments.length} segment(s)`); |
| } else { |
| segments = [{ start: 0, end: currentAudioData.length / SAMPLE_RATE }]; |
| } |
|
|
| |
| audioPlayer.currentTime = 0; |
| audioPlayer.play(); |
| playBtn.querySelector('.play-icon').style.display = 'none'; |
| playBtn.querySelector('.pause-icon').style.display = 'block'; |
| const playbackStartTime = performance.now() / 1000; |
|
|
| |
| const displayedResults = []; |
| const totalSegments = segments.length; |
|
|
| for (let segIdx = 0; segIdx < totalSegments; segIdx++) { |
| if (transcriptionAborted) break; |
|
|
| const seg = segments[segIdx]; |
|
|
| |
| const segProgress = ((segIdx + 1) / totalSegments) * 100; |
| updateProgress(segProgress, ''); |
|
|
| |
| const elapsed = (performance.now() / 1000) - playbackStartTime; |
| const waitTime = seg.start - elapsed; |
| if (waitTime > 0) { |
| await new Promise(resolve => setTimeout(resolve, waitTime * 1000)); |
| } |
|
|
| setStatus('processing', `Segment ${segIdx + 1}/${totalSegments}`); |
|
|
| |
| const startSample = Math.floor(seg.start * SAMPLE_RATE); |
| const endSample = Math.floor(seg.end * SAMPLE_RATE); |
| const audioSegment = currentAudioData.slice(startSample, endSample); |
|
|
| const timestamp = formatTimestamp(seg.start); |
| const makeRow = (ts, text) => `<div class="transcript-row"><span class="timestamp">${ts}</span><span class="transcript-text">${text}</span></div>`; |
|
|
| |
| const segmentText = await transcribeSegment(audioSegment, (partial) => { |
| const escaped = partial.replace(/</g, '<').replace(/>/g, '>'); |
| const rows = [...displayedResults, makeRow(timestamp, escaped)]; |
| outputText.innerHTML = rows.join(''); |
| outputText.scrollTop = outputText.scrollHeight; |
| }); |
|
|
| if (segmentText.trim()) { |
| let finalSegmentText = segmentText.trim(); |
| |
| if (punctuationCheckbox.checked) { |
| const detectedLang = detect(finalSegmentText); |
| if (detectedLang === 'en') { |
| const stripped = finalSegmentText.replace(/[.,!?]/g, ' ').replace(/\s+/g, ' ').trim(); |
| finalSegmentText = await applyPunctuation(stripped, 'en'); |
| finalSegmentText = finalSegmentText.replace(/<unk>/gi, ' ').replace(/\s+/g, ' ').trim(); |
| } |
| } |
| const escaped = finalSegmentText.replace(/</g, '<').replace(/>/g, '>'); |
| displayedResults.push(makeRow(timestamp, escaped)); |
| outputText.innerHTML = displayedResults.join(''); |
| outputText.scrollTop = outputText.scrollHeight; |
| } |
| } |
|
|
| |
| if (displayedResults.length === 0) { |
| outputText.innerHTML = '<span style="color: #94a3b8;">(No speech detected)</span>'; |
| } |
| copyBtn.disabled = false; |
|
|
| showProgress(false); |
| setStatus('ready', 'Transcription complete'); |
|
|
| } catch (error) { |
| console.error('Transcription failed:', error); |
| setStatus('error', `Error: ${error.message}`); |
| showProgress(false); |
| } |
|
|
| transcribeBtn.disabled = false; |
| } |
|
|
| |
| let isRecording = false; |
|
|
| function toggleRecording() { |
| if (isRecording) { |
| stopRecording(); |
| } else { |
| startRecording(); |
| } |
| } |
|
|
| async function startRecording() { |
| try { |
| const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); |
|
|
| mediaRecorder = new MediaRecorder(stream); |
| audioChunks = []; |
|
|
| mediaRecorder.ondataavailable = (event) => { |
| audioChunks.push(event.data); |
| }; |
|
|
| mediaRecorder.onstop = async () => { |
| const audioBlob = new Blob(audioChunks, { type: 'audio/wav' }); |
| const audioUrl = URL.createObjectURL(audioBlob); |
| audioPlayer.src = audioUrl; |
| audioPreview.style.display = 'flex'; |
| transcribeSection.style.display = 'flex'; |
|
|
| await processAudioBlob(audioBlob); |
| drawWaveform(); |
| updateAudioTime(); |
| stream.getTracks().forEach(track => track.stop()); |
| }; |
|
|
| mediaRecorder.start(); |
| isRecording = true; |
| setStatus('recording', 'Recording...'); |
|
|
| |
| recordBtn.querySelector('.mic-icon').style.display = 'none'; |
| recordBtn.querySelector('.stop-icon').style.display = 'block'; |
| recordBtn.querySelector('span').textContent = 'Stop'; |
| recordBtn.classList.add('recording'); |
|
|
| } catch (error) { |
| console.error('Recording failed:', error); |
| setStatus('error', 'Microphone access denied'); |
| } |
| } |
|
|
| function stopRecording() { |
| if (mediaRecorder && mediaRecorder.state !== 'inactive') { |
| mediaRecorder.stop(); |
| isRecording = false; |
| setStatus('ready', 'Recording stopped - Click Transcribe'); |
|
|
| |
| recordBtn.querySelector('.mic-icon').style.display = 'block'; |
| recordBtn.querySelector('.stop-icon').style.display = 'none'; |
| recordBtn.querySelector('span').textContent = 'Record'; |
| recordBtn.classList.remove('recording'); |
| } |
| } |
|
|
| |
| async function processAudioBlob(blob) { |
| try { |
| const arrayBuffer = await blob.arrayBuffer(); |
| const audioCtx = new AudioContext({ sampleRate: SAMPLE_RATE }); |
| const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer); |
|
|
| |
| let audioData; |
| if (audioBuffer.numberOfChannels > 1) { |
| const left = audioBuffer.getChannelData(0); |
| const right = audioBuffer.getChannelData(1); |
| audioData = new Float32Array(left.length); |
| for (let i = 0; i < left.length; i++) { |
| audioData[i] = (left[i] + right[i]) / 2; |
| } |
| } else { |
| audioData = audioBuffer.getChannelData(0); |
| } |
|
|
| |
| if (audioBuffer.sampleRate !== SAMPLE_RATE) { |
| audioData = resample(audioData, audioBuffer.sampleRate, SAMPLE_RATE); |
| } |
|
|
| currentAudioData = audioData; |
| transcribeBtn.disabled = false; |
|
|
| } catch (error) { |
| console.error('Audio processing failed:', error); |
| setStatus('error', 'Failed to process audio'); |
| } |
| } |
|
|
| |
| function resample(audioData, fromRate, toRate) { |
| const ratio = fromRate / toRate; |
| const newLength = Math.round(audioData.length / ratio); |
| const result = new Float32Array(newLength); |
|
|
| for (let i = 0; i < newLength; i++) { |
| const srcIndex = i * ratio; |
| const srcIndexFloor = Math.floor(srcIndex); |
| const srcIndexCeil = Math.min(srcIndexFloor + 1, audioData.length - 1); |
| const t = srcIndex - srcIndexFloor; |
| result[i] = audioData[srcIndexFloor] * (1 - t) + audioData[srcIndexCeil] * t; |
| } |
|
|
| return result; |
| } |
|
|
| |
| async function handleFileUpload(event) { |
| const file = event.target.files[0]; |
| if (!file) return; |
| await loadAudioFile(file); |
| } |
|
|
| |
| async function handleFileDrop(event) { |
| event.preventDefault(); |
| inputCard.classList.remove('drag-over'); |
|
|
| const file = event.dataTransfer.files[0]; |
| if (!file || !file.type.startsWith('audio/')) { |
| setStatus('error', 'Please drop an audio file'); |
| return; |
| } |
| await loadAudioFile(file); |
| } |
|
|
| |
| async function loadAudioFile(file) { |
| setStatus('processing', 'Processing audio file...'); |
|
|
| const audioUrl = URL.createObjectURL(file); |
| audioPlayer.src = audioUrl; |
| audioPreview.style.display = 'flex'; |
| transcribeSection.style.display = 'flex'; |
|
|
| await processAudioBlob(file); |
| drawWaveform(); |
| updateAudioTime(); |
| setStatus('ready', 'Audio loaded - Click Transcribe'); |
| } |
|
|
| |
| function drawWaveform() { |
| if (!currentAudioData) return; |
|
|
| const canvas = waveformCanvas; |
| const ctx = canvas.getContext('2d'); |
| const dpr = window.devicePixelRatio || 1; |
|
|
| |
| const rect = canvas.getBoundingClientRect(); |
| canvas.width = rect.width * dpr; |
| canvas.height = rect.height * dpr; |
| ctx.scale(dpr, dpr); |
|
|
| const width = rect.width; |
| const height = rect.height; |
| const centerY = height / 2; |
|
|
| |
| const samples = currentAudioData; |
| const barCount = Math.floor(width / 3); |
| const samplesPerBar = Math.floor(samples.length / barCount); |
|
|
| |
| const barAmplitudes = []; |
| for (let i = 0; i < barCount; i++) { |
| let sum = 0; |
| const start = i * samplesPerBar; |
| for (let j = 0; j < samplesPerBar; j++) { |
| sum += Math.abs(samples[start + j] || 0); |
| } |
| barAmplitudes.push(sum / samplesPerBar); |
| } |
|
|
| |
| const maxAmp = Math.max(...barAmplitudes, 0.01); |
|
|
| |
| const isDark = window.matchMedia('(prefers-color-scheme: dark)').matches; |
| ctx.fillStyle = isDark ? '#64748b' : '#cbd5e1'; |
|
|
| |
| for (let i = 0; i < barCount; i++) { |
| const normalized = barAmplitudes[i] / maxAmp; |
| const barHeight = Math.max(2, normalized * height * 0.9); |
|
|
| ctx.fillRect(i * 3, centerY - barHeight / 2, 2, barHeight); |
| } |
| } |
|
|
| |
| function formatTime(seconds) { |
| const mins = Math.floor(seconds / 60); |
| const secs = Math.floor(seconds % 60); |
| return `${mins}:${secs.toString().padStart(2, '0')}`; |
| } |
|
|
| |
| function updateAudioTime() { |
| const current = audioPlayer.currentTime || 0; |
| const duration = audioPlayer.duration || 0; |
| if (duration > 0) { |
| audioTime.textContent = `${formatTime(current)} / ${formatTime(duration)}`; |
| waveformProgress.style.width = `${(current / duration) * 100}%`; |
| } else { |
| audioTime.textContent = formatTime(currentAudioData ? currentAudioData.length / SAMPLE_RATE : 0); |
| } |
| } |
|
|
| |
| function togglePlayback() { |
| if (audioPlayer.paused) { |
| audioPlayer.play(); |
| playBtn.querySelector('.play-icon').style.display = 'none'; |
| playBtn.querySelector('.pause-icon').style.display = 'block'; |
| } else { |
| audioPlayer.pause(); |
| playBtn.querySelector('.play-icon').style.display = 'block'; |
| playBtn.querySelector('.pause-icon').style.display = 'none'; |
| |
| if (!transcriptionAborted && transcribeBtn.disabled) { |
| transcriptionAborted = true; |
| showProgress(false); |
| setStatus('ready', 'Transcription stopped'); |
| transcribeBtn.disabled = false; |
| } |
| } |
| } |
|
|
| |
| function seekAudio(event) { |
| const rect = waveformCanvas.getBoundingClientRect(); |
| const x = event.clientX - rect.left; |
| const percent = x / rect.width; |
| audioPlayer.currentTime = percent * audioPlayer.duration; |
| updateAudioTime(); |
| } |
|
|
| |
| async function copyToClipboard() { |
| try { |
| await navigator.clipboard.writeText(outputText.textContent); |
| |
| const originalTitle = copyBtn.title; |
| copyBtn.title = 'Copied!'; |
| setTimeout(() => { |
| copyBtn.title = originalTitle; |
| }, 2000); |
| } catch (error) { |
| console.error('Copy failed:', error); |
| } |
| } |
|
|
| function downloadTranscript() { |
| |
| const text = outputText.innerHTML |
| .replace(/<br\s*\/?>/gi, '\n') |
| .replace(/<[^>]+>/g, ''); |
| if (!text) return; |
|
|
| const blob = new Blob([text], { type: 'text/plain' }); |
| const url = URL.createObjectURL(blob); |
| const a = document.createElement('a'); |
| a.href = url; |
| a.download = 'transcript.txt'; |
| document.body.appendChild(a); |
| a.click(); |
| document.body.removeChild(a); |
| URL.revokeObjectURL(url); |
|
|
| |
| const originalTitle = downloadBtn.title; |
| downloadBtn.title = 'Downloaded!'; |
| setTimeout(() => { |
| downloadBtn.title = originalTitle; |
| }, 2000); |
| } |
|
|
| function clearAudio() { |
| |
| audioPlayer.pause(); |
| audioPlayer.src = ''; |
|
|
| |
| currentAudioData = null; |
|
|
| |
| audioPreview.style.display = 'none'; |
| transcribeSection.style.display = 'none'; |
|
|
| |
| transcriptCard.style.display = 'none'; |
| outputText.textContent = ''; |
|
|
| |
| waveformProgress.style.width = '0%'; |
| const ctx = waveformCanvas.getContext('2d'); |
| ctx.clearRect(0, 0, waveformCanvas.width, waveformCanvas.height); |
|
|
| |
| audioTime.textContent = '0:00'; |
|
|
| |
| transcribeBtn.disabled = true; |
|
|
| |
| audioFile.value = ''; |
|
|
| |
| setStatus('ready', 'Ready'); |
| } |
|
|
| |
| recordBtn.addEventListener('click', toggleRecording); |
| audioFile.addEventListener('change', handleFileUpload); |
|
|
| |
| playBtn.addEventListener('click', togglePlayback); |
| waveformCanvas.addEventListener('click', seekAudio); |
| audioPlayer.addEventListener('timeupdate', updateAudioTime); |
| audioPlayer.addEventListener('ended', () => { |
| playBtn.querySelector('.play-icon').style.display = 'block'; |
| playBtn.querySelector('.pause-icon').style.display = 'none'; |
| waveformProgress.style.width = '0%'; |
| }); |
|
|
| |
| window.addEventListener('resize', drawWaveform); |
| transcribeBtn.addEventListener('click', transcribe); |
| copyBtn.addEventListener('click', copyToClipboard); |
| downloadBtn.addEventListener('click', downloadTranscript); |
| clearBtn.addEventListener('click', clearAudio); |
|
|
| |
| inputCard.addEventListener('dragover', (e) => { |
| e.preventDefault(); |
| inputCard.classList.add('drag-over'); |
| }); |
| inputCard.addEventListener('dragleave', (e) => { |
| e.preventDefault(); |
| inputCard.classList.remove('drag-over'); |
| }); |
| inputCard.addEventListener('drop', handleFileDrop); |
|
|
| |
| window.addEventListener('load', initModels); |
|
|