Spaces:
Running
Running
| # Created by Kasun Ranasinghe (@kasunUoM) | Oct.2025 | |
| # | |
| # Gradio App for SinhalaVITS TTS Inference | |
| # =================================================== | |
| import gradio as gr | |
| from TTS.utils.synthesizer import Synthesizer | |
| from romanizer import sinhala_to_roman | |
| from huggingface_hub import hf_hub_download | |
| import os | |
| import numpy as np | |
| from pathlib import Path | |
| import csv | |
| from datetime import datetime | |
| # --------------- | |
| # FILE CREATIONS | |
| # --------------- | |
| DATA_DIR = Path("data") | |
| DATA_DIR.mkdir(exist_ok=True) | |
| RATINGS_FILE = DATA_DIR / "ratings.csv" | |
| INFERENCE_FILE = DATA_DIR / "inference_count.txt" | |
| # Initialize files if not exist | |
| if not RATINGS_FILE.exists(): | |
| with open(RATINGS_FILE, "w", newline="", encoding="utf-8") as f: | |
| writer = csv.writer(f) | |
| writer.writerow(["timestamp", "speaker", "text", "mos", "comment"]) | |
| if not INFERENCE_FILE.exists(): | |
| INFERENCE_FILE.write_text("0") | |
| # ----------------------- | |
| # AUTHENTICATION SETUP | |
| # ----------------------- | |
| # Get the token from Secrets | |
| HF_TOKEN = os.environ.get("HF_TOKEN") | |
| if not HF_TOKEN: | |
| print("WARNING: HF_TOKEN secret not found. Access to private repos will fail.") | |
| # ------------------ | |
| # SPEAKER MAPPING | |
| # ------------------ | |
| SPEAKER_MAPPING = { | |
| "Male Voice 1": "Roshan", | |
| "Female Voice 1": "Nipunika", | |
| "Male Voice 2": "Sanjaya", | |
| "Female Voice 2": "Sanuki", | |
| } | |
| # Invert the mapping to easily find the default display name | |
| INVERSE_MAPPING = {v: k for k, v in SPEAKER_MAPPING.items()} | |
| # ------------------------------- | |
| # Load Multiple Speaker Models | |
| # ------------------------------- | |
| def load_models(): | |
| model_sources = { | |
| "Roshan": { | |
| "repo": "dialoglk/SinhalaVITS-TTS-M2", | |
| "model_file": "Roshan_270000.pth", | |
| "config_file": "Roshan_config.json" | |
| }, | |
| "Nipunika": { | |
| "repo": "dialoglk/SinhalaVITS-TTS-F1", | |
| "model_file": "Nipunika_210000.pth", | |
| "config_file": "Nipunika_config.json" | |
| }, | |
| "Sanjaya": { | |
| "repo": "dialoglk/SinhalaVITS-TTS-M1", | |
| "model_file": "Sanjaya_170000.pth", | |
| "config_file": "Sanjaya_config.json" | |
| }, | |
| "Sanuki": { | |
| "repo": "dialoglk/SinhalaVITS-TTS-F2", | |
| "model_file": "Sanuki_190000.pth", | |
| "config_file": "Sanuki_config.json" | |
| } | |
| } | |
| loaded = {} | |
| print("Downloading and loading models...") | |
| for spk, info in model_sources.items(): | |
| try: | |
| print(f"Loading speaker: {spk}") | |
| # Pass token to access private repos | |
| ckpt_path = hf_hub_download( | |
| repo_id=info["repo"], | |
| filename=info["model_file"], | |
| token=HF_TOKEN | |
| ) | |
| cfg_path = hf_hub_download( | |
| repo_id=info["repo"], | |
| filename=info["config_file"], | |
| token=HF_TOKEN | |
| ) | |
| loaded[spk] = Synthesizer( | |
| tts_checkpoint=ckpt_path, | |
| tts_config_path=cfg_path, | |
| use_cuda=False | |
| ) | |
| print(f"✅ {spk} Loaded") | |
| except Exception as e: | |
| print(f"❌ Failed to load {spk}: {e}") | |
| print("All models loaded successfully.") | |
| return loaded | |
| # Load models globally once | |
| MODELS = load_models() | |
| # ------------------------------- | |
| # The Core Inference Function | |
| # ------------------------------- | |
| # Speaker in the Display Name | |
| def generate_speech(sinhala_text, speaker): | |
| if not sinhala_text.strip(): | |
| print("Please input a text and then try..") | |
| return (None, None) | |
| # Convert display name back to the actual model key | |
| model_key = SPEAKER_MAPPING.get(speaker) | |
| if model_key not in MODELS: | |
| print(f"Error: Model key '{model_key}' (from '{speaker}') not found.") | |
| return (None, None) | |
| print(f"Generating speech...") | |
| try: | |
| # 1. Convert Sinhala → Roman | |
| roman_text = sinhala_to_roman(sinhala_text) | |
| print(f"Romanized text: {roman_text}") | |
| # 2. Generate audio | |
| model = MODELS[model_key] # Use the actual model key | |
| wav = model.tts(roman_text) | |
| sample_rate = model.output_sample_rate | |
| # 3. Convert to Numpy Array | |
| print("Speech generated successfully.") | |
| # Increment inference counter | |
| count = int(INFERENCE_FILE.read_text().strip()) | |
| count += 1 | |
| INFERENCE_FILE.write_text(str(count)) | |
| return (sample_rate, np.array(wav)) | |
| except Exception as e: | |
| print(f"Error generating speech: {e}") | |
| return (None, None) | |
| # Rating Save Function | |
| # --------------------- | |
| def save_rating(mos, comment, text, speaker): | |
| if not mos: | |
| return "⚠️ Please select a rating before submitting.", gr.update() | |
| timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
| with open(RATINGS_FILE, "a", newline="", encoding="utf-8") as f: | |
| writer = csv.writer(f) | |
| writer.writerow([timestamp, speaker, text, mos, comment]) | |
| # Count total ratings | |
| with open(RATINGS_FILE, "r", encoding="utf-8") as f: | |
| total = sum(1 for _ in f) - 1 # exclude header | |
| return ( | |
| f"✅ Thank you for your feedback! ⭐ Total ratings: {total}", | |
| gr.update(value=None) | |
| ) | |
| # ----------------------------------- | |
| # GRADIO UI (Clean Light Theme) | |
| # ----------------------------------- | |
| # Simple Light Theme - Force light colors everywhere | |
| theme = gr.themes.Default( | |
| primary_hue="red", | |
| secondary_hue="orange", | |
| neutral_hue="gray", | |
| font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "sans-serif"], | |
| ).set( | |
| # Force light backgrounds | |
| body_background_fill="*neutral_50", | |
| body_background_fill_dark="*neutral_50", | |
| # Blocks | |
| block_background_fill="#dcfaf5", | |
| block_background_fill_dark="#dcfaf5", | |
| # Text - FORCE DARK | |
| body_text_color="*neutral_500", | |
| body_text_color_dark="*neutral_500", | |
| body_text_color_subdued="*neutral_350", | |
| body_text_color_subdued_dark="*neutral_350", | |
| # Labels | |
| block_label_text_color="*neutral_500", | |
| block_label_text_color_dark="*neutral_500", | |
| # Primary button - Using your red color | |
| button_primary_background_fill="#C40D42", | |
| button_primary_background_fill_dark="#C40D42", | |
| button_primary_background_fill_hover="#A00B36", | |
| button_primary_background_fill_hover_dark="#A00B36", | |
| button_primary_text_color="white", | |
| button_primary_text_color_dark="white", | |
| # Secondary button | |
| button_secondary_background_fill="#6689AB", | |
| button_secondary_background_fill_dark="#6689AB", | |
| button_secondary_text_color="white", | |
| button_secondary_text_color_dark="white", | |
| # Inputs | |
| input_background_fill="white", | |
| input_background_fill_dark="white", | |
| input_border_color="*neutral_300", | |
| input_border_color_dark="*neutral_300", | |
| # Shadows | |
| shadow_drop="0 1px 3px 0 rgb(0 0 0 / 0.1)", | |
| shadow_drop_lg="0 10px 15px -3px rgb(0 0 0 / 0.1)", | |
| ) | |
| # Comprehensive CSS to force light theme and fix visibility | |
| css = """ | |
| /* Force light theme globally */ | |
| :root, .dark { | |
| color-scheme: light !important; | |
| } | |
| /* Container */ | |
| .container { | |
| max-width: 1200px; | |
| margin: auto; | |
| padding: 2rem 1rem; | |
| } | |
| /* Body - light gradient background */ | |
| body, .gradio-container { | |
| background: linear-gradient(135deg, #FAFAFA 0%, #F5F5F5 100%) !important; | |
| color: #1a1a1a !important; | |
| } | |
| /* Force all text to be dark */ | |
| * { | |
| color: #1a1a1a !important; | |
| } | |
| /* Headers and titles - except where we want colors */ | |
| h1, h2, h3, h4, h5, h6 { | |
| color: #1a1a1a !important; | |
| } | |
| /* All blocks white background */ | |
| .gr-block, .gr-group, .gr-box, .gr-form, .gr-panel { | |
| background: white !important; | |
| border: 1px solid #e0e0e0 !important; | |
| box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05) !important; | |
| border-radius: 8px !important; | |
| } | |
| /* Force section headers ("Input Text", "Choose Voice Model", "Audio Result") to white */ | |
| .gr-panel-header, | |
| .gr-group-label { | |
| color: white !important; | |
| } | |
| /* Header styling with your colors */ | |
| #header { | |
| text-align: center; | |
| margin-bottom: 2rem; | |
| padding: 2rem 1rem; | |
| background: white !important; | |
| border-radius: 12px; | |
| box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08) !important; | |
| } | |
| #header h1 { | |
| font-size: 2.5rem; | |
| font-weight: 900; | |
| margin: 0; | |
| background: linear-gradient(135deg, #C40D42 0%, #F15A22 50%, #FDB934 100%); | |
| -webkit-background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| background-clip: text; | |
| } | |
| #header h3 { | |
| margin: 1rem 0 0 0; | |
| color: #333 !important; | |
| font-weight: 600; | |
| } | |
| #header h4 { | |
| margin: 0.5rem 0 0 0; | |
| color: #666 !important; | |
| font-weight: 400; | |
| } | |
| /* Labels - dark and bold */ | |
| label, .label, .gr-label { | |
| color: #faf7f7 !important; | |
| font-weight: 600 !important; | |
| font-size: 1rem !important; | |
| } | |
| .label-text { | |
| color: #faf7f7 !important; | |
| font-weight: 700 !important; | |
| font-size: 1rem !important; | |
| margin-bottom: 0.5rem; | |
| display: block; | |
| } | |
| /* Textbox - white with dark text */ | |
| .gr-textbox, | |
| .gr-textbox textarea, | |
| textarea, | |
| input[type="text"] { | |
| background: white !important; | |
| border: 1.5px solid #ddd !important; | |
| color: #1a1a1a !important; | |
| font-size: 1rem !important; | |
| } | |
| .gr-textbox:focus, | |
| textarea:focus, | |
| input[type="text"]:focus { | |
| border-color: #C40D42 !important; | |
| outline: none !important; | |
| box-shadow: 0 0 0 3px rgba(196, 13, 66, 0.1) !important; | |
| } | |
| /* Dropdown - fix all text visibility */ | |
| .gr-dropdown, | |
| .gr-dropdown *, | |
| .svelte-1gfkn6j, | |
| .svelte-1gfkn6j * { | |
| color: #1a1a1a !important; | |
| background: white !important; | |
| } | |
| .gr-dropdown { | |
| border: 1.5px solid #ddd !important; | |
| } | |
| /* Dropdown button/trigger */ | |
| .gr-dropdown .wrap, | |
| .gr-dropdown button { | |
| background: white !important; | |
| color: #1a1a1a !important; | |
| font-weight: 500 !important; | |
| } | |
| /* Dropdown options container */ | |
| .gr-dropdown-options, | |
| .gr-dropdown ul, | |
| [role="listbox"] { | |
| background: white !important; | |
| border: 1px solid #ddd !important; | |
| box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15) !important; | |
| z-index: 9999 !important; | |
| } | |
| /* Dropdown individual options */ | |
| .gr-dropdown-option, | |
| .gr-dropdown li, | |
| [role="option"] { | |
| color: #1a1a1a !important; | |
| background: white !important; | |
| padding: 0.5rem 1rem !important; | |
| } | |
| .gr-dropdown-option:hover, | |
| .gr-dropdown li:hover, | |
| [role="option"]:hover { | |
| background: #f5f5f5 !important; | |
| color: #C40D42 !important; | |
| } | |
| /* Audio Player - fix all button and text colors */ | |
| .gr-audio, | |
| .gr-audio *, | |
| audio, | |
| audio * { | |
| color: #1a1a1a !important; | |
| } | |
| .gr-audio { | |
| background: #fafafa !important; | |
| border: 1px solid #e0e0e0 !important; | |
| border-radius: 8px !important; | |
| padding: 1rem !important; | |
| } | |
| /* Audio controls - make buttons visible */ | |
| .gr-audio button, | |
| .gr-audio [role="button"], | |
| audio::-webkit-media-controls-panel, | |
| audio::-webkit-media-controls-play-button, | |
| audio::-webkit-media-controls-timeline, | |
| audio::-webkit-media-controls-current-time-display, | |
| audio::-webkit-media-controls-time-remaining-display, | |
| audio::-webkit-media-controls-mute-button, | |
| audio::-webkit-media-controls-volume-slider { | |
| background: #1a1a1a !important; | |
| color: white !important; | |
| filter: invert(0) !important; | |
| } | |
| /* Primary Button - your red color */ | |
| button.primary, | |
| .gr-button-primary, | |
| button[variant="primary"] { | |
| background: linear-gradient(135deg, #C40D42 0%, #A00B36 100%) !important; | |
| color: white !important; | |
| border: none !important; | |
| font-weight: 600 !important; | |
| padding: 0.75rem 2rem !important; | |
| box-shadow: 0 4px 12px rgba(196, 13, 66, 0.3) !important; | |
| transition: all 0.3s ease !important; | |
| } | |
| button.primary:hover, | |
| .gr-button-primary:hover { | |
| background: linear-gradient(135deg, #A00B36 0%, #7B1B67 100%) !important; | |
| box-shadow: 0 6px 16px rgba(196, 13, 66, 0.4) !important; | |
| transform: translateY(-2px) !important; | |
| } | |
| /* Guidelines box */ | |
| .guidelines { | |
| background: linear-gradient(135deg, #FFF5E1 0%, #FFE8CC 100%); | |
| border-left: 4px solid #F7941E; | |
| padding: 1.5rem; | |
| margin-top: 1rem; | |
| font-color: #333 !important; | |
| font-size: 0.95rem; | |
| border-radius: 8px; | |
| } | |
| .guidelines strong { | |
| color: #C40D42 !important; | |
| font-weight: 700; | |
| } | |
| .guidelines ul { | |
| margin: 0.75rem 0 0 1.5rem; | |
| padding: 0; | |
| } | |
| .guidelines li { | |
| color: #333 !important; | |
| margin: 0.5rem 0; | |
| } | |
| .guidelines dt { | |
| color: #666 !important; | |
| font-style: italic; | |
| } | |
| /* Logo section */ | |
| #logo-row { | |
| justify-content: center; | |
| align-items: center; | |
| background: white !important; | |
| border: 1px solid #e0e0e0 !important; | |
| border-radius: 10px !important; | |
| padding: 1rem !important; | |
| max-width: 450px; | |
| margin: 2rem auto; | |
| } | |
| .partner-logo { | |
| background: white !important; | |
| padding: 1rem !important; | |
| border-radius: 8px !important; | |
| border: 1px solid #e0e0e0 !important; | |
| transition: all 0.3s ease; | |
| } | |
| .partner-logo:hover { | |
| box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1) !important; | |
| transform: translateY(-4px); | |
| } | |
| .partner-logo img { | |
| max-height: 110px !important; | |
| width: auto !important; | |
| object-fit: contain; | |
| } | |
| /* Footer */ | |
| .footer { | |
| text-align: center; | |
| margin-top: 2rem; | |
| padding: 1rem; | |
| color: #666 !important; | |
| background: white; | |
| border-radius: 8px; | |
| } | |
| .footer * { | |
| color: #666 !important; | |
| } | |
| /* Result header */ | |
| .result-header { | |
| color: #1a1a1a !important; | |
| font-weight: 700 !important; | |
| } | |
| /* Status messages */ | |
| .status-message { | |
| padding: 1rem; | |
| border-radius: 8px; | |
| font-weight: 600; | |
| text-align: center; | |
| margin: 1rem 0; | |
| } | |
| .status-generating { | |
| background: #FFF4E6; | |
| color: #C77700 !important; | |
| border: 1.5px solid #FFD8A8; | |
| } | |
| .status-generating * { | |
| color: #C77700 !important; | |
| } | |
| .status-completed { | |
| background: #E8F5E9; | |
| color: #2E7D32 !important; | |
| border: 1.5px solid #A5D6A7; | |
| } | |
| .status-completed * { | |
| color: #2E7D32 !important; | |
| } | |
| .status-initial { | |
| background: #F5F5F5; | |
| color: #666 !important; | |
| border: 1.5px solid #E0E0E0; | |
| } | |
| .status-initial * { | |
| color: #666 !important; | |
| } | |
| /* Markdown content */ | |
| .markdown-text, .prose { | |
| color: #1a1a1a !important; | |
| } | |
| /* Make sure spans inherit correct colors when needed */ | |
| span[style*="color"] { | |
| /* Allow inline color styles to work */ | |
| } | |
| /* Radio buttons - fix label and option colors */ | |
| .gr-radio, | |
| .gr-radio label, | |
| [data-testid="radio-group"] label, | |
| [data-testid="radio-group"] span { | |
| color: #333333 !important; | |
| background: transparent !important; | |
| } | |
| /* Unselected radio option */ | |
| [data-testid="radio-group"] .selected, | |
| input[type="radio"] + span { | |
| background: white !important; | |
| border: 1.5px solid #ddd !important; | |
| color: #333 !important; | |
| } | |
| /* Selected/checked radio option */ | |
| [data-testid="radio-group"] input[type="radio"]:checked + span, | |
| .gr-radio input[type="radio"]:checked ~ span { | |
| background: #FFF0F4 !important; /* light pinkish tint */ | |
| border-color: #C40D42 !important; | |
| color: #C40D42 !important; | |
| font-weight: 600 !important; | |
| } | |
| """ | |
| with gr.Blocks(theme=theme, css=css, title="SinhalaVITS Playground") as demo: | |
| # Header | |
| gr.HTML( | |
| """ | |
| <div id='header'> | |
| <h1> SinhalaVITS Playground</h1> | |
| <h3>An Open Source Sinhala Text-to-Speech Model</h3> | |
| <h4>by Dialog Axiata PLC in collaboration with</h4> | |
| <h4>Dialog-University of Moratuwa Mobile Communications Research Laboratory</h4> | |
| </div> | |
| """ | |
| ) | |
| with gr.Row(): | |
| # Left column: inputs + button | |
| with gr.Column(scale=2): | |
| with gr.Group(elem_id="input-group"): | |
| gr.Markdown("<span class='label-text'>✍️ Input Text</span>") | |
| sinhala_text = gr.Textbox( | |
| placeholder="මෙතන සිංහලෙන් ටයිප් කරන්න...", | |
| lines=6, | |
| show_label=False, | |
| elem_id="text-input" | |
| ) | |
| # Voice selector | |
| gr.Markdown("<span class='label-text'>🗣️ Choose Voice Model</span>") | |
| speaker = gr.Dropdown( | |
| choices=list(SPEAKER_MAPPING.keys()), | |
| value=INVERSE_MAPPING.get("Sanuki") if "Sanuki" in MODELS else list(SPEAKER_MAPPING.keys())[0], | |
| interactive=True, | |
| show_label=False, | |
| elem_id="voice-selector" | |
| ) | |
| # Main action button | |
| generate_btn = gr.Button( | |
| "🔊 Generate Speech", | |
| variant="primary", | |
| size="lg", | |
| elem_id="gen-btn" | |
| ) | |
| # Right column: Player and Guidelines | |
| with gr.Column(scale=2): | |
| with gr.Group(): | |
| gr.Markdown("#### <span class='result-header'>🎧 Audio Result</span>") | |
| audio_output = gr.Audio( | |
| label="", | |
| type="numpy", | |
| show_label=False, | |
| interactive=False, | |
| autoplay=False, | |
| elem_id="audio-player" | |
| ) | |
| # Status area | |
| status_text = gr.HTML( | |
| "<div class='status-message status-initial'>Enter text and press 'Generate Speech' to begin</div>" | |
| ) | |
| # ------------------------- | |
| # Benchmarking Section | |
| # ------------------------- | |
| with gr.Column(visible=False) as rating_section: | |
| gr.Markdown("#### ⭐ Rate This Speech") | |
| mos_rating = gr.Radio( | |
| choices=[ | |
| ("1 - Very Bad", 1), | |
| ("2 - Bad", 2), | |
| ("3 - Fair", 3), | |
| ("4 - Good", 4), | |
| ("5 - Excellent", 5), | |
| ], | |
| label="How natural does this speech sound?", | |
| interactive=True, | |
| ) | |
| comment_box = gr.Textbox( | |
| placeholder="Optional comment...", | |
| lines=2, | |
| ) | |
| submit_rating_btn = gr.Button("📤 Submit", variant="secondary") | |
| rating_status = gr.Markdown("") | |
| # Guidelines | |
| gr.HTML( | |
| """ | |
| <div class="guidelines"> | |
| <strong>⚠️ Please Note:</strong> | |
| <ul> | |
| <li>Do <b>not</b> use <i>English letters</i> or <i>numerals</i> in the input. Use only <b>Sinhala Unicode</b> text.</li> | |
| <li>Use proper punctuation (.,?) to help the model with phrasing.</li> | |
| <li>Keep input short for a single request to speed up synthesizing.</li> | |
| </ul> | |
| <dl style="margin-top: 0.75rem;"> | |
| <dt>* The speech quality depends on the trained dataset and may not sound completely natural.</dt> | |
| </dl> | |
| </div> | |
| """ | |
| ) | |
| # Spacer | |
| gr.HTML("<div style='height: 3rem;'></div>") | |
| # Logo Section | |
| with gr.Row(equal_height=True, elem_id="logo-row"): | |
| # UoM Logo | |
| gr.Image( | |
| value="img/lablogo.png", | |
| label="UoM Research Lab Logo", | |
| show_label=False, | |
| show_download_button=False, # Removes the Download (arrow) button | |
| show_share_button=False, # Removes the Share button | |
| show_fullscreen_button=False, # Removes the Maximize/Fullscreen button | |
| interactive=False, | |
| elem_classes=["partner-logo"], # Added class | |
| container=False | |
| ) | |
| # Footer | |
| gr.HTML( | |
| """ | |
| <div class='footer'> | |
| <i>This playground is developed by Kasun Ranasinghe (@KasunUoM)</i> | |
| </div> | |
| """ | |
| ) | |
| # Event Handlers | |
| # Step 1: Set generating status | |
| set_status = generate_btn.click( | |
| fn=lambda: "<div class='status-message status-generating'>⏳ Generating speech, please wait...</div>", | |
| inputs=[], | |
| outputs=[status_text] | |
| ) | |
| # Step 2: Generate speech | |
| synth_call = set_status.then( | |
| fn=generate_speech, | |
| inputs=[sinhala_text, speaker], | |
| outputs=[audio_output] | |
| ) | |
| # Step 3: Set completed status | |
| synth_call.then( | |
| fn=lambda _: "<div class='status-message status-completed'>✅ Speech generated successfully! Use the player above to listen.</div>", | |
| inputs=[audio_output], | |
| outputs=[status_text] | |
| ) | |
| # Step 4: Make the Rating Visible | |
| synth_call.then( | |
| fn=lambda: gr.update(visible=True), | |
| inputs=[], | |
| outputs=[rating_section] | |
| ) | |
| # Step 5: Connect Rating Button | |
| submit_rating_btn.click( | |
| fn=save_rating, | |
| inputs=[mos_rating, comment_box, sinhala_text, speaker], | |
| outputs=[rating_status, mos_rating] | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch() |