SinhalaVITS / app.py
KasunUoM's picture
Block color change
871f1a5 verified
# Created by Kasun Ranasinghe (@kasunUoM) | Oct.2025
#
# Gradio App for SinhalaVITS TTS Inference
# ===================================================
import gradio as gr
from TTS.utils.synthesizer import Synthesizer
from romanizer import sinhala_to_roman
from huggingface_hub import hf_hub_download
import os
import numpy as np
from pathlib import Path
import csv
from datetime import datetime
# ---------------
# FILE CREATIONS
# ---------------
DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)
RATINGS_FILE = DATA_DIR / "ratings.csv"
INFERENCE_FILE = DATA_DIR / "inference_count.txt"
# Initialize files if not exist
if not RATINGS_FILE.exists():
with open(RATINGS_FILE, "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["timestamp", "speaker", "text", "mos", "comment"])
if not INFERENCE_FILE.exists():
INFERENCE_FILE.write_text("0")
# -----------------------
# AUTHENTICATION SETUP
# -----------------------
# Get the token from Secrets
HF_TOKEN = os.environ.get("HF_TOKEN")
if not HF_TOKEN:
print("WARNING: HF_TOKEN secret not found. Access to private repos will fail.")
# ------------------
# SPEAKER MAPPING
# ------------------
SPEAKER_MAPPING = {
"Male Voice 1": "Roshan",
"Female Voice 1": "Nipunika",
"Male Voice 2": "Sanjaya",
"Female Voice 2": "Sanuki",
}
# Invert the mapping to easily find the default display name
INVERSE_MAPPING = {v: k for k, v in SPEAKER_MAPPING.items()}
# -------------------------------
# Load Multiple Speaker Models
# -------------------------------
def load_models():
model_sources = {
"Roshan": {
"repo": "dialoglk/SinhalaVITS-TTS-M2",
"model_file": "Roshan_270000.pth",
"config_file": "Roshan_config.json"
},
"Nipunika": {
"repo": "dialoglk/SinhalaVITS-TTS-F1",
"model_file": "Nipunika_210000.pth",
"config_file": "Nipunika_config.json"
},
"Sanjaya": {
"repo": "dialoglk/SinhalaVITS-TTS-M1",
"model_file": "Sanjaya_170000.pth",
"config_file": "Sanjaya_config.json"
},
"Sanuki": {
"repo": "dialoglk/SinhalaVITS-TTS-F2",
"model_file": "Sanuki_190000.pth",
"config_file": "Sanuki_config.json"
}
}
loaded = {}
print("Downloading and loading models...")
for spk, info in model_sources.items():
try:
print(f"Loading speaker: {spk}")
# Pass token to access private repos
ckpt_path = hf_hub_download(
repo_id=info["repo"],
filename=info["model_file"],
token=HF_TOKEN
)
cfg_path = hf_hub_download(
repo_id=info["repo"],
filename=info["config_file"],
token=HF_TOKEN
)
loaded[spk] = Synthesizer(
tts_checkpoint=ckpt_path,
tts_config_path=cfg_path,
use_cuda=False
)
print(f"✅ {spk} Loaded")
except Exception as e:
print(f"❌ Failed to load {spk}: {e}")
print("All models loaded successfully.")
return loaded
# Load models globally once
MODELS = load_models()
# -------------------------------
# The Core Inference Function
# -------------------------------
# Speaker in the Display Name
def generate_speech(sinhala_text, speaker):
if not sinhala_text.strip():
print("Please input a text and then try..")
return (None, None)
# Convert display name back to the actual model key
model_key = SPEAKER_MAPPING.get(speaker)
if model_key not in MODELS:
print(f"Error: Model key '{model_key}' (from '{speaker}') not found.")
return (None, None)
print(f"Generating speech...")
try:
# 1. Convert Sinhala → Roman
roman_text = sinhala_to_roman(sinhala_text)
print(f"Romanized text: {roman_text}")
# 2. Generate audio
model = MODELS[model_key] # Use the actual model key
wav = model.tts(roman_text)
sample_rate = model.output_sample_rate
# 3. Convert to Numpy Array
print("Speech generated successfully.")
# Increment inference counter
count = int(INFERENCE_FILE.read_text().strip())
count += 1
INFERENCE_FILE.write_text(str(count))
return (sample_rate, np.array(wav))
except Exception as e:
print(f"Error generating speech: {e}")
return (None, None)
# Rating Save Function
# ---------------------
def save_rating(mos, comment, text, speaker):
if not mos:
return "⚠️ Please select a rating before submitting.", gr.update()
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
with open(RATINGS_FILE, "a", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow([timestamp, speaker, text, mos, comment])
# Count total ratings
with open(RATINGS_FILE, "r", encoding="utf-8") as f:
total = sum(1 for _ in f) - 1 # exclude header
return (
f"✅ Thank you for your feedback! ⭐ Total ratings: {total}",
gr.update(value=None)
)
# -----------------------------------
# GRADIO UI (Clean Light Theme)
# -----------------------------------
# Simple Light Theme - Force light colors everywhere
theme = gr.themes.Default(
primary_hue="red",
secondary_hue="orange",
neutral_hue="gray",
font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "sans-serif"],
).set(
# Force light backgrounds
body_background_fill="*neutral_50",
body_background_fill_dark="*neutral_50",
# Blocks
block_background_fill="#dcfaf5",
block_background_fill_dark="#dcfaf5",
# Text - FORCE DARK
body_text_color="*neutral_500",
body_text_color_dark="*neutral_500",
body_text_color_subdued="*neutral_350",
body_text_color_subdued_dark="*neutral_350",
# Labels
block_label_text_color="*neutral_500",
block_label_text_color_dark="*neutral_500",
# Primary button - Using your red color
button_primary_background_fill="#C40D42",
button_primary_background_fill_dark="#C40D42",
button_primary_background_fill_hover="#A00B36",
button_primary_background_fill_hover_dark="#A00B36",
button_primary_text_color="white",
button_primary_text_color_dark="white",
# Secondary button
button_secondary_background_fill="#6689AB",
button_secondary_background_fill_dark="#6689AB",
button_secondary_text_color="white",
button_secondary_text_color_dark="white",
# Inputs
input_background_fill="white",
input_background_fill_dark="white",
input_border_color="*neutral_300",
input_border_color_dark="*neutral_300",
# Shadows
shadow_drop="0 1px 3px 0 rgb(0 0 0 / 0.1)",
shadow_drop_lg="0 10px 15px -3px rgb(0 0 0 / 0.1)",
)
# Comprehensive CSS to force light theme and fix visibility
css = """
/* Force light theme globally */
:root, .dark {
color-scheme: light !important;
}
/* Container */
.container {
max-width: 1200px;
margin: auto;
padding: 2rem 1rem;
}
/* Body - light gradient background */
body, .gradio-container {
background: linear-gradient(135deg, #FAFAFA 0%, #F5F5F5 100%) !important;
color: #1a1a1a !important;
}
/* Force all text to be dark */
* {
color: #1a1a1a !important;
}
/* Headers and titles - except where we want colors */
h1, h2, h3, h4, h5, h6 {
color: #1a1a1a !important;
}
/* All blocks white background */
.gr-block, .gr-group, .gr-box, .gr-form, .gr-panel {
background: white !important;
border: 1px solid #e0e0e0 !important;
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05) !important;
border-radius: 8px !important;
}
/* Force section headers ("Input Text", "Choose Voice Model", "Audio Result") to white */
.gr-panel-header,
.gr-group-label {
color: white !important;
}
/* Header styling with your colors */
#header {
text-align: center;
margin-bottom: 2rem;
padding: 2rem 1rem;
background: white !important;
border-radius: 12px;
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08) !important;
}
#header h1 {
font-size: 2.5rem;
font-weight: 900;
margin: 0;
background: linear-gradient(135deg, #C40D42 0%, #F15A22 50%, #FDB934 100%);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
background-clip: text;
}
#header h3 {
margin: 1rem 0 0 0;
color: #333 !important;
font-weight: 600;
}
#header h4 {
margin: 0.5rem 0 0 0;
color: #666 !important;
font-weight: 400;
}
/* Labels - dark and bold */
label, .label, .gr-label {
color: #faf7f7 !important;
font-weight: 600 !important;
font-size: 1rem !important;
}
.label-text {
color: #faf7f7 !important;
font-weight: 700 !important;
font-size: 1rem !important;
margin-bottom: 0.5rem;
display: block;
}
/* Textbox - white with dark text */
.gr-textbox,
.gr-textbox textarea,
textarea,
input[type="text"] {
background: white !important;
border: 1.5px solid #ddd !important;
color: #1a1a1a !important;
font-size: 1rem !important;
}
.gr-textbox:focus,
textarea:focus,
input[type="text"]:focus {
border-color: #C40D42 !important;
outline: none !important;
box-shadow: 0 0 0 3px rgba(196, 13, 66, 0.1) !important;
}
/* Dropdown - fix all text visibility */
.gr-dropdown,
.gr-dropdown *,
.svelte-1gfkn6j,
.svelte-1gfkn6j * {
color: #1a1a1a !important;
background: white !important;
}
.gr-dropdown {
border: 1.5px solid #ddd !important;
}
/* Dropdown button/trigger */
.gr-dropdown .wrap,
.gr-dropdown button {
background: white !important;
color: #1a1a1a !important;
font-weight: 500 !important;
}
/* Dropdown options container */
.gr-dropdown-options,
.gr-dropdown ul,
[role="listbox"] {
background: white !important;
border: 1px solid #ddd !important;
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15) !important;
z-index: 9999 !important;
}
/* Dropdown individual options */
.gr-dropdown-option,
.gr-dropdown li,
[role="option"] {
color: #1a1a1a !important;
background: white !important;
padding: 0.5rem 1rem !important;
}
.gr-dropdown-option:hover,
.gr-dropdown li:hover,
[role="option"]:hover {
background: #f5f5f5 !important;
color: #C40D42 !important;
}
/* Audio Player - fix all button and text colors */
.gr-audio,
.gr-audio *,
audio,
audio * {
color: #1a1a1a !important;
}
.gr-audio {
background: #fafafa !important;
border: 1px solid #e0e0e0 !important;
border-radius: 8px !important;
padding: 1rem !important;
}
/* Audio controls - make buttons visible */
.gr-audio button,
.gr-audio [role="button"],
audio::-webkit-media-controls-panel,
audio::-webkit-media-controls-play-button,
audio::-webkit-media-controls-timeline,
audio::-webkit-media-controls-current-time-display,
audio::-webkit-media-controls-time-remaining-display,
audio::-webkit-media-controls-mute-button,
audio::-webkit-media-controls-volume-slider {
background: #1a1a1a !important;
color: white !important;
filter: invert(0) !important;
}
/* Primary Button - your red color */
button.primary,
.gr-button-primary,
button[variant="primary"] {
background: linear-gradient(135deg, #C40D42 0%, #A00B36 100%) !important;
color: white !important;
border: none !important;
font-weight: 600 !important;
padding: 0.75rem 2rem !important;
box-shadow: 0 4px 12px rgba(196, 13, 66, 0.3) !important;
transition: all 0.3s ease !important;
}
button.primary:hover,
.gr-button-primary:hover {
background: linear-gradient(135deg, #A00B36 0%, #7B1B67 100%) !important;
box-shadow: 0 6px 16px rgba(196, 13, 66, 0.4) !important;
transform: translateY(-2px) !important;
}
/* Guidelines box */
.guidelines {
background: linear-gradient(135deg, #FFF5E1 0%, #FFE8CC 100%);
border-left: 4px solid #F7941E;
padding: 1.5rem;
margin-top: 1rem;
font-color: #333 !important;
font-size: 0.95rem;
border-radius: 8px;
}
.guidelines strong {
color: #C40D42 !important;
font-weight: 700;
}
.guidelines ul {
margin: 0.75rem 0 0 1.5rem;
padding: 0;
}
.guidelines li {
color: #333 !important;
margin: 0.5rem 0;
}
.guidelines dt {
color: #666 !important;
font-style: italic;
}
/* Logo section */
#logo-row {
justify-content: center;
align-items: center;
background: white !important;
border: 1px solid #e0e0e0 !important;
border-radius: 10px !important;
padding: 1rem !important;
max-width: 450px;
margin: 2rem auto;
}
.partner-logo {
background: white !important;
padding: 1rem !important;
border-radius: 8px !important;
border: 1px solid #e0e0e0 !important;
transition: all 0.3s ease;
}
.partner-logo:hover {
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1) !important;
transform: translateY(-4px);
}
.partner-logo img {
max-height: 110px !important;
width: auto !important;
object-fit: contain;
}
/* Footer */
.footer {
text-align: center;
margin-top: 2rem;
padding: 1rem;
color: #666 !important;
background: white;
border-radius: 8px;
}
.footer * {
color: #666 !important;
}
/* Result header */
.result-header {
color: #1a1a1a !important;
font-weight: 700 !important;
}
/* Status messages */
.status-message {
padding: 1rem;
border-radius: 8px;
font-weight: 600;
text-align: center;
margin: 1rem 0;
}
.status-generating {
background: #FFF4E6;
color: #C77700 !important;
border: 1.5px solid #FFD8A8;
}
.status-generating * {
color: #C77700 !important;
}
.status-completed {
background: #E8F5E9;
color: #2E7D32 !important;
border: 1.5px solid #A5D6A7;
}
.status-completed * {
color: #2E7D32 !important;
}
.status-initial {
background: #F5F5F5;
color: #666 !important;
border: 1.5px solid #E0E0E0;
}
.status-initial * {
color: #666 !important;
}
/* Markdown content */
.markdown-text, .prose {
color: #1a1a1a !important;
}
/* Make sure spans inherit correct colors when needed */
span[style*="color"] {
/* Allow inline color styles to work */
}
/* Radio buttons - fix label and option colors */
.gr-radio,
.gr-radio label,
[data-testid="radio-group"] label,
[data-testid="radio-group"] span {
color: #333333 !important;
background: transparent !important;
}
/* Unselected radio option */
[data-testid="radio-group"] .selected,
input[type="radio"] + span {
background: white !important;
border: 1.5px solid #ddd !important;
color: #333 !important;
}
/* Selected/checked radio option */
[data-testid="radio-group"] input[type="radio"]:checked + span,
.gr-radio input[type="radio"]:checked ~ span {
background: #FFF0F4 !important; /* light pinkish tint */
border-color: #C40D42 !important;
color: #C40D42 !important;
font-weight: 600 !important;
}
"""
with gr.Blocks(theme=theme, css=css, title="SinhalaVITS Playground") as demo:
# Header
gr.HTML(
"""
<div id='header'>
<h1> SinhalaVITS Playground</h1>
<h3>An Open Source Sinhala Text-to-Speech Model</h3>
<h4>by Dialog Axiata PLC in collaboration with</h4>
<h4>Dialog-University of Moratuwa Mobile Communications Research Laboratory</h4>
</div>
"""
)
with gr.Row():
# Left column: inputs + button
with gr.Column(scale=2):
with gr.Group(elem_id="input-group"):
gr.Markdown("<span class='label-text'>✍️ Input Text</span>")
sinhala_text = gr.Textbox(
placeholder="මෙතන සිංහලෙන් ටයිප් කරන්න...",
lines=6,
show_label=False,
elem_id="text-input"
)
# Voice selector
gr.Markdown("<span class='label-text'>🗣️ Choose Voice Model</span>")
speaker = gr.Dropdown(
choices=list(SPEAKER_MAPPING.keys()),
value=INVERSE_MAPPING.get("Sanuki") if "Sanuki" in MODELS else list(SPEAKER_MAPPING.keys())[0],
interactive=True,
show_label=False,
elem_id="voice-selector"
)
# Main action button
generate_btn = gr.Button(
"🔊 Generate Speech",
variant="primary",
size="lg",
elem_id="gen-btn"
)
# Right column: Player and Guidelines
with gr.Column(scale=2):
with gr.Group():
gr.Markdown("#### <span class='result-header'>🎧 Audio Result</span>")
audio_output = gr.Audio(
label="",
type="numpy",
show_label=False,
interactive=False,
autoplay=False,
elem_id="audio-player"
)
# Status area
status_text = gr.HTML(
"<div class='status-message status-initial'>Enter text and press 'Generate Speech' to begin</div>"
)
# -------------------------
# Benchmarking Section
# -------------------------
with gr.Column(visible=False) as rating_section:
gr.Markdown("#### ⭐ Rate This Speech")
mos_rating = gr.Radio(
choices=[
("1 - Very Bad", 1),
("2 - Bad", 2),
("3 - Fair", 3),
("4 - Good", 4),
("5 - Excellent", 5),
],
label="How natural does this speech sound?",
interactive=True,
)
comment_box = gr.Textbox(
placeholder="Optional comment...",
lines=2,
)
submit_rating_btn = gr.Button("📤 Submit", variant="secondary")
rating_status = gr.Markdown("")
# Guidelines
gr.HTML(
"""
<div class="guidelines">
<strong>⚠️ Please Note:</strong>
<ul>
<li>Do <b>not</b> use <i>English letters</i> or <i>numerals</i> in the input. Use only <b>Sinhala Unicode</b> text.</li>
<li>Use proper punctuation (.,?) to help the model with phrasing.</li>
<li>Keep input short for a single request to speed up synthesizing.</li>
</ul>
<dl style="margin-top: 0.75rem;">
<dt>* The speech quality depends on the trained dataset and may not sound completely natural.</dt>
</dl>
</div>
"""
)
# Spacer
gr.HTML("<div style='height: 3rem;'></div>")
# Logo Section
with gr.Row(equal_height=True, elem_id="logo-row"):
# UoM Logo
gr.Image(
value="img/lablogo.png",
label="UoM Research Lab Logo",
show_label=False,
show_download_button=False, # Removes the Download (arrow) button
show_share_button=False, # Removes the Share button
show_fullscreen_button=False, # Removes the Maximize/Fullscreen button
interactive=False,
elem_classes=["partner-logo"], # Added class
container=False
)
# Footer
gr.HTML(
"""
<div class='footer'>
<i>This playground is developed by Kasun Ranasinghe (@KasunUoM)</i>
</div>
"""
)
# Event Handlers
# Step 1: Set generating status
set_status = generate_btn.click(
fn=lambda: "<div class='status-message status-generating'>⏳ Generating speech, please wait...</div>",
inputs=[],
outputs=[status_text]
)
# Step 2: Generate speech
synth_call = set_status.then(
fn=generate_speech,
inputs=[sinhala_text, speaker],
outputs=[audio_output]
)
# Step 3: Set completed status
synth_call.then(
fn=lambda _: "<div class='status-message status-completed'>✅ Speech generated successfully! Use the player above to listen.</div>",
inputs=[audio_output],
outputs=[status_text]
)
# Step 4: Make the Rating Visible
synth_call.then(
fn=lambda: gr.update(visible=True),
inputs=[],
outputs=[rating_section]
)
# Step 5: Connect Rating Button
submit_rating_btn.click(
fn=save_rating,
inputs=[mos_rating, comment_box, sinhala_text, speaker],
outputs=[rating_status, mos_rating]
)
# Launch the app
if __name__ == "__main__":
demo.launch()