Spaces:

dialoglk
/

SinhalaVITS

Running

App Files Files Community

SinhalaVITS / app.py

KasunUoM

Block color change

871f1a5 verified 3 months ago

raw

history blame contribute delete

21.4 kB

	# Created by Kasun Ranasinghe (@kasunUoM) \| Oct.2025
	#
	# Gradio App for SinhalaVITS TTS Inference
	# ===================================================

	import gradio as gr
	from TTS.utils.synthesizer import Synthesizer
	from romanizer import sinhala_to_roman
	from huggingface_hub import hf_hub_download
	import os
	import numpy as np
	from pathlib import Path
	import csv
	from datetime import datetime


	# ---------------
	# FILE CREATIONS
	# ---------------

	DATA_DIR = Path("data")
	DATA_DIR.mkdir(exist_ok=True)

	RATINGS_FILE = DATA_DIR / "ratings.csv"
	INFERENCE_FILE = DATA_DIR / "inference_count.txt"

	# Initialize files if not exist
	if not RATINGS_FILE.exists():
	with open(RATINGS_FILE, "w", newline="", encoding="utf-8") as f:
	writer = csv.writer(f)
	writer.writerow(["timestamp", "speaker", "text", "mos", "comment"])

	if not INFERENCE_FILE.exists():
	INFERENCE_FILE.write_text("0")


	# -----------------------
	# AUTHENTICATION SETUP
	# -----------------------

	# Get the token from Secrets
	HF_TOKEN = os.environ.get("HF_TOKEN")

	if not HF_TOKEN:
	print("WARNING: HF_TOKEN secret not found. Access to private repos will fail.")

	# ------------------
	# SPEAKER MAPPING
	# ------------------

	SPEAKER_MAPPING = {
	"Male Voice 1": "Roshan",
	"Female Voice 1": "Nipunika",
	"Male Voice 2": "Sanjaya",
	"Female Voice 2": "Sanuki",
	}

	# Invert the mapping to easily find the default display name
	INVERSE_MAPPING = {v: k for k, v in SPEAKER_MAPPING.items()}


	# -------------------------------
	# Load Multiple Speaker Models
	# -------------------------------

	def load_models():
	model_sources = {
	"Roshan": {
	"repo": "dialoglk/SinhalaVITS-TTS-M2",
	"model_file": "Roshan_270000.pth",
	"config_file": "Roshan_config.json"
	},
	"Nipunika": {
	"repo": "dialoglk/SinhalaVITS-TTS-F1",
	"model_file": "Nipunika_210000.pth",
	"config_file": "Nipunika_config.json"
	},
	"Sanjaya": {
	"repo": "dialoglk/SinhalaVITS-TTS-M1",
	"model_file": "Sanjaya_170000.pth",
	"config_file": "Sanjaya_config.json"
	},
	"Sanuki": {
	"repo": "dialoglk/SinhalaVITS-TTS-F2",
	"model_file": "Sanuki_190000.pth",
	"config_file": "Sanuki_config.json"
	}
	}

	loaded = {}
	print("Downloading and loading models...")

	for spk, info in model_sources.items():
	try:
	print(f"Loading speaker: {spk}")
	# Pass token to access private repos
	ckpt_path = hf_hub_download(
	repo_id=info["repo"],
	filename=info["model_file"],
	token=HF_TOKEN
	)
	cfg_path = hf_hub_download(
	repo_id=info["repo"],
	filename=info["config_file"],
	token=HF_TOKEN
	)

	loaded[spk] = Synthesizer(
	tts_checkpoint=ckpt_path,
	tts_config_path=cfg_path,
	use_cuda=False
	)
	print(f"✅ {spk} Loaded")
	except Exception as e:
	print(f"❌ Failed to load {spk}: {e}")

	print("All models loaded successfully.")
	return loaded

	# Load models globally once
	MODELS = load_models()

	# -------------------------------
	# The Core Inference Function
	# -------------------------------

	# Speaker in the Display Name
	def generate_speech(sinhala_text, speaker):
	if not sinhala_text.strip():
	print("Please input a text and then try..")
	return (None, None)

	# Convert display name back to the actual model key
	model_key = SPEAKER_MAPPING.get(speaker)

	if model_key not in MODELS:
	print(f"Error: Model key '{model_key}' (from '{speaker}') not found.")
	return (None, None)

	print(f"Generating speech...")
	try:
	# 1. Convert Sinhala → Roman
	roman_text = sinhala_to_roman(sinhala_text)
	print(f"Romanized text: {roman_text}")

	# 2. Generate audio
	model = MODELS[model_key] # Use the actual model key
	wav = model.tts(roman_text)
	sample_rate = model.output_sample_rate

	# 3. Convert to Numpy Array
	print("Speech generated successfully.")
	# Increment inference counter
	count = int(INFERENCE_FILE.read_text().strip())
	count += 1
	INFERENCE_FILE.write_text(str(count))
	return (sample_rate, np.array(wav))

	except Exception as e:
	print(f"Error generating speech: {e}")
	return (None, None)

	# Rating Save Function
	# ---------------------

	def save_rating(mos, comment, text, speaker):
	if not mos:
	return "⚠️ Please select a rating before submitting.", gr.update()

	timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

	with open(RATINGS_FILE, "a", newline="", encoding="utf-8") as f:
	writer = csv.writer(f)
	writer.writerow([timestamp, speaker, text, mos, comment])

	# Count total ratings
	with open(RATINGS_FILE, "r", encoding="utf-8") as f:
	total = sum(1 for _ in f) - 1 # exclude header

	return (
	f"✅ Thank you for your feedback! ⭐ Total ratings: {total}",
	gr.update(value=None)
	)

	# -----------------------------------
	# GRADIO UI (Clean Light Theme)
	# -----------------------------------

	# Simple Light Theme - Force light colors everywhere
	theme = gr.themes.Default(
	primary_hue="red",
	secondary_hue="orange",
	neutral_hue="gray",
	font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "sans-serif"],
	).set(
	# Force light backgrounds
	body_background_fill="*neutral_50",
	body_background_fill_dark="*neutral_50",

	# Blocks
	block_background_fill="#dcfaf5",
	block_background_fill_dark="#dcfaf5",

	# Text - FORCE DARK
	body_text_color="*neutral_500",
	body_text_color_dark="*neutral_500",
	body_text_color_subdued="*neutral_350",
	body_text_color_subdued_dark="*neutral_350",

	# Labels
	block_label_text_color="*neutral_500",
	block_label_text_color_dark="*neutral_500",

	# Primary button - Using your red color
	button_primary_background_fill="#C40D42",
	button_primary_background_fill_dark="#C40D42",
	button_primary_background_fill_hover="#A00B36",
	button_primary_background_fill_hover_dark="#A00B36",
	button_primary_text_color="white",
	button_primary_text_color_dark="white",

	# Secondary button
	button_secondary_background_fill="#6689AB",
	button_secondary_background_fill_dark="#6689AB",
	button_secondary_text_color="white",
	button_secondary_text_color_dark="white",

	# Inputs
	input_background_fill="white",
	input_background_fill_dark="white",
	input_border_color="*neutral_300",
	input_border_color_dark="*neutral_300",

	# Shadows
	shadow_drop="0 1px 3px 0 rgb(0 0 0 / 0.1)",
	shadow_drop_lg="0 10px 15px -3px rgb(0 0 0 / 0.1)",
	)

	# Comprehensive CSS to force light theme and fix visibility
	css = """
	/* Force light theme globally */
	:root, .dark {
	color-scheme: light !important;
	}
	/* Container */
	.container {
	max-width: 1200px;
	margin: auto;
	padding: 2rem 1rem;
	}
	/* Body - light gradient background */
	body, .gradio-container {
	background: linear-gradient(135deg, #FAFAFA 0%, #F5F5F5 100%) !important;
	color: #1a1a1a !important;
	}
	/* Force all text to be dark */
	* {
	color: #1a1a1a !important;
	}
	/* Headers and titles - except where we want colors */
	h1, h2, h3, h4, h5, h6 {
	color: #1a1a1a !important;
	}
	/* All blocks white background */
	.gr-block, .gr-group, .gr-box, .gr-form, .gr-panel {
	background: white !important;
	border: 1px solid #e0e0e0 !important;
	box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05) !important;
	border-radius: 8px !important;
	}
	/* Force section headers ("Input Text", "Choose Voice Model", "Audio Result") to white */
	.gr-panel-header,
	.gr-group-label {
	color: white !important;
	}
	/* Header styling with your colors */
	#header {
	text-align: center;
	margin-bottom: 2rem;
	padding: 2rem 1rem;
	background: white !important;
	border-radius: 12px;
	box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08) !important;
	}
	#header h1 {
	font-size: 2.5rem;
	font-weight: 900;
	margin: 0;
	background: linear-gradient(135deg, #C40D42 0%, #F15A22 50%, #FDB934 100%);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	background-clip: text;
	}
	#header h3 {
	margin: 1rem 0 0 0;
	color: #333 !important;
	font-weight: 600;
	}
	#header h4 {
	margin: 0.5rem 0 0 0;
	color: #666 !important;
	font-weight: 400;
	}
	/* Labels - dark and bold */
	label, .label, .gr-label {
	color: #faf7f7 !important;
	font-weight: 600 !important;
	font-size: 1rem !important;
	}
	.label-text {
	color: #faf7f7 !important;
	font-weight: 700 !important;
	font-size: 1rem !important;
	margin-bottom: 0.5rem;
	display: block;
	}
	/* Textbox - white with dark text */
	.gr-textbox,
	.gr-textbox textarea,
	textarea,
	input[type="text"] {
	background: white !important;
	border: 1.5px solid #ddd !important;
	color: #1a1a1a !important;
	font-size: 1rem !important;
	}
	.gr-textbox:focus,
	textarea:focus,
	input[type="text"]:focus {
	border-color: #C40D42 !important;
	outline: none !important;
	box-shadow: 0 0 0 3px rgba(196, 13, 66, 0.1) !important;
	}
	/* Dropdown - fix all text visibility */
	.gr-dropdown,
	.gr-dropdown *,
	.svelte-1gfkn6j,
	.svelte-1gfkn6j * {
	color: #1a1a1a !important;
	background: white !important;
	}
	.gr-dropdown {
	border: 1.5px solid #ddd !important;
	}
	/* Dropdown button/trigger */
	.gr-dropdown .wrap,
	.gr-dropdown button {
	background: white !important;
	color: #1a1a1a !important;
	font-weight: 500 !important;
	}
	/* Dropdown options container */
	.gr-dropdown-options,
	.gr-dropdown ul,
	[role="listbox"] {
	background: white !important;
	border: 1px solid #ddd !important;
	box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15) !important;
	z-index: 9999 !important;
	}
	/* Dropdown individual options */
	.gr-dropdown-option,
	.gr-dropdown li,
	[role="option"] {
	color: #1a1a1a !important;
	background: white !important;
	padding: 0.5rem 1rem !important;
	}
	.gr-dropdown-option:hover,
	.gr-dropdown li:hover,
	[role="option"]:hover {
	background: #f5f5f5 !important;
	color: #C40D42 !important;
	}
	/* Audio Player - fix all button and text colors */
	.gr-audio,
	.gr-audio *,
	audio,
	audio * {
	color: #1a1a1a !important;
	}
	.gr-audio {
	background: #fafafa !important;
	border: 1px solid #e0e0e0 !important;
	border-radius: 8px !important;
	padding: 1rem !important;
	}
	/* Audio controls - make buttons visible */
	.gr-audio button,
	.gr-audio [role="button"],
	audio::-webkit-media-controls-panel,
	audio::-webkit-media-controls-play-button,
	audio::-webkit-media-controls-timeline,
	audio::-webkit-media-controls-current-time-display,
	audio::-webkit-media-controls-time-remaining-display,
	audio::-webkit-media-controls-mute-button,
	audio::-webkit-media-controls-volume-slider {
	background: #1a1a1a !important;
	color: white !important;
	filter: invert(0) !important;
	}
	/* Primary Button - your red color */
	button.primary,
	.gr-button-primary,
	button[variant="primary"] {
	background: linear-gradient(135deg, #C40D42 0%, #A00B36 100%) !important;
	color: white !important;
	border: none !important;
	font-weight: 600 !important;
	padding: 0.75rem 2rem !important;
	box-shadow: 0 4px 12px rgba(196, 13, 66, 0.3) !important;
	transition: all 0.3s ease !important;
	}
	button.primary:hover,
	.gr-button-primary:hover {
	background: linear-gradient(135deg, #A00B36 0%, #7B1B67 100%) !important;
	box-shadow: 0 6px 16px rgba(196, 13, 66, 0.4) !important;
	transform: translateY(-2px) !important;
	}
	/* Guidelines box */
	.guidelines {
	background: linear-gradient(135deg, #FFF5E1 0%, #FFE8CC 100%);
	border-left: 4px solid #F7941E;
	padding: 1.5rem;
	margin-top: 1rem;
	font-color: #333 !important;
	font-size: 0.95rem;
	border-radius: 8px;
	}
	.guidelines strong {
	color: #C40D42 !important;
	font-weight: 700;
	}
	.guidelines ul {
	margin: 0.75rem 0 0 1.5rem;
	padding: 0;
	}
	.guidelines li {
	color: #333 !important;
	margin: 0.5rem 0;
	}
	.guidelines dt {
	color: #666 !important;
	font-style: italic;
	}
	/* Logo section */
	#logo-row {
	justify-content: center;
	align-items: center;
	background: white !important;
	border: 1px solid #e0e0e0 !important;
	border-radius: 10px !important;
	padding: 1rem !important;
	max-width: 450px;
	margin: 2rem auto;
	}
	.partner-logo {
	background: white !important;
	padding: 1rem !important;
	border-radius: 8px !important;
	border: 1px solid #e0e0e0 !important;
	transition: all 0.3s ease;
	}
	.partner-logo:hover {
	box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1) !important;
	transform: translateY(-4px);
	}
	.partner-logo img {
	max-height: 110px !important;
	width: auto !important;
	object-fit: contain;
	}
	/* Footer */
	.footer {
	text-align: center;
	margin-top: 2rem;
	padding: 1rem;
	color: #666 !important;
	background: white;
	border-radius: 8px;
	}
	.footer * {
	color: #666 !important;
	}
	/* Result header */
	.result-header {
	color: #1a1a1a !important;
	font-weight: 700 !important;
	}
	/* Status messages */
	.status-message {
	padding: 1rem;
	border-radius: 8px;
	font-weight: 600;
	text-align: center;
	margin: 1rem 0;
	}
	.status-generating {
	background: #FFF4E6;
	color: #C77700 !important;
	border: 1.5px solid #FFD8A8;
	}
	.status-generating * {
	color: #C77700 !important;
	}
	.status-completed {
	background: #E8F5E9;
	color: #2E7D32 !important;
	border: 1.5px solid #A5D6A7;
	}
	.status-completed * {
	color: #2E7D32 !important;
	}
	.status-initial {
	background: #F5F5F5;
	color: #666 !important;
	border: 1.5px solid #E0E0E0;
	}
	.status-initial * {
	color: #666 !important;
	}
	/* Markdown content */
	.markdown-text, .prose {
	color: #1a1a1a !important;
	}
	/* Make sure spans inherit correct colors when needed */
	span[style*="color"] {
	/* Allow inline color styles to work */
	}
	/* Radio buttons - fix label and option colors */
	.gr-radio,
	.gr-radio label,
	[data-testid="radio-group"] label,
	[data-testid="radio-group"] span {
	color: #333333 !important;
	background: transparent !important;
	}

	/* Unselected radio option */
	[data-testid="radio-group"] .selected,
	input[type="radio"] + span {
	background: white !important;
	border: 1.5px solid #ddd !important;
	color: #333 !important;
	}

	/* Selected/checked radio option */
	[data-testid="radio-group"] input[type="radio"]:checked + span,
	.gr-radio input[type="radio"]:checked ~ span {
	background: #FFF0F4 !important; /* light pinkish tint */
	border-color: #C40D42 !important;
	color: #C40D42 !important;
	font-weight: 600 !important;
	}

	"""

	with gr.Blocks(theme=theme, css=css, title="SinhalaVITS Playground") as demo:

	# Header
	gr.HTML(
	"""
	<div id='header'>
	<h1> SinhalaVITS Playground</h1>
	<h3>An Open Source Sinhala Text-to-Speech Model</h3>
	<h4>by Dialog Axiata PLC in collaboration with</h4>
	<h4>Dialog-University of Moratuwa Mobile Communications Research Laboratory</h4>
	</div>
	"""
	)

	with gr.Row():

	# Left column: inputs + button
	with gr.Column(scale=2):
	with gr.Group(elem_id="input-group"):
	gr.Markdown("<span class='label-text'>✍️ Input Text</span>")
	sinhala_text = gr.Textbox(
	placeholder="මෙතන සිංහලෙන් ටයිප් කරන්න...",
	lines=6,
	show_label=False,
	elem_id="text-input"
	)

	# Voice selector
	gr.Markdown("<span class='label-text'>🗣️ Choose Voice Model</span>")
	speaker = gr.Dropdown(
	choices=list(SPEAKER_MAPPING.keys()),
	value=INVERSE_MAPPING.get("Sanuki") if "Sanuki" in MODELS else list(SPEAKER_MAPPING.keys())[0],
	interactive=True,
	show_label=False,
	elem_id="voice-selector"
	)

	# Main action button
	generate_btn = gr.Button(
	"🔊 Generate Speech",
	variant="primary",
	size="lg",
	elem_id="gen-btn"
	)

	# Right column: Player and Guidelines
	with gr.Column(scale=2):
	with gr.Group():
	gr.Markdown("#### <span class='result-header'>🎧 Audio Result</span>")

	audio_output = gr.Audio(
	label="",
	type="numpy",
	show_label=False,
	interactive=False,
	autoplay=False,
	elem_id="audio-player"
	)

	# Status area
	status_text = gr.HTML(
	"<div class='status-message status-initial'>Enter text and press 'Generate Speech' to begin</div>"
	)

	# -------------------------
	# Benchmarking Section
	# -------------------------

	with gr.Column(visible=False) as rating_section:
	gr.Markdown("#### ⭐ Rate This Speech")

	mos_rating = gr.Radio(
	choices=[
	("1 - Very Bad", 1),
	("2 - Bad", 2),
	("3 - Fair", 3),
	("4 - Good", 4),
	("5 - Excellent", 5),
	],
	label="How natural does this speech sound?",
	interactive=True,
	)

	comment_box = gr.Textbox(
	placeholder="Optional comment...",
	lines=2,
	)

	submit_rating_btn = gr.Button("📤 Submit", variant="secondary")

	rating_status = gr.Markdown("")

	# Guidelines
	gr.HTML(
	"""
	<div class="guidelines">
	<strong>⚠️ Please Note:</strong>
	<ul>
	<li>Do <b>not</b> use <i>English letters</i> or <i>numerals</i> in the input. Use only <b>Sinhala Unicode</b> text.</li>
	<li>Use proper punctuation (.,?) to help the model with phrasing.</li>
	<li>Keep input short for a single request to speed up synthesizing.</li>
	</ul>
	<dl style="margin-top: 0.75rem;">
	<dt>* The speech quality depends on the trained dataset and may not sound completely natural.</dt>
	</dl>
	</div>
	"""
	)


	# Spacer
	gr.HTML("<div style='height: 3rem;'></div>")

	# Logo Section
	with gr.Row(equal_height=True, elem_id="logo-row"):
	# UoM Logo
	gr.Image(
	value="img/lablogo.png",
	label="UoM Research Lab Logo",
	show_label=False,
	show_download_button=False, # Removes the Download (arrow) button
	show_share_button=False, # Removes the Share button
	show_fullscreen_button=False, # Removes the Maximize/Fullscreen button
	interactive=False,
	elem_classes=["partner-logo"], # Added class
	container=False
	)

	# Footer
	gr.HTML(
	"""
	<div class='footer'>
	<i>This playground is developed by Kasun Ranasinghe (@KasunUoM)</i>
	</div>
	"""
	)

	# Event Handlers
	# Step 1: Set generating status
	set_status = generate_btn.click(
	fn=lambda: "<div class='status-message status-generating'>⏳ Generating speech, please wait...</div>",
	inputs=[],
	outputs=[status_text]
	)

	# Step 2: Generate speech
	synth_call = set_status.then(
	fn=generate_speech,
	inputs=[sinhala_text, speaker],
	outputs=[audio_output]
	)

	# Step 3: Set completed status
	synth_call.then(
	fn=lambda _: "<div class='status-message status-completed'>✅ Speech generated successfully! Use the player above to listen.</div>",
	inputs=[audio_output],
	outputs=[status_text]
	)

	# Step 4: Make the Rating Visible
	synth_call.then(
	fn=lambda: gr.update(visible=True),
	inputs=[],
	outputs=[rating_section]
	)

	# Step 5: Connect Rating Button
	submit_rating_btn.click(
	fn=save_rating,
	inputs=[mos_rating, comment_box, sinhala_text, speaker],
	outputs=[rating_status, mos_rating]
	)


	# Launch the app
	if __name__ == "__main__":
	demo.launch()