Spaces:
Sleeping
Sleeping
| import os | |
| import torch | |
| import time | |
| import logging | |
| from pydub import AudioSegment | |
| from phonemizer.backend.espeak.wrapper import EspeakWrapper | |
| from models import build_model | |
| logger = logging.getLogger(__name__) | |
| # Hugging Face Spaces setup | |
| MODEL_DIR = "./kokoro" | |
| os.makedirs(MODEL_DIR, exist_ok=True) | |
| # Configure espeak-ng for Hugging Face environment | |
| EspeakWrapper.set_library('/usr/lib/x86_64-linux-gnu/libespeak-ng.so.1') | |
| class TTSEngine: | |
| def __init__(self): | |
| logger.info("Initializing TTS Engine") | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| logger.info(f"Using device: {self.device}") | |
| self._verify_model_files() | |
| logger.info("Loading Kokoro model") | |
| self.model = build_model(f"{MODEL_DIR}/kokoro-v0_19.pth", self.device) | |
| logger.info("Loading voice model") | |
| self.voice = torch.load(f"{MODEL_DIR}/voices/af_bella.pt", | |
| map_location=self.device) | |
| logger.info("TTS engine initialized") | |
| def _verify_model_files(self): | |
| """Ensure required model files exist""" | |
| required_files = [ | |
| f"{MODEL_DIR}/kokoro-v0_19.pth", | |
| f"{MODEL_DIR}/voices/af_bella.pt" | |
| ] | |
| missing = [f for f in required_files if not os.path.exists(f)] | |
| if missing: | |
| logger.error(f"Missing model files: {missing}") | |
| raise FileNotFoundError( | |
| f"Missing model files: {missing}\n" | |
| "Add this to your Hugging Face Space settings:\n" | |
| "App setup -> Clone Kokoro repository: " | |
| "git clone https://huggingface.co/hexgrad/Kokoro-82M ./kokoro" | |
| ) | |
| def generate_speech(self, text: str, language: str = "zh") -> str: | |
| """Generate speech from Chinese text""" | |
| logger.info(f"Generating speech for text length: {len(text)}") | |
| try: | |
| from kokoro import generate_full | |
| if len(text) > 500: | |
| logger.warning(f"Truncating long text ({len(text)} characters)") | |
| text = text[:495] + "[TRUNCATED]" | |
| logger.info("Starting audio generation") | |
| audio, _ = generate_full( | |
| self.model, | |
| text, | |
| self.voice, | |
| lang='en-us', | |
| max_len=200 if self.device == "cpu" else 500 | |
| ) | |
| output_path = f"temp/outputs/output_{int(time.time())}.wav" | |
| logger.info(f"Saving audio to {output_path}") | |
| AudioSegment( | |
| audio.numpy().tobytes(), | |
| frame_rate=24000, | |
| sample_width=2, | |
| channels=1 | |
| ).export(output_path, format="wav") | |
| logger.info(f"Audio generation complete: {output_path}") | |
| return output_path | |
| except Exception as e: | |
| logger.error(f"TTS generation failed: {str(e)}", exc_info=True) | |
| raise | |
| # Initialize TTS engine once | |
| def get_tts_engine(): | |
| return TTSEngine() | |
| def generate_speech(text: str, language: str = "zh") -> str: | |
| """Public interface for TTS generation""" | |
| return get_tts_engine().generate_speech(text, language) |