Spaces:
Sleeping
Sleeping
| import json | |
| import re | |
| import warnings | |
| from pathlib import Path | |
| from kanjiconv import KanjiConv | |
| from pypinyin import lazy_pinyin | |
| from .resources.pinyin_dict import PINYIN_DICT | |
| kanji_to_kana = KanjiConv() | |
| yoon_map = { | |
| "ぁ": "あ", | |
| "ぃ": "い", | |
| "ぅ": "う", | |
| "ぇ": "え", | |
| "ぉ": "お", | |
| "ゃ": "や", | |
| "ゅ": "ゆ", | |
| "ょ": "よ", | |
| "ゎ": "わ", | |
| } | |
| # ACE_phonemes | |
| with open(Path(__file__).parent / "resources" / "all_plans.json", "r") as f: | |
| ace_phonemes_all_plans = json.load(f) | |
| for plan in ace_phonemes_all_plans["plans"]: | |
| if plan["language"] == "zh": | |
| ace_phonemes_zh_plan = plan | |
| break | |
| def preprocess_text(text: str, language: str) -> list[str]: | |
| text = text.replace(" ", "") | |
| if language == "mandarin": | |
| text_list = to_pinyin(text) | |
| elif language == "japanese": | |
| text_list = to_kana(text) | |
| else: | |
| raise ValueError(f"Other languages are not supported") | |
| return text_list | |
| def to_pinyin(text: str) -> list[str]: | |
| pinyin_list = lazy_pinyin(text) | |
| text_list = [] | |
| for text in pinyin_list: | |
| if text[0] == "S" or text[0] == "A" or text[0] == "-": | |
| sp_strs = re.findall(r"-|AP|SP", text) | |
| for phn in sp_strs: | |
| text_list.append(phn) | |
| else: | |
| text_list.append(text) | |
| return text_list | |
| def replace_chouonpu(hiragana_text: str) -> str: | |
| """process「ー」since the previous packages didn't support""" | |
| vowels = { | |
| "あ": "あ", | |
| "い": "い", | |
| "う": "う", | |
| "え": "え", | |
| "お": "う", | |
| "か": "あ", | |
| "き": "い", | |
| "く": "う", | |
| "け": "え", | |
| "こ": "う", | |
| "さ": "あ", | |
| "し": "い", | |
| "す": "う", | |
| "せ": "え", | |
| "そ": "う", | |
| "た": "あ", | |
| "ち": "い", | |
| "つ": "う", | |
| "て": "え", | |
| "と": "う", | |
| "な": "あ", | |
| "に": "い", | |
| "ぬ": "う", | |
| "ね": "え", | |
| "の": "う", | |
| "は": "あ", | |
| "ひ": "い", | |
| "ふ": "う", | |
| "へ": "え", | |
| "ほ": "う", | |
| "ま": "あ", | |
| "み": "い", | |
| "む": "う", | |
| "め": "え", | |
| "も": "う", | |
| "や": "あ", | |
| "ゆ": "う", | |
| "よ": "う", | |
| "ら": "あ", | |
| "り": "い", | |
| "る": "う", | |
| "れ": "え", | |
| "ろ": "う", | |
| "わ": "あ", | |
| "を": "う", | |
| } | |
| new_text = [] | |
| for i, char in enumerate(hiragana_text): | |
| if char == "ー" and i > 0: | |
| prev_char = new_text[-1] | |
| if prev_char in yoon_map: | |
| prev_char = yoon_map[prev_char] | |
| new_text.append(vowels.get(prev_char, prev_char)) | |
| else: | |
| new_text.append(char) | |
| return "".join(new_text) | |
| def to_kana(text: str) -> list[str]: | |
| hiragana_text = kanji_to_kana.to_hiragana(text.replace(" ", "")) | |
| hiragana_text_wl = replace_chouonpu(hiragana_text).split(" ") | |
| final_ls = [] | |
| for subword in hiragana_text_wl: | |
| sl_prev = 0 | |
| for i in range(len(subword) - 1): | |
| if sl_prev >= len(subword) - 1: | |
| break | |
| sl = sl_prev + 1 | |
| if subword[sl] in yoon_map: | |
| final_ls.append(subword[sl_prev : sl + 1]) | |
| sl_prev += 2 | |
| else: | |
| final_ls.append(subword[sl_prev]) | |
| sl_prev += 1 | |
| final_ls.append(subword[sl_prev]) | |
| return final_ls | |
| def kana_to_phonemes_openjtalk(kana: str) -> list[str]: | |
| import pyopenjtalk | |
| with warnings.catch_warnings(record=True) as w: | |
| warnings.simplefilter("always") | |
| # add space between each character | |
| kana = " ".join(list(kana)) | |
| # phones is a str object separated by space | |
| phones = pyopenjtalk.g2p(kana, kana=False) | |
| if len(w) > 0: | |
| for warning in w: | |
| if "No phoneme" in str(warning.message): | |
| raise ValueError(f"No phoneme found for {kana}. {warning.message}") | |
| phones = phones.split(" ") | |
| return phones | |
| def pinyin_to_phonemes_opencpop(pinyin: str) -> list[str]: | |
| pinyin = pinyin.lower() | |
| if pinyin in ace_phonemes_zh_plan["dict"]: | |
| phns = ace_phonemes_zh_plan["dict"][pinyin] | |
| return phns | |
| elif pinyin in ace_phonemes_zh_plan["syllable_alias"]: | |
| phns = ace_phonemes_zh_plan["dict"][ | |
| ace_phonemes_zh_plan["syllable_alias"][pinyin] | |
| ] | |
| return phns | |
| else: | |
| raise ValueError(f"{pinyin} not registered in Opencpop phoneme dict") | |
| def pinyin_to_phonemes_ace(pinyin: str) -> list[str]: | |
| pinyin = pinyin.lower() | |
| if pinyin in PINYIN_DICT: | |
| phns = PINYIN_DICT[pinyin] | |
| return phns | |
| else: | |
| raise ValueError(f"{pinyin} not registered in ACE phoneme dict") | |