jhansss's picture
bug fixed
7a23964
raw
history blame
4.86 kB
import json
import re
import warnings
from pathlib import Path
from kanjiconv import KanjiConv
from pypinyin import lazy_pinyin
from .resources.pinyin_dict import PINYIN_DICT
kanji_to_kana = KanjiConv()
yoon_map = {
"ぁ": "あ",
"ぃ": "い",
"ぅ": "う",
"ぇ": "え",
"ぉ": "お",
"ゃ": "や",
"ゅ": "ゆ",
"ょ": "よ",
"ゎ": "わ",
}
# ACE_phonemes
with open(Path(__file__).parent / "resources" / "all_plans.json", "r") as f:
ace_phonemes_all_plans = json.load(f)
for plan in ace_phonemes_all_plans["plans"]:
if plan["language"] == "zh":
ace_phonemes_zh_plan = plan
break
def preprocess_text(text: str, language: str) -> list[str]:
text = text.replace(" ", "")
if language == "mandarin":
text_list = to_pinyin(text)
elif language == "japanese":
text_list = to_kana(text)
else:
raise ValueError(f"Other languages are not supported")
return text_list
def to_pinyin(text: str) -> list[str]:
pinyin_list = lazy_pinyin(text)
text_list = []
for text in pinyin_list:
if text[0] == "S" or text[0] == "A" or text[0] == "-":
sp_strs = re.findall(r"-|AP|SP", text)
for phn in sp_strs:
text_list.append(phn)
else:
text_list.append(text)
return text_list
def replace_chouonpu(hiragana_text: str) -> str:
"""process「ー」since the previous packages didn't support"""
vowels = {
"あ": "あ",
"い": "い",
"う": "う",
"え": "え",
"お": "う",
"か": "あ",
"き": "い",
"く": "う",
"け": "え",
"こ": "う",
"さ": "あ",
"し": "い",
"す": "う",
"せ": "え",
"そ": "う",
"た": "あ",
"ち": "い",
"つ": "う",
"て": "え",
"と": "う",
"な": "あ",
"に": "い",
"ぬ": "う",
"ね": "え",
"の": "う",
"は": "あ",
"ひ": "い",
"ふ": "う",
"へ": "え",
"ほ": "う",
"ま": "あ",
"み": "い",
"む": "う",
"め": "え",
"も": "う",
"や": "あ",
"ゆ": "う",
"よ": "う",
"ら": "あ",
"り": "い",
"る": "う",
"れ": "え",
"ろ": "う",
"わ": "あ",
"を": "う",
}
new_text = []
for i, char in enumerate(hiragana_text):
if char == "ー" and i > 0:
prev_char = new_text[-1]
if prev_char in yoon_map:
prev_char = yoon_map[prev_char]
new_text.append(vowels.get(prev_char, prev_char))
else:
new_text.append(char)
return "".join(new_text)
def to_kana(text: str) -> list[str]:
hiragana_text = kanji_to_kana.to_hiragana(text.replace(" ", ""))
hiragana_text_wl = replace_chouonpu(hiragana_text).split(" ")
final_ls = []
for subword in hiragana_text_wl:
sl_prev = 0
for i in range(len(subword) - 1):
if sl_prev >= len(subword) - 1:
break
sl = sl_prev + 1
if subword[sl] in yoon_map:
final_ls.append(subword[sl_prev : sl + 1])
sl_prev += 2
else:
final_ls.append(subword[sl_prev])
sl_prev += 1
final_ls.append(subword[sl_prev])
return final_ls
def kana_to_phonemes_openjtalk(kana: str) -> list[str]:
import pyopenjtalk
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
# add space between each character
kana = " ".join(list(kana))
# phones is a str object separated by space
phones = pyopenjtalk.g2p(kana, kana=False)
if len(w) > 0:
for warning in w:
if "No phoneme" in str(warning.message):
raise ValueError(f"No phoneme found for {kana}. {warning.message}")
phones = phones.split(" ")
return phones
def pinyin_to_phonemes_opencpop(pinyin: str) -> list[str]:
pinyin = pinyin.lower()
if pinyin in ace_phonemes_zh_plan["dict"]:
phns = ace_phonemes_zh_plan["dict"][pinyin]
return phns
elif pinyin in ace_phonemes_zh_plan["syllable_alias"]:
phns = ace_phonemes_zh_plan["dict"][
ace_phonemes_zh_plan["syllable_alias"][pinyin]
]
return phns
else:
raise ValueError(f"{pinyin} not registered in Opencpop phoneme dict")
def pinyin_to_phonemes_ace(pinyin: str) -> list[str]:
pinyin = pinyin.lower()
if pinyin in PINYIN_DICT:
phns = PINYIN_DICT[pinyin]
return phns
else:
raise ValueError(f"{pinyin} not registered in ACE phoneme dict")