SohomToom commited on
Commit
caf3576
ยท
verified ยท
1 Parent(s): 6711545

Update MeloTTS/melo/text/korean.py

Browse files
Files changed (1) hide show
  1. MeloTTS/melo/text/korean.py +191 -191
MeloTTS/melo/text/korean.py CHANGED
@@ -1,192 +1,192 @@
1
- # Convert Japanese text to phonemes which is
2
- # compatible with Julius https://github.com/julius-speech/segmentation-kit
3
- import re
4
- import unicodedata
5
-
6
- from transformers import AutoTokenizer
7
-
8
- from . import punctuation, symbols
9
-
10
-
11
- from num2words import num2words
12
- from melo.text.ko_dictionary import english_dictionary, etc_dictionary
13
- from anyascii import anyascii
14
- from jamo import hangul_to_jamo
15
-
16
- def normalize(text):
17
- text = text.strip()
18
- text = re.sub("[โบ€-โบ™โบ›-โปณโผ€-โฟ•ใ€…ใ€‡ใ€ก-ใ€ฉใ€ธ-ใ€บใ€ปใ€-ไถตไธ€-้ฟƒ่ฑˆ-้ถดไพฎ-้ ปไธฆ-้พŽ]", "", text)
19
- text = normalize_with_dictionary(text, etc_dictionary)
20
- text = normalize_english(text)
21
- text = text.lower()
22
- return text
23
-
24
-
25
- def normalize_with_dictionary(text, dic):
26
- if any(key in text for key in dic.keys()):
27
- pattern = re.compile("|".join(re.escape(key) for key in dic.keys()))
28
- return pattern.sub(lambda x: dic[x.group()], text)
29
- return text
30
-
31
-
32
- def normalize_english(text):
33
- def fn(m):
34
- word = m.group()
35
- if word in english_dictionary:
36
- return english_dictionary.get(word)
37
- return word
38
-
39
- text = re.sub("([A-Za-z]+)", fn, text)
40
- return text
41
-
42
-
43
- g2p_kr = None
44
- def korean_text_to_phonemes(text, character: str = "hangeul") -> str:
45
- """
46
-
47
- The input and output values look the same, but they are different in Unicode.
48
-
49
- example :
50
-
51
- input = 'ํ•˜๋Š˜' (Unicode : \ud558\ub298), (ํ•˜ + ๋Š˜)
52
- output = 'แ„’แ…กแ„‚แ…ณแ†ฏ' (Unicode :\u1112\u1161\u1102\u1173\u11af), (แ„’ + แ…ก + แ„‚ + แ…ณ + แ†ฏ)
53
-
54
- """
55
- global g2p_kr # pylint: disable=global-statement
56
- if g2p_kr is None:
57
- from g2pkk import G2p
58
-
59
- g2p_kr = G2p()
60
-
61
- if character == "english":
62
- from anyascii import anyascii
63
- text = normalize(text)
64
- text = g2p_kr(text)
65
- text = anyascii(text)
66
- return text
67
-
68
- text = normalize(text)
69
- text = g2p_kr(text)
70
- text = list(hangul_to_jamo(text)) # 'ํ•˜๋Š˜' --> ['แ„’', 'แ…ก', 'แ„‚', 'แ…ณ', 'แ†ฏ']
71
- return "".join(text)
72
-
73
- def text_normalize(text):
74
- # res = unicodedata.normalize("NFKC", text)
75
- # res = japanese_convert_numbers_to_words(res)
76
- # # res = "".join([i for i in res if is_japanese_character(i)])
77
- # res = replace_punctuation(res)
78
- text = normalize(text)
79
- return text
80
-
81
-
82
- def distribute_phone(n_phone, n_word):
83
- phones_per_word = [0] * n_word
84
- for task in range(n_phone):
85
- min_tasks = min(phones_per_word)
86
- min_index = phones_per_word.index(min_tasks)
87
- phones_per_word[min_index] += 1
88
- return phones_per_word
89
-
90
-
91
-
92
- # tokenizer = AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-v3')
93
-
94
- model_id = 'kykim/bert-kor-base'
95
- tokenizer = AutoTokenizer.from_pretrained(model_id)
96
-
97
- def g2p(norm_text):
98
- tokenized = tokenizer.tokenize(norm_text)
99
- phs = []
100
- ph_groups = []
101
- for t in tokenized:
102
- if not t.startswith("#"):
103
- ph_groups.append([t])
104
- else:
105
- ph_groups[-1].append(t.replace("#", ""))
106
- word2ph = []
107
- for group in ph_groups:
108
- text = ""
109
- for ch in group:
110
- text += ch
111
- if text == '[UNK]':
112
- phs += ['_']
113
- word2ph += [1]
114
- continue
115
- elif text in punctuation:
116
- phs += [text]
117
- word2ph += [1]
118
- continue
119
- # import pdb; pdb.set_trace()
120
- # phonemes = japanese_text_to_phonemes(text)
121
- # text = g2p_kr(text)
122
- phonemes = korean_text_to_phonemes(text)
123
- # import pdb; pdb.set_trace()
124
- # # phonemes = [i for i in phonemes if i in symbols]
125
- # for i in phonemes:
126
- # assert i in symbols, (group, norm_text, tokenized, i)
127
- phone_len = len(phonemes)
128
- word_len = len(group)
129
-
130
- aaa = distribute_phone(phone_len, word_len)
131
- assert len(aaa) == word_len
132
- word2ph += aaa
133
-
134
- phs += phonemes
135
- phones = ["_"] + phs + ["_"]
136
- tones = [0 for i in phones]
137
- word2ph = [1] + word2ph + [1]
138
- assert len(word2ph) == len(tokenized) + 2
139
- return phones, tones, word2ph
140
-
141
- def get_bert_feature(text, word2ph, device='cuda'):
142
- from . import japanese_bert
143
- return japanese_bert.get_bert_feature(text, word2ph, device=device, model_id=model_id)
144
-
145
-
146
- if __name__ == "__main__":
147
- # tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
148
- from text.symbols import symbols
149
- text = "์ „ ์ œ ์ผ์˜ ๊ฐ€์น˜์™€ ํฐํƒ€์ธ ๋Œ€์ค‘๋“ค์ด ํ•œ ์ผ์˜ ์˜๋ฏธ๋ฅผ ์ž˜ ์••๋‹ˆ๋‹ค. ์•ž์œผ๋กœ๋„ ์ „ ์ œ ์ผ์— ์ž๋ถ€์‹ฌ์„ ๊ฐ–๊ณ  ์‚ด์•„๊ฐˆ ๊ฒ๋‹ˆ๋‹ค"
150
- import json
151
-
152
- # genshin_data = json.load(open('/data/zwl/workspace/StarRail_Datasets/Index & Scripts/Index/1.3/Korean.json'))
153
- genshin_data = json.load(open('/data/zwl/workspace/Genshin_Datasets/Index & Script/AI Hobbyist Version/Index/4.1/KR_output.json'))
154
- from tqdm import tqdm
155
- new_symbols = []
156
- for key, item in tqdm(genshin_data.items()):
157
- texts = item.get('voiceContent', '')
158
- if isinstance(texts, list):
159
- texts = ','.join(texts)
160
- if texts is None:
161
- continue
162
- if len(texts) == 0:
163
- continue
164
-
165
- text = text_normalize(text)
166
- phones, tones, word2ph = g2p(text)
167
- bert = get_bert_feature(text, word2ph)
168
- import pdb; pdb.set_trace()
169
- for ph in phones:
170
- if ph not in symbols and ph not in new_symbols:
171
- new_symbols.append(ph)
172
- print('update!, now symbols:')
173
- print(new_symbols)
174
- with open('korean_symbol.txt', 'w') as f:
175
- f.write(f'{new_symbols}')
176
-
177
-
178
-
179
- # if __name__ == '__main__':
180
- # from pykakasi import kakasi
181
- # # Initialize kakasi object
182
- # kakasi = kakasi()
183
-
184
- # # Set options for converting Chinese characters to Katakana
185
- # kakasi.setMode("J", "H") # Chinese to Katakana
186
- # kakasi.setMode("K", "H") # Hiragana to Katakana
187
-
188
- # # Convert Chinese characters to Katakana
189
- # conv = kakasi.getConverter()
190
- # katakana_text = conv.do('ใˆใˆใ€ๅƒ•ใฏใŠใใชใจ็”ณใ—ใพใ™ใ€‚ใ“ใกใ‚‰ใฎๅฐใ•ใ„ใ‚ใ‚‰ในใฏๆๅญใ€‚ใ”ๆŒจๆ‹ถใŒ้…ใ‚Œใฆใ—ใพใ„ใ™ใฟใพใ›ใ‚“ใ€‚ใ‚ใชใŸใฎๅใฏ?') # Replace with your Chinese text
191
-
192
  # print(katakana_text) # Output: ใƒ‹ใƒผใƒใ‚ชใ‚ปใ‚ซใ‚ค
 
1
+ # Convert Japanese text to phonemes which is
2
+ # compatible with Julius https://github.com/julius-speech/segmentation-kit
3
+ import re
4
+ import unicodedata
5
+
6
+ from transformers import AutoTokenizer
7
+
8
+ from . import punctuation, symbols
9
+
10
+
11
+ from num2words import num2words
12
+ from MeloTTS.melo.text.ko_dictionary import english_dictionary, etc_dictionary
13
+ from anyascii import anyascii
14
+ from jamo import hangul_to_jamo
15
+
16
+ def normalize(text):
17
+ text = text.strip()
18
+ text = re.sub("[โบ€-โบ™โบ›-โปณโผ€-โฟ•ใ€…ใ€‡ใ€ก-ใ€ฉใ€ธ-ใ€บใ€ปใ€-ไถตไธ€-้ฟƒ่ฑˆ-้ถดไพฎ-้ ปไธฆ-้พŽ]", "", text)
19
+ text = normalize_with_dictionary(text, etc_dictionary)
20
+ text = normalize_english(text)
21
+ text = text.lower()
22
+ return text
23
+
24
+
25
+ def normalize_with_dictionary(text, dic):
26
+ if any(key in text for key in dic.keys()):
27
+ pattern = re.compile("|".join(re.escape(key) for key in dic.keys()))
28
+ return pattern.sub(lambda x: dic[x.group()], text)
29
+ return text
30
+
31
+
32
+ def normalize_english(text):
33
+ def fn(m):
34
+ word = m.group()
35
+ if word in english_dictionary:
36
+ return english_dictionary.get(word)
37
+ return word
38
+
39
+ text = re.sub("([A-Za-z]+)", fn, text)
40
+ return text
41
+
42
+
43
+ g2p_kr = None
44
+ def korean_text_to_phonemes(text, character: str = "hangeul") -> str:
45
+ """
46
+
47
+ The input and output values look the same, but they are different in Unicode.
48
+
49
+ example :
50
+
51
+ input = 'ํ•˜๋Š˜' (Unicode : \ud558\ub298), (ํ•˜ + ๋Š˜)
52
+ output = 'แ„’แ…กแ„‚แ…ณแ†ฏ' (Unicode :\u1112\u1161\u1102\u1173\u11af), (แ„’ + แ…ก + แ„‚ + แ…ณ + แ†ฏ)
53
+
54
+ """
55
+ global g2p_kr # pylint: disable=global-statement
56
+ if g2p_kr is None:
57
+ from g2pkk import G2p
58
+
59
+ g2p_kr = G2p()
60
+
61
+ if character == "english":
62
+ from anyascii import anyascii
63
+ text = normalize(text)
64
+ text = g2p_kr(text)
65
+ text = anyascii(text)
66
+ return text
67
+
68
+ text = normalize(text)
69
+ text = g2p_kr(text)
70
+ text = list(hangul_to_jamo(text)) # 'ํ•˜๋Š˜' --> ['แ„’', 'แ…ก', 'แ„‚', 'แ…ณ', 'แ†ฏ']
71
+ return "".join(text)
72
+
73
+ def text_normalize(text):
74
+ # res = unicodedata.normalize("NFKC", text)
75
+ # res = japanese_convert_numbers_to_words(res)
76
+ # # res = "".join([i for i in res if is_japanese_character(i)])
77
+ # res = replace_punctuation(res)
78
+ text = normalize(text)
79
+ return text
80
+
81
+
82
+ def distribute_phone(n_phone, n_word):
83
+ phones_per_word = [0] * n_word
84
+ for task in range(n_phone):
85
+ min_tasks = min(phones_per_word)
86
+ min_index = phones_per_word.index(min_tasks)
87
+ phones_per_word[min_index] += 1
88
+ return phones_per_word
89
+
90
+
91
+
92
+ # tokenizer = AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-v3')
93
+
94
+ model_id = 'kykim/bert-kor-base'
95
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
96
+
97
+ def g2p(norm_text):
98
+ tokenized = tokenizer.tokenize(norm_text)
99
+ phs = []
100
+ ph_groups = []
101
+ for t in tokenized:
102
+ if not t.startswith("#"):
103
+ ph_groups.append([t])
104
+ else:
105
+ ph_groups[-1].append(t.replace("#", ""))
106
+ word2ph = []
107
+ for group in ph_groups:
108
+ text = ""
109
+ for ch in group:
110
+ text += ch
111
+ if text == '[UNK]':
112
+ phs += ['_']
113
+ word2ph += [1]
114
+ continue
115
+ elif text in punctuation:
116
+ phs += [text]
117
+ word2ph += [1]
118
+ continue
119
+ # import pdb; pdb.set_trace()
120
+ # phonemes = japanese_text_to_phonemes(text)
121
+ # text = g2p_kr(text)
122
+ phonemes = korean_text_to_phonemes(text)
123
+ # import pdb; pdb.set_trace()
124
+ # # phonemes = [i for i in phonemes if i in symbols]
125
+ # for i in phonemes:
126
+ # assert i in symbols, (group, norm_text, tokenized, i)
127
+ phone_len = len(phonemes)
128
+ word_len = len(group)
129
+
130
+ aaa = distribute_phone(phone_len, word_len)
131
+ assert len(aaa) == word_len
132
+ word2ph += aaa
133
+
134
+ phs += phonemes
135
+ phones = ["_"] + phs + ["_"]
136
+ tones = [0 for i in phones]
137
+ word2ph = [1] + word2ph + [1]
138
+ assert len(word2ph) == len(tokenized) + 2
139
+ return phones, tones, word2ph
140
+
141
+ def get_bert_feature(text, word2ph, device='cuda'):
142
+ from . import japanese_bert
143
+ return japanese_bert.get_bert_feature(text, word2ph, device=device, model_id=model_id)
144
+
145
+
146
+ if __name__ == "__main__":
147
+ # tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
148
+ from text.symbols import symbols
149
+ text = "์ „ ์ œ ์ผ์˜ ๊ฐ€์น˜์™€ ํฐํƒ€์ธ ๋Œ€์ค‘๋“ค์ด ํ•œ ์ผ์˜ ์˜๋ฏธ๋ฅผ ์ž˜ ์••๋‹ˆ๋‹ค. ์•ž์œผ๋กœ๋„ ์ „ ์ œ ์ผ์— ์ž๋ถ€์‹ฌ์„ ๊ฐ–๊ณ  ์‚ด์•„๊ฐˆ ๊ฒ๋‹ˆ๋‹ค"
150
+ import json
151
+
152
+ # genshin_data = json.load(open('/data/zwl/workspace/StarRail_Datasets/Index & Scripts/Index/1.3/Korean.json'))
153
+ genshin_data = json.load(open('/data/zwl/workspace/Genshin_Datasets/Index & Script/AI Hobbyist Version/Index/4.1/KR_output.json'))
154
+ from tqdm import tqdm
155
+ new_symbols = []
156
+ for key, item in tqdm(genshin_data.items()):
157
+ texts = item.get('voiceContent', '')
158
+ if isinstance(texts, list):
159
+ texts = ','.join(texts)
160
+ if texts is None:
161
+ continue
162
+ if len(texts) == 0:
163
+ continue
164
+
165
+ text = text_normalize(text)
166
+ phones, tones, word2ph = g2p(text)
167
+ bert = get_bert_feature(text, word2ph)
168
+ import pdb; pdb.set_trace()
169
+ for ph in phones:
170
+ if ph not in symbols and ph not in new_symbols:
171
+ new_symbols.append(ph)
172
+ print('update!, now symbols:')
173
+ print(new_symbols)
174
+ with open('korean_symbol.txt', 'w') as f:
175
+ f.write(f'{new_symbols}')
176
+
177
+
178
+
179
+ # if __name__ == '__main__':
180
+ # from pykakasi import kakasi
181
+ # # Initialize kakasi object
182
+ # kakasi = kakasi()
183
+
184
+ # # Set options for converting Chinese characters to Katakana
185
+ # kakasi.setMode("J", "H") # Chinese to Katakana
186
+ # kakasi.setMode("K", "H") # Hiragana to Katakana
187
+
188
+ # # Convert Chinese characters to Katakana
189
+ # conv = kakasi.getConverter()
190
+ # katakana_text = conv.do('ใˆใˆใ€ๅƒ•ใฏใŠใใชใจ็”ณใ—ใพใ™ใ€‚ใ“ใกใ‚‰ใฎๅฐใ•ใ„ใ‚ใ‚‰ในใฏๆๅญใ€‚ใ”ๆŒจๆ‹ถใŒ้…ใ‚Œใฆใ—ใพใ„ใ™ใฟใพใ›ใ‚“ใ€‚ใ‚ใชใŸใฎๅใฏ?') # Replace with your Chinese text
191
+
192
  # print(katakana_text) # Output: ใƒ‹ใƒผใƒใ‚ชใ‚ปใ‚ซใ‚ค