Skip to content

Commit d6403e7

Browse files
authored
[fix] fix segmentation error, empty sentence & prosody mapping (#250)
1 parent 79cb146 commit d6403e7

3 files changed

Lines changed: 64 additions & 24 deletions

File tree

runtime/core/frontend/g2p_prosody.cc

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include <fstream>
1919
#include <memory>
2020
#include <string>
21+
#include <unordered_map>
2122
#include <utility>
2223
#include <vector>
2324

@@ -31,6 +32,16 @@
3132

3233
namespace wetts {
3334

35+
// 标点到韵律值的映射规则
36+
// 逗号、冒号映射为 #3,顿号映射为 #2
37+
static const std::unordered_map<std::string, std::string> kPunctProsodyMap = {
38+
{",", "#3"}, // 英文逗号
39+
{"", "#3"}, // 中文逗号
40+
{":", "#3"}, // 英文冒号
41+
{"", "#3"}, // 中文冒号
42+
{"", "#2"}, // 中文顿号
43+
};
44+
3445
G2pProsody::G2pProsody(const std::string& g2p_prosody_model,
3546
const std::string& g2p_prosody_vocab,
3647
const std::string& lexicon_file,
@@ -200,15 +211,23 @@ void G2pProsody::Compute(const std::string& str,
200211
phonemes->insert(phonemes->end(), pinyin.begin(), pinyin.end());
201212
phonemes->emplace_back(prosody[0]);
202213
} else {
203-
// Not English, Not in Lexicon, ignore now
204-
// TODO(Binbin Zhang): Deal punct
205-
LOG(WARNING) << "Ignore word " << word;
214+
// Deal with OOV & punctuation
215+
auto it = kPunctProsodyMap.find(word);
216+
if (it != kPunctProsodyMap.end()) {
217+
if (!phonemes->empty()) {
218+
phonemes->back() = it->second;
219+
}
220+
} else {
221+
LOG(WARNING) << "Ignore word " << word;
222+
}
206223
}
207224
VLOG(2) << "Word, g2p & prosody: " << word << " "
208225
<< pinyins[idx] << " " << prosodys[idx];
209226
}
210227
// Last token should be "#4"
211-
phonemes->back() = "#4";
228+
if (phonemes->size() > 0) {
229+
phonemes->back() = "#4";
230+
}
212231
}
213232

214233

runtime/core/model/tts.cc

Lines changed: 40 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -44,27 +44,33 @@ TTS::TTS(const std::string& encoder_model_path,
4444
ReadTableFile(speaker2id, &speaker2id_);
4545
}
4646

47-
void TTS::Text2PhoneIds(const std::string& text,
47+
bool TTS::Text2PhoneIds(const std::string& text,
4848
std::vector<int64_t>* phone_ids) {
4949
phone_ids->clear();
5050
// 1. TN
5151
std::string norm_text = tn_->Normalize(text);
52+
LOG(INFO) << text << " --TN--> " << norm_text;
5253
// 2. G2P: char => pinyin => phones => ids
5354
std::vector<std::string> phonemes;
5455
g2p_prosody_->Compute(norm_text, &phonemes);
5556
// 3. Convert to phone id
56-
std::stringstream ss;
57-
phone_ids->emplace_back(phone2id_["sil"]);
58-
ss << "sil";
59-
for (const auto& phone : phonemes) {
60-
if (phone2id_.count(phone) == 0) {
61-
LOG(ERROR) << "Can't find `" << phone << "` in phone2id.";
62-
continue;
57+
if (phonemes.size() > 0) {
58+
std::stringstream ss;
59+
phone_ids->emplace_back(phone2id_["sil"]);
60+
ss << "sil";
61+
for (const auto& phone : phonemes) {
62+
if (phone2id_.count(phone) == 0) {
63+
LOG(ERROR) << "Can't find `" << phone << "` in phone2id.";
64+
continue;
65+
}
66+
ss << " " << phone;
67+
phone_ids->emplace_back(phone2id_[phone]);
6368
}
64-
ss << " " << phone;
65-
phone_ids->emplace_back(phone2id_[phone]);
69+
LOG(INFO) << "phone sequence " << ss.str();
70+
return true;
71+
} else {
72+
return false;
6673
}
67-
LOG(INFO) << "phone sequence " << ss.str();
6874
}
6975

7076
void TTS::Synthesis(const std::string& text, const int sid,
@@ -73,10 +79,12 @@ void TTS::Synthesis(const std::string& text, const int sid,
7379
SentenceSegement(text, &text_arrs);
7480
for (const auto& text : text_arrs) {
7581
std::vector<int64_t> phonemes;
76-
Text2PhoneIds(text, &phonemes);
77-
std::vector<float> sub_audio;
78-
vits_.Forward(phonemes, sid, &sub_audio);
79-
audio->insert(audio->end(), sub_audio.begin(), sub_audio.end());
82+
bool ok = Text2PhoneIds(text, &phonemes);
83+
if (ok) {
84+
std::vector<float> sub_audio;
85+
vits_.Forward(phonemes, sid, &sub_audio);
86+
audio->insert(audio->end(), sub_audio.begin(), sub_audio.end());
87+
}
8088
}
8189
}
8290

@@ -93,10 +101,23 @@ bool TTS::StreamSynthesis(std::vector<float>* audio) {
93101
return true; // all text done
94102
}
95103
if (new_text_) {
96-
std::vector<int64_t> phonemes;
97-
Text2PhoneIds(text_arrs_[cur_text_idx_], &phonemes);
98-
vits_.SetInput(phonemes, sid_);
99-
new_text_ = false;
104+
// 连续跳过转换失败的文本片段,直到找到有效的片段
105+
while (cur_text_idx_ < text_arrs_.size()) {
106+
std::vector<int64_t> phonemes;
107+
bool ok = Text2PhoneIds(text_arrs_[cur_text_idx_], &phonemes);
108+
if (ok && !phonemes.empty()) {
109+
// 找到有效的片段,设置输入并退出循环
110+
vits_.SetInput(phonemes, sid_);
111+
new_text_ = false;
112+
break;
113+
}
114+
// 转换失败或结果为空,跳过当前文本片段,继续下一个
115+
cur_text_idx_++;
116+
}
117+
// 如果所有片段都处理完了,返回 true
118+
if (cur_text_idx_ >= text_arrs_.size()) {
119+
return true;
120+
}
100121
}
101122
bool done = vits_.StreamDecode(audio);
102123
if (done) {

runtime/core/model/tts.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ class TTS {
4848
// true: synthesis done, stop call
4949
// false: synthesis work in progress, you should continue to call me
5050
bool StreamSynthesis(std::vector<float>* audio);
51-
void Text2PhoneIds(const std::string& text, std::vector<int64_t>* phone_ids);
51+
bool Text2PhoneIds(const std::string& text, std::vector<int64_t>* phone_ids);
5252
int GetSid(const std::string& name);
5353
int sampling_rate() const { return sampling_rate_; }
5454

0 commit comments

Comments
 (0)