@@ -44,27 +44,33 @@ TTS::TTS(const std::string& encoder_model_path,
4444 ReadTableFile (speaker2id, &speaker2id_);
4545}
4646
47- void TTS::Text2PhoneIds (const std::string& text,
47+ bool TTS::Text2PhoneIds (const std::string& text,
4848 std::vector<int64_t >* phone_ids) {
4949 phone_ids->clear ();
5050 // 1. TN
5151 std::string norm_text = tn_->Normalize (text);
52+ LOG (INFO) << text << " --TN--> " << norm_text;
5253 // 2. G2P: char => pinyin => phones => ids
5354 std::vector<std::string> phonemes;
5455 g2p_prosody_->Compute (norm_text, &phonemes);
5556 // 3. Convert to phone id
56- std::stringstream ss;
57- phone_ids->emplace_back (phone2id_[" sil" ]);
58- ss << " sil" ;
59- for (const auto & phone : phonemes) {
60- if (phone2id_.count (phone) == 0 ) {
61- LOG (ERROR) << " Can't find `" << phone << " ` in phone2id." ;
62- continue ;
57+ if (phonemes.size () > 0 ) {
58+ std::stringstream ss;
59+ phone_ids->emplace_back (phone2id_[" sil" ]);
60+ ss << " sil" ;
61+ for (const auto & phone : phonemes) {
62+ if (phone2id_.count (phone) == 0 ) {
63+ LOG (ERROR) << " Can't find `" << phone << " ` in phone2id." ;
64+ continue ;
65+ }
66+ ss << " " << phone;
67+ phone_ids->emplace_back (phone2id_[phone]);
6368 }
64- ss << " " << phone;
65- phone_ids->emplace_back (phone2id_[phone]);
69+ LOG (INFO) << " phone sequence " << ss.str ();
70+ return true ;
71+ } else {
72+ return false ;
6673 }
67- LOG (INFO) << " phone sequence " << ss.str ();
6874}
6975
7076void TTS::Synthesis (const std::string& text, const int sid,
@@ -73,10 +79,12 @@ void TTS::Synthesis(const std::string& text, const int sid,
7379 SentenceSegement (text, &text_arrs);
7480 for (const auto & text : text_arrs) {
7581 std::vector<int64_t > phonemes;
76- Text2PhoneIds (text, &phonemes);
77- std::vector<float > sub_audio;
78- vits_.Forward (phonemes, sid, &sub_audio);
79- audio->insert (audio->end (), sub_audio.begin (), sub_audio.end ());
82+ bool ok = Text2PhoneIds (text, &phonemes);
83+ if (ok) {
84+ std::vector<float > sub_audio;
85+ vits_.Forward (phonemes, sid, &sub_audio);
86+ audio->insert (audio->end (), sub_audio.begin (), sub_audio.end ());
87+ }
8088 }
8189}
8290
@@ -93,10 +101,23 @@ bool TTS::StreamSynthesis(std::vector<float>* audio) {
93101 return true ; // all text done
94102 }
95103 if (new_text_) {
96- std::vector<int64_t > phonemes;
97- Text2PhoneIds (text_arrs_[cur_text_idx_], &phonemes);
98- vits_.SetInput (phonemes, sid_);
99- new_text_ = false ;
104+ // 连续跳过转换失败的文本片段,直到找到有效的片段
105+ while (cur_text_idx_ < text_arrs_.size ()) {
106+ std::vector<int64_t > phonemes;
107+ bool ok = Text2PhoneIds (text_arrs_[cur_text_idx_], &phonemes);
108+ if (ok && !phonemes.empty ()) {
109+ // 找到有效的片段,设置输入并退出循环
110+ vits_.SetInput (phonemes, sid_);
111+ new_text_ = false ;
112+ break ;
113+ }
114+ // 转换失败或结果为空,跳过当前文本片段,继续下一个
115+ cur_text_idx_++;
116+ }
117+ // 如果所有片段都处理完了,返回 true
118+ if (cur_text_idx_ >= text_arrs_.size ()) {
119+ return true ;
120+ }
100121 }
101122 bool done = vits_.StreamDecode (audio);
102123 if (done) {
0 commit comments