Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions runtime/core/frontend/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ add_library(frontend STATIC
g2p_en.cc
g2p_prosody.cc
word_break.cc
sentence_break.cc
)

target_link_libraries(frontend PUBLIC onnx_model)
113 changes: 113 additions & 0 deletions runtime/core/frontend/sentence_break.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
// Copyright (c) 2025 GPT(binbzha@qq.com)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "frontend/sentence_break.h"

#include <cctype>
#include <string>
#include <unordered_set>
#include <vector>

#include "glog/logging.h"

#include "utils/string.h"

namespace wetts {

static bool IsSentenceDelimiterChar(const std::string& ch) {
// 支持英文与中文(全角)常见句子/短语分隔符以及换行
static const std::unordered_set<std::string> kDelims = {
".", ";", "!", "?", "。", ";", "!", "?", "\n", "\r"};
return kDelims.find(ch) != kDelims.end();
}

void Segement(const std::string& text, std::vector<std::string>* sentences,
size_t max_clause_len) {
if (sentences == nullptr) {
return;
}
sentences->clear();
if (text.empty()) {
return;
}

std::string current;
current.reserve(text.size());
size_t current_chars = 0;
size_t last_safe_index = 0; // current 中最近的安全切割点(字节下标)
bool in_ascii_word = false;

std::vector<std::string> chars;
SplitUTF8StringToChars(text, &chars);
for (const auto& ch : chars) {
if (IsSentenceDelimiterChar(ch)) {
std::string trimmed = Trim(current);
if (!trimmed.empty()) {
sentences->emplace_back(trimmed);
}
current.clear();
current_chars = 0;
last_safe_index = 0;
in_ascii_word = false;
continue;
}
// 记录安全切割点(不切英文单词)
bool is_ascii_alnum =
(ch.size() == 1) && std::isalnum(static_cast<unsigned char>(ch[0]));
bool is_space = (ch == " " || ch == "\t");
if (is_space) {
last_safe_index = current.size();
in_ascii_word = false;
} else if (!in_ascii_word && is_ascii_alnum) {
last_safe_index = current.size();
in_ascii_word = true;
} else if (in_ascii_word && !is_ascii_alnum) {
last_safe_index = current.size();
in_ascii_word = false;
}

current += ch;
++current_chars;

if (max_clause_len > 0 && current_chars >= max_clause_len) {
if (last_safe_index > 0) {
std::string left = Trim(current.substr(0, last_safe_index));
if (!left.empty()) {
sentences->emplace_back(left);
}
std::string right = current.substr(last_safe_index);
current.swap(right);
current_chars = UTF8StringLength(current);
last_safe_index = 0;
in_ascii_word = false;
} else {
// 没有安全点(例如超长英文单词),直接强制切分
std::string left = Trim(current);
if (!left.empty()) {
sentences->emplace_back(left);
}
current.clear();
current_chars = 0;
in_ascii_word = false;
}
}
}

std::string trimmed = Trim(current);
if (!trimmed.empty()) {
sentences->emplace_back(trimmed);
}
}

} // namespace wetts
30 changes: 30 additions & 0 deletions runtime/core/frontend/sentence_break.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
// Copyright (c) 2025 Binbin Zhang(binbzha@qq.com)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef FRONTEND_SENTENCE_BREAK_H_
#define FRONTEND_SENTENCE_BREAK_H_

#include <string>
#include <vector>

namespace wetts {

// 将输入文本按句子边界切分。
// max_clause_len: 最大子句长度(按 UTF-8 字符计数),默认 64;0 表示不限制。
void Segement(const std::string& text, std::vector<std::string>* sentences,
size_t max_clause_len = 64);

} // namespace wetts

#endif // FRONTEND_SENTENCE_BREAK_H_
13 changes: 13 additions & 0 deletions runtime/core/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,16 @@ target_link_libraries(word_break_test
)

add_test(NAME word_break_test COMMAND word_break_test)

add_executable(sentence_break_test
sentence_break_test.cc
)

target_link_libraries(sentence_break_test
PRIVATE
frontend
gtest
gtest_main
)

add_test(NAME sentence_break_test COMMAND sentence_break_test)
60 changes: 60 additions & 0 deletions runtime/core/test/sentence_break_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
// Copyright (c) 2025 Binbin Zhang(binbzha@qq.com)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "frontend/sentence_break.h"

#include <gtest/gtest.h>

#include <string>
#include <vector>

using wetts::Segement;

TEST(SentenceBreakTest, ChinesePunctuations) {
std::string text = "我爱编程,但是今天下雨了。明天呢?好吧!";
std::vector<std::string> sentences;
Segement(text, &sentences);
// 逗号不进行切分
std::vector<std::string> expect = {"我爱编程,但是今天下雨了", "明天呢",
"好吧"};
ASSERT_EQ(sentences, expect);
}

TEST(SentenceBreakTest, EnglishPunctuations) {
std::string text = "Hello, world! Are you OK? Yes; good.";
std::vector<std::string> sentences;
Segement(text, &sentences);
// 逗号不进行切分
std::vector<std::string> expect = {"Hello, world", "Are you OK", "Yes",
"good"};
ASSERT_EQ(sentences, expect);
}

TEST(SentenceBreakTest, MaxLengthSplit) {
// 无标点,依赖最大长度强制切分 + 不切英文单词
std::string text = "abc def ghi jkl";
std::vector<std::string> sentences;
Segement(text, &sentences, 4);
std::vector<std::string> expect = {"abc", "def", "ghi", "jkl"};
ASSERT_EQ(sentences, expect);
}

TEST(SentenceBreakTest, ChineseMaxLengthSplit) {
// 中文无空格,达到最大长度时允许强制切分(不涉及英文单词)
std::string text = "我爱编程学习"; // 6 个中文字符
std::vector<std::string> sentences;
Segement(text, &sentences, 3);
std::vector<std::string> expect = {"我爱编", "程学习"};
ASSERT_EQ(sentences, expect);
}