wenet-e2e · robin1001 · Nov 4, 2025 · Nov 4, 2025
diff --git a/runtime/core/frontend/CMakeLists.txt b/runtime/core/frontend/CMakeLists.txt
@@ -3,6 +3,7 @@ add_library(frontend STATIC
   g2p_en.cc
   g2p_prosody.cc
   word_break.cc
+  sentence_break.cc
 )
 
 target_link_libraries(frontend PUBLIC onnx_model)
diff --git a/runtime/core/frontend/sentence_break.cc b/runtime/core/frontend/sentence_break.cc
@@ -0,0 +1,113 @@
+// Copyright (c) 2025 GPT(binbzha@qq.com)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "frontend/sentence_break.h"
+
+#include <cctype>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "glog/logging.h"
+
+#include "utils/string.h"
+
+namespace wetts {
+
+static bool IsSentenceDelimiterChar(const std::string& ch) {
+  // 支持英文与中文（全角）常见句子/短语分隔符以及换行
+  static const std::unordered_set<std::string> kDelims = {
+      ".", ";", "!", "?", "。", "；", "！", "？", "\n", "\r"};
+  return kDelims.find(ch) != kDelims.end();
+}
+
+void Segement(const std::string& text, std::vector<std::string>* sentences,
+              size_t max_clause_len) {
+  if (sentences == nullptr) {
+    return;
+  }
+  sentences->clear();
+  if (text.empty()) {
+    return;
+  }
+
+  std::string current;
+  current.reserve(text.size());
+  size_t current_chars = 0;
+  size_t last_safe_index = 0;  // current 中最近的安全切割点（字节下标）
+  bool in_ascii_word = false;
+
+  std::vector<std::string> chars;
+  SplitUTF8StringToChars(text, &chars);
+  for (const auto& ch : chars) {
+    if (IsSentenceDelimiterChar(ch)) {
+      std::string trimmed = Trim(current);
+      if (!trimmed.empty()) {
+        sentences->emplace_back(trimmed);
+      }
+      current.clear();
+      current_chars = 0;
+      last_safe_index = 0;
+      in_ascii_word = false;
+      continue;
+    }
+    // 记录安全切割点（不切英文单词）
+    bool is_ascii_alnum =
+        (ch.size() == 1) && std::isalnum(static_cast<unsigned char>(ch[0]));
+    bool is_space = (ch == " " || ch == "\t");
+    if (is_space) {
+      last_safe_index = current.size();
+      in_ascii_word = false;
+    } else if (!in_ascii_word && is_ascii_alnum) {
+      last_safe_index = current.size();
+      in_ascii_word = true;
+    } else if (in_ascii_word && !is_ascii_alnum) {
+      last_safe_index = current.size();
+      in_ascii_word = false;
+    }
+
+    current += ch;
+    ++current_chars;
+
+    if (max_clause_len > 0 && current_chars >= max_clause_len) {
+      if (last_safe_index > 0) {
+        std::string left = Trim(current.substr(0, last_safe_index));
+        if (!left.empty()) {
+          sentences->emplace_back(left);
+        }
+        std::string right = current.substr(last_safe_index);
+        current.swap(right);
+        current_chars = UTF8StringLength(current);
+        last_safe_index = 0;
+        in_ascii_word = false;
+      } else {
+        // 没有安全点（例如超长英文单词），直接强制切分
+        std::string left = Trim(current);
+        if (!left.empty()) {
+          sentences->emplace_back(left);
+        }
+        current.clear();
+        current_chars = 0;
+        in_ascii_word = false;
+      }
+    }
+  }
+
+  std::string trimmed = Trim(current);
+  if (!trimmed.empty()) {
+    sentences->emplace_back(trimmed);
+  }
+}
+
+}  // namespace wetts
diff --git a/runtime/core/frontend/sentence_break.h b/runtime/core/frontend/sentence_break.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2025 Binbin Zhang(binbzha@qq.com)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef FRONTEND_SENTENCE_BREAK_H_
+#define FRONTEND_SENTENCE_BREAK_H_
+
+#include <string>
+#include <vector>
+
+namespace wetts {
+
+// 将输入文本按句子边界切分。
+// max_clause_len: 最大子句长度（按 UTF-8 字符计数），默认 64；0 表示不限制。
+void Segement(const std::string& text, std::vector<std::string>* sentences,
+              size_t max_clause_len = 64);
+
+}  // namespace wetts
+
+#endif  // FRONTEND_SENTENCE_BREAK_H_
diff --git a/runtime/core/test/CMakeLists.txt b/runtime/core/test/CMakeLists.txt
@@ -12,3 +12,16 @@ target_link_libraries(word_break_test
 )
 
 add_test(NAME word_break_test COMMAND word_break_test)
+
+add_executable(sentence_break_test
+  sentence_break_test.cc
+)
+
+target_link_libraries(sentence_break_test
+  PRIVATE
+    frontend
+    gtest
+    gtest_main
+)
+
+add_test(NAME sentence_break_test COMMAND sentence_break_test)
diff --git a/runtime/core/test/sentence_break_test.cc b/runtime/core/test/sentence_break_test.cc
@@ -0,0 +1,60 @@
+// Copyright (c) 2025 Binbin Zhang(binbzha@qq.com)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "frontend/sentence_break.h"
+
+#include <gtest/gtest.h>
+
+#include <string>
+#include <vector>
+
+using wetts::Segement;
+
+TEST(SentenceBreakTest, ChinesePunctuations) {
+  std::string text = "我爱编程，但是今天下雨了。明天呢？好吧！";
+  std::vector<std::string> sentences;
+  Segement(text, &sentences);
+  // 逗号不进行切分
+  std::vector<std::string> expect = {"我爱编程，但是今天下雨了", "明天呢",
+                                     "好吧"};
+  ASSERT_EQ(sentences, expect);
+}
+
+TEST(SentenceBreakTest, EnglishPunctuations) {
+  std::string text = "Hello, world! Are you OK? Yes; good.";
+  std::vector<std::string> sentences;
+  Segement(text, &sentences);
+  // 逗号不进行切分
+  std::vector<std::string> expect = {"Hello, world", "Are you OK", "Yes",
+                                     "good"};
+  ASSERT_EQ(sentences, expect);
+}
+
+TEST(SentenceBreakTest, MaxLengthSplit) {
+  // 无标点，依赖最大长度强制切分 + 不切英文单词
+  std::string text = "abc def ghi jkl";
+  std::vector<std::string> sentences;
+  Segement(text, &sentences, 4);
+  std::vector<std::string> expect = {"abc", "def", "ghi", "jkl"};
+  ASSERT_EQ(sentences, expect);
+}
+
+TEST(SentenceBreakTest, ChineseMaxLengthSplit) {
+  // 中文无空格，达到最大长度时允许强制切分（不涉及英文单词）
+  std::string text = "我爱编程学习";  // 6 个中文字符
+  std::vector<std::string> sentences;
+  Segement(text, &sentences, 3);
+  std::vector<std::string> expect = {"我爱编", "程学习"};
+  ASSERT_EQ(sentences, expect);
+}