arc53 · Divyansh1552005 · Apr 4, 2026 · Apr 4, 2026
diff --git a/tests/stt/test_live_session_utils.py b/tests/stt/test_live_session_utils.py
@@ -0,0 +1,230 @@
+"""
+Unit tests for the internal utility functions of the live Speech-to-Text (STT) session logic.
+These tests ensure that text normalization, word splitting, prefix matching, and overlap
+calculations function correctly to stabilize live transcription output.
+"""
+
+from application.stt.live_session import (
+    _calculate_commit_count,
+    _common_prefix_length,
+    _find_suffix_prefix_overlap,
+    _normalize_word,
+    _split_words,
+)
+
+# _normalize_word
+
+
+def test_normalize_word_basic_lowercase():
+    """Tests that standard text is simply converted to lowercase."""
+    assert _normalize_word("Hello") == "hello"
+
+
+def test_normalize_word_removes_special_characters():
+    """Tests that punctuation and special characters are stripped from words."""
+    assert _normalize_word("Hello!!!") == "hello"
+
+
+def test_normalize_word_keeps_alphanumeric():
+    """Tests that numbers mixed with text are preserved during normalization."""
+    assert _normalize_word("Hello123") == "hello123"
+
+
+def test_normalize_word_mixed_case_and_symbols():
+    """Tests a complex string containing mixed cases, text, numbers, and symbols."""
+    assert _normalize_word("HeLLo@#123") == "hello123"
+
+
+def test_normalize_word_only_symbols_fallback():
+    """
+    Tests the fallback behavior when a word is entirely symbols.
+
+    If stripping symbols leaves an empty string, it should fall back
+    to simply casefolding the original string so information isn't lost.
+    """
+    # When everything is removed, fallback to casefold
+    assert _normalize_word("!!!") == "!!!".casefold()
+
+
+# _split_words
+
+
+def test_split_words_basic():
+    """Tests basic splitting of a typical sentence into a list of words."""
+    assert _split_words("hello world") == ["hello", "world"]
+
+
+def test_split_words_extra_spaces():
+    """Tests that multiple contiguous spaces do not result in empty string elements."""
+    assert _split_words("  hello   world  ") == ["hello", "world"]
+
+
+def test_split_words_tabs_and_newlines():
+    """Tests that tabs and newline characters are treated as standard word separators."""
+    assert _split_words("hello\t\nworld") == ["hello", "world"]
+
+
+def test_split_words_empty_string():
+    """Tests that an empty string returns an empty list instead of ['']."""
+    assert _split_words("") == []
+
+
+def test_split_words_only_spaces():
+    """Tests that a string composed entirely of whitespace returns an empty list."""
+    assert _split_words("     ") == []
+
+
+# _common_prefix_length
+
+
+def test_common_prefix_full_match():
+    """Tests that identical word lists return a prefix length equal to their total length."""
+    left = ["hello", "world"]
+    right = ["hello", "world"]
+    assert _common_prefix_length(left, right) == 2
+
+
+def test_common_prefix_partial_match():
+    """Tests that the function accurately identifies the length of a matching start."""
+    left = ["hello", "world"]
+    right = ["hello", "there"]
+    assert _common_prefix_length(left, right) == 1
+
+
+def test_common_prefix_no_match():
+    """Tests that lists with completely different starting words return 0."""
+    left = ["hi"]
+    right = ["bye"]
+    assert _common_prefix_length(left, right) == 0
+
+
+def test_common_prefix_case_insensitive():
+    """Tests that prefix matching correctly ignores case differences."""
+    left = ["Hello"]
+    right = ["hello"]
+    assert _common_prefix_length(left, right) == 1
+
+
+def test_common_prefix_with_special_characters():
+    """Tests that prefix matching ignores punctuation anomalies generated by STT."""
+    left = ["hello!!!"]
+    right = ["hello"]
+    assert _common_prefix_length(left, right) == 1
+
+
+def test_common_prefix_empty_lists():
+    """Tests that comparing with one or more empty lists safely returns 0."""
+    assert _common_prefix_length([], ["hello"]) == 0
+    assert _common_prefix_length(["hello"], []) == 0
+
+
+# _find_suffix_prefix_overlap
+
+
+def test_overlap_single_word():
+    """Tests detection of a 1-word overlap between the end of the left list and start of the right."""
+    left = ["i", "love", "python"]
+    right = ["python", "is", "great"]
+    assert _find_suffix_prefix_overlap(left, right, 1) == 1
+
+
+def test_overlap_multiple_words():
+    """Tests detection of multi-word overlaps across the boundary of two lists."""
+    left = ["i", "love", "machine", "learning"]
+    right = ["machine", "learning", "is", "cool"]
+    assert _find_suffix_prefix_overlap(left, right, 2) == 2
+
+
+def test_overlap_no_match():
+    """Tests that distinct lists with no boundary overlap return 0."""
+    left = ["hello"]
+    right = ["world"]
+    assert _find_suffix_prefix_overlap(left, right, 1) == 0
+
+
+def test_overlap_respects_minimum_threshold():
+    """Tests that an overlap smaller than the specified minimum threshold is ignored (returns 0)."""
+    left = ["a", "b"]
+    right = ["b"]
+    assert _find_suffix_prefix_overlap(left, right, 2) == 0
+
+
+def test_overlap_full_match():
+    """Tests that identical lists identify an overlap equal to their entire length."""
+    left = ["a", "b", "c"]
+    right = ["a", "b", "c"]
+    assert _find_suffix_prefix_overlap(left, right, 1) == 3
+
+
+def test_overlap_case_and_symbol_insensitive():
+    """Tests that overlap detection utilizes normalization to ignore punctuation and casing."""
+    left = ["hello!!!"]
+    right = ["hello"]
+    assert _find_suffix_prefix_overlap(left, right, 1) == 1
+
+
+# _calculate_commit_count
+
+
+def test_commit_count_no_previous_words():
+    """Tests that a fresh session with no prior hypothesis does not commit words immediately."""
+    result = _calculate_commit_count("", "hello world", is_silence=False)
+    assert result == 0
+
+
+def test_commit_count_with_silence_no_previous():
+    """
+    Tests that a fresh session encountering silence might commit words early.
+
+    Silence aggressively drops the mutable tail size, allowing words to commit
+    even if there's no previous text to stabilize against.
+    """
+    result = _calculate_commit_count("", "hello world", is_silence=True)
+    # silence allows committing more
+    assert result >= 0
+
+
+def test_commit_count_no_common_prefix():
+    """Tests that a complete shift in hypothesis resets stabilization, committing 0 words."""
+    result = _calculate_commit_count("hello world", "bye world", is_silence=False)
+    assert result == 0
+
+
+def test_commit_count_partial_prefix():
+    """Tests that a stable prefix across hypotheses results in committed words."""
+    # Must be longer than LIVE_STT_MUTABLE_TAIL_WORDS (8 words) to commit anything when not silent
+    prev = "one two three four five six seven eight nine ten"
+    curr = "one two three four five six seven eight nine ten eleven"
+    result = _calculate_commit_count(prev, curr, is_silence=False)
+    assert result >= 1
+
+
+def test_commit_count_full_prefix_but_tail_limited():
+    """
+    Tests the mutable tail constraint during continuous speech.
+
+    Even if the prefix perfectly matches, the engine should hold back the last
+    few words (the mutable tail) from committing to allow for STT corrections.
+    """
+    prev = "one two three four five six seven eight"
+    curr = "one two three four five six seven eight nine ten"
+    result = _calculate_commit_count(prev, curr, is_silence=False)
+
+    # Should not commit everything due to mutable tail constraint
+    assert result < len(curr.split())
+
+
+def test_commit_count_more_aggressive_on_silence():
+    """
+    Tests that silence triggers a smaller mutable tail, committing more words.
+
+    When the user pauses (silence), the STT is less likely to correct older words,
+    so the system commits a larger portion of the stable prefix.
+    """
+    prev = "one two three four five six seven eight"
+    curr = "one two three four five six seven eight nine ten"
+
+    normal = _calculate_commit_count(prev, curr, is_silence=False)
+    silence = _calculate_commit_count(prev, curr, is_silence=True)
+
+    assert silence >= normal