From 4bc2d4d834b7e344b51d92436736b4d5ef8d1a38 Mon Sep 17 00:00:00 2001 From: Divyansh Date: Sun, 5 Apr 2026 02:02:04 +0530 Subject: [PATCH 1/2] Added tests for stt live session utilites --- tests/stt/test_live_session_utils.py | 227 +++++++++++++++++++++++++++ 1 file changed, 227 insertions(+) create mode 100644 tests/stt/test_live_session_utils.py diff --git a/tests/stt/test_live_session_utils.py b/tests/stt/test_live_session_utils.py new file mode 100644 index 000000000..561b48cf3 --- /dev/null +++ b/tests/stt/test_live_session_utils.py @@ -0,0 +1,227 @@ +""" +Unit tests for the internal utility functions of the live Speech-to-Text (STT) session logic. +These tests ensure that text normalization, word splitting, prefix matching, and overlap +calculations function correctly to stabilize live transcription output. +""" + +from application.stt.live_session import ( + _calculate_commit_count, + _common_prefix_length, + _find_suffix_prefix_overlap, + _normalize_word, + _split_words, +) + +# _normalize_word + + +def test_normalize_word_basic_lowercase(): + """Tests that standard text is simply converted to lowercase.""" + assert _normalize_word("Hello") == "hello" + + +def test_normalize_word_removes_special_characters(): + """Tests that punctuation and special characters are stripped from words.""" + assert _normalize_word("Hello!!!") == "hello" + + +def test_normalize_word_keeps_alphanumeric(): + """Tests that numbers mixed with text are preserved during normalization.""" + assert _normalize_word("Hello123") == "hello123" + + +def test_normalize_word_mixed_case_and_symbols(): + """Tests a complex string containing mixed cases, text, numbers, and symbols.""" + assert _normalize_word("HeLLo@#123") == "hello123" + + +def test_normalize_word_only_symbols_fallback(): + """ + Tests the fallback behavior when a word is entirely symbols. + + If stripping symbols leaves an empty string, it should fall back + to simply casefolding the original string so information isn't lost. + """ + # When everything is removed, fallback to casefold + assert _normalize_word("!!!") == "!!!".casefold() + + +# _split_words + + +def test_split_words_basic(): + """Tests basic splitting of a typical sentence into a list of words.""" + assert _split_words("hello world") == ["hello", "world"] + + +def test_split_words_extra_spaces(): + """Tests that multiple contiguous spaces do not result in empty string elements.""" + assert _split_words(" hello world ") == ["hello", "world"] + + +def test_split_words_tabs_and_newlines(): + """Tests that tabs and newline characters are treated as standard word separators.""" + assert _split_words("hello\t\nworld") == ["hello", "world"] + + +def test_split_words_empty_string(): + """Tests that an empty string returns an empty list instead of [''].""" + assert _split_words("") == [] + + +def test_split_words_only_spaces(): + """Tests that a string composed entirely of whitespace returns an empty list.""" + assert _split_words(" ") == [] + + +# _common_prefix_length + + +def test_common_prefix_full_match(): + """Tests that identical word lists return a prefix length equal to their total length.""" + left = ["hello", "world"] + right = ["hello", "world"] + assert _common_prefix_length(left, right) == 2 + + +def test_common_prefix_partial_match(): + """Tests that the function accurately identifies the length of a matching start.""" + left = ["hello", "world"] + right = ["hello", "there"] + assert _common_prefix_length(left, right) == 1 + + +def test_common_prefix_no_match(): + """Tests that lists with completely different starting words return 0.""" + left = ["hi"] + right = ["bye"] + assert _common_prefix_length(left, right) == 0 + + +def test_common_prefix_case_insensitive(): + """Tests that prefix matching correctly ignores case differences.""" + left = ["Hello"] + right = ["hello"] + assert _common_prefix_length(left, right) == 1 + + +def test_common_prefix_with_special_characters(): + """Tests that prefix matching ignores punctuation anomalies generated by STT.""" + left = ["hello!!!"] + right = ["hello"] + assert _common_prefix_length(left, right) == 1 + + +def test_common_prefix_empty_lists(): + """Tests that comparing with one or more empty lists safely returns 0.""" + assert _common_prefix_length([], ["hello"]) == 0 + assert _common_prefix_length(["hello"], []) == 0 + + +# _find_suffix_prefix_overlap + + +def test_overlap_single_word(): + """Tests detection of a 1-word overlap between the end of the left list and start of the right.""" + left = ["i", "love", "python"] + right = ["python", "is", "great"] + assert _find_suffix_prefix_overlap(left, right, 1) == 1 + + +def test_overlap_multiple_words(): + """Tests detection of multi-word overlaps across the boundary of two lists.""" + left = ["i", "love", "machine", "learning"] + right = ["machine", "learning", "is", "cool"] + assert _find_suffix_prefix_overlap(left, right, 2) == 2 + + +def test_overlap_no_match(): + """Tests that distinct lists with no boundary overlap return 0.""" + left = ["hello"] + right = ["world"] + assert _find_suffix_prefix_overlap(left, right, 1) == 0 + + +def test_overlap_respects_minimum_threshold(): + """Tests that an overlap smaller than the specified minimum threshold is ignored (returns 0).""" + left = ["a", "b"] + right = ["b"] + assert _find_suffix_prefix_overlap(left, right, 2) == 0 + + +def test_overlap_full_match(): + """Tests that identical lists identify an overlap equal to their entire length.""" + left = ["a", "b", "c"] + right = ["a", "b", "c"] + assert _find_suffix_prefix_overlap(left, right, 1) == 3 + + +def test_overlap_case_and_symbol_insensitive(): + """Tests that overlap detection utilizes normalization to ignore punctuation and casing.""" + left = ["hello!!!"] + right = ["hello"] + assert _find_suffix_prefix_overlap(left, right, 1) == 1 + + +# _calculate_commit_count + + +def test_commit_count_no_previous_words(): + """Tests that a fresh session with no prior hypothesis does not commit words immediately.""" + result = _calculate_commit_count("", "hello world", is_silence=False) + assert result == 0 + + +def test_commit_count_with_silence_no_previous(): + """ + Tests that a fresh session encountering silence might commit words early. + + Silence aggressively drops the mutable tail size, allowing words to commit + even if there's no previous text to stabilize against. + """ + result = _calculate_commit_count("", "hello world", is_silence=True) + # silence allows committing more + assert result >= 0 + + +def test_commit_count_no_common_prefix(): + """Tests that a complete shift in hypothesis resets stabilization, committing 0 words.""" + result = _calculate_commit_count("hello world", "bye world", is_silence=False) + assert result == 0 + + +def test_commit_count_partial_prefix(): + """Tests that a stable prefix across hypotheses results in committed words.""" + result = _calculate_commit_count("hello world test", "hello world again", is_silence=False) + assert result >= 1 + + +def test_commit_count_full_prefix_but_tail_limited(): + """ + Tests the mutable tail constraint during continuous speech. + + Even if the prefix perfectly matches, the engine should hold back the last + few words (the mutable tail) from committing to allow for STT corrections. + """ + prev = "one two three four five six seven eight" + curr = "one two three four five six seven eight nine ten" + result = _calculate_commit_count(prev, curr, is_silence=False) + + # Should not commit everything due to mutable tail constraint + assert result < len(curr.split()) + + +def test_commit_count_more_aggressive_on_silence(): + """ + Tests that silence triggers a smaller mutable tail, committing more words. + + When the user pauses (silence), the STT is less likely to correct older words, + so the system commits a larger portion of the stable prefix. + """ + prev = "one two three four five six seven eight" + curr = "one two three four five six seven eight nine ten" + + normal = _calculate_commit_count(prev, curr, is_silence=False) + silence = _calculate_commit_count(prev, curr, is_silence=True) + + assert silence >= normal From 9ba5f3c6c862a810b7fa1e1bbc6d9f632a68aa97 Mon Sep 17 00:00:00 2001 From: Divyansh Date: Sun, 5 Apr 2026 02:25:12 +0530 Subject: [PATCH 2/2] Added tests for stt live session utilites --- tests/stt/test_live_session_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/stt/test_live_session_utils.py b/tests/stt/test_live_session_utils.py index 561b48cf3..c6f0819f8 100644 --- a/tests/stt/test_live_session_utils.py +++ b/tests/stt/test_live_session_utils.py @@ -192,7 +192,10 @@ def test_commit_count_no_common_prefix(): def test_commit_count_partial_prefix(): """Tests that a stable prefix across hypotheses results in committed words.""" - result = _calculate_commit_count("hello world test", "hello world again", is_silence=False) + # Must be longer than LIVE_STT_MUTABLE_TAIL_WORDS (8 words) to commit anything when not silent + prev = "one two three four five six seven eight nine ten" + curr = "one two three four five six seven eight nine ten eleven" + result = _calculate_commit_count(prev, curr, is_silence=False) assert result >= 1