From cf87839b46f8072e18f92deb5c867cbc8e1ec9ec Mon Sep 17 00:00:00 2001 From: Daniel Ellison Date: Sat, 28 Mar 2026 07:37:42 -0400 Subject: [PATCH 1/4] Replace wall-clock limit with activity-based idle timeout The wall-clock timer killed interactions after timeout_seconds * 5 of total elapsed time, regardless of whether Claude was actively working. This caused healthy long-running tasks (multi-step research, detailed planning) to be killed mid-work. Replace with an idle timeout that resets every time the process emits a line of output. Same duration (10 min at defaults), different measurement: time since last output instead of time since start. Increase bot lock acquire timeout from 660s to 3600s since interactions can now legitimately run much longer than 10 minutes. Fixes #206 --- src/kai/bot.py | 14 +++--- src/kai/claude.py | 34 ++++++++------- tests/test_bot.py | 2 +- tests/test_claude.py | 100 ++++++++++++++++++++++++++++++++----------- 4 files changed, 100 insertions(+), 50 deletions(-) diff --git a/src/kai/bot.py b/src/kai/bot.py index cc80d43..df385e3 100644 --- a/src/kai/bot.py +++ b/src/kai/bot.py @@ -150,14 +150,12 @@ async def _notify_if_queued(update: Update, chat_id: int) -> bool: ) # Safety-net timeout for acquiring the per-chat lock (seconds). If the -# wall-clock timeout in claude.py doesn't fire for some reason, this -# prevents a stuck interaction from blocking all future messages. Set -# longer than claude.py's wall-clock limit (timeout_seconds * 5) so the -# inner timeout fires first when the process is emitting frequent lines. -# Note: the wall-clock check runs between readline calls, so it can -# overshoot by up to one readline timeout (timeout_seconds * 3 = 6 min -# at defaults). Effective worst case is ~16 min, not 10. -_LOCK_ACQUIRE_TIMEOUT = 660 # 11 minutes +# idle timeout in claude.py doesn't fire for some reason, this prevents +# a stuck interaction from blocking all future messages indefinitely. +# Set generously: the idle timer in claude.py is the real safety net +# (fires after timeout_seconds * 5 of silence); this is a last-resort +# backstop for interactions that run legitimately long with active output. +_LOCK_ACQUIRE_TIMEOUT = 3600 # 1 hour async def _acquire_lock_or_kill( diff --git a/src/kai/claude.py b/src/kai/claude.py index 3a8bbbb..ed5ff58 100644 --- a/src/kai/claude.py +++ b/src/kai/claude.py @@ -575,23 +575,22 @@ async def _send_locked(self, prompt: str | list, chat_id: int | None = None) -> return accumulated_text = "" - # Wall-clock limit for the entire interaction. The per-readline - # timeout below catches dead processes (no output for 6 min), but - # a process stuck in a tool-use loop emits progress events that - # reset the readline timer indefinitely. This outer limit catches - # that case: if the total interaction exceeds N minutes regardless - # of output, kill the process. - interaction_start = time.monotonic() - max_interaction_seconds = self.timeout_seconds * 5 # 10 min at default 120s + # Idle-activity timeout for the interaction. Resets every time the + # process emits a line of output. If the process goes silent for + # this long, it is likely dead or wedged. The per-readline timeout + # below (timeout_seconds * 3) is the primary dead-process detector; + # this is a secondary safety net measured across the whole interaction. + last_activity = time.monotonic() + max_idle_seconds = self.timeout_seconds * 5 # 10 min of silence at default 120s try: while True: - # Check wall-clock limit before each readline - elapsed = time.monotonic() - interaction_start - if elapsed > max_interaction_seconds: + # Check idle timeout before each readline + idle_seconds = time.monotonic() - last_activity + if idle_seconds > max_idle_seconds: log.error( - "Interaction exceeded wall-clock limit (%.0fs > %ds)", - elapsed, - max_interaction_seconds, + "Interaction idle timeout (%.0fs with no output, limit %ds)", + idle_seconds, + max_idle_seconds, ) await self._kill() yield StreamEvent( @@ -600,7 +599,7 @@ async def _send_locked(self, prompt: str | list, chat_id: int | None = None) -> response=ClaudeResponse( success=False, text=accumulated_text, - error="Claude interaction timed out (too long)", + error="Claude interaction timed out (no output)", ), ) return @@ -619,6 +618,11 @@ async def _send_locked(self, prompt: str | list, chat_id: int | None = None) -> ) return + # Reset idle timer - process is alive and producing output. + # Do NOT reset on empty line (EOF); that means the process died. + if line: + last_activity = time.monotonic() + if not line: # Process died unexpectedly log.error("Claude process EOF") diff --git a/tests/test_bot.py b/tests/test_bot.py index 790c342..c3252de 100644 --- a/tests/test_bot.py +++ b/tests/test_bot.py @@ -2458,7 +2458,7 @@ async def test_timeout_kills_claude(self): await held_lock.acquire() try: - # Patch the timeout to something tiny so the test doesn't wait 11 min + # Patch the timeout to something tiny so the test doesn't wait 1 hour with patch("kai.bot._LOCK_ACQUIRE_TIMEOUT", 0.05): result = await _acquire_lock_or_kill(chat_id, claude, update) finally: diff --git a/tests/test_claude.py b/tests/test_claude.py index 1e7f10f..db3dc64 100644 --- a/tests/test_claude.py +++ b/tests/test_claude.py @@ -863,44 +863,57 @@ async def test_timeout(self): assert "timed out" in events[-1].response.error.lower() @pytest.mark.asyncio - async def test_wall_clock_timeout(self): - """Interaction killed when total elapsed time exceeds wall-clock limit. - - Simulates a tool-use loop: the process keeps emitting assistant - events (resetting the per-readline timeout) but the overall - interaction exceeds timeout_seconds * 5. The wall-clock guard - should fire and kill the process. + async def test_idle_timeout(self): + """Interaction killed when process goes silent longer than idle limit. + + The idle timeout (timeout_seconds * 5) is a secondary safety net + behind the per-readline timeout (timeout_seconds * 3). Because the + readline timeout is shorter, it normally fires first for a truly + silent process. This test uses a mocked time source to force the + idle check to trigger, verifying the error path works correctly. """ - claude = _make_claude(timeout_seconds=1) # wall-clock limit = 5s - - # Feed assistant events indefinitely (simulating a chatty tool loop) - call_count = 0 - - async def slow_readline(): - nonlocal call_count - call_count += 1 - # Each readline returns an assistant event after a short delay. - # After enough calls, the wall-clock limit is exceeded. - await asyncio.sleep(0.3) - return _assistant_event(f"Tool output {call_count}") + claude = _make_claude(timeout_seconds=1) # idle limit = 5s + + # Control time progression in kai.claude without affecting asyncio. + # After two successful readlines, jump time forward past the idle + # limit so the check at the top of the next loop iteration fires. + call_count = [0] + + def fake_monotonic(): + call_count[0] += 1 + # First several calls: normal progression (init, checks, resets) + if call_count[0] <= 5: + return call_count[0] * 0.1 + # After that: jump way past the idle limit + return 100.0 + + readline_count = 0 + + async def readline_with_output(): + nonlocal readline_count + readline_count += 1 + if readline_count <= 2: + return _assistant_event(f"Output {readline_count}") + # Should not reach here - idle timeout fires first + return _assistant_event("Should not reach this") proc = _make_mock_proc([]) - proc.stdout.readline = slow_readline + proc.stdout.readline = readline_with_output claude._proc = proc claude._fresh_session = False - # Patch wait_for to actually respect the real timeout (not mock it) - events = await _collect_events(claude) + with patch("kai.claude.time.monotonic", side_effect=fake_monotonic): + events = await _collect_events(claude) - # Should get the wall-clock timeout error, not a readline timeout + # Should get the idle timeout error done_event = events[-1] assert done_event.done is True assert done_event.response.success is False - assert "too long" in done_event.response.error.lower() + assert "no output" in done_event.response.error.lower() @pytest.mark.asyncio - async def test_wall_clock_normal_completion_unaffected(self): - """Normal interactions that complete quickly are not affected by wall-clock limit.""" + async def test_idle_timeout_normal_completion_unaffected(self): + """Normal interactions that complete quickly are not affected by idle timeout.""" proc = _make_mock_proc( [ _system_event(), @@ -919,6 +932,41 @@ async def test_wall_clock_normal_completion_unaffected(self): assert done_event.response.success is True assert done_event.response.error is None + @pytest.mark.asyncio + async def test_active_process_survives_past_old_wall_clock(self): + """A process that keeps producing output is not killed by the idle timer. + + This is the core behavioral change: under the old wall-clock limit, + any interaction exceeding timeout_seconds * 5 was killed regardless + of activity. The idle timer only fires after prolonged silence. + """ + claude = _make_claude(timeout_seconds=1) # idle limit = 5s, old wall-clock = 5s + + call_count = 0 + + async def slow_but_active_readline(): + nonlocal call_count + call_count += 1 + if call_count <= 20: + # Emit events every 0.4s - total time ~8s, well past the 5s limit + await asyncio.sleep(0.4) + return _assistant_event(f"Working... step {call_count}") + # Finish with a result event + return _result_event("All done") + + proc = _make_mock_proc([]) + proc.stdout.readline = slow_but_active_readline + claude._proc = proc + claude._fresh_session = False + + events = await _collect_events(claude) + + # Should complete successfully despite total time > timeout_seconds * 5 + done_event = events[-1] + assert done_event.done is True + assert done_event.response.success is True + assert done_event.response.text # Has content + @pytest.mark.asyncio async def test_eof_with_text(self): """EOF with accumulated text yields success=True, error=None.""" From 110848cddaea96853277a94ded1eb59d7ec7f288 Mon Sep 17 00:00:00 2001 From: Daniel Ellison Date: Sat, 28 Mar 2026 08:02:28 -0400 Subject: [PATCH 2/4] Fix tests: mock time instead of using real sleeps Use mocked time.monotonic for both test_idle_timeout and test_active_process_survives_past_old_wall_clock. No test should ever wait for a real timeout to fire. --- tests/test_claude.py | 63 ++++++++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 25 deletions(-) diff --git a/tests/test_claude.py b/tests/test_claude.py index db3dc64..3db2be6 100644 --- a/tests/test_claude.py +++ b/tests/test_claude.py @@ -875,27 +875,31 @@ async def test_idle_timeout(self): claude = _make_claude(timeout_seconds=1) # idle limit = 5s # Control time progression in kai.claude without affecting asyncio. - # After two successful readlines, jump time forward past the idle - # limit so the check at the top of the next loop iteration fires. - call_count = [0] + # The streaming loop calls time.monotonic() in a fixed pattern: + # 1. Init: last_activity = time.monotonic() + # 2. Idle check (iter 1): time.monotonic() - last_activity + # 3. Reset after readline 1: last_activity = time.monotonic() + # 4. Idle check (iter 2): time.monotonic() - last_activity + # 5. Reset after readline 2: last_activity = time.monotonic() + # 6. Idle check (iter 3): time.monotonic() - last_activity <- jump here + # Calls 1-5 return small values; call 6+ returns 100.0 so the + # idle check sees (100.0 - 0.5) > 5s and fires. + mono_call = [0] def fake_monotonic(): - call_count[0] += 1 - # First several calls: normal progression (init, checks, resets) - if call_count[0] <= 5: - return call_count[0] * 0.1 - # After that: jump way past the idle limit + mono_call[0] += 1 + if mono_call[0] <= 5: + return mono_call[0] * 0.1 return 100.0 - readline_count = 0 + readline_count = [0] async def readline_with_output(): - nonlocal readline_count - readline_count += 1 - if readline_count <= 2: - return _assistant_event(f"Output {readline_count}") + readline_count[0] += 1 + if readline_count[0] <= 2: + return _assistant_event(f"Output {readline_count[0]}") # Should not reach here - idle timeout fires first - return _assistant_event("Should not reach this") + return _assistant_event("unreachable") proc = _make_mock_proc([]) proc.stdout.readline = readline_with_output @@ -942,24 +946,33 @@ async def test_active_process_survives_past_old_wall_clock(self): """ claude = _make_claude(timeout_seconds=1) # idle limit = 5s, old wall-clock = 5s - call_count = 0 + # Simulate a process that takes longer than timeout_seconds * 5 total + # but keeps producing output (resetting the idle timer each time). + # Uses mocked time so the test runs instantly. Each readline advances + # time by 0.5s; after 20 calls that is 10s total, well past the old + # 5s wall-clock limit. The idle timer never fires because each + # readline resets it to within 0.5s. + sim_time = [0.0] - async def slow_but_active_readline(): - nonlocal call_count - call_count += 1 - if call_count <= 20: - # Emit events every 0.4s - total time ~8s, well past the 5s limit - await asyncio.sleep(0.4) - return _assistant_event(f"Working... step {call_count}") - # Finish with a result event + def advancing_monotonic(): + return sim_time[0] + + call_count = [0] + + async def active_readline(): + call_count[0] += 1 + if call_count[0] <= 20: + sim_time[0] += 0.5 # advance 0.5s per event + return _assistant_event(f"Working... step {call_count[0]}") return _result_event("All done") proc = _make_mock_proc([]) - proc.stdout.readline = slow_but_active_readline + proc.stdout.readline = active_readline claude._proc = proc claude._fresh_session = False - events = await _collect_events(claude) + with patch("kai.claude.time.monotonic", side_effect=advancing_monotonic): + events = await _collect_events(claude) # Should complete successfully despite total time > timeout_seconds * 5 done_event = events[-1] From 8bb640e2f13ca3f3864db490082f107c89d5c563 Mon Sep 17 00:00:00 2001 From: Daniel Ellison Date: Sat, 28 Mar 2026 08:07:07 -0400 Subject: [PATCH 3/4] Replace unreachable return with pytest.fail for drift detection --- tests/test_claude.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_claude.py b/tests/test_claude.py index 3db2be6..38dafae 100644 --- a/tests/test_claude.py +++ b/tests/test_claude.py @@ -898,8 +898,8 @@ async def readline_with_output(): readline_count[0] += 1 if readline_count[0] <= 2: return _assistant_event(f"Output {readline_count[0]}") - # Should not reach here - idle timeout fires first - return _assistant_event("unreachable") + # If we get here, the idle timeout didn't fire as expected + pytest.fail("readline reached call 3 - idle timeout did not fire") proc = _make_mock_proc([]) proc.stdout.readline = readline_with_output From 1b02ccc4671c247cbcaeee9f18cd3d8b1a3a63e6 Mon Sep 17 00:00:00 2001 From: Daniel Ellison Date: Sat, 28 Mar 2026 08:11:43 -0400 Subject: [PATCH 4/4] Fix stale wall-clock comment in test_idle_timeout_normal_completion_unaffected --- tests/test_claude.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_claude.py b/tests/test_claude.py index 38dafae..4503ec5 100644 --- a/tests/test_claude.py +++ b/tests/test_claude.py @@ -925,7 +925,7 @@ async def test_idle_timeout_normal_completion_unaffected(self): _result_event("Done"), ] ) - claude = _make_claude(timeout_seconds=1) # wall-clock = 5s + claude = _make_claude(timeout_seconds=1) # idle limit = 5s claude._proc = proc claude._fresh_session = False