From cf87839b46f8072e18f92deb5c867cbc8e1ec9ec Mon Sep 17 00:00:00 2001
From: Daniel Ellison <daniel@syrinx.net>
Date: Sat, 28 Mar 2026 07:37:42 -0400
Subject: [PATCH 1/4] Replace wall-clock limit with activity-based idle timeout

The wall-clock timer killed interactions after timeout_seconds * 5 of
total elapsed time, regardless of whether Claude was actively working.
This caused healthy long-running tasks (multi-step research, detailed
planning) to be killed mid-work.

Replace with an idle timeout that resets every time the process emits
a line of output. Same duration (10 min at defaults), different
measurement: time since last output instead of time since start.

Increase bot lock acquire timeout from 660s to 3600s since interactions
can now legitimately run much longer than 10 minutes.

Fixes #206
---
 src/kai/bot.py       |  14 +++---
 src/kai/claude.py    |  34 ++++++++-------
 tests/test_bot.py    |   2 +-
 tests/test_claude.py | 100 ++++++++++++++++++++++++++++++++-----------
 4 files changed, 100 insertions(+), 50 deletions(-)

diff --git a/src/kai/bot.py b/src/kai/bot.py
index cc80d43..df385e3 100644
--- a/src/kai/bot.py
+++ b/src/kai/bot.py
@@ -150,14 +150,12 @@ async def _notify_if_queued(update: Update, chat_id: int) -> bool:
 )
 
 # Safety-net timeout for acquiring the per-chat lock (seconds). If the
-# wall-clock timeout in claude.py doesn't fire for some reason, this
-# prevents a stuck interaction from blocking all future messages. Set
-# longer than claude.py's wall-clock limit (timeout_seconds * 5) so the
-# inner timeout fires first when the process is emitting frequent lines.
-# Note: the wall-clock check runs between readline calls, so it can
-# overshoot by up to one readline timeout (timeout_seconds * 3 = 6 min
-# at defaults). Effective worst case is ~16 min, not 10.
-_LOCK_ACQUIRE_TIMEOUT = 660  # 11 minutes
+# idle timeout in claude.py doesn't fire for some reason, this prevents
+# a stuck interaction from blocking all future messages indefinitely.
+# Set generously: the idle timer in claude.py is the real safety net
+# (fires after timeout_seconds * 5 of silence); this is a last-resort
+# backstop for interactions that run legitimately long with active output.
+_LOCK_ACQUIRE_TIMEOUT = 3600  # 1 hour
 
 
 async def _acquire_lock_or_kill(
diff --git a/src/kai/claude.py b/src/kai/claude.py
index 3a8bbbb..ed5ff58 100644
--- a/src/kai/claude.py
+++ b/src/kai/claude.py
@@ -575,23 +575,22 @@ async def _send_locked(self, prompt: str | list, chat_id: int | None = None) ->
             return
 
         accumulated_text = ""
-        # Wall-clock limit for the entire interaction. The per-readline
-        # timeout below catches dead processes (no output for 6 min), but
-        # a process stuck in a tool-use loop emits progress events that
-        # reset the readline timer indefinitely. This outer limit catches
-        # that case: if the total interaction exceeds N minutes regardless
-        # of output, kill the process.
-        interaction_start = time.monotonic()
-        max_interaction_seconds = self.timeout_seconds * 5  # 10 min at default 120s
+        # Idle-activity timeout for the interaction. Resets every time the
+        # process emits a line of output. If the process goes silent for
+        # this long, it is likely dead or wedged. The per-readline timeout
+        # below (timeout_seconds * 3) is the primary dead-process detector;
+        # this is a secondary safety net measured across the whole interaction.
+        last_activity = time.monotonic()
+        max_idle_seconds = self.timeout_seconds * 5  # 10 min of silence at default 120s
         try:
             while True:
-                # Check wall-clock limit before each readline
-                elapsed = time.monotonic() - interaction_start
-                if elapsed > max_interaction_seconds:
+                # Check idle timeout before each readline
+                idle_seconds = time.monotonic() - last_activity
+                if idle_seconds > max_idle_seconds:
                     log.error(
-                        "Interaction exceeded wall-clock limit (%.0fs > %ds)",
-                        elapsed,
-                        max_interaction_seconds,
+                        "Interaction idle timeout (%.0fs with no output, limit %ds)",
+                        idle_seconds,
+                        max_idle_seconds,
                     )
                     await self._kill()
                     yield StreamEvent(
@@ -600,7 +599,7 @@ async def _send_locked(self, prompt: str | list, chat_id: int | None = None) ->
                         response=ClaudeResponse(
                             success=False,
                             text=accumulated_text,
-                            error="Claude interaction timed out (too long)",
+                            error="Claude interaction timed out (no output)",
                         ),
                     )
                     return
@@ -619,6 +618,11 @@ async def _send_locked(self, prompt: str | list, chat_id: int | None = None) ->
                     )
                     return
 
+                # Reset idle timer - process is alive and producing output.
+                # Do NOT reset on empty line (EOF); that means the process died.
+                if line:
+                    last_activity = time.monotonic()
+
                 if not line:
                     # Process died unexpectedly
                     log.error("Claude process EOF")
diff --git a/tests/test_bot.py b/tests/test_bot.py
index 790c342..c3252de 100644
--- a/tests/test_bot.py
+++ b/tests/test_bot.py
@@ -2458,7 +2458,7 @@ async def test_timeout_kills_claude(self):
         await held_lock.acquire()
 
         try:
-            # Patch the timeout to something tiny so the test doesn't wait 11 min
+            # Patch the timeout to something tiny so the test doesn't wait 1 hour
             with patch("kai.bot._LOCK_ACQUIRE_TIMEOUT", 0.05):
                 result = await _acquire_lock_or_kill(chat_id, claude, update)
         finally:
diff --git a/tests/test_claude.py b/tests/test_claude.py
index 1e7f10f..db3dc64 100644
--- a/tests/test_claude.py
+++ b/tests/test_claude.py
@@ -863,44 +863,57 @@ async def test_timeout(self):
         assert "timed out" in events[-1].response.error.lower()
 
     @pytest.mark.asyncio
-    async def test_wall_clock_timeout(self):
-        """Interaction killed when total elapsed time exceeds wall-clock limit.
-
-        Simulates a tool-use loop: the process keeps emitting assistant
-        events (resetting the per-readline timeout) but the overall
-        interaction exceeds timeout_seconds * 5. The wall-clock guard
-        should fire and kill the process.
+    async def test_idle_timeout(self):
+        """Interaction killed when process goes silent longer than idle limit.
+
+        The idle timeout (timeout_seconds * 5) is a secondary safety net
+        behind the per-readline timeout (timeout_seconds * 3). Because the
+        readline timeout is shorter, it normally fires first for a truly
+        silent process. This test uses a mocked time source to force the
+        idle check to trigger, verifying the error path works correctly.
         """
-        claude = _make_claude(timeout_seconds=1)  # wall-clock limit = 5s
-
-        # Feed assistant events indefinitely (simulating a chatty tool loop)
-        call_count = 0
-
-        async def slow_readline():
-            nonlocal call_count
-            call_count += 1
-            # Each readline returns an assistant event after a short delay.
-            # After enough calls, the wall-clock limit is exceeded.
-            await asyncio.sleep(0.3)
-            return _assistant_event(f"Tool output {call_count}")
+        claude = _make_claude(timeout_seconds=1)  # idle limit = 5s
+
+        # Control time progression in kai.claude without affecting asyncio.
+        # After two successful readlines, jump time forward past the idle
+        # limit so the check at the top of the next loop iteration fires.
+        call_count = [0]
+
+        def fake_monotonic():
+            call_count[0] += 1
+            # First several calls: normal progression (init, checks, resets)
+            if call_count[0] <= 5:
+                return call_count[0] * 0.1
+            # After that: jump way past the idle limit
+            return 100.0
+
+        readline_count = 0
+
+        async def readline_with_output():
+            nonlocal readline_count
+            readline_count += 1
+            if readline_count <= 2:
+                return _assistant_event(f"Output {readline_count}")
+            # Should not reach here - idle timeout fires first
+            return _assistant_event("Should not reach this")
 
         proc = _make_mock_proc([])
-        proc.stdout.readline = slow_readline
+        proc.stdout.readline = readline_with_output
         claude._proc = proc
         claude._fresh_session = False
 
-        # Patch wait_for to actually respect the real timeout (not mock it)
-        events = await _collect_events(claude)
+        with patch("kai.claude.time.monotonic", side_effect=fake_monotonic):
+            events = await _collect_events(claude)
 
-        # Should get the wall-clock timeout error, not a readline timeout
+        # Should get the idle timeout error
         done_event = events[-1]
         assert done_event.done is True
         assert done_event.response.success is False
-        assert "too long" in done_event.response.error.lower()
+        assert "no output" in done_event.response.error.lower()
 
     @pytest.mark.asyncio
-    async def test_wall_clock_normal_completion_unaffected(self):
-        """Normal interactions that complete quickly are not affected by wall-clock limit."""
+    async def test_idle_timeout_normal_completion_unaffected(self):
+        """Normal interactions that complete quickly are not affected by idle timeout."""
         proc = _make_mock_proc(
             [
                 _system_event(),
@@ -919,6 +932,41 @@ async def test_wall_clock_normal_completion_unaffected(self):
         assert done_event.response.success is True
         assert done_event.response.error is None
 
+    @pytest.mark.asyncio
+    async def test_active_process_survives_past_old_wall_clock(self):
+        """A process that keeps producing output is not killed by the idle timer.
+
+        This is the core behavioral change: under the old wall-clock limit,
+        any interaction exceeding timeout_seconds * 5 was killed regardless
+        of activity. The idle timer only fires after prolonged silence.
+        """
+        claude = _make_claude(timeout_seconds=1)  # idle limit = 5s, old wall-clock = 5s
+
+        call_count = 0
+
+        async def slow_but_active_readline():
+            nonlocal call_count
+            call_count += 1
+            if call_count <= 20:
+                # Emit events every 0.4s - total time ~8s, well past the 5s limit
+                await asyncio.sleep(0.4)
+                return _assistant_event(f"Working... step {call_count}")
+            # Finish with a result event
+            return _result_event("All done")
+
+        proc = _make_mock_proc([])
+        proc.stdout.readline = slow_but_active_readline
+        claude._proc = proc
+        claude._fresh_session = False
+
+        events = await _collect_events(claude)
+
+        # Should complete successfully despite total time > timeout_seconds * 5
+        done_event = events[-1]
+        assert done_event.done is True
+        assert done_event.response.success is True
+        assert done_event.response.text  # Has content
+
     @pytest.mark.asyncio
     async def test_eof_with_text(self):
         """EOF with accumulated text yields success=True, error=None."""

From 110848cddaea96853277a94ded1eb59d7ec7f288 Mon Sep 17 00:00:00 2001
From: Daniel Ellison <daniel@syrinx.net>
Date: Sat, 28 Mar 2026 08:02:28 -0400
Subject: [PATCH 2/4] Fix tests: mock time instead of using real sleeps

Use mocked time.monotonic for both test_idle_timeout and
test_active_process_survives_past_old_wall_clock. No test should
ever wait for a real timeout to fire.
---
 tests/test_claude.py | 63 ++++++++++++++++++++++++++------------------
 1 file changed, 38 insertions(+), 25 deletions(-)

diff --git a/tests/test_claude.py b/tests/test_claude.py
index db3dc64..3db2be6 100644
--- a/tests/test_claude.py
+++ b/tests/test_claude.py
@@ -875,27 +875,31 @@ async def test_idle_timeout(self):
         claude = _make_claude(timeout_seconds=1)  # idle limit = 5s
 
         # Control time progression in kai.claude without affecting asyncio.
-        # After two successful readlines, jump time forward past the idle
-        # limit so the check at the top of the next loop iteration fires.
-        call_count = [0]
+        # The streaming loop calls time.monotonic() in a fixed pattern:
+        #   1. Init: last_activity = time.monotonic()
+        #   2. Idle check (iter 1): time.monotonic() - last_activity
+        #   3. Reset after readline 1: last_activity = time.monotonic()
+        #   4. Idle check (iter 2): time.monotonic() - last_activity
+        #   5. Reset after readline 2: last_activity = time.monotonic()
+        #   6. Idle check (iter 3): time.monotonic() - last_activity <- jump here
+        # Calls 1-5 return small values; call 6+ returns 100.0 so the
+        # idle check sees (100.0 - 0.5) > 5s and fires.
+        mono_call = [0]
 
         def fake_monotonic():
-            call_count[0] += 1
-            # First several calls: normal progression (init, checks, resets)
-            if call_count[0] <= 5:
-                return call_count[0] * 0.1
-            # After that: jump way past the idle limit
+            mono_call[0] += 1
+            if mono_call[0] <= 5:
+                return mono_call[0] * 0.1
             return 100.0
 
-        readline_count = 0
+        readline_count = [0]
 
         async def readline_with_output():
-            nonlocal readline_count
-            readline_count += 1
-            if readline_count <= 2:
-                return _assistant_event(f"Output {readline_count}")
+            readline_count[0] += 1
+            if readline_count[0] <= 2:
+                return _assistant_event(f"Output {readline_count[0]}")
             # Should not reach here - idle timeout fires first
-            return _assistant_event("Should not reach this")
+            return _assistant_event("unreachable")
 
         proc = _make_mock_proc([])
         proc.stdout.readline = readline_with_output
@@ -942,24 +946,33 @@ async def test_active_process_survives_past_old_wall_clock(self):
         """
         claude = _make_claude(timeout_seconds=1)  # idle limit = 5s, old wall-clock = 5s
 
-        call_count = 0
+        # Simulate a process that takes longer than timeout_seconds * 5 total
+        # but keeps producing output (resetting the idle timer each time).
+        # Uses mocked time so the test runs instantly. Each readline advances
+        # time by 0.5s; after 20 calls that is 10s total, well past the old
+        # 5s wall-clock limit. The idle timer never fires because each
+        # readline resets it to within 0.5s.
+        sim_time = [0.0]
 
-        async def slow_but_active_readline():
-            nonlocal call_count
-            call_count += 1
-            if call_count <= 20:
-                # Emit events every 0.4s - total time ~8s, well past the 5s limit
-                await asyncio.sleep(0.4)
-                return _assistant_event(f"Working... step {call_count}")
-            # Finish with a result event
+        def advancing_monotonic():
+            return sim_time[0]
+
+        call_count = [0]
+
+        async def active_readline():
+            call_count[0] += 1
+            if call_count[0] <= 20:
+                sim_time[0] += 0.5  # advance 0.5s per event
+                return _assistant_event(f"Working... step {call_count[0]}")
             return _result_event("All done")
 
         proc = _make_mock_proc([])
-        proc.stdout.readline = slow_but_active_readline
+        proc.stdout.readline = active_readline
         claude._proc = proc
         claude._fresh_session = False
 
-        events = await _collect_events(claude)
+        with patch("kai.claude.time.monotonic", side_effect=advancing_monotonic):
+            events = await _collect_events(claude)
 
         # Should complete successfully despite total time > timeout_seconds * 5
         done_event = events[-1]

From 8bb640e2f13ca3f3864db490082f107c89d5c563 Mon Sep 17 00:00:00 2001
From: Daniel Ellison <daniel@syrinx.net>
Date: Sat, 28 Mar 2026 08:07:07 -0400
Subject: [PATCH 3/4] Replace unreachable return with pytest.fail for drift
 detection

---
 tests/test_claude.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_claude.py b/tests/test_claude.py
index 3db2be6..38dafae 100644
--- a/tests/test_claude.py
+++ b/tests/test_claude.py
@@ -898,8 +898,8 @@ async def readline_with_output():
             readline_count[0] += 1
             if readline_count[0] <= 2:
                 return _assistant_event(f"Output {readline_count[0]}")
-            # Should not reach here - idle timeout fires first
-            return _assistant_event("unreachable")
+            # If we get here, the idle timeout didn't fire as expected
+            pytest.fail("readline reached call 3 - idle timeout did not fire")
 
         proc = _make_mock_proc([])
         proc.stdout.readline = readline_with_output

From 1b02ccc4671c247cbcaeee9f18cd3d8b1a3a63e6 Mon Sep 17 00:00:00 2001
From: Daniel Ellison <daniel@syrinx.net>
Date: Sat, 28 Mar 2026 08:11:43 -0400
Subject: [PATCH 4/4] Fix stale wall-clock comment in
 test_idle_timeout_normal_completion_unaffected

---
 tests/test_claude.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_claude.py b/tests/test_claude.py
index 38dafae..4503ec5 100644
--- a/tests/test_claude.py
+++ b/tests/test_claude.py
@@ -925,7 +925,7 @@ async def test_idle_timeout_normal_completion_unaffected(self):
                 _result_event("Done"),
             ]
         )
-        claude = _make_claude(timeout_seconds=1)  # wall-clock = 5s
+        claude = _make_claude(timeout_seconds=1)  # idle limit = 5s
         claude._proc = proc
         claude._fresh_session = False