From e6201b8c25d42cd576283bcf5e8c6bdd7b3f7b64 Mon Sep 17 00:00:00 2001 From: Ben Horowitz Date: Sat, 25 Apr 2026 14:48:40 -0700 Subject: [PATCH 1/2] Fix race in device_initialized causing KeyError on rapid rejoin (#748) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a device sends a second join announcement before the first initialization task completes, the lambda done-callback unconditionally popped the dict entry by IEEE — but by that point the entry belonged to the replacement task. The replacement's own done-callback then raised KeyError, leaving ZHA's device tracking corrupted: devices appeared as "Interview complete → Configuring" in the UI but immediately went unavailable and looped indefinitely. Replace the lambda with a named callback that removes the dict entry only if it still points at this task, so a cancelled task's callback cannot clobber the replacement's entry. Add a regression test that captures asyncio loop exceptions to verify no KeyError surfaces when device_initialized is invoked twice in rapid succession for the same IEEE. --- tests/test_gateway.py | 40 ++++++++++++++++++++++++++++++++++++++ zha/application/gateway.py | 9 ++++++++- 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/tests/test_gateway.py b/tests/test_gateway.py index 677955e71..6d5a85732 100644 --- a/tests/test_gateway.py +++ b/tests/test_gateway.py @@ -557,6 +557,46 @@ async def test_gateway_device_initialized( ) +async def test_gateway_device_initialized_no_keyerror_on_rapid_rejoin( + zha_gateway: Gateway, + caplog: pytest.LogCaptureFixture, +) -> None: + """Regression test for #748. + + When a device sends a second join announcement before the first + initialization task completes, the cancelled task's done-callback + must not pop the dict entry that now belongs to the replacement + task — otherwise the replacement's own done-callback raises KeyError. + """ + zigpy_dev_basic = create_mock_zigpy_device(zha_gateway, ZIGPY_DEVICE_BASIC) + + loop = asyncio.get_running_loop() + captured_exceptions: list[BaseException] = [] + original_handler = loop.get_exception_handler() + + def capture_exception(loop_, context): + if (exc := context.get("exception")) is not None: + captured_exceptions.append(exc) + if original_handler is not None: + original_handler(loop_, context) + + loop.set_exception_handler(capture_exception) + try: + zha_gateway.device_initialized(zigpy_dev_basic) + zha_gateway.device_initialized(zigpy_dev_basic) + await zha_gateway.async_block_till_done() + finally: + loop.set_exception_handler(original_handler) + + assert ( + f"Cancelling previous initialization task for device {zigpy_dev_basic.ieee}" + in caplog.text + ) + keyerrors = [e for e in captured_exceptions if isinstance(e, KeyError)] + assert not keyerrors, f"done-callback raised KeyError(s): {keyerrors}" + assert zigpy_dev_basic.ieee not in zha_gateway._device_init_tasks + + def test_gateway_raw_device_initialized( zha_gateway: Gateway, ) -> None: diff --git a/zha/application/gateway.py b/zha/application/gateway.py index 64434f499..18ee30c94 100644 --- a/zha/application/gateway.py +++ b/zha/application/gateway.py @@ -443,7 +443,14 @@ def device_initialized(self, device: zigpy.device.Device) -> None: name=f"device_initialized_task_{str(device.ieee)}:0x{device.nwk:04x}", eager_start=True, ) - init_task.add_done_callback(lambda _: self._device_init_tasks.pop(device.ieee)) + + def _remove_init_task(task: asyncio.Task, ieee: EUI64 = device.ieee) -> None: + # Only remove the entry if it still points at this task; a cancelled + # task's done-callback must not pop the replacement task's entry. + if self._device_init_tasks.get(ieee) is task: + del self._device_init_tasks[ieee] + + init_task.add_done_callback(_remove_init_task) def device_left(self, device: zigpy.device.Device) -> None: """Handle device leaving the network.""" From af5870e7aa5c1ac0904b9b591917fcdef2b9aed3 Mon Sep 17 00:00:00 2001 From: TheJulianJES Date: Wed, 29 Apr 2026 10:17:08 +0200 Subject: [PATCH 2/2] Drop redundant ieee default arg in _remove_init_task The closure already captures `device` from the enclosing function parameter (never reassigned), so the late-binding workaround isn't needed here. --- zha/application/gateway.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/zha/application/gateway.py b/zha/application/gateway.py index 18ee30c94..72d026d8a 100644 --- a/zha/application/gateway.py +++ b/zha/application/gateway.py @@ -444,11 +444,11 @@ def device_initialized(self, device: zigpy.device.Device) -> None: eager_start=True, ) - def _remove_init_task(task: asyncio.Task, ieee: EUI64 = device.ieee) -> None: + def _remove_init_task(task: asyncio.Task) -> None: # Only remove the entry if it still points at this task; a cancelled # task's done-callback must not pop the replacement task's entry. - if self._device_init_tasks.get(ieee) is task: - del self._device_init_tasks[ieee] + if self._device_init_tasks.get(device.ieee) is task: + del self._device_init_tasks[device.ieee] init_task.add_done_callback(_remove_init_task)