diff --git a/docs/resiliency.md b/docs/resiliency.md new file mode 100644 index 00000000..86936454 --- /dev/null +++ b/docs/resiliency.md @@ -0,0 +1,129 @@ +# Resiliency + +pyOverkiz retries transient failures automatically and raises typed exceptions for +everything else. This page explains what the library handles for you, so you can +decide what (if anything) to handle yourself. + +## What the library retries + +Every request is wrapped with [`backoff`](https://github.com/litl/backoff) decorators +using exponential delays with full jitter. Each retry policy is capped by both a +maximum number of attempts (`max_tries`) and a maximum wall-clock budget (`max_time`), +so retries always give up rather than looping indefinitely. + +| Failure | Retried | Budget | On retry | +| --- | --- | --- | --- | +| Connection errors — `TimeoutError`, `ClientConnectorError`, `ServerDisconnectedError` | yes | 3 tries / ~30s | reopen the connection | +| `NotAuthenticatedError` (session expired) | yes | 2 tries / ~60s | call `login()` | +| `TooManyConcurrentRequestsError` | yes | 5 tries / ~120s | — | +| `TooManyExecutionsError` | yes | 5 tries / ~300s | — | +| `ExecutionQueueFullError` | yes | 5 tries / ~120s | — | +| Event-listener errors — `InvalidEventListenerIdError`, `NoRegisteredEventListenerError` | yes (on `fetch_events`) | 2 tries / ~30s | re-register the listener | + +Everything else — `BadCredentialsError`, `TooManyRequestsError`, `MaintenanceError`, +`UnsupportedOperationError`, and so on — is **not** retried and is raised directly. + +### Backoff timing + +Delays grow exponentially — the ceilings are 1s, 2s, 4s, 8s, … — but **full jitter** +is applied on top, so each actual wait is a random value between 0 and that ceiling +(e.g. the first retry waits a random `0–1s`, the second a random `0–2s`). The +randomization is deliberate: it spreads retries out so many clients do not all retry in +lockstep. Because of the jitter, the per-attempt delays are not a fixed sequence; the +`max_time` budget is the upper bound that matters. + +### Connection errors fail fast + +Transient transport failures are retried quickly and then given up on (3 attempts +within roughly 30 seconds). This is deliberate: it is better to surface a failure to +the caller than to keep a request hanging for minutes. For a polling loop such as +`fetch_events()`, letting a call fail and retrying on the next poll tick is usually the +right behaviour. + +A dropped keep-alive socket (`ServerDisconnectedError`) is treated as a connection +blip and simply retried — it does **not** trigger a re-login. A genuine session expiry +is reported by the server as a `Not authenticated` response, which raises +`NotAuthenticatedError` and escalates to a re-login through the separate auth policy. + +!!! note "`max_time` does not interrupt a hung request" + The `max_time` budget only bounds the scheduling of retries *between* attempts; it + cannot cancel a single in-flight request. A request that hangs is bounded by the + session timeout (see below), which surfaces as a `TimeoutError` and is then retried. + +## Request timeout + +When pyOverkiz creates its own session, it applies a default per-request timeout +(`total=15s`, `sock_connect=10s`). This is kept below the connection-retry budget so a +hung socket fails fast and is retried, instead of blocking on aiohttp's 300-second +default. + +If you pass your own `ClientSession`, **you own its timeout** — pyOverkiz does not +override it. Configure one explicitly: + +```python +from aiohttp import ClientSession, ClientTimeout + +from pyoverkiz.auth.credentials import UsernamePasswordCredentials +from pyoverkiz.client import OverkizClient +from pyoverkiz.enums import Server + +session = ClientSession(timeout=ClientTimeout(total=15, sock_connect=10)) + +client = OverkizClient( + server=Server.SOMFY_EUROPE, + credentials=UsernamePasswordCredentials("you@example.com", "password"), + session=session, +) +``` + +## What you should handle + +Because the library already retries transient failures, most consumers only need to +handle the terminal cases: + +- **`BadCredentialsError`** — the credentials are wrong; retrying will not help. Prompt + for new credentials. +- **`TooManyRequestsError`** — you are being rate limited. Reduce your polling + frequency. This is intentionally not retried so you do not make the situation worse. +- **`MaintenanceError` / `ServiceUnavailableError`** — the backend is temporarily + unavailable. Back off and try again later. +- **`NotAuthenticatedError`** — only reaches you if the automatic re-login also failed. + Call `login()` and retry. + +```python +import asyncio + +from pyoverkiz.auth.credentials import UsernamePasswordCredentials +from pyoverkiz.client import OverkizClient +from pyoverkiz.enums import Server +from pyoverkiz.exceptions import ( + MaintenanceError, + NotAuthenticatedError, + TooManyRequestsError, +) + + +async def fetch_devices() -> None: + async with OverkizClient( + server=Server.SOMFY_EUROPE, + credentials=UsernamePasswordCredentials("you@example.com", "password"), + ) as client: + await client.login() + try: + print(await client.get_devices()) + except NotAuthenticatedError: + await client.login() + print(await client.get_devices()) + except TooManyRequestsError: + # Rate limited — slow down your polling. + await asyncio.sleep(60) + except MaintenanceError: + # Backend under maintenance — try again later. + pass + + +asyncio.run(fetch_devices()) +``` + +All exceptions derive from `BaseOverkizError`, so you can catch that as a catch-all for +anything the library raises. diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 4034a7ca..2965db03 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -82,9 +82,11 @@ asyncio.run(fetch_devices_with_retry()) - Refresh setup with `get_setup()` and re-fetch devices. - Confirm you are using the correct gateway and server. -## Timeouts +## Timeouts and retries -For long-running operations, prefer shorter request timeouts with retries rather than a single long timeout. +The library retries transient connection and rate-limit failures automatically and +applies a default request timeout. See [Resiliency](resiliency.md) for the full retry +strategy and for configuring timeouts on a custom session. ## Logging diff --git a/mkdocs.yml b/mkdocs.yml index 02ea10b8..97405bd7 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -49,6 +49,7 @@ nav: - Device control: device-control.md - Action queue: action-queue.md - Event handling: event-handling.md + - Resiliency: resiliency.md - Troubleshooting: troubleshooting.md - Migrating from v1: migration-v2.md - Reference: diff --git a/pyoverkiz/client.py b/pyoverkiz/client.py index 836761a9..93b5f973 100644 --- a/pyoverkiz/client.py +++ b/pyoverkiz/client.py @@ -17,6 +17,7 @@ ClientConnectorError, ClientResponse, ClientSession, + ClientTimeout, ServerDisconnectedError, ) from backoff.types import Details @@ -72,6 +73,8 @@ _LOGGER = logging.getLogger(__name__) +DEFAULT_TIMEOUT = ClientTimeout(total=15, sock_connect=10) + def _get_client_from_invocation(invocation: Details) -> OverkizClient: """Return the `OverkizClient` instance from a backoff invocation.""" @@ -88,12 +91,11 @@ async def refresh_listener(invocation: Details) -> None: await _get_client_from_invocation(invocation).register_event_listener() -# Reusable backoff decorators with max_tries and max_time to cap total retry duration. retry_on_auth_error = backoff.on_exception( backoff.expo, - (NotAuthenticatedError, ServerDisconnectedError), + NotAuthenticatedError, max_tries=2, - max_time=60, # safety net for hung requests + max_time=60, jitter=backoff.full_jitter, on_backoff=relogin, logger=_LOGGER, @@ -101,9 +103,9 @@ async def refresh_listener(invocation: Details) -> None: retry_on_connection_failure = backoff.on_exception( backoff.expo, - (TimeoutError, ClientConnectorError), - max_tries=5, - max_time=120, + (TimeoutError, ClientConnectorError, ServerDisconnectedError), + max_tries=3, + max_time=30, jitter=backoff.full_jitter, logger=_LOGGER, ) @@ -226,7 +228,9 @@ def __init__( self.gateways: list[Gateway] = [] self._event_listener_id: str | None = None - self.session = session or ClientSession(headers={"User-Agent": USER_AGENT}) + self.session = session or ClientSession( + headers={"User-Agent": USER_AGENT}, timeout=DEFAULT_TIMEOUT + ) self._ssl = verify_ssl if self.server_config.api_type == APIType.LOCAL and verify_ssl: @@ -476,7 +480,6 @@ async def register_event_listener(self) -> str: @retry_on_concurrent_requests @retry_on_auth_error @retry_on_listener_error - @retry_on_connection_failure async def fetch_events(self) -> list[Event]: """Fetch new events from a registered event listener. Fetched events are removed. @@ -1021,6 +1024,7 @@ async def deactivate_developer_mode(self, gateway_id: str) -> None: """ await self._delete(f"setup/gateways/{gateway_id}/developerMode") + @retry_on_connection_failure async def _get(self, path: str) -> Any: """Make a GET request to the OverKiz API.""" await self._refresh_token_if_expired() @@ -1032,6 +1036,7 @@ async def _get(self, path: str) -> Any: ) as response: return await self._parse_response(response) + @retry_on_connection_failure async def _post( self, path: str, @@ -1050,6 +1055,7 @@ async def _post( ) as response: return await self._parse_response(response) + @retry_on_connection_failure async def _put(self, path: str, payload: dict[str, Any] | None = None) -> Any: """Make a PUT request to the OverKiz API.""" await self._refresh_token_if_expired() @@ -1062,6 +1068,7 @@ async def _put(self, path: str, payload: dict[str, Any] | None = None) -> Any: ) as response: return await self._parse_response(response) + @retry_on_connection_failure async def _delete(self, path: str) -> None: """Make a DELETE request to the OverKiz API.""" await self._refresh_token_if_expired() diff --git a/tests/test_client.py b/tests/test_client.py index 4956b713..0a818346 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -58,6 +58,19 @@ async def test_get_api_type_local(self, local_client: OverkizClient): """Verify that a local-configured client reports APIType.LOCAL.""" assert local_client.server_config.api_type == APIType.LOCAL + @pytest.mark.asyncio + async def test_default_session_has_request_timeout( + self, client: OverkizClient + ) -> None: + """A client-owned session applies a default per-request timeout.""" + from pyoverkiz.client import DEFAULT_TIMEOUT + + assert client.session.timeout.total == DEFAULT_TIMEOUT.total + # Must stay below the connection-retry budget so a hung socket fails fast. + assert client.session.timeout.total is not None + assert client.session.timeout.total <= 30 + await client.session.close() + @pytest.mark.asyncio async def test_get_devices_basic(self, client: OverkizClient): """Ensure the client can fetch and parse the basic devices fixture.""" @@ -68,6 +81,108 @@ async def test_get_devices_basic(self, client: OverkizClient): devices = await client.get_devices() assert len(devices) == 23 + @pytest.mark.asyncio + async def test_backoff_retries_command_on_connection_failure( + self, client: OverkizClient + ) -> None: + """Ensure the command path retries a transient connection failure.""" + resp = MockResponse(json.dumps({"execId": "exec-1"})) + + with ( + patch("backoff._async.asyncio.sleep", new=AsyncMock()) as sleep_mock, + patch.object( + aiohttp.ClientSession, + "post", + side_effect=[TimeoutError("timed out"), resp], + ) as post_mock, + ): + exec_id = await client.execute_action_group( + actions=[Action(device_url="io://1234-5678-9012/12345678")], + ) + + assert exec_id == "exec-1" + assert post_mock.call_count == 2 + assert sleep_mock.await_count == 1 + + @pytest.mark.asyncio + async def test_backoff_gives_up_after_max_tries_on_connection_failure( + self, client: OverkizClient + ) -> None: + """Ensure a persistent connection failure is re-raised after 3 attempts.""" + with ( + patch("backoff._async.asyncio.sleep", new=AsyncMock()) as sleep_mock, + patch.object( + aiohttp.ClientSession, + "get", + side_effect=TimeoutError("timed out"), + ) as get_mock, + pytest.raises(TimeoutError), + ): + await client.get_api_version() + + assert get_mock.call_count == 3 + assert sleep_mock.await_count == 2 + + @pytest.mark.asyncio + async def test_backoff_retries_on_server_disconnected_without_relogin( + self, client: OverkizClient + ) -> None: + """A dropped keep-alive socket is retried as a connection blip, not a relogin.""" + client.login = AsyncMock() + resp = MockResponse(json.dumps({"protocolVersion": "1"})) + + with ( + patch("backoff._async.asyncio.sleep", new=AsyncMock()) as sleep_mock, + patch.object( + aiohttp.ClientSession, + "get", + side_effect=[aiohttp.ServerDisconnectedError(), resp], + ) as get_mock, + ): + result = await client.get_api_version() + + assert result == "1" + assert get_mock.call_count == 2 + assert sleep_mock.await_count == 1 + # A transport blip must not trigger a (heavy) relogin. + assert client.login.await_count == 0 + + @pytest.mark.asyncio + async def test_server_disconnected_escalates_to_relogin_when_auth_expired( + self, client: OverkizClient + ) -> None: + """If retrying a disconnect surfaces a real auth expiry, relogin still kicks in.""" + client.login = AsyncMock() + + with ( + patch("backoff._async.asyncio.sleep", new=AsyncMock()) as sleep_mock, + patch.object( + aiohttp.ClientSession, + "get", + side_effect=[ + # Transport blip, retried by the connection decorator. + aiohttp.ServerDisconnectedError(), + # Retry reaches the server, which reports the session expired. + MockResponse( + json.dumps( + { + "errorCode": "RESOURCE_ACCESS_DENIED", + "error": "Not authenticated", + } + ), + status=401, + ), + # After relogin, the auth decorator retries and succeeds. + MockResponse(json.dumps({"protocolVersion": "1"})), + ], + ), + ): + result = await client.get_api_version() + + assert result == "1" + assert client.login.await_count == 1 + assert sleep_mock.await_count >= 1 + @pytest.mark.parametrize( ("fixture_name", "event_length"), [