Create fresh ollama.Client per request to avoid stale connection hangs

Project Team · Project Team · commit 9d6bbb8c7037 · 2026-02-20T12:31:47.000-06:00
A persistent httpx client reuses connections. After a streaming response
completes, the underlying HTTP/1.1 connection can be left in a state
where the server has closed it but the client hasn't detected that yet.
The next request then hangs silently until the read timeout fires,
holding the flock the entire time and starving every subsequent request.

Create a new ollama.Client (and therefore a fresh httpx connection) for
each inference call. The per-request overhead is negligible compared to
the 10s inference time.
diff --git a/app/ocr_backends.py b/app/ocr_backends.py
@@ -93,10 +93,11 @@ def __init__(self, model: str = "llama3.2-vision", host: str = "http://localhost
             import httpx
             import ollama
             self.ollama = ollama
-            self._client = ollama.Client(
-                host=host,
-                timeout=httpx.Timeout(timeout=float(timeout), connect=10.0),
-            )
+            self._httpx_timeout = httpx.Timeout(timeout=float(timeout), connect=10.0)
+            # Do NOT create a persistent _client here. A long-lived httpx client
+            # can end up with a stale/broken connection after a streaming response
+            # completes, causing subsequent requests to hang silently. We create
+            # a fresh client per request in _do_extract() instead.
         except ImportError:
             self._is_available = False
             self._availability_error = "ollama Python library not installed. Install with: pip install ollama"
@@ -271,7 +272,11 @@ def _do_extract(self, image_path: str, start_time: float) -> Dict[str, Any]:
             #   streaming because the runner will still abort cleanly on pipe
             #   breaks regardless of the keep_alive setting.
             chunks = []
-            for chunk in self._client.chat(
+            client = self.ollama.Client(
+                host=self.host,
+                timeout=self._httpx_timeout,
+            )
+            for chunk in client.chat(
                 model=self.model,
                 messages=[{
                     'role': 'user',