Fix Ollama queue saturation with semaphore, timeout classification, and retry

Project Team · Project Team · commit 751587ec3435 · 2026-02-20T11:37:25.000-06:00
- Add module-level threading.Semaphore(1) in OllamaOCR so only one
  inference runs at a time; concurrent requests get an immediate 503
  (retry_after: 5) instead of queuing and hitting the CloudFront 60s
  origin timeout, which was the root cause of cascading failures
- Replace bare except Exception with explicit httpx timeout/connection
  detection; each failure gets a typed error_type field (busy/timeout/
  connection/error) for accurate HTTP status codes downstream
- Add one automatic retry on timeout in _do_extract() to absorb
  transient GPU memory stalls without user impact
- Add OllamaBusyError and OllamaTimeoutError exception classes so
  api.py can return 503 with appropriate retry_after hints per case
- Fix silent bug in ui_routes.py: pass timeout at LabelValidator()
  construction instead of mutating validator.ocr.timeout after the
  fact (post-construction mutation never reached the httpx.Client)
- Handle busy/timeout errors per-image in batch jobs so one slow
  image does not abort the entire batch
diff --git a/app/api.py b/app/api.py
@@ -26,7 +26,7 @@
 from pydantic import BaseModel, Field, ValidationError
 
 from config import get_settings
-from label_validator import LabelValidator
+from label_validator import LabelValidator, OllamaBusyError, OllamaTimeoutError
 from auth import get_current_user
 from middleware import HostCheckMiddleware
 from job_manager import JobManager, JobStatus
@@ -529,6 +529,24 @@ def process_batch_job(
                     f"Completed {image_path.name} - Status: {result['status']}"
                 )
             
+            except (OllamaBusyError, OllamaTimeoutError) as e:
+                logger.warning(
+                    f"[{correlation_id}] [{i}/{len(image_files)}] "
+                    f"Ollama transient error for {image_path.name}: {e}"
+                )
+                error_result = {
+                    "status": "ERROR",
+                    "validation_level": "STRUCTURAL_ONLY",
+                    "extracted_fields": {},
+                    "validation_results": {"structural": [], "accuracy": []},
+                    "violations": [],
+                    "warnings": [],
+                    "processing_time_seconds": 0.0,
+                    "image_path": image_path.name,
+                    "error": str(e)
+                }
+                job_manager.append_result(job_id, error_result)
+            
             except Exception as e:
                 logger.error(
                     f"[{correlation_id}] Failed to process {image_path.name}: {e}",
@@ -667,6 +685,28 @@ async def verify_label(
             
             return VerifyResponse(**result)
         
+        except OllamaBusyError as e:
+            logger.warning(f"[{correlation_id}] Ollama busy — shedding request: {e}")
+            raise HTTPException(
+                status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+                detail={
+                    "message": str(e),
+                    "suggestion": "Ollama is processing another request. Retry in a few seconds.",
+                    "retry_after": 5
+                }
+            )
+        
+        except OllamaTimeoutError as e:
+            logger.warning(f"[{correlation_id}] Ollama timed out: {e}")
+            raise HTTPException(
+                status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+                detail={
+                    "message": str(e),
+                    "suggestion": "Ollama inference took too long. Retry shortly.",
+                    "retry_after": 10
+                }
+            )
+        
         except RuntimeError as e:
             # Handle Ollama unavailability
             error_msg = str(e)
diff --git a/app/label_validator.py b/app/label_validator.py
@@ -18,6 +18,14 @@
 from field_validators import FieldValidator
 
 
+class OllamaBusyError(RuntimeError):
+    """Raised when the Ollama semaphore is held and the request was shed."""
+
+
+class OllamaTimeoutError(RuntimeError):
+    """Raised when the Ollama request timed out (after any automatic retries)."""
+
+
 class ValidationStatus(Enum):
     """Overall validation status."""
     COMPLIANT = "COMPLIANT"
@@ -89,9 +97,19 @@ def validate_label(self,
         
         # Check if OCR was successful
         if not ocr_result.get('success', False):
+            error_type = ocr_result.get('error_type', 'error')
+            error_msg = ocr_result.get('error', 'OCR extraction failed')
+
+            # Raise a typed exception so callers (API layer) can return the
+            # correct HTTP status code instead of a generic 500.
+            if error_type == 'busy':
+                raise OllamaBusyError(error_msg)
+            if error_type == 'timeout':
+                raise OllamaTimeoutError(error_msg)
+
             return {
                 "status": "ERROR",
-                "error": ocr_result.get('error', 'OCR extraction failed'),
+                "error": error_msg,
                 "validation_level": "STRUCTURAL_ONLY",
                 "extracted_fields": {},
                 "validation_results": {
diff --git a/app/ocr_backends.py b/app/ocr_backends.py
@@ -5,11 +5,15 @@
 for accurate text extraction from alcohol beverage labels.
 """
 
+import logging
+import threading
 import time
 from abc import ABC, abstractmethod
 from pathlib import Path
 from typing import Dict, Any, Optional
 
+logger = logging.getLogger(__name__)
+
 
 class OCRBackend(ABC):
     """Abstract base class for OCR backends."""
@@ -38,6 +42,23 @@ def extract_text(self, image_path: str) -> Dict[str, Any]:
         pass
 
 
+
+# ---------------------------------------------------------------------------
+# Module-level semaphore: Ollama is single-threaded and processes one vision
+# inference at a time.  Without a limit here, concurrent requests pile up
+# inside Ollama's queue; each waits for all previous ones to finish, so the
+# N-th request waits N×inference_time seconds — quickly blowing past the
+# CloudFront 60-second origin timeout and causing a cascade of 504 errors.
+#
+# The semaphore allows exactly one in-flight Ollama call at a time.  Any
+# request that cannot acquire it immediately gets a fast 503 + Retry-After
+# rather than a guaranteed timeout.  Adjust _OLLAMA_MAX_CONCURRENCY if you
+# deploy a multi-GPU setup where Ollama can run parallel inferences.
+# ---------------------------------------------------------------------------
+_OLLAMA_MAX_CONCURRENCY = 1
+_ollama_semaphore = threading.Semaphore(_OLLAMA_MAX_CONCURRENCY)
+
+
 class OllamaOCR(OCRBackend):
     """OCR backend using Ollama vision models with lazy initialization."""
     
@@ -126,10 +147,21 @@ def _ensure_available(self):
             )
     
     def extract_text(self, image_path: str) -> Dict[str, Any]:
-        """Extract text using Ollama vision model."""
+        """Extract text using Ollama vision model.
+
+        Acquires the module-level semaphore before calling Ollama so that only
+        one inference runs at a time.  If the semaphore is already held (another
+        request is in progress) this call returns immediately with a transient
+        error that callers can surface as HTTP 503 + Retry-After rather than
+        letting the request queue up and eventually hit the CloudFront timeout.
+
+        A single automatic retry is attempted on httpx timeout errors because
+        Ollama occasionally takes a few extra seconds when the model is in the
+        middle of a memory operation; one retry absorbs that without user impact.
+        """
         start_time = time.time()
-        
-        # Lazy availability check - only verify when actually used
+
+        # --- availability check (sentinel file) ---
         try:
             self._ensure_available()
         except RuntimeError as e:
@@ -142,7 +174,32 @@ def extract_text(self, image_path: str) -> Dict[str, Any]:
                     'processing_time_seconds': time.time() - start_time
                 }
             }
-        
+
+        # --- concurrency gate ---
+        acquired = _ollama_semaphore.acquire(blocking=False)
+        if not acquired:
+            logger.warning(
+                "Ollama semaphore busy — rejecting request to prevent queue buildup"
+            )
+            return {
+                'success': False,
+                'error': "Ollama is busy processing another request. Please retry shortly.",
+                'error_type': 'busy',
+                'metadata': {
+                    'backend': 'ollama',
+                    'model': self.model,
+                    'processing_time_seconds': time.time() - start_time
+                }
+            }
+
+        try:
+            return self._do_extract(image_path, start_time)
+        finally:
+            _ollama_semaphore.release()
+
+    def _do_extract(self, image_path: str, start_time: float,
+                    _retry: bool = True) -> Dict[str, Any]:
+        """Inner extraction with timeout classification and one automatic retry."""
         try:
             # Verify image exists
             img_path = Path(image_path)
@@ -151,7 +208,7 @@ def extract_text(self, image_path: str) -> Dict[str, Any]:
                     'success': False,
                     'error': f"Image not found: {image_path}"
                 }
-            
+
             # Prepare prompt for structured extraction
             prompt = """Extract ALL text from this alcohol beverage label image EXACTLY as it appears.
 
@@ -184,10 +241,10 @@ def extract_text(self, image_path: str) -> Dict[str, Any]:
                 },
                 keep_alive=-1  # Keep model loaded indefinitely to avoid 60s+ reload times
             )
-            
+
             extracted_text = response['message']['content'].strip()
             processing_time = time.time() - start_time
-            
+
             return {
                 'success': True,
                 'raw_text': extracted_text,
@@ -198,11 +255,65 @@ def extract_text(self, image_path: str) -> Dict[str, Any]:
                     'confidence': 0.85  # Ollama doesn't provide confidence, use estimate
                 }
             }
-            
+
         except Exception as e:
+            # Distinguish timeout/connection errors from logic errors so callers
+            # can return the right HTTP status and decide whether to retry.
+            err_str = str(e)
+            err_type_name = type(e).__name__
+
+            is_timeout = (
+                "ReadTimeout" in err_type_name
+                or "ConnectTimeout" in err_type_name
+                or "TimeoutException" in err_type_name
+                or "timeout" in err_str.lower()
+            )
+            is_connection = (
+                "ConnectError" in err_type_name
+                or "RemoteProtocolError" in err_type_name
+                or "Cannot connect" in err_str
+            )
+
+            if is_timeout:
+                logger.warning(
+                    "Ollama request timed out after %.1fs (limit: %ds) — %s",
+                    time.time() - start_time, self.timeout, err_str
+                )
+                # One automatic retry on timeout; Ollama occasionally needs a
+                # few extra seconds during memory operations.
+                if _retry:
+                    logger.info("Retrying Ollama request once after timeout")
+                    return self._do_extract(image_path, start_time, _retry=False)
+
+                return {
+                    'success': False,
+                    'error': f"Ollama request timed out after {self.timeout}s. Please retry.",
+                    'error_type': 'timeout',
+                    'metadata': {
+                        'backend': 'ollama',
+                        'model': self.model,
+                        'processing_time_seconds': time.time() - start_time
+                    }
+                }
+
+            if is_connection:
+                logger.error("Ollama connection error: %s", err_str)
+                return {
+                    'success': False,
+                    'error': f"Cannot connect to Ollama at {self.host}: {err_str}",
+                    'error_type': 'connection',
+                    'metadata': {
+                        'backend': 'ollama',
+                        'model': self.model,
+                        'processing_time_seconds': time.time() - start_time
+                    }
+                }
+
+            logger.error("Ollama extraction error: %s", err_str, exc_info=True)
             return {
                 'success': False,
-                'error': f"Ollama extraction error: {str(e)}",
+                'error': f"Ollama extraction error: {err_str}",
+                'error_type': 'error',
                 'metadata': {
                     'backend': 'ollama',
                     'model': self.model,
diff --git a/app/ui_routes.py b/app/ui_routes.py
@@ -30,7 +30,7 @@
     SESSION_COOKIE_NAME
 )
 from config import get_settings
-from label_validator import LabelValidator
+from label_validator import LabelValidator, OllamaBusyError, OllamaTimeoutError
 from ocr_backends import OllamaOCR
 
 logger = logging.getLogger("ttb_ui")
@@ -258,12 +258,11 @@ async def ui_verify_submit(
             f.write(content)
         
         try:
-            # Initialize validator with Ollama
-            validator = LabelValidator()
-            
-            # Set timeout for Ollama
-            if hasattr(validator.ocr, 'timeout'):
-                validator.ocr.timeout = timeout
+            # Initialize validator with the resolved timeout so the httpx client
+            # inside OllamaOCR is constructed with the correct value.  Mutating
+            # validator.ocr.timeout after construction does NOT propagate to the
+            # already-built httpx.Client, so timeout must be passed here.
+            validator = LabelValidator(timeout=timeout)
             
             # Validate label
             result = validator.validate_label(
@@ -286,6 +285,27 @@ async def ui_verify_submit(
                 }
             )
         
+        except (OllamaBusyError, OllamaTimeoutError) as e:
+            return templates.TemplateResponse(
+                "index.html",
+                {
+                    "request": request,
+                    "username": username,
+                    "error": f"Ollama is temporarily unavailable: {str(e)} Please retry in a few seconds.",
+                    "error_field": "image",
+                    "form_data": {
+                        "brand_name": brand_name,
+                        "product_type": product_type,
+                        "abv": abv,
+                        "net_contents": net_contents,
+                        "bottler": bottler,
+                        "ollama_timeout": ollama_timeout
+                    },
+                    "ollama_host": settings.ollama_host,
+                    "default_timeout": settings.ollama_timeout_seconds
+                }
+            )
+        
         except RuntimeError as e:
             error_msg = str(e)
             if "Cannot connect" in error_msg or "not found" in error_msg: