Skip to content

Commit 751587e

Browse files
author
Project Team
committed
Fix Ollama queue saturation with semaphore, timeout classification, and retry
- Add module-level threading.Semaphore(1) in OllamaOCR so only one inference runs at a time; concurrent requests get an immediate 503 (retry_after: 5) instead of queuing and hitting the CloudFront 60s origin timeout, which was the root cause of cascading failures - Replace bare except Exception with explicit httpx timeout/connection detection; each failure gets a typed error_type field (busy/timeout/ connection/error) for accurate HTTP status codes downstream - Add one automatic retry on timeout in _do_extract() to absorb transient GPU memory stalls without user impact - Add OllamaBusyError and OllamaTimeoutError exception classes so api.py can return 503 with appropriate retry_after hints per case - Fix silent bug in ui_routes.py: pass timeout at LabelValidator() construction instead of mutating validator.ocr.timeout after the fact (post-construction mutation never reached the httpx.Client) - Handle busy/timeout errors per-image in batch jobs so one slow image does not abort the entire batch
1 parent db67db3 commit 751587e

File tree

4 files changed

+207
-18
lines changed

4 files changed

+207
-18
lines changed

app/api.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from pydantic import BaseModel, Field, ValidationError
2727

2828
from config import get_settings
29-
from label_validator import LabelValidator
29+
from label_validator import LabelValidator, OllamaBusyError, OllamaTimeoutError
3030
from auth import get_current_user
3131
from middleware import HostCheckMiddleware
3232
from job_manager import JobManager, JobStatus
@@ -529,6 +529,24 @@ def process_batch_job(
529529
f"Completed {image_path.name} - Status: {result['status']}"
530530
)
531531

532+
except (OllamaBusyError, OllamaTimeoutError) as e:
533+
logger.warning(
534+
f"[{correlation_id}] [{i}/{len(image_files)}] "
535+
f"Ollama transient error for {image_path.name}: {e}"
536+
)
537+
error_result = {
538+
"status": "ERROR",
539+
"validation_level": "STRUCTURAL_ONLY",
540+
"extracted_fields": {},
541+
"validation_results": {"structural": [], "accuracy": []},
542+
"violations": [],
543+
"warnings": [],
544+
"processing_time_seconds": 0.0,
545+
"image_path": image_path.name,
546+
"error": str(e)
547+
}
548+
job_manager.append_result(job_id, error_result)
549+
532550
except Exception as e:
533551
logger.error(
534552
f"[{correlation_id}] Failed to process {image_path.name}: {e}",
@@ -667,6 +685,28 @@ async def verify_label(
667685

668686
return VerifyResponse(**result)
669687

688+
except OllamaBusyError as e:
689+
logger.warning(f"[{correlation_id}] Ollama busy — shedding request: {e}")
690+
raise HTTPException(
691+
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
692+
detail={
693+
"message": str(e),
694+
"suggestion": "Ollama is processing another request. Retry in a few seconds.",
695+
"retry_after": 5
696+
}
697+
)
698+
699+
except OllamaTimeoutError as e:
700+
logger.warning(f"[{correlation_id}] Ollama timed out: {e}")
701+
raise HTTPException(
702+
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
703+
detail={
704+
"message": str(e),
705+
"suggestion": "Ollama inference took too long. Retry shortly.",
706+
"retry_after": 10
707+
}
708+
)
709+
670710
except RuntimeError as e:
671711
# Handle Ollama unavailability
672712
error_msg = str(e)

app/label_validator.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,14 @@
1818
from field_validators import FieldValidator
1919

2020

21+
class OllamaBusyError(RuntimeError):
22+
"""Raised when the Ollama semaphore is held and the request was shed."""
23+
24+
25+
class OllamaTimeoutError(RuntimeError):
26+
"""Raised when the Ollama request timed out (after any automatic retries)."""
27+
28+
2129
class ValidationStatus(Enum):
2230
"""Overall validation status."""
2331
COMPLIANT = "COMPLIANT"
@@ -89,9 +97,19 @@ def validate_label(self,
8997

9098
# Check if OCR was successful
9199
if not ocr_result.get('success', False):
100+
error_type = ocr_result.get('error_type', 'error')
101+
error_msg = ocr_result.get('error', 'OCR extraction failed')
102+
103+
# Raise a typed exception so callers (API layer) can return the
104+
# correct HTTP status code instead of a generic 500.
105+
if error_type == 'busy':
106+
raise OllamaBusyError(error_msg)
107+
if error_type == 'timeout':
108+
raise OllamaTimeoutError(error_msg)
109+
92110
return {
93111
"status": "ERROR",
94-
"error": ocr_result.get('error', 'OCR extraction failed'),
112+
"error": error_msg,
95113
"validation_level": "STRUCTURAL_ONLY",
96114
"extracted_fields": {},
97115
"validation_results": {

app/ocr_backends.py

Lines changed: 120 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,15 @@
55
for accurate text extraction from alcohol beverage labels.
66
"""
77

8+
import logging
9+
import threading
810
import time
911
from abc import ABC, abstractmethod
1012
from pathlib import Path
1113
from typing import Dict, Any, Optional
1214

15+
logger = logging.getLogger(__name__)
16+
1317

1418
class OCRBackend(ABC):
1519
"""Abstract base class for OCR backends."""
@@ -38,6 +42,23 @@ def extract_text(self, image_path: str) -> Dict[str, Any]:
3842
pass
3943

4044

45+
46+
# ---------------------------------------------------------------------------
47+
# Module-level semaphore: Ollama is single-threaded and processes one vision
48+
# inference at a time. Without a limit here, concurrent requests pile up
49+
# inside Ollama's queue; each waits for all previous ones to finish, so the
50+
# N-th request waits N×inference_time seconds — quickly blowing past the
51+
# CloudFront 60-second origin timeout and causing a cascade of 504 errors.
52+
#
53+
# The semaphore allows exactly one in-flight Ollama call at a time. Any
54+
# request that cannot acquire it immediately gets a fast 503 + Retry-After
55+
# rather than a guaranteed timeout. Adjust _OLLAMA_MAX_CONCURRENCY if you
56+
# deploy a multi-GPU setup where Ollama can run parallel inferences.
57+
# ---------------------------------------------------------------------------
58+
_OLLAMA_MAX_CONCURRENCY = 1
59+
_ollama_semaphore = threading.Semaphore(_OLLAMA_MAX_CONCURRENCY)
60+
61+
4162
class OllamaOCR(OCRBackend):
4263
"""OCR backend using Ollama vision models with lazy initialization."""
4364

@@ -126,10 +147,21 @@ def _ensure_available(self):
126147
)
127148

128149
def extract_text(self, image_path: str) -> Dict[str, Any]:
129-
"""Extract text using Ollama vision model."""
150+
"""Extract text using Ollama vision model.
151+
152+
Acquires the module-level semaphore before calling Ollama so that only
153+
one inference runs at a time. If the semaphore is already held (another
154+
request is in progress) this call returns immediately with a transient
155+
error that callers can surface as HTTP 503 + Retry-After rather than
156+
letting the request queue up and eventually hit the CloudFront timeout.
157+
158+
A single automatic retry is attempted on httpx timeout errors because
159+
Ollama occasionally takes a few extra seconds when the model is in the
160+
middle of a memory operation; one retry absorbs that without user impact.
161+
"""
130162
start_time = time.time()
131-
132-
# Lazy availability check - only verify when actually used
163+
164+
# --- availability check (sentinel file) ---
133165
try:
134166
self._ensure_available()
135167
except RuntimeError as e:
@@ -142,7 +174,32 @@ def extract_text(self, image_path: str) -> Dict[str, Any]:
142174
'processing_time_seconds': time.time() - start_time
143175
}
144176
}
145-
177+
178+
# --- concurrency gate ---
179+
acquired = _ollama_semaphore.acquire(blocking=False)
180+
if not acquired:
181+
logger.warning(
182+
"Ollama semaphore busy — rejecting request to prevent queue buildup"
183+
)
184+
return {
185+
'success': False,
186+
'error': "Ollama is busy processing another request. Please retry shortly.",
187+
'error_type': 'busy',
188+
'metadata': {
189+
'backend': 'ollama',
190+
'model': self.model,
191+
'processing_time_seconds': time.time() - start_time
192+
}
193+
}
194+
195+
try:
196+
return self._do_extract(image_path, start_time)
197+
finally:
198+
_ollama_semaphore.release()
199+
200+
def _do_extract(self, image_path: str, start_time: float,
201+
_retry: bool = True) -> Dict[str, Any]:
202+
"""Inner extraction with timeout classification and one automatic retry."""
146203
try:
147204
# Verify image exists
148205
img_path = Path(image_path)
@@ -151,7 +208,7 @@ def extract_text(self, image_path: str) -> Dict[str, Any]:
151208
'success': False,
152209
'error': f"Image not found: {image_path}"
153210
}
154-
211+
155212
# Prepare prompt for structured extraction
156213
prompt = """Extract ALL text from this alcohol beverage label image EXACTLY as it appears.
157214
@@ -184,10 +241,10 @@ def extract_text(self, image_path: str) -> Dict[str, Any]:
184241
},
185242
keep_alive=-1 # Keep model loaded indefinitely to avoid 60s+ reload times
186243
)
187-
244+
188245
extracted_text = response['message']['content'].strip()
189246
processing_time = time.time() - start_time
190-
247+
191248
return {
192249
'success': True,
193250
'raw_text': extracted_text,
@@ -198,11 +255,65 @@ def extract_text(self, image_path: str) -> Dict[str, Any]:
198255
'confidence': 0.85 # Ollama doesn't provide confidence, use estimate
199256
}
200257
}
201-
258+
202259
except Exception as e:
260+
# Distinguish timeout/connection errors from logic errors so callers
261+
# can return the right HTTP status and decide whether to retry.
262+
err_str = str(e)
263+
err_type_name = type(e).__name__
264+
265+
is_timeout = (
266+
"ReadTimeout" in err_type_name
267+
or "ConnectTimeout" in err_type_name
268+
or "TimeoutException" in err_type_name
269+
or "timeout" in err_str.lower()
270+
)
271+
is_connection = (
272+
"ConnectError" in err_type_name
273+
or "RemoteProtocolError" in err_type_name
274+
or "Cannot connect" in err_str
275+
)
276+
277+
if is_timeout:
278+
logger.warning(
279+
"Ollama request timed out after %.1fs (limit: %ds) — %s",
280+
time.time() - start_time, self.timeout, err_str
281+
)
282+
# One automatic retry on timeout; Ollama occasionally needs a
283+
# few extra seconds during memory operations.
284+
if _retry:
285+
logger.info("Retrying Ollama request once after timeout")
286+
return self._do_extract(image_path, start_time, _retry=False)
287+
288+
return {
289+
'success': False,
290+
'error': f"Ollama request timed out after {self.timeout}s. Please retry.",
291+
'error_type': 'timeout',
292+
'metadata': {
293+
'backend': 'ollama',
294+
'model': self.model,
295+
'processing_time_seconds': time.time() - start_time
296+
}
297+
}
298+
299+
if is_connection:
300+
logger.error("Ollama connection error: %s", err_str)
301+
return {
302+
'success': False,
303+
'error': f"Cannot connect to Ollama at {self.host}: {err_str}",
304+
'error_type': 'connection',
305+
'metadata': {
306+
'backend': 'ollama',
307+
'model': self.model,
308+
'processing_time_seconds': time.time() - start_time
309+
}
310+
}
311+
312+
logger.error("Ollama extraction error: %s", err_str, exc_info=True)
203313
return {
204314
'success': False,
205-
'error': f"Ollama extraction error: {str(e)}",
315+
'error': f"Ollama extraction error: {err_str}",
316+
'error_type': 'error',
206317
'metadata': {
207318
'backend': 'ollama',
208319
'model': self.model,

app/ui_routes.py

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
SESSION_COOKIE_NAME
3131
)
3232
from config import get_settings
33-
from label_validator import LabelValidator
33+
from label_validator import LabelValidator, OllamaBusyError, OllamaTimeoutError
3434
from ocr_backends import OllamaOCR
3535

3636
logger = logging.getLogger("ttb_ui")
@@ -258,12 +258,11 @@ async def ui_verify_submit(
258258
f.write(content)
259259

260260
try:
261-
# Initialize validator with Ollama
262-
validator = LabelValidator()
263-
264-
# Set timeout for Ollama
265-
if hasattr(validator.ocr, 'timeout'):
266-
validator.ocr.timeout = timeout
261+
# Initialize validator with the resolved timeout so the httpx client
262+
# inside OllamaOCR is constructed with the correct value. Mutating
263+
# validator.ocr.timeout after construction does NOT propagate to the
264+
# already-built httpx.Client, so timeout must be passed here.
265+
validator = LabelValidator(timeout=timeout)
267266

268267
# Validate label
269268
result = validator.validate_label(
@@ -286,6 +285,27 @@ async def ui_verify_submit(
286285
}
287286
)
288287

288+
except (OllamaBusyError, OllamaTimeoutError) as e:
289+
return templates.TemplateResponse(
290+
"index.html",
291+
{
292+
"request": request,
293+
"username": username,
294+
"error": f"Ollama is temporarily unavailable: {str(e)} Please retry in a few seconds.",
295+
"error_field": "image",
296+
"form_data": {
297+
"brand_name": brand_name,
298+
"product_type": product_type,
299+
"abv": abv,
300+
"net_contents": net_contents,
301+
"bottler": bottler,
302+
"ollama_timeout": ollama_timeout
303+
},
304+
"ollama_host": settings.ollama_host,
305+
"default_timeout": settings.ollama_timeout_seconds
306+
}
307+
)
308+
289309
except RuntimeError as e:
290310
error_msg = str(e)
291311
if "Cannot connect" in error_msg or "not found" in error_msg:

0 commit comments

Comments
 (0)