4848# Initialize job manager for async batch processing
4949job_manager = JobManager ()
5050
51- # Track Ollama pre-warming state (per-process)
52- _ollama_prewarmed = False
53- _ollama_prewarm_lock = False
54-
5551
5652# ============================================================================
5753# Pydantic Models
@@ -979,227 +975,44 @@ async def root():
979975 from fastapi .responses import RedirectResponse
980976 return RedirectResponse (url = "/ui/verify" , status_code = status .HTTP_302_FOUND )
981977
982- # File-based lock path for cross-worker pre-warm coordination
983- _PREWARM_LOCK_FILE = Path ("/app/tmp/prewarm.lock" )
984- _PREWARM_DONE_FILE = Path ("/app/tmp/prewarm.done" )
985-
986-
987- def prewarm_ollama_model (ollama_host : str , model : str ) -> None :
988- """
989- Pre-warm Ollama model by loading it into GPU memory.
978+ # Path to the Ollama health sentinel file written by the host cron job
979+ # scripts/ollama-health-cron.sh runs every 5 minutes and writes this file
980+ # when the model is in GPU RAM. The app never touches Ollama directly from
981+ # the health check — all the blocking network calls are gone.
982+ _OLLAMA_HEALTHY_FILE = Path ("/etc/OLLAMA_HEALTHY" )
990983
991- Uses a file-based lock so only one of the 4 uvicorn workers triggers
992- the pre-warm. Also checks /api/ps first — if the model is already in
993- GPU RAM (e.g. from a previous worker), skips entirely.
994984
995- Args:
996- ollama_host: Ollama server URL
997- model: Model name to pre-warm
985+ def get_health_status () -> Dict [str , Any ]:
998986 """
999- global _ollama_prewarmed , _ollama_prewarm_lock
1000-
1001- # In-process guard — already done or in progress in this worker
1002- if _ollama_prewarmed or _ollama_prewarm_lock :
1003- return
1004-
1005- _ollama_prewarm_lock = True
1006-
1007- try :
1008- import requests
1009- import threading
1010- import fcntl
1011-
1012- def _prewarm ():
1013- global _ollama_prewarmed , _ollama_prewarm_lock
1014- lock_fd = None
1015- acquired = False
1016- try :
1017- # --- Step 1: Check if model is already in GPU via /api/ps ---
1018- try :
1019- ps_resp = requests .get (f"{ ollama_host } /api/ps" , timeout = 2 )
1020- if ps_resp .status_code == 200 :
1021- loaded = [m .get ('name' , '' ).split (':' )[0 ]
1022- for m in ps_resp .json ().get ('models' , [])]
1023- model_base = model .split (':' )[0 ]
1024- if model_base in loaded :
1025- logger .info (f"Model '{ model } ' already in GPU, skipping pre-warm" )
1026- _ollama_prewarmed = True
1027- _PREWARM_DONE_FILE .touch ()
1028- return
1029- except Exception :
1030- pass # If /api/ps fails, proceed to try pre-warming anyway
1031-
1032- # --- Step 2: Check done-file (another worker already succeeded) ---
1033- if _PREWARM_DONE_FILE .exists ():
1034- logger .info (f"Pre-warm done-file found, skipping pre-warm in this worker" )
1035- _ollama_prewarmed = True
1036- return
1037-
1038- # --- Step 3: Acquire file lock — only one worker does the pre-warm ---
1039- _PREWARM_LOCK_FILE .parent .mkdir (parents = True , exist_ok = True )
1040- lock_fd = open (_PREWARM_LOCK_FILE , 'w' )
1041- try :
1042- fcntl .flock (lock_fd .fileno (), fcntl .LOCK_EX | fcntl .LOCK_NB )
1043- acquired = True
1044- except BlockingIOError :
1045- logger .info ("Another worker is already pre-warming, skipping" )
1046- _ollama_prewarmed = True # Will be done soon, treat as in progress
1047- return
1048-
1049- # Re-check done-file now that we hold the lock
1050- if _PREWARM_DONE_FILE .exists ():
1051- logger .info ("Pre-warm done-file found after acquiring lock, skipping" )
1052- _ollama_prewarmed = True
1053- return
1054-
1055- # --- Step 4: Fire the pre-warm request ---
1056- # Use /api/generate with an empty prompt and keep_alive=-1.
1057- # This loads the model weights into GPU RAM without running
1058- # real inference, so Ollama stays free to serve /verify
1059- # requests immediately after. Using /api/chat with a real
1060- # message runs full inference (20-60s) and blocks Ollama's
1061- # single inference thread, causing /api/tags health-check
1062- # calls to hang and cascading 503s from CloudFront.
1063- logger .info (f"Pre-warming Ollama model '{ model } ' into GPU memory..." )
1064- response = requests .post (
1065- f"{ ollama_host } /api/generate" ,
1066- json = {
1067- "model" : model ,
1068- "prompt" : "" ,
1069- "keep_alive" : - 1 ,
1070- "stream" : False
1071- },
1072- timeout = 120
1073- )
987+ Check health of the Ollama backend and return status.
1074988
1075- if response .status_code == 200 :
1076- logger .info (f"Model '{ model } ' pre-warmed and loaded in GPU" )
1077- _ollama_prewarmed = True
1078- _PREWARM_DONE_FILE .touch ()
1079- else :
1080- logger .warning (f"Model pre-warm returned HTTP { response .status_code } " )
1081-
1082- except Exception as e :
1083- logger .warning (f"Model pre-warm failed: { e } " )
1084- finally :
1085- if lock_fd :
1086- if acquired :
1087- fcntl .flock (lock_fd .fileno (), fcntl .LOCK_UN )
1088- lock_fd .close ()
1089- _ollama_prewarm_lock = False
1090-
1091- thread = threading .Thread (target = _prewarm , daemon = True )
1092- thread .start ()
1093-
1094- except Exception as e :
1095- logger .error (f"Failed to start pre-warm thread: { e } " )
1096- _ollama_prewarm_lock = False
989+ Shared function used by both /health (JSON) and /ui/health (HTML) endpoints.
1097990
991+ Ollama health is determined solely by the presence of /etc/OLLAMA_HEALTHY,
992+ which is written by the host cron job (scripts/ollama-health-cron.sh) when
993+ the model is confirmed to be in GPU RAM. This avoids any synchronous network
994+ calls to Ollama inside the FastAPI event loop, which was the root cause of
995+ the /health endpoint blocking and cascading 503s from CloudFront.
1098996
1099- def get_health_status () -> Dict [str , Any ]:
1100- """
1101- Check health of OCR backends and return status.
1102-
1103- Shared function used by both /health (JSON) and /ui/health (HTML) endpoints.
1104-
1105- Ollama is marked as "available" only when ALL three conditions are met:
1106- 1. Ollama server is responsive (GET /api/tags succeeds)
1107- 2. Model is downloaded and available (appears in /api/tags response)
1108- 3. Model is loaded in GPU RAM (appears in /api/ps response)
1109-
1110997 Returns:
1111- Dictionary with health status:
1112998 {
1113- "status": "healthy" | "degraded ",
999+ "status": "healthy" | "initializing ",
11141000 "backends": {
1115- "tesseract": {"available": bool, "error": str|null},
11161001 "ollama": {"available": bool, "error": str|null, "model": str}
11171002 },
11181003 "capabilities": {
1119- "ocr_backends": ["tesseract", "ollama"],
1120- "degraded_mode": bool
1004+ "ocr_backends": ["ollama"]
11211005 }
11221006 }
11231007 """
1124- from ocr_backends import OllamaOCR
11251008 import os
1126-
1127- # Check Ollama availability (lazy check - no exception raised)
1128- # Ollama is considered "available" if:
1129- # 1. Server is responsive
1130- # 2. Model is downloaded (exists in /api/tags)
1131- # 3. Model is loaded in GPU RAM (appears in /api/ps)
1132- ollama_host = settings .ollama_host
11331009 ollama_model = os .getenv ("OLLAMA_MODEL" , "llama3.2-vision" )
1134- ollama_available = False
1135- ollama_error = None
1136-
1137- try :
1138- # Use short timeout for health check to avoid blocking
1139- import requests
1140-
1141- # Check 1: Is server responsive and model downloaded?
1142- response = requests .get (f"{ ollama_host } /api/tags" , timeout = 2 )
1143-
1144- if response .status_code == 200 :
1145- models_data = response .json ()
1146- available_models = [m .get ('name' , '' ).split (':' )[0 ] for m in models_data .get ('models' , [])]
1147- model_base = ollama_model .split (':' )[0 ]
1148-
1149- if model_base not in available_models :
1150- ollama_error = f"Model '{ ollama_model } ' not downloaded"
1151- else :
1152- # Check 2: Is model loaded in GPU RAM?
1153- ps_response = requests .get (f"{ ollama_host } /api/ps" , timeout = 2 )
1154-
1155- if ps_response .status_code == 200 :
1156- loaded_models = ps_response .json ().get ('models' , [])
1157- loaded_model_names = [m .get ('name' , '' ).split (':' )[0 ] for m in loaded_models ]
1158-
1159- if model_base in loaded_model_names :
1160- # Model is loaded and ready!
1161- ollama_available = True
1162- # Mark as pre-warmed since model is loaded
1163- global _ollama_prewarmed
1164- _ollama_prewarmed = True
1165- # Ensure done-file exists so other workers skip pre-warm
1166- _PREWARM_DONE_FILE .touch ()
1167- else :
1168- # Model not in GPU — reset flags so pre-warm fires again
1169- _ollama_prewarmed = False
1170- if _PREWARM_DONE_FILE .exists ():
1171- _PREWARM_DONE_FILE .unlink (missing_ok = True )
1172- logger .info ("Model evicted from GPU, cleared pre-warm done-file" )
1173- # Trigger pre-warm (file lock ensures only one worker does it)
1174- global _ollama_prewarm_lock
1175- if not _ollama_prewarm_lock :
1176- logger .info (f"Model '{ ollama_model } ' not in GPU, triggering pre-warm" )
1177- prewarm_ollama_model (ollama_host , ollama_model )
1178-
1179- # Model exists but not yet loaded - pre-warming may be in progress
1180- ollama_error = f"Model loading into GPU (pre-warming in progress)"
1181- else :
1182- ollama_error = f"Cannot check GPU status: HTTP { ps_response .status_code } "
1183- else :
1184- ollama_error = f"Ollama not available: HTTP { response .status_code } "
1185-
1186- except requests .exceptions .Timeout :
1187- ollama_error = "Unreachable (timeout after 2s)"
1188- except requests .exceptions .ConnectionError :
1189- ollama_error = "Unreachable (connection failed)"
1190- except Exception as e :
1191- ollama_error = f"Unreachable ({ str (e )} )"
1192-
1193- # Determine available backends (only Ollama)
1194- available_backends = []
1195- if ollama_available :
1196- available_backends .append ("ollama" )
1197-
1198- # Determine overall status
1199- overall_status = "healthy" if ollama_available else "initializing"
1200-
1010+
1011+ ollama_available = _OLLAMA_HEALTHY_FILE .exists ()
1012+ ollama_error = None if ollama_available else "Model not in GPU (cron pre-warm pending)"
1013+
12011014 return {
1202- "status" : overall_status ,
1015+ "status" : "healthy" if ollama_available else "initializing" ,
12031016 "backends" : {
12041017 "ollama" : {
12051018 "available" : ollama_available ,
@@ -1208,7 +1021,7 @@ def get_health_status() -> Dict[str, Any]:
12081021 }
12091022 },
12101023 "capabilities" : {
1211- "ocr_backends" : available_backends
1024+ "ocr_backends" : [ "ollama" ] if ollama_available else []
12121025 }
12131026 }
12141027
@@ -1217,26 +1030,20 @@ def get_health_status() -> Dict[str, Any]:
12171030async def health_check ():
12181031 """
12191032 Health check endpoint that reports Ollama backend availability.
1220-
1221- Returns service health status. This endpoint always returns HTTP 200 as long
1222- as the API is running, even when Ollama is initializing. This allows the
1223- load balancer to route traffic while the Ollama model is loading.
1224-
1225- Ollama is marked as "available" only when ALL three conditions are met:
1226- 1. Ollama server is responsive (GET /api/tags succeeds)
1227- 2. Model is downloaded and available (appears in /api/tags response)
1228- 3. Model is loaded in GPU RAM (appears in /api/ps response)
1229-
1230- The model will be automatically pre-warmed when detected as downloaded.
1231-
1033+
1034+ Returns HTTP 200 as long as the API process is running, even when Ollama
1035+ is initializing. Ollama is reported as available only when the sentinel
1036+ file /etc/OLLAMA_HEALTHY is present on the host; that file is maintained
1037+ by the scripts/ollama-health-cron.sh cron job running every 5 minutes.
1038+
12321039 Returns:
12331040 {
12341041 "status": "healthy" | "initializing",
12351042 "backends": {
12361043 "ollama": {"available": bool, "error": str|null, "model": str}
12371044 },
12381045 "capabilities": {
1239- "ocr_backends": ["ollama"] # Available backends
1046+ "ocr_backends": ["ollama"]
12401047 }
12411048 }
12421049 """
0 commit comments