Skip to content

Commit ea55be2

Browse files
author
Project Team
committed
move prewarm code to shell script
1 parent 79e09ca commit ea55be2

4 files changed

Lines changed: 161 additions & 449 deletions

File tree

app/api.py

Lines changed: 28 additions & 221 deletions
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,6 @@
4848
# Initialize job manager for async batch processing
4949
job_manager = JobManager()
5050

51-
# Track Ollama pre-warming state (per-process)
52-
_ollama_prewarmed = False
53-
_ollama_prewarm_lock = False
54-
5551

5652
# ============================================================================
5753
# Pydantic Models
@@ -979,227 +975,44 @@ async def root():
979975
from fastapi.responses import RedirectResponse
980976
return RedirectResponse(url="/ui/verify", status_code=status.HTTP_302_FOUND)
981977

982-
# File-based lock path for cross-worker pre-warm coordination
983-
_PREWARM_LOCK_FILE = Path("/app/tmp/prewarm.lock")
984-
_PREWARM_DONE_FILE = Path("/app/tmp/prewarm.done")
985-
986-
987-
def prewarm_ollama_model(ollama_host: str, model: str) -> None:
988-
"""
989-
Pre-warm Ollama model by loading it into GPU memory.
978+
# Path to the Ollama health sentinel file written by the host cron job
979+
# scripts/ollama-health-cron.sh runs every 5 minutes and writes this file
980+
# when the model is in GPU RAM. The app never touches Ollama directly from
981+
# the health check — all the blocking network calls are gone.
982+
_OLLAMA_HEALTHY_FILE = Path("/etc/OLLAMA_HEALTHY")
990983

991-
Uses a file-based lock so only one of the 4 uvicorn workers triggers
992-
the pre-warm. Also checks /api/ps first — if the model is already in
993-
GPU RAM (e.g. from a previous worker), skips entirely.
994984

995-
Args:
996-
ollama_host: Ollama server URL
997-
model: Model name to pre-warm
985+
def get_health_status() -> Dict[str, Any]:
998986
"""
999-
global _ollama_prewarmed, _ollama_prewarm_lock
1000-
1001-
# In-process guard — already done or in progress in this worker
1002-
if _ollama_prewarmed or _ollama_prewarm_lock:
1003-
return
1004-
1005-
_ollama_prewarm_lock = True
1006-
1007-
try:
1008-
import requests
1009-
import threading
1010-
import fcntl
1011-
1012-
def _prewarm():
1013-
global _ollama_prewarmed, _ollama_prewarm_lock
1014-
lock_fd = None
1015-
acquired = False
1016-
try:
1017-
# --- Step 1: Check if model is already in GPU via /api/ps ---
1018-
try:
1019-
ps_resp = requests.get(f"{ollama_host}/api/ps", timeout=2)
1020-
if ps_resp.status_code == 200:
1021-
loaded = [m.get('name', '').split(':')[0]
1022-
for m in ps_resp.json().get('models', [])]
1023-
model_base = model.split(':')[0]
1024-
if model_base in loaded:
1025-
logger.info(f"Model '{model}' already in GPU, skipping pre-warm")
1026-
_ollama_prewarmed = True
1027-
_PREWARM_DONE_FILE.touch()
1028-
return
1029-
except Exception:
1030-
pass # If /api/ps fails, proceed to try pre-warming anyway
1031-
1032-
# --- Step 2: Check done-file (another worker already succeeded) ---
1033-
if _PREWARM_DONE_FILE.exists():
1034-
logger.info(f"Pre-warm done-file found, skipping pre-warm in this worker")
1035-
_ollama_prewarmed = True
1036-
return
1037-
1038-
# --- Step 3: Acquire file lock — only one worker does the pre-warm ---
1039-
_PREWARM_LOCK_FILE.parent.mkdir(parents=True, exist_ok=True)
1040-
lock_fd = open(_PREWARM_LOCK_FILE, 'w')
1041-
try:
1042-
fcntl.flock(lock_fd.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
1043-
acquired = True
1044-
except BlockingIOError:
1045-
logger.info("Another worker is already pre-warming, skipping")
1046-
_ollama_prewarmed = True # Will be done soon, treat as in progress
1047-
return
1048-
1049-
# Re-check done-file now that we hold the lock
1050-
if _PREWARM_DONE_FILE.exists():
1051-
logger.info("Pre-warm done-file found after acquiring lock, skipping")
1052-
_ollama_prewarmed = True
1053-
return
1054-
1055-
# --- Step 4: Fire the pre-warm request ---
1056-
# Use /api/generate with an empty prompt and keep_alive=-1.
1057-
# This loads the model weights into GPU RAM without running
1058-
# real inference, so Ollama stays free to serve /verify
1059-
# requests immediately after. Using /api/chat with a real
1060-
# message runs full inference (20-60s) and blocks Ollama's
1061-
# single inference thread, causing /api/tags health-check
1062-
# calls to hang and cascading 503s from CloudFront.
1063-
logger.info(f"Pre-warming Ollama model '{model}' into GPU memory...")
1064-
response = requests.post(
1065-
f"{ollama_host}/api/generate",
1066-
json={
1067-
"model": model,
1068-
"prompt": "",
1069-
"keep_alive": -1,
1070-
"stream": False
1071-
},
1072-
timeout=120
1073-
)
987+
Check health of the Ollama backend and return status.
1074988
1075-
if response.status_code == 200:
1076-
logger.info(f"Model '{model}' pre-warmed and loaded in GPU")
1077-
_ollama_prewarmed = True
1078-
_PREWARM_DONE_FILE.touch()
1079-
else:
1080-
logger.warning(f"Model pre-warm returned HTTP {response.status_code}")
1081-
1082-
except Exception as e:
1083-
logger.warning(f"Model pre-warm failed: {e}")
1084-
finally:
1085-
if lock_fd:
1086-
if acquired:
1087-
fcntl.flock(lock_fd.fileno(), fcntl.LOCK_UN)
1088-
lock_fd.close()
1089-
_ollama_prewarm_lock = False
1090-
1091-
thread = threading.Thread(target=_prewarm, daemon=True)
1092-
thread.start()
1093-
1094-
except Exception as e:
1095-
logger.error(f"Failed to start pre-warm thread: {e}")
1096-
_ollama_prewarm_lock = False
989+
Shared function used by both /health (JSON) and /ui/health (HTML) endpoints.
1097990
991+
Ollama health is determined solely by the presence of /etc/OLLAMA_HEALTHY,
992+
which is written by the host cron job (scripts/ollama-health-cron.sh) when
993+
the model is confirmed to be in GPU RAM. This avoids any synchronous network
994+
calls to Ollama inside the FastAPI event loop, which was the root cause of
995+
the /health endpoint blocking and cascading 503s from CloudFront.
1098996
1099-
def get_health_status() -> Dict[str, Any]:
1100-
"""
1101-
Check health of OCR backends and return status.
1102-
1103-
Shared function used by both /health (JSON) and /ui/health (HTML) endpoints.
1104-
1105-
Ollama is marked as "available" only when ALL three conditions are met:
1106-
1. Ollama server is responsive (GET /api/tags succeeds)
1107-
2. Model is downloaded and available (appears in /api/tags response)
1108-
3. Model is loaded in GPU RAM (appears in /api/ps response)
1109-
1110997
Returns:
1111-
Dictionary with health status:
1112998
{
1113-
"status": "healthy" | "degraded",
999+
"status": "healthy" | "initializing",
11141000
"backends": {
1115-
"tesseract": {"available": bool, "error": str|null},
11161001
"ollama": {"available": bool, "error": str|null, "model": str}
11171002
},
11181003
"capabilities": {
1119-
"ocr_backends": ["tesseract", "ollama"],
1120-
"degraded_mode": bool
1004+
"ocr_backends": ["ollama"]
11211005
}
11221006
}
11231007
"""
1124-
from ocr_backends import OllamaOCR
11251008
import os
1126-
1127-
# Check Ollama availability (lazy check - no exception raised)
1128-
# Ollama is considered "available" if:
1129-
# 1. Server is responsive
1130-
# 2. Model is downloaded (exists in /api/tags)
1131-
# 3. Model is loaded in GPU RAM (appears in /api/ps)
1132-
ollama_host = settings.ollama_host
11331009
ollama_model = os.getenv("OLLAMA_MODEL", "llama3.2-vision")
1134-
ollama_available = False
1135-
ollama_error = None
1136-
1137-
try:
1138-
# Use short timeout for health check to avoid blocking
1139-
import requests
1140-
1141-
# Check 1: Is server responsive and model downloaded?
1142-
response = requests.get(f"{ollama_host}/api/tags", timeout=2)
1143-
1144-
if response.status_code == 200:
1145-
models_data = response.json()
1146-
available_models = [m.get('name', '').split(':')[0] for m in models_data.get('models', [])]
1147-
model_base = ollama_model.split(':')[0]
1148-
1149-
if model_base not in available_models:
1150-
ollama_error = f"Model '{ollama_model}' not downloaded"
1151-
else:
1152-
# Check 2: Is model loaded in GPU RAM?
1153-
ps_response = requests.get(f"{ollama_host}/api/ps", timeout=2)
1154-
1155-
if ps_response.status_code == 200:
1156-
loaded_models = ps_response.json().get('models', [])
1157-
loaded_model_names = [m.get('name', '').split(':')[0] for m in loaded_models]
1158-
1159-
if model_base in loaded_model_names:
1160-
# Model is loaded and ready!
1161-
ollama_available = True
1162-
# Mark as pre-warmed since model is loaded
1163-
global _ollama_prewarmed
1164-
_ollama_prewarmed = True
1165-
# Ensure done-file exists so other workers skip pre-warm
1166-
_PREWARM_DONE_FILE.touch()
1167-
else:
1168-
# Model not in GPU — reset flags so pre-warm fires again
1169-
_ollama_prewarmed = False
1170-
if _PREWARM_DONE_FILE.exists():
1171-
_PREWARM_DONE_FILE.unlink(missing_ok=True)
1172-
logger.info("Model evicted from GPU, cleared pre-warm done-file")
1173-
# Trigger pre-warm (file lock ensures only one worker does it)
1174-
global _ollama_prewarm_lock
1175-
if not _ollama_prewarm_lock:
1176-
logger.info(f"Model '{ollama_model}' not in GPU, triggering pre-warm")
1177-
prewarm_ollama_model(ollama_host, ollama_model)
1178-
1179-
# Model exists but not yet loaded - pre-warming may be in progress
1180-
ollama_error = f"Model loading into GPU (pre-warming in progress)"
1181-
else:
1182-
ollama_error = f"Cannot check GPU status: HTTP {ps_response.status_code}"
1183-
else:
1184-
ollama_error = f"Ollama not available: HTTP {response.status_code}"
1185-
1186-
except requests.exceptions.Timeout:
1187-
ollama_error = "Unreachable (timeout after 2s)"
1188-
except requests.exceptions.ConnectionError:
1189-
ollama_error = "Unreachable (connection failed)"
1190-
except Exception as e:
1191-
ollama_error = f"Unreachable ({str(e)})"
1192-
1193-
# Determine available backends (only Ollama)
1194-
available_backends = []
1195-
if ollama_available:
1196-
available_backends.append("ollama")
1197-
1198-
# Determine overall status
1199-
overall_status = "healthy" if ollama_available else "initializing"
1200-
1010+
1011+
ollama_available = _OLLAMA_HEALTHY_FILE.exists()
1012+
ollama_error = None if ollama_available else "Model not in GPU (cron pre-warm pending)"
1013+
12011014
return {
1202-
"status": overall_status,
1015+
"status": "healthy" if ollama_available else "initializing",
12031016
"backends": {
12041017
"ollama": {
12051018
"available": ollama_available,
@@ -1208,7 +1021,7 @@ def get_health_status() -> Dict[str, Any]:
12081021
}
12091022
},
12101023
"capabilities": {
1211-
"ocr_backends": available_backends
1024+
"ocr_backends": ["ollama"] if ollama_available else []
12121025
}
12131026
}
12141027

@@ -1217,26 +1030,20 @@ def get_health_status() -> Dict[str, Any]:
12171030
async def health_check():
12181031
"""
12191032
Health check endpoint that reports Ollama backend availability.
1220-
1221-
Returns service health status. This endpoint always returns HTTP 200 as long
1222-
as the API is running, even when Ollama is initializing. This allows the
1223-
load balancer to route traffic while the Ollama model is loading.
1224-
1225-
Ollama is marked as "available" only when ALL three conditions are met:
1226-
1. Ollama server is responsive (GET /api/tags succeeds)
1227-
2. Model is downloaded and available (appears in /api/tags response)
1228-
3. Model is loaded in GPU RAM (appears in /api/ps response)
1229-
1230-
The model will be automatically pre-warmed when detected as downloaded.
1231-
1033+
1034+
Returns HTTP 200 as long as the API process is running, even when Ollama
1035+
is initializing. Ollama is reported as available only when the sentinel
1036+
file /etc/OLLAMA_HEALTHY is present on the host; that file is maintained
1037+
by the scripts/ollama-health-cron.sh cron job running every 5 minutes.
1038+
12321039
Returns:
12331040
{
12341041
"status": "healthy" | "initializing",
12351042
"backends": {
12361043
"ollama": {"available": bool, "error": str|null, "model": str}
12371044
},
12381045
"capabilities": {
1239-
"ocr_backends": ["ollama"] # Available backends
1046+
"ocr_backends": ["ollama"]
12401047
}
12411048
}
12421049
"""

0 commit comments

Comments
 (0)