Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions app/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ class Settings(BaseSettings):
instagram_navigation_timeout: int = 12
instagram_og_wait_timeout_ms: int = 3000
instagram_block_resource_types: str = "image,font,media"
instagram_image_fetch_timeout_ms: int = 8000
instagram_image_fetch_max_images: int = 10
instagram_image_fetch_max_next_clicks: int = 10
instagram_rate_limit_cooldown_seconds: int = 1800
instagram_cooldown_key: str = "processing:cooldown:instagram"

Expand Down Expand Up @@ -134,6 +137,12 @@ class Settings(BaseSettings):
hf_extraction_max_attempts: int = 3
hf_extraction_retry_base_seconds: float = 0.0
hf_extraction_retry_backoff_multiplier: float = 2.0
hf_ocr_endpoint_url: str = ""
hf_ocr_api_token: str = ""
hf_ocr_model_name: str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
hf_ocr_timeout_seconds: int = 30
hf_ocr_max_new_tokens: int = 1024
hf_ocr_max_attempts: int = 2
extraction_failure_retry_enabled: bool = True

@field_validator("processing_schema")
Expand Down
4 changes: 4 additions & 0 deletions app/infra/llm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
from app.infra.llm.client import (
HFExtractionClient,
HFExtractionError,
HFOCRClient,
HFOCRError,
extract_json_object,
extract_text_from_hf_payload,
)

__all__ = [
"HFExtractionClient",
"HFExtractionError",
"HFOCRClient",
"HFOCRError",
"extract_json_object",
"extract_text_from_hf_payload",
]
115 changes: 115 additions & 0 deletions app/infra/llm/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,10 @@ class HFExtractionError(Exception):
pass


class HFOCRError(Exception):
pass


class HFExtractionClient:
def __init__(
self,
Expand Down Expand Up @@ -227,6 +231,117 @@ def _limit_places(self, result: ExtractionResult) -> ExtractionResult:
return result


OCR_PROMPT = (
"You are an OCR engine. Extract all visible Korean and English text from this "
"Instagram post image. Preserve useful line breaks. Return only the extracted "
"text, with no preface, explanation, Markdown, or translation. If no text is "
"visible, return an empty string."
)


class HFOCRClient:
def __init__(
self,
settings: Settings,
*,
transport: httpx.AsyncBaseTransport | None = None,
) -> None:
self._settings = settings
self._transport = transport

async def extract_text_from_image_url(self, image_url: str) -> str:
if not str(image_url or "").strip():
return ""
endpoint_url = self._endpoint_url()
api_token = self._api_token()
if not endpoint_url:
raise HFOCRError("HF OCR endpoint URL is empty")
if not api_token:
raise HFOCRError("HF OCR API token is empty")

payload = self._build_payload(image_url=image_url)
headers = {
"Authorization": f"Bearer {api_token}",
"Content-Type": "application/json",
}
timeout = httpx.Timeout(self._settings.hf_ocr_timeout_seconds)
max_attempts = max(1, self._settings.hf_ocr_max_attempts)
last_error: HFOCRError | None = None

async with httpx.AsyncClient(timeout=timeout, transport=self._transport) as client:
for attempt in range(1, max_attempts + 1):
try:
response = await client.post(
endpoint_url,
headers=headers,
json=payload,
)
except (httpx.TimeoutException, httpx.NetworkError) as exc:
last_error = HFOCRError(str(exc) or exc.__class__.__name__)
if attempt >= max_attempts:
raise last_error from exc
await self._sleep_before_retry(attempt)
continue

if response.status_code >= 400:
error = HFOCRError(f"HF OCR request failed ({response.status_code})")
if not _is_retryable_status(response.status_code) or attempt >= max_attempts:
raise error
last_error = error
await self._sleep_before_retry(attempt)
continue

return self._parse_response(response)

if last_error is not None:
raise last_error
raise HFOCRError("HF OCR failed")

async def extract_texts_from_image_urls(self, image_urls: list[str]) -> list[str]:
texts: list[str] = []
for image_url in image_urls:
text = (await self.extract_text_from_image_url(image_url)).strip()
if text:
texts.append(text)
return texts

def _build_payload(self, *, image_url: str) -> dict[str, Any]:
return {
"model": self._settings.hf_ocr_model_name,
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": OCR_PROMPT},
{"type": "image_url", "image_url": {"url": image_url}},
],
}
],
"temperature": 0.0,
"max_tokens": self._settings.hf_ocr_max_new_tokens,
}

def _endpoint_url(self) -> str:
return self._settings.hf_ocr_endpoint_url or self._settings.hf_extraction_endpoint_url

def _api_token(self) -> str:
return self._settings.hf_ocr_api_token or self._settings.hf_extraction_api_token

def _parse_response(self, response: httpx.Response) -> str:
try:
response_payload = response.json()
except json.JSONDecodeError as exc:
raise HFOCRError("HF OCR response is not valid JSON") from exc
return extract_text_from_hf_payload(response_payload).strip()

async def _sleep_before_retry(self, attempt: int) -> None:
base_seconds = max(0.0, self._settings.hf_extraction_retry_base_seconds)
if base_seconds <= 0:
return
multiplier = max(1.0, self._settings.hf_extraction_retry_backoff_multiplier)
await asyncio.sleep(base_seconds * (multiplier ** max(0, attempt - 1)))


def extract_text_from_hf_payload(payload: Any) -> str:
if isinstance(payload, str):
return payload
Expand Down
Loading
Loading