diff --git a/CLAUDE.md b/CLAUDE.md index ddf392f..35e6725 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -127,7 +127,7 @@ LLM이 결정하지만, 사용자 질의의 입력 신호에 대해 **첫 worker | 사용자 질의 신호 | 첫 sub-agent | 첫 worker | prompt 위치 | |:---|:---|:---|:---| | 데이터 첨부(csv/xlsx/json/pdf/docx) + 분석/차트 요청 | `data_science_team` | **`data_engineer`** (ONE-pass inspect) → `data_analyst`(python_repl + 차트) | `SYSTEM_SUPERVISOR_PROMPT` `# TEAM SELECTION HINTS` + `TEAM_SUPERVISOR_PROMPT` `# DATA SCIENCE TEAM HANDOFF` | -| 이미지 첨부 | `vision_team` | `image_inspector` → `image_editor` | `SYSTEM_SUPERVISOR_PROMPT` `# TEAM SELECTION HINTS` | +| 이미지 첨부 | `vision_team` | `vision_analyst` (tools: `get_image_metadata`, `resize_image`) | `SYSTEM_SUPERVISOR_PROMPT` `# REQUIRED FIRST ROUTES` + `TEAM_SUPERVISOR_PROMPT` `# VISION TEAM HANDOFF` | | 최신 정보·뉴스·"latest" 요청 | `research_team` | `search` → 필요 시 `web_scraper` | `RESEARCH_TEAM_SUPERVISOR_PROMPT` | | repo 바인딩 + 코드 수정/실행 | `coding_team` | `codebase_explorer` → `implementation_engineer` → (선택) `runtime_verifier` | `SYSTEM_SUPERVISOR_PROMPT` `# CRITICAL GUIDELINES 2a/2b` | | 명시적 보고서/슬라이드/문서 작성 | `writing_team` | `note_taker` → `doc_writer` | `SYSTEM_SUPERVISOR_PROMPT` `# CRITICAL GUIDELINES 6a` | diff --git a/apps/backend/tests/test_agent_tools.py b/apps/backend/tests/test_agent_tools.py index e96985a..d37487c 100644 --- a/apps/backend/tests/test_agent_tools.py +++ b/apps/backend/tests/test_agent_tools.py @@ -82,11 +82,52 @@ def test_vision_tools_with_dummy_image(): meta_result = get_image_metadata.invoke({"base64_image": dummy_base64}) assert "JPEG" in meta_result assert "100, 100" in meta_result + # New metadata fields — locks the structured payload so prompt drift + # doesn't silently shrink what vision_analyst sees. + assert "FileSize:" in meta_result + assert "EXIF:" in meta_result + assert "Alpha:" in meta_result resize_result = resize_image.invoke( {"base64_image": dummy_base64, "max_width": 50, "max_height": 50} ) assert "successfully resized to (50, 50)" in resize_result + # The summary must also report the original size and the file-size delta + # so the analyst can reason about whether the resize actually saved bytes. + assert "from (100, 100)" in resize_result + assert "->" in resize_result + + +def test_resize_image_applies_exif_orientation_correction(): + """EXIF Orientation=6 (rotate 90 CW) must swap width/height after resize. + + Without ``ImageOps.exif_transpose`` a portrait phone photo (stored as + landscape pixels with an EXIF rotate-90 tag) is fed to the LLM rotated, + which silently degrades vision-analyst accuracy. This test pins the fix. + """ + import base64 + import io + from PIL import Image + from agent_tools.vision import resize_image + + # 100x200 stored pixels; EXIF says "rotate 90 CW for display" → after + # transpose the dimensions become (200, 100). + img = Image.new("RGB", (100, 200), color="green") + exif = img.getexif() + exif[0x0112] = 6 # Orientation tag: rotate 90 CW + buffered = io.BytesIO() + img.save(buffered, format="JPEG", exif=exif) + b64 = base64.b64encode(buffered.getvalue()).decode("utf-8") + + # Resize box larger than both axes so thumbnail() is a no-op and we + # observe purely the EXIF correction. + result = resize_image.invoke( + {"base64_image": b64, "max_width": 400, "max_height": 400} + ) + + assert "(200, 100)" in result, ( + f"EXIF orientation correction missing — expected (200, 100), got: {result}" + ) def _make_runtime(tmp_path, attachments): diff --git a/packages/agent-tools/src/agent_tools/vision.py b/packages/agent-tools/src/agent_tools/vision.py index 8a28095..82cbd91 100644 --- a/packages/agent-tools/src/agent_tools/vision.py +++ b/packages/agent-tools/src/agent_tools/vision.py @@ -1,19 +1,39 @@ import base64 import io from typing import Annotated, Optional -from PIL import Image + +from PIL import Image, ImageOps from langchain_core.tools import tool +def _exif_corrected(img: Image.Image) -> Image.Image: + """Rotate/flip per the EXIF Orientation tag so portrait photos read upright. + + Without this, a phone photo whose EXIF says "rotate 90 CW for display" + stays in its stored orientation and downstream LLM vision misreads the + scene (people lying down, text rotated, etc.). + """ + return ImageOps.exif_transpose(img) or img + + @tool def get_image_metadata( base64_image: Annotated[str, "The base64 encoded image string."], ) -> str: - """Extracts metadata such as format, size, and mode from a base64 encoded image.""" + """Extract format, size, color mode, file size, EXIF, and alpha info from a base64 image.""" try: image_data = base64.b64decode(base64_image) img = Image.open(io.BytesIO(image_data)) - return f"Format: {img.format}, Size: {img.size}, Mode: {img.mode}" + has_exif = bool( + getattr(img, "_getexif", lambda: None)() or img.info.get("exif") + ) + has_alpha = img.mode in ("RGBA", "LA") or ( + img.mode == "P" and "transparency" in img.info + ) + return ( + f"Format: {img.format}, Size: {img.size}, Mode: {img.mode}, " + f"FileSize: {len(image_data)} bytes, EXIF: {has_exif}, Alpha: {has_alpha}" + ) except Exception as e: return f"Error extracting metadata: {str(e)}" @@ -28,20 +48,30 @@ def resize_image( Optional[int], "Maximum height for the resized image. Defaults to 1024." ] = 1024, ) -> str: - """Resizes an image while maintaining aspect ratio and returns the new base64 string.""" + """Resize an image (aspect-preserving, EXIF-orientation aware). + + Returns a short factual summary: original size → new size and file-size + delta. The new base64 is intentionally not returned to avoid blowing up + the LLM context window — vision_analyst already has the original image + in its input messages; this tool exists so the analyst can confirm that + a smaller copy is feasible and reason about it. + """ try: image_data = base64.b64decode(base64_image) img = Image.open(io.BytesIO(image_data)) + original_size = img.size + original_fmt = img.format if img.format else "JPEG" + img = _exif_corrected(img) - # Maintain aspect ratio img.thumbnail((max_width, max_height)) buffered = io.BytesIO() - # Save back to same format if possible, otherwise default to JPEG - fmt = img.format if img.format else "JPEG" - img.save(buffered, format=fmt) + img.save(buffered, format=original_fmt) + new_bytes = buffered.getvalue() - new_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8") - return f"Image successfully resized to {img.size}. New Base64 length: {len(new_base64)}" + return ( + f"Image successfully resized to {img.size} from {original_size}. " + f"FileSize: {len(image_data)} -> {len(new_bytes)} bytes." + ) except Exception as e: return f"Error resizing image: {str(e)}"