From 1151470db9c3c15147836c0a835b8d1aba36a1be Mon Sep 17 00:00:00 2001 From: Francesc Leveque Date: Sun, 17 May 2026 01:36:04 +0200 Subject: [PATCH] Validate image format magic + steer LLM to company domains MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes for the LLM-fallback class of failures observed in prod. ## Magic-byte image format validation After grounding (#16) and the wikidata layer (#18), the remaining LLM misses were failing in a particularly bad way: the model returned a URL that downloaded fine, but the bytes weren't an image — usually HTML from a Wikipedia file *page* or a generic 404 page. libvips spent ~6s per size trying to process, hit our (now 60s) write timeout, and returned an opaque "Unsupported image format" without marking the row failed for retry. New `validateImageFormat` checks magic bytes for PNG, JPEG, WebP, GIF before handing to bimg, with specific errors for the common bad cases (SVG → libvips can't handle on Alpine; HTML → wrong URL). Rejection is fast (microseconds) and surfaces a clear reason in the logs. ## Gemini prompt rewrite The prior prompt encouraged Wikipedia/Wikimedia URLs — exactly the hosts the model can't reliably target because of the MD5-derived hash prefix in Commons paths. Prod hallucinations included three different invented hash prefixes for Repsol's logo (1/12, f/f9, 3/30) plus a malformed /thumb/ URL. Pivot the prompt to the company's own domain (and CDNs they own) and explicitly enumerate the anti-patterns we've observed: - upload.wikimedia.org URLs (use Wikidata path instead) - en.wikipedia.org/wiki/File:* (HTML pages, not files) - /thumb/ paths - stock-exchange "logo" endpoints - pattern-constructed URLs vs. URLs from real search results Also tightens the required-format list to PNG/JPEG/WebP (drops SVG), matching what our libvips build actually supports. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/llm/gemini.go | 41 +++++++++++----- internal/service/image_format.go | 56 ++++++++++++++++++++++ internal/service/image_format_test.go | 68 +++++++++++++++++++++++++++ internal/service/image_processor.go | 8 ++++ 4 files changed, 161 insertions(+), 12 deletions(-) create mode 100644 internal/service/image_format.go create mode 100644 internal/service/image_format_test.go diff --git a/internal/llm/gemini.go b/internal/llm/gemini.go index 19429e6..9eb1995 100644 --- a/internal/llm/gemini.go +++ b/internal/llm/gemini.go @@ -148,6 +148,14 @@ func truncate(s string, n int) string { // from the JSON-schema variant used by Anthropic / OpenAI clients: we ask for a // tagged free-text response since `google_search` and `responseSchema` can't // coexist in the same Gemini call. +// +// The prompt deliberately steers the model AWAY from Wikipedia/Wikimedia +// (those URLs have an MD5-derived hash prefix the model can't reliably +// reproduce — see production hallucinations like +// `upload.wikimedia.org/.../1/12/Repsol_logo.svg` 404'ing) and TOWARD the +// company's own domain, where logo URLs are stable patterns the model can +// actually verify via grounded search results. Wikipedia coverage is already +// handled deterministically by the Wikidata provider that runs first. func buildGeminiGroundedPrompt(symbol, companyName string) string { hint := "" if companyName != "" { @@ -156,24 +164,33 @@ func buildGeminiGroundedPrompt(symbol, companyName string) string { return fmt.Sprintf(`Find the official company logo for stock ticker "%s"%s. -Use Google Search to locate a DIRECT image URL for the company's official logo. Only return a URL you actually saw in the search results — do not guess or construct URLs from patterns you remember. +Use Google Search to locate a DIRECT image URL on the COMPANY'S OWN WEBSITE that serves an image file. + +PREFERRED sources (in order): +1. The company's primary domain (e.g. https://www.repsol.com/.../logo.png) +2. A CDN subdomain owned by the company (e.g. https://cdn.diageo.com/...) +3. The company's press kit, brand assets, or media-resources pages +4. Investor-relations pages on the company's domain -Prefer (in order): -1. Wikipedia / Wikimedia Commons file pages — copy the actual file URL from the page -2. The company's own website (look for /favicon.png, brand assets, press kit pages) -3. Reputable financial data sites (Yahoo Finance, Google Finance) +AVOID these — they're common failure modes: +- upload.wikimedia.org URLs — the hash-prefix path (e.g. /commons/X/YY/) is rarely guessed correctly. Skip Wikimedia entirely; we already cover that path separately. +- en.wikipedia.org/wiki/File:... — those are HTML pages, not file URLs. +- /thumb/ paths on Wikimedia — they often 400 without a proper Referer. +- Stock-exchange "logo" endpoints (e.g. londonstockexchange.com/images/logos/...) — most don't actually serve logos. +- URLs you constructed from a pattern but haven't actually seen in a real search result. -Requirements: -- Must be a DIRECT link to an image file (URL ends in .png, .svg, .jpg, .jpeg, or .webp) -- Must be publicly accessible (no auth, no paywall) -- Must be the company's primary logo, not a product or sub-brand variant +Requirements for the URL: +- Must be PNG, JPEG, or WebP (NOT SVG — our pipeline doesn't process SVG). +- The host should be the company's own domain or its CDN. +- Must be publicly accessible — no auth, no paywall, no Referer requirement. +- Must be the company's primary corporate logo, not a product or sub-brand variant. -Output format: after any reasoning, end your response with these two tags on their own lines: +Output format: end your response with these two tags on their own lines: the direct image URL you found -the page where you found it +the page on the company's site where you found it -If you cannot find a logo URL that meets all the requirements, output: +If you cannot find a URL on the company's own domain that meets ALL the requirements, output: `, symbol, hint) } diff --git a/internal/service/image_format.go b/internal/service/image_format.go new file mode 100644 index 0000000..d282da8 --- /dev/null +++ b/internal/service/image_format.go @@ -0,0 +1,56 @@ +package service + +import ( + "bytes" + "fmt" + "strings" +) + +// validateImageFormat returns nil if data starts with magic bytes for a raster +// image format our libvips build can process. Otherwise it returns a specific +// error so the caller can log why the bytes were rejected (instead of waiting +// for libvips to time out or emit its generic "Unsupported image format"). +// +// We explicitly reject SVG: the Alpine `vips` package this image runs on was +// compiled without rsvg support, so SVG bytes would hang the resize pipeline. +// The Wikidata provider already requests pre-rasterized PNGs via Wikimedia's +// `?width=` endpoint; this guard catches anything else that slips through +// (e.g. an LLM returning a Wikipedia file *page* URL whose body is HTML). +func validateImageFormat(data []byte) error { + if len(data) < 12 { + return fmt.Errorf("data too short (%d bytes) to be an image", len(data)) + } + + // PNG: \x89 P N G \r \n \x1a \n + if bytes.HasPrefix(data, []byte{0x89, 'P', 'N', 'G', '\r', '\n', 0x1a, '\n'}) { + return nil + } + // JPEG: FF D8 FF + if bytes.HasPrefix(data, []byte{0xff, 0xd8, 0xff}) { + return nil + } + // WebP: "RIFF" .... "WEBP" + if bytes.HasPrefix(data, []byte("RIFF")) && bytes.Equal(data[8:12], []byte("WEBP")) { + return nil + } + // GIF + if bytes.HasPrefix(data, []byte("GIF87a")) || bytes.HasPrefix(data, []byte("GIF89a")) { + return nil + } + + // Look at the first chunk as text to give better errors on common non-image responses. + headLen := 512 + if len(data) < headLen { + headLen = len(data) + } + head := strings.ToLower(strings.TrimSpace(string(data[:headLen]))) + + if strings.HasPrefix(head, "`), + []byte(``), + []byte(" \n\n"), + } + for i, data := range cases { + err := validateImageFormat(data) + if err == nil { + t.Errorf("case %d: SVG should be rejected", i) + continue + } + if !strings.Contains(err.Error(), "SVG") { + t.Errorf("case %d: error should mention SVG, got %v", i, err) + } + } +} + +func TestValidateImageFormat_RejectsHTML(t *testing.T) { + cases := [][]byte{ + []byte("not an image"), + []byte("404"), + } + for i, data := range cases { + err := validateImageFormat(data) + if err == nil { + t.Errorf("case %d: HTML should be rejected", i) + continue + } + if !strings.Contains(err.Error(), "HTML") { + t.Errorf("case %d: error should mention HTML, got %v", i, err) + } + } +} + +func TestValidateImageFormat_RejectsShortInput(t *testing.T) { + if err := validateImageFormat([]byte{0x89, 'P'}); err == nil { + t.Fatal("expected error on tiny input") + } +} + +func TestValidateImageFormat_RejectsUnknownBytes(t *testing.T) { + data := []byte("Hello world, definitely not an image at all") + if err := validateImageFormat(data); err == nil { + t.Fatal("expected error on garbage input") + } +} diff --git a/internal/service/image_processor.go b/internal/service/image_processor.go index 7cae774..af8700c 100644 --- a/internal/service/image_processor.go +++ b/internal/service/image_processor.go @@ -29,6 +29,14 @@ func NewImageProcessor(fs *storage.FileSystem) *ImageProcessor { // Go note: returning a map lets the caller know which sizes succeeded. // We process all sizes even if some fail, collecting errors along the way. func (p *ImageProcessor) ProcessAll(symbol string, imageData []byte) (map[model.LogoSize]bool, error) { + // Magic-byte gate: reject obviously-wrong inputs (HTML returned by a bad + // LLM URL, SVG that would hang libvips) BEFORE handing to bimg. Without + // this, the resize loop spends ~6s per size on a bad input before giving + // up — enough to trip kamal-proxy's request timeout. + if err := validateImageFormat(imageData); err != nil { + return nil, fmt.Errorf("input rejected: %w", err) + } + results := make(map[model.LogoSize]bool) var errs []string