Skip to content

Commit 4365de9

Browse files
penny-team[bot]jaredlockhartclaude
authored
Thread page og:image through browse_url as base64 attachment (jaredlockhart#919)
The browser content script already extracts og:image from pages, but it was dropped in formatResult. Now the addon downloads the image using the browser's fetch API (which has session cookies for CDN auth), base64 encodes it, and sends it as a separate image field on the tool response. The server threads it through SearchResult.image_base64 → MultiTool (first image wins) → ControllerResponse.attachments → send_response. When attachments are present, the Serper image search fallback is skipped. Signal receives the base64 data URI directly. This was necessary because CDN bot detection (e.g., Akamai on CBC) blocks server-side image downloads via httpx — the browser's fetch has the authenticated session and cookies needed to pass bot checks. - Addon: downloadImageAsDataUri in browse_url.ts using browser fetch - Addon: BrowseResult type with text + image fields - Addon: sendToolResponse includes image field - Server: BrowserToolResponse.image field (optional) - Server: send_tool_request returns tuple[str, str | None] - Server: BrowseUrlTool.execute returns SearchResult with image - Server: MultiTool passes first image through combined result Co-authored-by: Jared Lockhart <119884+jaredlockhart@users.noreply.github.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 3a3885a commit 4365de9

File tree

9 files changed

+183
-53
lines changed

9 files changed

+183
-53
lines changed

browser/src/background/background.ts

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -344,13 +344,16 @@ async function setToolUse(enabled: boolean): Promise<void> {
344344
broadcastToSidebar({ type: RuntimeMessageType.ToolUseState, enabled });
345345
}
346346

347-
function sendToolResponse(requestId: string, result?: string, error?: string): void {
347+
function sendToolResponse(
348+
requestId: string, result?: string, error?: string, image?: string,
349+
): void {
348350
if (!ws || ws.readyState !== WebSocket.OPEN) return;
349351
ws.send(JSON.stringify({
350352
type: WsOutgoingType.ToolResponse,
351353
request_id: requestId,
352354
result,
353355
error,
356+
image,
354357
}));
355358
}
356359

@@ -362,7 +365,7 @@ async function handleToolRequest(request: WsIncomingToolRequestPayload): Promise
362365
try {
363366
if (tool === "browse_url") {
364367
const result = await executeBrowseUrl(request_id, args);
365-
sendToolResponse(request_id, result);
368+
sendToolResponse(request_id, result.text, undefined, result.image);
366369
} else {
367370
sendToolResponse(request_id, undefined, `Unknown tool: ${tool}`);
368371
}
@@ -375,7 +378,7 @@ async function handleToolRequest(request: WsIncomingToolRequestPayload): Promise
375378
async function executeBrowseUrl(
376379
_requestId: string,
377380
args: Record<string, unknown>,
378-
): Promise<string> {
381+
): Promise<{ text: string; image: string }> {
379382
const url = args.url as string;
380383
if (!url) throw new Error("Missing required argument: url");
381384
return await browseUrl(url);
@@ -409,8 +412,8 @@ browser.runtime.onConnect.addListener(async (port) => {
409412
globalThis.debugBrowseUrl = (url: string): void => {
410413
browseUrl(url).then(
411414
(result) => {
412-
console.log(`[debug] ${result.length} chars`);
413-
console.log(result);
415+
console.log(`[debug] ${result.text.length} chars, image: ${result.image || "none"}`);
416+
console.log(result.text);
414417
},
415418
(err) => console.error("[debug] ERROR:", err),
416419
);

browser/src/background/tools/browse_url.ts

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,29 +15,30 @@ interface PageData {
1515
title: string;
1616
url: string;
1717
text: string;
18+
image: string;
1819
ready: boolean;
1920
}
2021

2122
const MAX_TAB_ATTEMPTS = 3;
2223

23-
export async function browseUrl(url: string): Promise<string> {
24+
export async function browseUrl(url: string): Promise<BrowseResult> {
2425
for (let attempt = 1; attempt <= MAX_TAB_ATTEMPTS; attempt++) {
2526
console.log(`[browse_url] opening: ${url} (attempt ${attempt}/${MAX_TAB_ATTEMPTS})`);
2627
const tab = await openHiddenTab(url);
2728
try {
2829
await waitForTabLoad(tab.id!);
2930
const pageData = await pollForContent(tab.id!);
30-
return formatResult(pageData);
31+
return await formatResult(pageData);
3132
} catch (err) {
3233
console.warn(`[browse_url] attempt ${attempt} failed:`, err);
3334
if (attempt === MAX_TAB_ATTEMPTS) {
34-
return `Failed to read ${url}: ${err}`;
35+
return { text: `Failed to read ${url}: ${err}`, image: "" };
3536
}
3637
} finally {
3738
await closeTab(tab.id!);
3839
}
3940
}
40-
return `Failed to read ${url}`;
41+
return { text: `Failed to read ${url}`, image: "" };
4142
}
4243

4344
async function pollForContent(tabId: number): Promise<PageData> {
@@ -120,6 +121,33 @@ async function closeTab(tabId: number): Promise<void> {
120121
}
121122
}
122123

123-
function formatResult(data: PageData): string {
124-
return `Title: ${data.title}\nURL: ${data.url}\n\n${data.text}`;
124+
interface BrowseResult {
125+
text: string;
126+
image: string;
127+
}
128+
129+
async function formatResult(data: PageData): Promise<BrowseResult> {
130+
const image = data.image ? await downloadImageAsDataUri(data.image) : "";
131+
console.log(`[browse_url] image: ${image ? `${image.length} chars` : "none"}`);
132+
return {
133+
text: `Title: ${data.title}\nURL: ${data.url}\n\n${data.text}`,
134+
image,
135+
};
136+
}
137+
138+
async function downloadImageAsDataUri(url: string): Promise<string> {
139+
try {
140+
const resp = await fetch(url);
141+
if (!resp.ok) return "";
142+
const blob = await resp.blob();
143+
const buffer = await blob.arrayBuffer();
144+
const bytes = new Uint8Array(buffer);
145+
let binary = "";
146+
for (const b of bytes) binary += String.fromCharCode(b);
147+
const b64 = btoa(binary);
148+
return `data:${blob.type};base64,${b64}`;
149+
} catch {
150+
console.warn("[browse_url] failed to download image:", url);
151+
return "";
152+
}
125153
}

penny/penny/agents/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -562,6 +562,7 @@ def _build_final_response(
562562
tool_call_records: list[ToolCallRecord],
563563
) -> ControllerResponse:
564564
"""Build the ControllerResponse from the model's final (non-tool) answer."""
565+
logger.debug("Building final response with %d attachments", len(attachments))
565566
content = response.content.strip()
566567

567568
if not content:
@@ -736,7 +737,6 @@ async def _execute_single_tool(
736737
record.failed = _is_tool_result_failed(result_str)
737738
logger.debug("Tool result: %s", result_str[:200])
738739
return result_str, record, urls, image
739-
740740
result_str = self._truncate_tool_result(str(tool_result.result))
741741
record.failed = _is_tool_result_failed(result_str)
742742
logger.debug("Tool result: %s", result_str[:200])

penny/penny/channels/base.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -243,28 +243,26 @@ async def send_response(
243243
recipient: str,
244244
content: str,
245245
parent_id: int | None,
246-
image_prompt: str,
246+
image_prompt: str = "",
247247
attachments: list[str] | None = None,
248248
quote_message: MessageLog | None = None,
249249
thought_id: int | None = None,
250250
) -> int | None:
251251
"""
252-
Log and send an outgoing message with an image attachment.
252+
Log and send an outgoing message with optional image attachments.
253253
254254
Args:
255255
recipient: Identifier for the recipient
256256
content: Message content
257257
parent_id: Parent message ID for thread linking
258-
image_prompt: Search query for image attachment (max 300 chars)
259-
attachments: Optional list of base64-encoded attachments
258+
image_prompt: Deprecated — previously used for Serper image search
259+
attachments: Optional list of base64-encoded image attachments
260260
quote_message: Optional message to quote-reply to
261261
thought_id: Optional FK to the thought that triggered this message
262262
263263
Returns:
264264
Database message ID if send was successful, None otherwise
265265
"""
266-
image_prompt = image_prompt[: self.MAX_IMAGE_PROMPT_LENGTH]
267-
268266
if not attachments and image_prompt:
269267
attachments = await self._resolve_image(image_prompt, attachments)
270268
elif not attachments:

penny/penny/channels/browser/channel.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ def __init__(
111111
self._port = port
112112
self._server: Server | None = None
113113
self._connections: dict[str, ConnectionInfo] = {}
114-
self._pending_requests: dict[str, asyncio.Future[str]] = {}
114+
self._pending_requests: dict[str, asyncio.Future[tuple[str, str | None]]] = {}
115115
self._pending_permissions: dict[str, asyncio.Future[bool]] = {}
116116
self._channel_manager: MessageChannel | None = None
117117

@@ -414,10 +414,15 @@ def _handle_tool_response(self, data: dict) -> None:
414414
logger.warning("No pending request for id: %s", response.request_id)
415415
return
416416

417+
logger.debug(
418+
"Tool response: result=%d chars, image=%s",
419+
len(response.result or ""),
420+
f"{len(response.image)} chars" if response.image else "none",
421+
)
417422
if response.error:
418423
future.set_exception(RuntimeError(response.error))
419424
else:
420-
future.set_result(response.result or "")
425+
future.set_result((response.result or "", response.image))
421426

422427
async def _handle_thoughts_request(self, ws: ServerConnection) -> None:
423428
"""Query recent thoughts and send them to the browser."""
@@ -626,11 +631,16 @@ async def _handle_chat_message(
626631

627632
# --- Tool requests ---
628633

629-
async def send_tool_request(self, tool: str, arguments: dict) -> str:
630-
"""Send a tool request to a connected browser and await the sanitized response.
631-
632-
Checks domain permission server-side before dispatching. If the domain
633-
is unknown, prompts all connected addons and Signal for a decision.
634+
async def send_tool_request(
635+
self,
636+
tool: str,
637+
arguments: dict,
638+
) -> tuple[str, str | None]:
639+
"""Send a tool request to a connected browser and await the response.
640+
641+
Returns (result_text, image_url). Checks domain permission server-side
642+
before dispatching. If the domain is unknown, prompts all connected
643+
addons and Signal for a decision.
634644
"""
635645
if tool == "browse_url" and "url" in arguments:
636646
await self._check_domain_permission(arguments["url"])
@@ -640,7 +650,7 @@ async def send_tool_request(self, tool: str, arguments: dict) -> str:
640650
raise RuntimeError("No browser with tool-use enabled is connected")
641651

642652
request_id = str(uuid.uuid4())
643-
future: asyncio.Future[str] = asyncio.get_event_loop().create_future()
653+
future: asyncio.Future[tuple[str, str | None]] = asyncio.get_event_loop().create_future()
644654
self._pending_requests[request_id] = future
645655

646656
request = BrowserToolRequest(

penny/penny/channels/browser/models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ class BrowserToolResponse(BaseModel):
5353
request_id: str
5454
result: str | None = None
5555
error: str | None = None
56+
image: str | None = None
5657

5758

5859
class BrowserOutgoing(BaseModel):

0 commit comments

Comments
 (0)