From a910f3f8bc455153ff83c9f70ec1ca962552f0cc Mon Sep 17 00:00:00 2001 From: Ed Zynda Date: Sat, 16 May 2026 15:32:22 +0300 Subject: [PATCH] fix(providers/openaicompat): forward tool result media to LLM The openaicompat provider's ToPromptFunc only handled ToolResultContentTypeText and ToolResultContentTypeError. Tool results carrying ToolResultOutputContentMedia silently fell through the switch with no matching case, producing no message at all. The downstream LLM never received the tool result, typically causing it to hallucinate or re-call the tool. Mirror the openai provider's behavior: - Emit a text tool message containing the media's accompanying text (or a placeholder describing the media type) so the tool_call/tool_result pairing stays valid. - Follow up with a synthetic user message holding the image_url or input_audio content part so vision- and audio-capable models can actually see the media. Supported media types: image/* (any), audio/wav, audio/mpeg, audio/mp3. Unsupported types (e.g. video/mp4) still get the text tool message and emit a CallWarning instead of being silently dropped. Add table-driven coverage for png images, audio (wav, mp3), accompanying text precedence, and the unsupported-media-type warning path. Fixes charmbracelet/fantasy#208 --- .../openaicompat/language_model_hooks.go | 71 ++++++ .../openaicompat/tool_result_media_test.go | 209 ++++++++++++++++++ 2 files changed, 280 insertions(+) create mode 100644 providers/openaicompat/tool_result_media_test.go diff --git a/providers/openaicompat/language_model_hooks.go b/providers/openaicompat/language_model_hooks.go index 5ffefb692..20740b60e 100644 --- a/providers/openaicompat/language_model_hooks.go +++ b/providers/openaicompat/language_model_hooks.go @@ -434,6 +434,42 @@ func ToPromptFunc(prompt fantasy.Prompt, _, _ string) ([]openaisdk.ChatCompletio continue } messages = append(messages, openaisdk.ToolMessage(output.Error.Error(), toolResultPart.ToolCallID)) + case fantasy.ToolResultContentTypeMedia: + output, ok := fantasy.AsToolResultOutputType[fantasy.ToolResultOutputContentMedia](toolResultPart.Output) + if !ok { + warnings = append(warnings, fantasy.CallWarning{ + Type: fantasy.CallWarningTypeOther, + Message: "tool result output does not have the right type", + }) + continue + } + // OpenAI-compatible chat completions tool messages cannot + // carry image or audio content directly; the SDK's content + // union only accepts text. To keep the tool_call/tool_result + // pairing valid while still surfacing the media to + // vision-capable models, emit a text tool message with any + // accompanying text (or a placeholder) and follow it with a + // synthetic user message holding the actual media content + // part. This mirrors the behavior of the openai provider. + placeholder := output.Text + if placeholder == "" { + placeholder = fmt.Sprintf("The tool returned %s content; see the following user message.", output.MediaType) + } + messages = append(messages, openaisdk.ToolMessage(placeholder, toolResultPart.ToolCallID)) + mediaPart, mediaWarning, emit := toolResultMediaUserPart(output) + if mediaWarning != nil { + warnings = append(warnings, *mediaWarning) + } + if emit { + messages = append(messages, openaisdk.UserMessage( + []openaisdk.ChatCompletionContentPartUnionParam{mediaPart}, + )) + } + default: + warnings = append(warnings, fantasy.CallWarning{ + Type: fantasy.CallWarningTypeOther, + Message: fmt.Sprintf("tool result output type %q not supported", toolResultPart.Output.GetType()), + }) } } } @@ -441,6 +477,41 @@ func ToPromptFunc(prompt fantasy.Prompt, _, _ string) ([]openaisdk.ChatCompletio return messages, warnings } +// toolResultMediaUserPart maps a tool-result media output to an OpenAI chat +// completions user content part. It returns the content part, an optional +// warning, and whether the caller should emit the returned part. +func toolResultMediaUserPart(output fantasy.ToolResultOutputContentMedia) (openaisdk.ChatCompletionContentPartUnionParam, *fantasy.CallWarning, bool) { + switch { + case strings.HasPrefix(output.MediaType, "image/"): + data := "data:" + output.MediaType + ";base64," + output.Data + imageBlock := openaisdk.ChatCompletionContentPartImageParam{ + ImageURL: openaisdk.ChatCompletionContentPartImageImageURLParam{URL: data}, + } + return openaisdk.ChatCompletionContentPartUnionParam{OfImageURL: &imageBlock}, nil, true + case output.MediaType == "audio/wav": + audioBlock := openaisdk.ChatCompletionContentPartInputAudioParam{ + InputAudio: openaisdk.ChatCompletionContentPartInputAudioInputAudioParam{ + Data: output.Data, + Format: "wav", + }, + } + return openaisdk.ChatCompletionContentPartUnionParam{OfInputAudio: &audioBlock}, nil, true + case output.MediaType == "audio/mpeg" || output.MediaType == "audio/mp3": + audioBlock := openaisdk.ChatCompletionContentPartInputAudioParam{ + InputAudio: openaisdk.ChatCompletionContentPartInputAudioInputAudioParam{ + Data: output.Data, + Format: "mp3", + }, + } + return openaisdk.ChatCompletionContentPartUnionParam{OfInputAudio: &audioBlock}, nil, true + default: + return openaisdk.ChatCompletionContentPartUnionParam{}, &fantasy.CallWarning{ + Type: fantasy.CallWarningTypeOther, + Message: fmt.Sprintf("tool result media type %s not supported, sending text placeholder only", output.MediaType), + }, false + } +} + func hasVisibleCompatUserContent(content []openaisdk.ChatCompletionContentPartUnionParam) bool { for _, part := range content { if part.OfText != nil || part.OfImageURL != nil || part.OfInputAudio != nil || part.OfFile != nil { diff --git a/providers/openaicompat/tool_result_media_test.go b/providers/openaicompat/tool_result_media_test.go new file mode 100644 index 000000000..642b605b7 --- /dev/null +++ b/providers/openaicompat/tool_result_media_test.go @@ -0,0 +1,209 @@ +package openaicompat + +import ( + "encoding/base64" + "testing" + + "charm.land/fantasy" + "github.com/stretchr/testify/require" +) + +// Tool messages in the OpenAI Chat Completions API cannot carry image or audio +// content directly — the SDK's content union only accepts text. When a tool +// returns media, ToPromptFunc must still emit a text tool message so the +// tool_call/tool_result pairing stays valid, and attach the media to a +// synthetic follow-up user message so vision- and audio-capable models can see +// it. +// +// These tests guard against regressions of charmbracelet/fantasy#208, where +// the openaicompat provider silently dropped tool results carrying +// ToolResultOutputContentMedia. + +func TestToPromptFunc_MediaToolResult_ImagePNG(t *testing.T) { + t.Parallel() + + imageData := base64.StdEncoding.EncodeToString([]byte{0, 1, 2, 3}) + prompt := fantasy.Prompt{ + { + Role: fantasy.MessageRoleAssistant, + Content: []fantasy.MessagePart{ + fantasy.ToolCallPart{ToolCallID: "img-1", ToolName: "view", Input: "{}"}, + }, + }, + { + Role: fantasy.MessageRoleTool, + Content: []fantasy.MessagePart{ + fantasy.ToolResultPart{ + ToolCallID: "img-1", + Output: fantasy.ToolResultOutputContentMedia{ + Data: imageData, + MediaType: "image/png", + }, + }, + }, + }, + } + + messages, warnings := ToPromptFunc(prompt, "", "") + + require.Empty(t, warnings) + // Assistant tool call + text tool message + synthetic user image message. + require.Len(t, messages, 3) + + toolMsg := messages[1].OfTool + require.NotNil(t, toolMsg) + require.Equal(t, "img-1", toolMsg.ToolCallID) + require.Contains(t, toolMsg.Content.OfString.Value, "image/png") + + userMsg := messages[2].OfUser + require.NotNil(t, userMsg) + require.Len(t, userMsg.Content.OfArrayOfContentParts, 1) + imagePart := userMsg.Content.OfArrayOfContentParts[0].OfImageURL + require.NotNil(t, imagePart) + require.Equal(t, "data:image/png;base64,"+imageData, imagePart.ImageURL.URL) +} + +func TestToPromptFunc_MediaToolResult_PrefersAccompanyingText(t *testing.T) { + t.Parallel() + + imageData := base64.StdEncoding.EncodeToString([]byte{9, 9, 9}) + prompt := fantasy.Prompt{ + { + Role: fantasy.MessageRoleAssistant, + Content: []fantasy.MessagePart{ + fantasy.ToolCallPart{ToolCallID: "img-2", ToolName: "view", Input: "{}"}, + }, + }, + { + Role: fantasy.MessageRoleTool, + Content: []fantasy.MessagePart{ + fantasy.ToolResultPart{ + ToolCallID: "img-2", + Output: fantasy.ToolResultOutputContentMedia{ + Data: imageData, + MediaType: "image/jpeg", + Text: "Screenshot of the blockquote element.", + }, + }, + }, + }, + } + + messages, warnings := ToPromptFunc(prompt, "", "") + + require.Empty(t, warnings) + require.Len(t, messages, 3) + require.Equal(t, "Screenshot of the blockquote element.", messages[1].OfTool.Content.OfString.Value) +} + +func TestToPromptFunc_MediaToolResult_AudioWAV(t *testing.T) { + t.Parallel() + + audio := base64.StdEncoding.EncodeToString([]byte("fake-wav-bytes")) + prompt := fantasy.Prompt{ + { + Role: fantasy.MessageRoleAssistant, + Content: []fantasy.MessagePart{ + fantasy.ToolCallPart{ToolCallID: "audio-1", ToolName: "record", Input: "{}"}, + }, + }, + { + Role: fantasy.MessageRoleTool, + Content: []fantasy.MessagePart{ + fantasy.ToolResultPart{ + ToolCallID: "audio-1", + Output: fantasy.ToolResultOutputContentMedia{ + Data: audio, + MediaType: "audio/wav", + }, + }, + }, + }, + } + + messages, warnings := ToPromptFunc(prompt, "", "") + + require.Empty(t, warnings) + require.Len(t, messages, 3) + require.NotNil(t, messages[1].OfTool) + userMsg := messages[2].OfUser + require.NotNil(t, userMsg) + require.Len(t, userMsg.Content.OfArrayOfContentParts, 1) + audioPart := userMsg.Content.OfArrayOfContentParts[0].OfInputAudio + require.NotNil(t, audioPart) + require.Equal(t, audio, audioPart.InputAudio.Data) + require.Equal(t, "wav", audioPart.InputAudio.Format) +} + +func TestToPromptFunc_MediaToolResult_AudioMP3(t *testing.T) { + t.Parallel() + + audio := base64.StdEncoding.EncodeToString([]byte("fake-mp3-bytes")) + prompt := fantasy.Prompt{ + { + Role: fantasy.MessageRoleAssistant, + Content: []fantasy.MessagePart{ + fantasy.ToolCallPart{ToolCallID: "audio-2", ToolName: "record", Input: "{}"}, + }, + }, + { + Role: fantasy.MessageRoleTool, + Content: []fantasy.MessagePart{ + fantasy.ToolResultPart{ + ToolCallID: "audio-2", + Output: fantasy.ToolResultOutputContentMedia{ + Data: audio, + MediaType: "audio/mpeg", + }, + }, + }, + }, + } + + messages, warnings := ToPromptFunc(prompt, "", "") + + require.Empty(t, warnings) + require.Len(t, messages, 3) + require.NotNil(t, messages[1].OfTool) + userMsg := messages[2].OfUser + require.NotNil(t, userMsg) + require.Len(t, userMsg.Content.OfArrayOfContentParts, 1) + audioPart := userMsg.Content.OfArrayOfContentParts[0].OfInputAudio + require.NotNil(t, audioPart) + require.Equal(t, audio, audioPart.InputAudio.Data) + require.Equal(t, "mp3", audioPart.InputAudio.Format) +} + +func TestToPromptFunc_MediaToolResult_UnsupportedMediaType(t *testing.T) { + t.Parallel() + + prompt := fantasy.Prompt{ + { + Role: fantasy.MessageRoleAssistant, + Content: []fantasy.MessagePart{ + fantasy.ToolCallPart{ToolCallID: "vid-1", ToolName: "record", Input: "{}"}, + }, + }, + { + Role: fantasy.MessageRoleTool, + Content: []fantasy.MessagePart{ + fantasy.ToolResultPart{ + ToolCallID: "vid-1", + Output: fantasy.ToolResultOutputContentMedia{ + Data: "AAAA", + MediaType: "video/mp4", + }, + }, + }, + }, + } + + messages, warnings := ToPromptFunc(prompt, "", "") + + // Assistant tool call + text tool message, but no synthetic user image. + require.Len(t, messages, 2) + require.NotNil(t, messages[1].OfTool) + require.Equal(t, "vid-1", messages[1].OfTool.ToolCallID) + require.Len(t, warnings, 1) + require.Contains(t, warnings[0].Message, "video/mp4") +}