Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 71 additions & 0 deletions providers/openaicompat/language_model_hooks.go
Original file line number Diff line number Diff line change
Expand Up @@ -434,13 +434,84 @@ func ToPromptFunc(prompt fantasy.Prompt, _, _ string) ([]openaisdk.ChatCompletio
continue
}
messages = append(messages, openaisdk.ToolMessage(output.Error.Error(), toolResultPart.ToolCallID))
case fantasy.ToolResultContentTypeMedia:
output, ok := fantasy.AsToolResultOutputType[fantasy.ToolResultOutputContentMedia](toolResultPart.Output)
if !ok {
warnings = append(warnings, fantasy.CallWarning{
Type: fantasy.CallWarningTypeOther,
Message: "tool result output does not have the right type",
})
continue
}
// OpenAI-compatible chat completions tool messages cannot
// carry image or audio content directly; the SDK's content
// union only accepts text. To keep the tool_call/tool_result
// pairing valid while still surfacing the media to
// vision-capable models, emit a text tool message with any
// accompanying text (or a placeholder) and follow it with a
// synthetic user message holding the actual media content
// part. This mirrors the behavior of the openai provider.
placeholder := output.Text
if placeholder == "" {
placeholder = fmt.Sprintf("The tool returned %s content; see the following user message.", output.MediaType)
}
messages = append(messages, openaisdk.ToolMessage(placeholder, toolResultPart.ToolCallID))
mediaPart, mediaWarning, emit := toolResultMediaUserPart(output)
if mediaWarning != nil {
warnings = append(warnings, *mediaWarning)
}
if emit {
messages = append(messages, openaisdk.UserMessage(
[]openaisdk.ChatCompletionContentPartUnionParam{mediaPart},
))
}
default:
warnings = append(warnings, fantasy.CallWarning{
Type: fantasy.CallWarningTypeOther,
Message: fmt.Sprintf("tool result output type %q not supported", toolResultPart.Output.GetType()),
})
}
}
}
}
return messages, warnings
}

// toolResultMediaUserPart maps a tool-result media output to an OpenAI chat
// completions user content part. It returns the content part, an optional
// warning, and whether the caller should emit the returned part.
func toolResultMediaUserPart(output fantasy.ToolResultOutputContentMedia) (openaisdk.ChatCompletionContentPartUnionParam, *fantasy.CallWarning, bool) {
switch {
case strings.HasPrefix(output.MediaType, "image/"):
data := "data:" + output.MediaType + ";base64," + output.Data
imageBlock := openaisdk.ChatCompletionContentPartImageParam{
ImageURL: openaisdk.ChatCompletionContentPartImageImageURLParam{URL: data},
}
return openaisdk.ChatCompletionContentPartUnionParam{OfImageURL: &imageBlock}, nil, true
case output.MediaType == "audio/wav":
audioBlock := openaisdk.ChatCompletionContentPartInputAudioParam{
InputAudio: openaisdk.ChatCompletionContentPartInputAudioInputAudioParam{
Data: output.Data,
Format: "wav",
},
}
return openaisdk.ChatCompletionContentPartUnionParam{OfInputAudio: &audioBlock}, nil, true
case output.MediaType == "audio/mpeg" || output.MediaType == "audio/mp3":
audioBlock := openaisdk.ChatCompletionContentPartInputAudioParam{
InputAudio: openaisdk.ChatCompletionContentPartInputAudioInputAudioParam{
Data: output.Data,
Format: "mp3",
},
}
return openaisdk.ChatCompletionContentPartUnionParam{OfInputAudio: &audioBlock}, nil, true
default:
return openaisdk.ChatCompletionContentPartUnionParam{}, &fantasy.CallWarning{
Type: fantasy.CallWarningTypeOther,
Message: fmt.Sprintf("tool result media type %s not supported, sending text placeholder only", output.MediaType),
}, false
}
}

func hasVisibleCompatUserContent(content []openaisdk.ChatCompletionContentPartUnionParam) bool {
for _, part := range content {
if part.OfText != nil || part.OfImageURL != nil || part.OfInputAudio != nil || part.OfFile != nil {
Expand Down
209 changes: 209 additions & 0 deletions providers/openaicompat/tool_result_media_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
package openaicompat

import (
"encoding/base64"
"testing"

"charm.land/fantasy"
"github.com/stretchr/testify/require"
)

// Tool messages in the OpenAI Chat Completions API cannot carry image or audio
// content directly — the SDK's content union only accepts text. When a tool
// returns media, ToPromptFunc must still emit a text tool message so the
// tool_call/tool_result pairing stays valid, and attach the media to a
// synthetic follow-up user message so vision- and audio-capable models can see
// it.
//
// These tests guard against regressions of charmbracelet/fantasy#208, where
// the openaicompat provider silently dropped tool results carrying
// ToolResultOutputContentMedia.

func TestToPromptFunc_MediaToolResult_ImagePNG(t *testing.T) {
t.Parallel()

imageData := base64.StdEncoding.EncodeToString([]byte{0, 1, 2, 3})
prompt := fantasy.Prompt{
{
Role: fantasy.MessageRoleAssistant,
Content: []fantasy.MessagePart{
fantasy.ToolCallPart{ToolCallID: "img-1", ToolName: "view", Input: "{}"},
},
},
{
Role: fantasy.MessageRoleTool,
Content: []fantasy.MessagePart{
fantasy.ToolResultPart{
ToolCallID: "img-1",
Output: fantasy.ToolResultOutputContentMedia{
Data: imageData,
MediaType: "image/png",
},
},
},
},
}

messages, warnings := ToPromptFunc(prompt, "", "")

require.Empty(t, warnings)
// Assistant tool call + text tool message + synthetic user image message.
require.Len(t, messages, 3)

toolMsg := messages[1].OfTool
require.NotNil(t, toolMsg)
require.Equal(t, "img-1", toolMsg.ToolCallID)
require.Contains(t, toolMsg.Content.OfString.Value, "image/png")

userMsg := messages[2].OfUser
require.NotNil(t, userMsg)
require.Len(t, userMsg.Content.OfArrayOfContentParts, 1)
imagePart := userMsg.Content.OfArrayOfContentParts[0].OfImageURL
require.NotNil(t, imagePart)
require.Equal(t, "data:image/png;base64,"+imageData, imagePart.ImageURL.URL)
}

func TestToPromptFunc_MediaToolResult_PrefersAccompanyingText(t *testing.T) {
t.Parallel()

imageData := base64.StdEncoding.EncodeToString([]byte{9, 9, 9})
prompt := fantasy.Prompt{
{
Role: fantasy.MessageRoleAssistant,
Content: []fantasy.MessagePart{
fantasy.ToolCallPart{ToolCallID: "img-2", ToolName: "view", Input: "{}"},
},
},
{
Role: fantasy.MessageRoleTool,
Content: []fantasy.MessagePart{
fantasy.ToolResultPart{
ToolCallID: "img-2",
Output: fantasy.ToolResultOutputContentMedia{
Data: imageData,
MediaType: "image/jpeg",
Text: "Screenshot of the blockquote element.",
},
},
},
},
}

messages, warnings := ToPromptFunc(prompt, "", "")

require.Empty(t, warnings)
require.Len(t, messages, 3)
require.Equal(t, "Screenshot of the blockquote element.", messages[1].OfTool.Content.OfString.Value)
}

func TestToPromptFunc_MediaToolResult_AudioWAV(t *testing.T) {
t.Parallel()

audio := base64.StdEncoding.EncodeToString([]byte("fake-wav-bytes"))
prompt := fantasy.Prompt{
{
Role: fantasy.MessageRoleAssistant,
Content: []fantasy.MessagePart{
fantasy.ToolCallPart{ToolCallID: "audio-1", ToolName: "record", Input: "{}"},
},
},
{
Role: fantasy.MessageRoleTool,
Content: []fantasy.MessagePart{
fantasy.ToolResultPart{
ToolCallID: "audio-1",
Output: fantasy.ToolResultOutputContentMedia{
Data: audio,
MediaType: "audio/wav",
},
},
},
},
}

messages, warnings := ToPromptFunc(prompt, "", "")

require.Empty(t, warnings)
require.Len(t, messages, 3)
require.NotNil(t, messages[1].OfTool)
userMsg := messages[2].OfUser
require.NotNil(t, userMsg)
require.Len(t, userMsg.Content.OfArrayOfContentParts, 1)
audioPart := userMsg.Content.OfArrayOfContentParts[0].OfInputAudio
require.NotNil(t, audioPart)
require.Equal(t, audio, audioPart.InputAudio.Data)
require.Equal(t, "wav", audioPart.InputAudio.Format)
}

func TestToPromptFunc_MediaToolResult_AudioMP3(t *testing.T) {
t.Parallel()

audio := base64.StdEncoding.EncodeToString([]byte("fake-mp3-bytes"))
prompt := fantasy.Prompt{
{
Role: fantasy.MessageRoleAssistant,
Content: []fantasy.MessagePart{
fantasy.ToolCallPart{ToolCallID: "audio-2", ToolName: "record", Input: "{}"},
},
},
{
Role: fantasy.MessageRoleTool,
Content: []fantasy.MessagePart{
fantasy.ToolResultPart{
ToolCallID: "audio-2",
Output: fantasy.ToolResultOutputContentMedia{
Data: audio,
MediaType: "audio/mpeg",
},
},
},
},
}

messages, warnings := ToPromptFunc(prompt, "", "")

require.Empty(t, warnings)
require.Len(t, messages, 3)
require.NotNil(t, messages[1].OfTool)
userMsg := messages[2].OfUser
require.NotNil(t, userMsg)
require.Len(t, userMsg.Content.OfArrayOfContentParts, 1)
audioPart := userMsg.Content.OfArrayOfContentParts[0].OfInputAudio
require.NotNil(t, audioPart)
require.Equal(t, audio, audioPart.InputAudio.Data)
require.Equal(t, "mp3", audioPart.InputAudio.Format)
}

func TestToPromptFunc_MediaToolResult_UnsupportedMediaType(t *testing.T) {
t.Parallel()

prompt := fantasy.Prompt{
{
Role: fantasy.MessageRoleAssistant,
Content: []fantasy.MessagePart{
fantasy.ToolCallPart{ToolCallID: "vid-1", ToolName: "record", Input: "{}"},
},
},
{
Role: fantasy.MessageRoleTool,
Content: []fantasy.MessagePart{
fantasy.ToolResultPart{
ToolCallID: "vid-1",
Output: fantasy.ToolResultOutputContentMedia{
Data: "AAAA",
MediaType: "video/mp4",
},
},
},
},
}

messages, warnings := ToPromptFunc(prompt, "", "")

// Assistant tool call + text tool message, but no synthetic user image.
require.Len(t, messages, 2)
require.NotNil(t, messages[1].OfTool)
require.Equal(t, "vid-1", messages[1].OfTool.ToolCallID)
require.Len(t, warnings, 1)
require.Contains(t, warnings[0].Message, "video/mp4")
}
Loading