diff --git a/providers/openaicompat/language_model_hooks.go b/providers/openaicompat/language_model_hooks.go index 5ffefb692..20740b60e 100644 --- a/providers/openaicompat/language_model_hooks.go +++ b/providers/openaicompat/language_model_hooks.go @@ -434,6 +434,42 @@ func ToPromptFunc(prompt fantasy.Prompt, _, _ string) ([]openaisdk.ChatCompletio continue } messages = append(messages, openaisdk.ToolMessage(output.Error.Error(), toolResultPart.ToolCallID)) + case fantasy.ToolResultContentTypeMedia: + output, ok := fantasy.AsToolResultOutputType[fantasy.ToolResultOutputContentMedia](toolResultPart.Output) + if !ok { + warnings = append(warnings, fantasy.CallWarning{ + Type: fantasy.CallWarningTypeOther, + Message: "tool result output does not have the right type", + }) + continue + } + // OpenAI-compatible chat completions tool messages cannot + // carry image or audio content directly; the SDK's content + // union only accepts text. To keep the tool_call/tool_result + // pairing valid while still surfacing the media to + // vision-capable models, emit a text tool message with any + // accompanying text (or a placeholder) and follow it with a + // synthetic user message holding the actual media content + // part. This mirrors the behavior of the openai provider. + placeholder := output.Text + if placeholder == "" { + placeholder = fmt.Sprintf("The tool returned %s content; see the following user message.", output.MediaType) + } + messages = append(messages, openaisdk.ToolMessage(placeholder, toolResultPart.ToolCallID)) + mediaPart, mediaWarning, emit := toolResultMediaUserPart(output) + if mediaWarning != nil { + warnings = append(warnings, *mediaWarning) + } + if emit { + messages = append(messages, openaisdk.UserMessage( + []openaisdk.ChatCompletionContentPartUnionParam{mediaPart}, + )) + } + default: + warnings = append(warnings, fantasy.CallWarning{ + Type: fantasy.CallWarningTypeOther, + Message: fmt.Sprintf("tool result output type %q not supported", toolResultPart.Output.GetType()), + }) } } } @@ -441,6 +477,41 @@ func ToPromptFunc(prompt fantasy.Prompt, _, _ string) ([]openaisdk.ChatCompletio return messages, warnings } +// toolResultMediaUserPart maps a tool-result media output to an OpenAI chat +// completions user content part. It returns the content part, an optional +// warning, and whether the caller should emit the returned part. +func toolResultMediaUserPart(output fantasy.ToolResultOutputContentMedia) (openaisdk.ChatCompletionContentPartUnionParam, *fantasy.CallWarning, bool) { + switch { + case strings.HasPrefix(output.MediaType, "image/"): + data := "data:" + output.MediaType + ";base64," + output.Data + imageBlock := openaisdk.ChatCompletionContentPartImageParam{ + ImageURL: openaisdk.ChatCompletionContentPartImageImageURLParam{URL: data}, + } + return openaisdk.ChatCompletionContentPartUnionParam{OfImageURL: &imageBlock}, nil, true + case output.MediaType == "audio/wav": + audioBlock := openaisdk.ChatCompletionContentPartInputAudioParam{ + InputAudio: openaisdk.ChatCompletionContentPartInputAudioInputAudioParam{ + Data: output.Data, + Format: "wav", + }, + } + return openaisdk.ChatCompletionContentPartUnionParam{OfInputAudio: &audioBlock}, nil, true + case output.MediaType == "audio/mpeg" || output.MediaType == "audio/mp3": + audioBlock := openaisdk.ChatCompletionContentPartInputAudioParam{ + InputAudio: openaisdk.ChatCompletionContentPartInputAudioInputAudioParam{ + Data: output.Data, + Format: "mp3", + }, + } + return openaisdk.ChatCompletionContentPartUnionParam{OfInputAudio: &audioBlock}, nil, true + default: + return openaisdk.ChatCompletionContentPartUnionParam{}, &fantasy.CallWarning{ + Type: fantasy.CallWarningTypeOther, + Message: fmt.Sprintf("tool result media type %s not supported, sending text placeholder only", output.MediaType), + }, false + } +} + func hasVisibleCompatUserContent(content []openaisdk.ChatCompletionContentPartUnionParam) bool { for _, part := range content { if part.OfText != nil || part.OfImageURL != nil || part.OfInputAudio != nil || part.OfFile != nil { diff --git a/providers/openaicompat/tool_result_media_test.go b/providers/openaicompat/tool_result_media_test.go new file mode 100644 index 000000000..642b605b7 --- /dev/null +++ b/providers/openaicompat/tool_result_media_test.go @@ -0,0 +1,209 @@ +package openaicompat + +import ( + "encoding/base64" + "testing" + + "charm.land/fantasy" + "github.com/stretchr/testify/require" +) + +// Tool messages in the OpenAI Chat Completions API cannot carry image or audio +// content directly — the SDK's content union only accepts text. When a tool +// returns media, ToPromptFunc must still emit a text tool message so the +// tool_call/tool_result pairing stays valid, and attach the media to a +// synthetic follow-up user message so vision- and audio-capable models can see +// it. +// +// These tests guard against regressions of charmbracelet/fantasy#208, where +// the openaicompat provider silently dropped tool results carrying +// ToolResultOutputContentMedia. + +func TestToPromptFunc_MediaToolResult_ImagePNG(t *testing.T) { + t.Parallel() + + imageData := base64.StdEncoding.EncodeToString([]byte{0, 1, 2, 3}) + prompt := fantasy.Prompt{ + { + Role: fantasy.MessageRoleAssistant, + Content: []fantasy.MessagePart{ + fantasy.ToolCallPart{ToolCallID: "img-1", ToolName: "view", Input: "{}"}, + }, + }, + { + Role: fantasy.MessageRoleTool, + Content: []fantasy.MessagePart{ + fantasy.ToolResultPart{ + ToolCallID: "img-1", + Output: fantasy.ToolResultOutputContentMedia{ + Data: imageData, + MediaType: "image/png", + }, + }, + }, + }, + } + + messages, warnings := ToPromptFunc(prompt, "", "") + + require.Empty(t, warnings) + // Assistant tool call + text tool message + synthetic user image message. + require.Len(t, messages, 3) + + toolMsg := messages[1].OfTool + require.NotNil(t, toolMsg) + require.Equal(t, "img-1", toolMsg.ToolCallID) + require.Contains(t, toolMsg.Content.OfString.Value, "image/png") + + userMsg := messages[2].OfUser + require.NotNil(t, userMsg) + require.Len(t, userMsg.Content.OfArrayOfContentParts, 1) + imagePart := userMsg.Content.OfArrayOfContentParts[0].OfImageURL + require.NotNil(t, imagePart) + require.Equal(t, "data:image/png;base64,"+imageData, imagePart.ImageURL.URL) +} + +func TestToPromptFunc_MediaToolResult_PrefersAccompanyingText(t *testing.T) { + t.Parallel() + + imageData := base64.StdEncoding.EncodeToString([]byte{9, 9, 9}) + prompt := fantasy.Prompt{ + { + Role: fantasy.MessageRoleAssistant, + Content: []fantasy.MessagePart{ + fantasy.ToolCallPart{ToolCallID: "img-2", ToolName: "view", Input: "{}"}, + }, + }, + { + Role: fantasy.MessageRoleTool, + Content: []fantasy.MessagePart{ + fantasy.ToolResultPart{ + ToolCallID: "img-2", + Output: fantasy.ToolResultOutputContentMedia{ + Data: imageData, + MediaType: "image/jpeg", + Text: "Screenshot of the blockquote element.", + }, + }, + }, + }, + } + + messages, warnings := ToPromptFunc(prompt, "", "") + + require.Empty(t, warnings) + require.Len(t, messages, 3) + require.Equal(t, "Screenshot of the blockquote element.", messages[1].OfTool.Content.OfString.Value) +} + +func TestToPromptFunc_MediaToolResult_AudioWAV(t *testing.T) { + t.Parallel() + + audio := base64.StdEncoding.EncodeToString([]byte("fake-wav-bytes")) + prompt := fantasy.Prompt{ + { + Role: fantasy.MessageRoleAssistant, + Content: []fantasy.MessagePart{ + fantasy.ToolCallPart{ToolCallID: "audio-1", ToolName: "record", Input: "{}"}, + }, + }, + { + Role: fantasy.MessageRoleTool, + Content: []fantasy.MessagePart{ + fantasy.ToolResultPart{ + ToolCallID: "audio-1", + Output: fantasy.ToolResultOutputContentMedia{ + Data: audio, + MediaType: "audio/wav", + }, + }, + }, + }, + } + + messages, warnings := ToPromptFunc(prompt, "", "") + + require.Empty(t, warnings) + require.Len(t, messages, 3) + require.NotNil(t, messages[1].OfTool) + userMsg := messages[2].OfUser + require.NotNil(t, userMsg) + require.Len(t, userMsg.Content.OfArrayOfContentParts, 1) + audioPart := userMsg.Content.OfArrayOfContentParts[0].OfInputAudio + require.NotNil(t, audioPart) + require.Equal(t, audio, audioPart.InputAudio.Data) + require.Equal(t, "wav", audioPart.InputAudio.Format) +} + +func TestToPromptFunc_MediaToolResult_AudioMP3(t *testing.T) { + t.Parallel() + + audio := base64.StdEncoding.EncodeToString([]byte("fake-mp3-bytes")) + prompt := fantasy.Prompt{ + { + Role: fantasy.MessageRoleAssistant, + Content: []fantasy.MessagePart{ + fantasy.ToolCallPart{ToolCallID: "audio-2", ToolName: "record", Input: "{}"}, + }, + }, + { + Role: fantasy.MessageRoleTool, + Content: []fantasy.MessagePart{ + fantasy.ToolResultPart{ + ToolCallID: "audio-2", + Output: fantasy.ToolResultOutputContentMedia{ + Data: audio, + MediaType: "audio/mpeg", + }, + }, + }, + }, + } + + messages, warnings := ToPromptFunc(prompt, "", "") + + require.Empty(t, warnings) + require.Len(t, messages, 3) + require.NotNil(t, messages[1].OfTool) + userMsg := messages[2].OfUser + require.NotNil(t, userMsg) + require.Len(t, userMsg.Content.OfArrayOfContentParts, 1) + audioPart := userMsg.Content.OfArrayOfContentParts[0].OfInputAudio + require.NotNil(t, audioPart) + require.Equal(t, audio, audioPart.InputAudio.Data) + require.Equal(t, "mp3", audioPart.InputAudio.Format) +} + +func TestToPromptFunc_MediaToolResult_UnsupportedMediaType(t *testing.T) { + t.Parallel() + + prompt := fantasy.Prompt{ + { + Role: fantasy.MessageRoleAssistant, + Content: []fantasy.MessagePart{ + fantasy.ToolCallPart{ToolCallID: "vid-1", ToolName: "record", Input: "{}"}, + }, + }, + { + Role: fantasy.MessageRoleTool, + Content: []fantasy.MessagePart{ + fantasy.ToolResultPart{ + ToolCallID: "vid-1", + Output: fantasy.ToolResultOutputContentMedia{ + Data: "AAAA", + MediaType: "video/mp4", + }, + }, + }, + }, + } + + messages, warnings := ToPromptFunc(prompt, "", "") + + // Assistant tool call + text tool message, but no synthetic user image. + require.Len(t, messages, 2) + require.NotNil(t, messages[1].OfTool) + require.Equal(t, "vid-1", messages[1].OfTool.ToolCallID) + require.Len(t, warnings, 1) + require.Contains(t, warnings[0].Message, "video/mp4") +}