SUPERAGENT/test_image_support.py at main · sheet0/SUPERAGENT · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
#!/usr/bin/env python3
"""
Test script for image support in Python Agent

This script tests the new multimodal capabilities by:
1. Creating a test image
2. Testing image encoding/decoding
3. Testing message creation with images
4. Testing API provider with multimodal content
"""

import asyncio
import os
import sys
from pathlib import Path

# Add the python_agent to the path
sys.path.insert(0, str(Path(__file__).parent / "python_agent"))

from python_agent.utils.image_utils import (
    encode_image_to_base64,
    create_anthropic_image_block,
    is_image_file,
    validate_image_size
)
from python_agent.core.message import Message, MessageType, MessageState
from python_agent.core.config import AgentConfig, ApiConfiguration, ApiProviderType


def test_image_utils():
    """Test image utility functions"""
    print("=== Testing Image Utilities ===")

    # Test with the existing screenshot
    test_image_path = "temp_images/mcp_image_20250705_185030_891449.jpeg"

    if not Path(test_image_path).exists():
        print(f"❌ Test image not found: {test_image_path}")
        return False

    # Test image file detection
    print(f"Is image file: {is_image_file(test_image_path)}")

    # Test image size validation
    print(f"Valid image size: {validate_image_size(test_image_path)}")

    # Test base64 encoding
    base64_data = encode_image_to_base64(test_image_path)
    if base64_data:
        print(f"✅ Base64 encoding successful (length: {len(base64_data)})")
    else:
        print("❌ Base64 encoding failed")
        return False

    # Test Anthropic image block creation
    image_block = create_anthropic_image_block(test_image_path)
    if image_block:
        print("✅ Anthropic image block created successfully")
        print(f"   Block type: {image_block['type']}")
        print(f"   Media type: {image_block['source']['media_type']}")
        print(f"   Data length: {len(image_block['source']['data'])}")
    else:
        print("❌ Anthropic image block creation failed")
        return False

    return True


def test_message_with_images():
    """Test message creation with images"""
    print("\n=== Testing Message with Images ===")

    test_image_path = "temp_images/mcp_image_20250705_185030_891449.jpeg"

    if not Path(test_image_path).exists():
        print(f"❌ Test image not found: {test_image_path}")
        return False

    # Create a message with images
    message = Message(
        type=MessageType.USER,
        content="Please analyze this screenshot",
        images=[test_image_path]
    )

    print(f"Message has images: {message.has_images()}")
    print(f"Number of images: {len(message.images)}")

    # Test multimodal content creation
    multimodal_content = message.create_multimodal_content()
    print(f"Multimodal content blocks: {len(multimodal_content)}")

    for i, block in enumerate(multimodal_content):
        print(f"  Block {i}: {block['type']}")
        if block['type'] == 'image':
            print(f"    Media type: {block['source']['media_type']}")

    return True


def test_message_state_with_images():
    """Test MessageState with image support"""
    print("\n=== Testing MessageState with Images ===")

    test_image_path = "temp_images/mcp_image_20250705_185030_891449.jpeg"

    if not Path(test_image_path).exists():
        print(f"❌ Test image not found: {test_image_path}")
        return False

    message_state = MessageState()

    # Start a turn with images
    turn = message_state.start_new_turn(
        "Analyze this screenshot and tell me what you see",
        images=[test_image_path]
    )

    print(f"Turn created with {len(turn.user_message.images)} images")

    # Get conversation history for API
    history = message_state.get_conversation_history()
    print(f"History entries: {len(history)}")

    if history:
        user_message = history[0]
        print(f"User message role: {user_message['role']}")
        print(f"Content type: {type(user_message['content'])}")

        if isinstance(user_message['content'], list):
            print(f"Content blocks: {len(user_message['content'])}")
            for i, block in enumerate(user_message['content']):
                print(f"  Block {i}: {block['type']}")

    return True


async def test_agent_with_image():
    """Test Agent with image message (if API key available)"""
    print("\n=== Testing Agent with Image ===")

    # Check if we have an API key
    api_key = os.getenv("ANTHROPIC_API_KEY")
    if not api_key:
        print("⚠️  No ANTHROPIC_API_KEY found, skipping API test")
        return True

    from python_agent.agent import Agent

    # Create agent configuration
    config = AgentConfig(
        api_config=ApiConfiguration(
            provider=ApiProviderType.ANTHROPIC,
            api_key=api_key,
            model_id="claude-3-haiku-20240307"  # Use a smaller model for testing
        )
    )

    agent = Agent(config)

    try:
        await agent.start()
        print("✅ Agent started successfully")

        # Test with image - we'll simulate what would happen with a screenshot
        test_image_path = "temp_images/mcp_image_20250705_185030_891449.jpeg"

        if Path(test_image_path).exists():
            # Add image to the current turn manually for testing
            turn = agent.message_state.start_new_turn(
                "What do you see in this screenshot?",
                images=[test_image_path]
            )

            print(f"Created turn with {len(turn.user_message.images)} images")

            # Test conversation history generation
            history = agent.message_state.get_conversation_history()
            if history and isinstance(history[0]['content'], list):
                print("✅ Multimodal content correctly formatted for API")
            else:
                print("❌ Multimodal content not formatted correctly")

    except Exception as e:
        print(f"❌ Agent test failed: {e}")
        return False
    finally:
        await agent.stop()
        print("Agent stopped")

    return True


async def main():
    """Run all tests"""
    print("🧪 Testing Image Support in Python Agent\n")

    success = True

    # Test image utilities
    if not test_image_utils():
        success = False

    # Test message with images
    if not test_message_with_images():
        success = False

    # Test message state with images
    if not test_message_state_with_images():
        success = False

    # Test agent with image (if API key available)
    if not await test_agent_with_image():
        success = False

    print(f"\n{'='*50}")
    if success:
        print("🎉 All tests passed! Image support is working.")
    else:
        print("❌ Some tests failed. Check the output above.")

    return success


if __name__ == "__main__":
    asyncio.run(main())