Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions examples/12_resale_advisor_example/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
[project]
name = "resale-advisor-example"
version = "0.0.0"
requires-python = ">=3.10"

dependencies = [
"python-dotenv>=1.0",
"vision-agents-plugins-huggingface[mlx-vlm,transformers]",
"vision-agents-plugins-getstream",
"vision-agents-plugins-deepgram",
"vision-agents",
"mlx-vlm",
"torchvision",
]

[tool.uv.sources]
"vision-agents-plugins-huggingface" = {path = "../../plugins/huggingface", editable=true}
"vision-agents-plugins-getstream" = {path = "../../plugins/getstream", editable=true}
"vision-agents-plugins-deepgram" = {path = "../../plugins/deepgram", editable=true}
"vision-agents" = {path = "../../agents-core", editable=true}
75 changes: 75 additions & 0 deletions examples/12_resale_advisor_example/resale_advisor_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""
Vision AI with Gemma 4 - Local VLM Agent (MLX)

A real-time vision + voice assistant powered by Gemma 4 E4B running on Apple
Silicon via MLX. Demonstrates how to build a multimodal AI agent that can see
the user's video feed and respond with voice:

- Gemma 4 E4B (8-bit quantized) via mlx-vlm for vision-language inference
- Deepgram for speech-to-text and text-to-speech
- GetStream for real-time communication

The user speaks naturally and the agent responds with voice, describing what
it sees and answering questions about the video feed.

Requirements:
- STREAM_API_KEY and STREAM_API_SECRET environment variables
- DEEPGRAM_API_KEY environment variable
- Apple Silicon Mac with 16GB+ unified memory

First run will download the MLX model (~8GB).
"""

import asyncio
import logging

from dotenv import load_dotenv
from vision_agents.core import Agent, Runner, User
from vision_agents.core.agents import AgentLauncher
from vision_agents.plugins import deepgram, getstream, huggingface

logger = logging.getLogger(__name__)

load_dotenv()

SYSTEM_PROMPT = (
"You are a vision assistant running on a local Gemma 4 model. "
"You can see the user's camera feed. Describe what you see concisely. "
"Speak naturally, as if having a conversation. No lists or formatting. "
"Never use emojis or special characters. Keep responses under 50 words."
)


async def create_agent(**kwargs) -> Agent:
"""Create a vision AI agent with Gemma 4 VLM."""
agent = Agent(
edge=getstream.Edge(),
agent_user=User(name="Vision Assistant", id="agent"),
instructions=SYSTEM_PROMPT,
llm=huggingface.MlxVLM(
model="mlx-community/gemma-4-e4b-it-8bit",
max_new_tokens=150,
),
tts=deepgram.TTS(),
stt=deepgram.STT(),
)

return agent


async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
"""Join the call and run the agent."""
call = await agent.create_call(call_type, call_id)

logger.info("Starting Vision Assistant...")

async with agent.join(call):
await asyncio.sleep(2)
await agent.llm.simple_response(
text="Greet the user briefly. Tell them you can see their camera and can describe what you see.",
)
await agent.finish()


if __name__ == "__main__":
Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli()
14 changes: 12 additions & 2 deletions plugins/huggingface/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ requires-python = ">=3.10"
license = "MIT"
dependencies = [
"vision-agents",
"huggingface_hub<1.0",
"huggingface_hub>=0.20.0,<2",
]

[project.optional-dependencies]
transformers = [
"transformers>=4.45.0,<5",
"transformers>=5.3.0,<6",
"torch>=2.0.0,<3",
"accelerate>=0.25.0,<2",
"supervision>=0.21.0,<1",
Expand All @@ -27,6 +27,16 @@ transformers-quantized = [
"vision-agents-plugins-huggingface[transformers]",
"bitsandbytes>=0.41.0",
]
mlx = [
"mlx>=0.22.0",
"mlx-lm>=0.22.0",
]
mlx-vlm = [
"mlx>=0.22.0",
"mlx-vlm>=0.4.0",
"av",
"aiortc",
]

[project.urls]
Documentation = "https://visionagents.ai/"
Expand Down
6 changes: 3 additions & 3 deletions plugins/huggingface/tests/test_transformers_vlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ async def test_processor_fallback(self, vlm):
}

messages = [{"role": "user", "content": "describe this"}]
result = vlm._build_processor_inputs(messages, [])
result = vlm._build_processor_inputs(processor, messages, [], None)
assert "input_ids" in result

call_kwargs = processor.call_args.kwargs
Expand All @@ -174,7 +174,7 @@ async def test_build_processor_inputs_passes_tools(self, vlm):
}
]
messages = [{"role": "user", "content": "hi"}]
vlm._build_processor_inputs(messages, [], tools)
vlm._build_processor_inputs(vlm._resources.processor, messages, [], tools)

call_kwargs = vlm._resources.processor.apply_chat_template.call_args.kwargs
assert call_kwargs["tools"] is tools
Expand Down Expand Up @@ -205,7 +205,7 @@ def _side_effect(*args, **kwargs):
}
]
result = vlm._build_processor_inputs(
[{"role": "user", "content": "hi"}], [], tools
vlm._resources.processor, [{"role": "user", "content": "hi"}], [], tools
)
assert "input_ids" in result
assert call_count == 2
Expand Down
32 changes: 32 additions & 0 deletions plugins/huggingface/vision_agents/plugins/huggingface/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,35 @@
)
else:
raise

try:
from .mlx_llm import MlxLLM

__all__ += ["MlxLLM"]
except ImportError as e:
import warnings

if e.name in {"mlx", "mlx_lm"}:
warnings.warn(
f"Optional dependency '{e.name}' is not installed. "
"Install the [mlx] extra to enable MLX plugins.",
stacklevel=2,
)
else:
raise

try:
from .mlx_vlm import MlxVLM

__all__ += ["MlxVLM"]
except ImportError as e:
import warnings

if e.name in {"mlx", "mlx_vlm", "av", "aiortc"}:
warnings.warn(
f"Optional dependency '{e.name}' is not installed. "
"Install the [mlx-vlm] extra to enable MLX VLM plugins.",
stacklevel=2,
)
else:
raise
Loading
Loading