GetStream · d3xvn · Mar 6, 2026 · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026
diff --git a/agents-core/pyproject.toml b/agents-core/pyproject.toml
@@ -72,6 +72,7 @@ twilio = ["vision-agents-plugins-twilio"]
 turbopuffer = ["vision-agents-plugins-turbopuffer"]
 mistral = ["vision-agents-plugins-mistral"]
 assemblyai = ["vision-agents-plugins-assemblyai"]
+computer-use = ["vision-agents-plugins-computer-use"]
 redis = ["redis[hiredis]>=5.0.0"]
 
 all-plugins = [
@@ -107,6 +108,7 @@ all-plugins = [
     "vision-agents-plugins-turbopuffer",
     "vision-agents-plugins-mistral",
     "vision-agents-plugins-assemblyai",
+    "vision-agents-plugins-computer-use",
 ]
 
 [tool.hatch.metadata]

diff --git a/examples/10_computer_use_example/.gitignore b/examples/10_computer_use_example/.gitignore
@@ -0,0 +1 @@
+.env
diff --git a/examples/10_computer_use_example/README.md b/examples/10_computer_use_example/README.md
@@ -0,0 +1,64 @@
+# Computer Use Example
+
+An AI desktop assistant that can see your screen and control your computer. Share your screen in a video call and ask the agent to perform actions like opening folders, clicking buttons, typing text, or using keyboard shortcuts.
+
+## How it works
+
+1. You join a video call and share your screen
+2. The agent receives your screen-share frames via Gemini Realtime
+3. You ask the agent to do something (e.g. "open my Downloads folder")
+4. The agent sees your screen, identifies what to interact with, and calls action tools
+5. PyAutoGUI executes the actions on the host machine
+
+## Prerequisites
+
+- Python 3.10+
+- A display environment (the agent controls the machine it runs on)
+- API keys for:
+  - [Google AI (Gemini)](https://ai.google.dev/) — for the Realtime LLM
+  - [Stream](https://getstream.io/) — for video infrastructure
+
+## Setup
+
+1. Navigate to this example:
+   ```bash
+   cd examples/10_computer_use_example
+   ```
+
+2. Install dependencies:
+   ```bash
+   uv sync
+   ```
+
+3. Set up your `.env`:
+   ```bash
+   GOOGLE_API_KEY=your_google_key
+   STREAM_API_KEY=your_stream_key
+   STREAM_API_SECRET=your_stream_secret
+   ```
+
+## Run
+
+```bash
+uv run computer_use_example.py run
+```
+
+The agent will create a call and open a demo UI. Share your screen in the call, then ask the agent to perform actions.
+
+## Available actions
+
+| Tool | What it does |
+|------|-------------|
+| `click(cell, position, button)` | Click at a grid cell |
+| `double_click(cell, position)` | Double-click at a grid cell |
+| `type_text(text)` | Type into the focused element |
+| `key_press(keys)` | Press a key combo, e.g. `"cmd+c"` |
+| `scroll(cell, position, clicks, direction)` | Scroll at a grid cell |
+| `mouse_move(cell, position)` | Move the cursor to a grid cell |
+| `open_path(path)` | Open a file or folder with the OS default handler |
+
+## Important notes
+
+- The agent controls the machine it runs on, not the caller's machine. For remote control, run the agent on the target machine.
+- PyAutoGUI requires accessibility permissions on macOS (System Settings > Privacy & Security > Accessibility).
+- Consider running in a sandboxed environment (VM or container) for safety.
diff --git a/examples/10_computer_use_example/computer_use_example.py b/examples/10_computer_use_example/computer_use_example.py
@@ -0,0 +1,56 @@
+"""
+Computer use example — the agent sees your screen share and can control your desktop.
+
+Uses:
+- Gemini Realtime for live screen-share vision + tool calling
+- Stream's edge network for video transport
+- Computer-use plugin for desktop actions (click, type, scroll, etc.)
+- Grid overlay processor so the LLM can reference labeled cells
+
+Share your screen in the call, then ask the agent to perform actions
+like "open my Downloads folder" or "click on the Safari icon".
+"""
+
+import logging
+
+from dotenv import load_dotenv
+from vision_agents.core import Agent, AgentLauncher, Runner, User
+from vision_agents.plugins import computer_use, gemini, getstream
+
+logger = logging.getLogger(__name__)
+
+load_dotenv()
+
+
+grid = computer_use.Grid(cols=15, rows=15)
+
+
+def setup_llm() -> gemini.Realtime:
+    llm = gemini.Realtime(fps=2)
+    computer_use.register(llm, grid=grid)
+    return llm
+
+
+async def create_agent(**kwargs) -> Agent:
+    agent = Agent(
+        edge=getstream.Edge(),
+        agent_user=User(name="Desktop Assistant", id="desktop-agent"),
+        instructions="Read @examples/10_computer_use_example/instructions.md",
+        llm=setup_llm(),
+        processors=[computer_use.GridOverlayProcessor(grid=grid, fps=2)],
+    )
+    return agent
+
+
+async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
+    call = await agent.create_call(call_type, call_id)
+
+    async with agent.join(call):
+        await agent.llm.simple_response(
+            text="Say hi and let the user know they can share their screen and ask you to perform actions on their computer."
+        )
+        await agent.finish()
+
+
+if __name__ == "__main__":
+    Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli()
diff --git a/examples/10_computer_use_example/instructions.md b/examples/10_computer_use_example/instructions.md
@@ -0,0 +1,21 @@
+You are a **Desktop Assistant** that controls the user's computer by calling your tools.
+
+## Critical Rule
+
+When the user asks you to do something on screen, you MUST call the appropriate tool function (click, double_click, mouse_move, type_text, key_press, scroll, open_path). Never just describe what you would do — actually call the tool. If the user says "click on X", call the `click` tool. If they say "move cursor to X", call `mouse_move`.
+
+## Grid system
+
+The screen has a **grid overlay** with columns **A-O** (left to right) and rows **1-15** (top to bottom). Each cell is labeled in its top-left corner (e.g. A1, C5, O15). When you want to interact with a UI element, identify which grid cell it falls in and pass that as the `cell` parameter (e.g. `cell="C2"`).
+
+For finer accuracy, use the `position` parameter to target a specific part of the cell: top-left, top, top-right, left, center (default), right, bottom-left, bottom, or bottom-right. For example, if a button is in the top-right area of cell C2, use `cell="C2", position="top-right"`.
+
+## Rules
+
+1. **Always use tools.** When asked to perform an action, call the tool immediately. Say briefly what you'll do, then call the tool.
+2. **Use cell references.** Look at the grid labels on screen and pass the `cell` parameter (e.g. "C2") for coordinate-based tools.
+3. **Prefer open_path for files and folders.** If the user asks to open something by name or path, use `open_path` instead of trying to find and double-click an icon.
-3. **Prefer open_path for files and folders.** If the user asks to open something by name or path, use `open_path` instead of trying to find and double-click an icon.
+3. **Prefer open_path for files and folders.** If the user provides an absolute file or folder path, use `open_path`. If they only provide a name, ask for the path or locate it through the UI instead of guessing.
-3. **Prefer open_path for files and folders.** If the user asks to open something by name or path, use `open_path` instead of trying to find and double-click an icon.
+3. **Prefer open_path for files and folders.** If the user provides an absolute file or folder path, use `open_path`. If they only provide a name, ask for the path or locate it through the UI instead of guessing.
+4. **Use keyboard shortcuts.** When possible, prefer `key_press` over clicking through menus (e.g. `cmd+c` to copy, `cmd+tab` to switch apps, `cmd+space` to open Spotlight).
-4. **Use keyboard shortcuts.** When possible, prefer `key_press` over clicking through menus (e.g. `cmd+c` to copy, `cmd+tab` to switch apps, `cmd+space` to open Spotlight).
+4. **Use keyboard shortcuts.** When possible, prefer `key_press` over clicking through menus, using shortcuts appropriate for the current OS (e.g. `cmd+c` on macOS or `ctrl+c` on Windows/Linux).
-4. **Use keyboard shortcuts.** When possible, prefer `key_press` over clicking through menus (e.g. `cmd+c` to copy, `cmd+tab` to switch apps, `cmd+space` to open Spotlight).
+4. **Use keyboard shortcuts.** When possible, prefer `key_press` over clicking through menus, using shortcuts appropriate for the current OS (e.g. `cmd+c` on macOS or `ctrl+c` on Windows/Linux).
+5. **One action at a time.** Perform a single action, then observe the result before deciding on the next step.
+6. **Ask when unsure.** If you can't clearly identify a UI element or aren't confident about which cell it's in, ask the user for guidance.
+7. **Keep responses short.** The user is watching you in real time — don't narrate at length.
diff --git a/examples/10_computer_use_example/pyproject.toml b/examples/10_computer_use_example/pyproject.toml
@@ -0,0 +1,19 @@
+[project]
+name = "computer-use-example"
+version = "0.1.0"
+description = "AI desktop assistant that can see and control your screen"
+requires-python = ">=3.10"
+
+dependencies = [
+    "python-dotenv>=1.0",
+    "vision-agents",
+    "vision-agents-plugins-gemini",
+    "vision-agents-plugins-getstream",
+    "vision-agents-plugins-computer-use",
+]
+
+[tool.uv.sources]
+"vision-agents" = { path = "../../agents-core", editable = true }
+"vision-agents-plugins-gemini" = { path = "../../plugins/gemini", editable = true }
+"vision-agents-plugins-getstream" = { path = "../../plugins/getstream", editable = true }
+"vision-agents-plugins-computer-use" = { path = "../../plugins/computer_use", editable = true }
diff --git a/plugins/computer_use/README.md b/plugins/computer_use/README.md
@@ -0,0 +1,53 @@
+# Computer Use Plugin
+
+Model-agnostic desktop control tools for Vision Agents. Lets any LLM with vision (via screen share) interact with the user's desktop — clicking, typing, scrolling, and opening files.
+
+## Install
+
+```bash
+pip install vision-agents-plugins-computer-use
+```
+
+## Usage
+
+Register the tools on any LLM, then use with an agent that receives screen-share frames:
+
+```python
+from vision_agents.plugins import gemini, computer_use
+
+llm = gemini.Realtime(fps=2)
+computer_use.register(llm)
+
+agent = Agent(
+    llm=llm,
+    processors=[computer_use.GridOverlayProcessor(fps=2)],
+)
+```
+
+The `GridOverlayProcessor` draws a labeled grid on screen frames so the model can reference cells by name. Grid size is customizable — share a `Grid` instance to keep tools and overlay in sync:
+
+```python
+grid = computer_use.Grid(cols=20, rows=20)
+computer_use.register(llm, grid=grid)
+computer_use.GridOverlayProcessor(grid=grid, fps=2)
+```
+
+With screen sharing active, the model sees the grid and can call:
+
+| Tool | Description |
+|------|-------------|
+| `click(cell, position, button)` | Click at a grid cell |
+| `double_click(cell, position)` | Double-click at a grid cell |
+| `type_text(text)` | Type text into the focused element |
+| `key_press(keys)` | Press a key combo, e.g. `"cmd+c"` |
+| `scroll(cell, position, clicks, direction)` | Scroll at a grid cell |
+| `mouse_move(cell, position)` | Move cursor to a grid cell |
+| `open_path(path)` | Open a file/folder with the OS default handler |
+
+## How it works
+
+The SDK's screen-share pipeline (`TrackType.SCREEN_SHARE`) feeds frames to the VLM/Realtime model continuously. The `GridOverlayProcessor` annotates these frames with a labeled grid (e.g. A-O / 1-15). The model reads the grid labels, picks the right cell, and calls action tools backed by [PyAutoGUI](https://pyautogui.readthedocs.io/).
+
+## Platform support
+
+Actions use PyAutoGUI (macOS, Linux, Windows). `open_path` uses `open` (macOS), `xdg-open` (Linux), or `explorer` (Windows).
diff --git a/plugins/computer_use/pyproject.toml b/plugins/computer_use/pyproject.toml
@@ -0,0 +1,40 @@
+[build-system]
+requires = ["hatchling", "hatch-vcs"]
+build-backend = "hatchling.build"
+
+[project]
+name = "vision-agents-plugins-computer-use"
+dynamic = ["version"]
+description = "Model-agnostic computer use (desktop control) plugin for Vision Agents"
+readme = "README.md"
+keywords = ["computer use", "desktop automation", "AI", "agents"]
+requires-python = ">=3.10"
+license = "MIT"
+dependencies = [
+    "vision-agents",
+    "pyautogui",
+]
+
+[project.urls]
+Documentation = "https://visionagents.ai/"
+Website = "https://visionagents.ai/"
+Source = "https://github.com/GetStream/Vision-Agents"
+
+[tool.hatch.version]
+source = "vcs"
+raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" }
+
+[tool.hatch.build.targets.wheel]
+packages = ["."]
+
+[tool.hatch.build.targets.sdist]
+include = ["/vision_agents"]
+
+[tool.uv.sources]
+vision-agents = { workspace = true }
+
+[dependency-groups]
+dev = [
+    "pytest>=8.4.1",
+    "pytest-asyncio>=1.0.0",
+]
diff --git a/plugins/computer_use/tests/__init__.py b/plugins/computer_use/tests/__init__.py