agentbeats · evansandoval · Feb 16, 2026 · Feb 23, 2026 · Feb 23, 2026 · evansandoval
diff --git a/README.md b/README.md
@@ -6,9 +6,17 @@ Example code for agentifying Tau-Bench using A2A and MCP standards.
 
 ```
 src/
-├── green_agent/    # Assessment manager agent
-├── white_agent/    # Target agent being tested
-└── launcher.py     # Evaluation coordinator
+├── green_agent/       # Assessment manager agent (runs tau-bench, talks to white via A2A)
+│   ├── agent.py
+│   ├── run.sh         # Entry point for agentbeats controller
+│   └── tau_green_agent.toml
+├── white_agent/       # Target agent being tested (calls GPT-4o via LiteLLM)
+│   ├── agent.py
+│   └── run.sh         # Entry point for agentbeats controller
+├── my_util/           # A2A client helpers
+└── launcher.py        # Local-mode evaluation coordinator
+main.py                # CLI entry point
+proxy_test.py          # Automated proxy mode test script
 ```
 
 ## Installation
@@ -17,11 +25,72 @@ src/
 uv sync
 ```
 
-## Usage
+Requires an editable install of [earthshaker](../earthshaker) (provides the `agentbeats` CLI).
 
-First, configure `.env` with `OPENAI_API_KEY=...`, then
+## Environment Variables
+
+Create a `.env` file in the repo root:
+
+```
+OPENAI_API_KEY=sk-...
+```
+
+Both agents call `dotenv.load_dotenv()` on startup, which reads this file. The key is used by:
+- **White agent** — LiteLLM uses it to call GPT-4o
+- **Green agent** — tau-bench uses it internally for the simulated user
+
+## Local Mode
+
+Runs both agents in a single process on localhost. No controller or backend needed.
 
 ```bash
-# Launch complete evaluation
 uv run python main.py launch
 ```
+
+Starts green on port 9001 and white on 9002, runs one tau-bench task, and prints the result.
+
+## Proxy Mode
+
+Runs agents behind the agentbeats backend, which proxies A2A traffic between them and manages assessments.
+
+Each agent is started by a controller (`agentbeats run_ctrl`) that sets `AGENT_PORT` and `AGENT_URL` env vars before running the agent's `run.sh`. The agents read these to bind to the correct port and advertise the proxy URL in their agent card.
+
+### Prerequisites
+
+- **`.env`** with `OPENAI_API_KEY` (see above)
+- **earthshaker** installed (editable, via `uv sync`)
+- **Backend** running at a reachable URL (default: `https://backend.evansandoval.org`)
+- **`AB_API_KEY`** env var — your session cookie from the backend (GitHub OAuth). Find it in browser dev tools under Application > Cookies > `ab_api_key`.
+
+### Running
+
+```bash
+export AB_API_KEY="your-cookie-value"
+python proxy_test.py
+```
+
+This automates the full flow:
+
+1. Creates two proxied agents on the backend (green + white)
+2. Launches `agentbeats run_ctrl` for each in their `src/` directories
+3. Waits for the backend to confirm both agents are reachable
+4. Creates a tau-bench assessment
+5. Prints a URL to view results
+
+Press Ctrl+C to stop — the script cleans up controllers and deletes agents.
+
+### Options
+
+```
+--backend-url URL    Backend API URL (or set AB_BACKEND_URL)
+--repeat-n N         Number of assessment repetitions (default: 1)
+--no-browser         Don't open results in browser
+```
+
+## How It Works
+
+The **green agent** receives an assessment task containing the white agent's URL and a tau-bench config. It sets up the tau-bench environment, then drives a multi-step conversation with the white agent over A2A — sending observations, receiving tool-call responses, executing them in the environment, and repeating until done.
+
+The **white agent** is a stateful A2A server that forwards messages to GPT-4o and returns the response, maintaining per-conversation message history.
+
+In **local mode**, `launcher.py` spawns both as `multiprocessing.Process` on localhost. In **proxy mode**, the backend routes A2A traffic through proxy URLs, so agents only need to reach the backend — not each other directly.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,18 +1,20 @@
 [project]
-name = "agentify-example-tau-bench"
+name = "agentify-example-tau-bench-proxy"
 version = "0.1.0"
 description = "Example code of agentifying tau-bench for the blog `Agentify the Agent Assessment`."
 readme = "README.md"
 requires-python = ">=3.13"
 dependencies = [
     "a2a-sdk[http-server]>=0.3.8",
     "dotenv>=0.9.9",
-    "earthshaker>=0.1.12",
+    "earthshaker>=0.2.0",
     "pydantic-settings>=2.11.0",
+    "requests>=2.31.0",
     "tau-bench",
     "typer>=0.19.2",
     "uvicorn>=0.37.0",
 ]
 
 [tool.uv.sources]
+earthshaker = { path = "../earthshaker", editable = true }
 tau-bench = { git = "https://github.com/sierra-research/tau-bench.git" }
diff --git a/src/green_agent/run.sh b/src/green_agent/run.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+cd "$(dirname "$0")/../.."
+PYTHONPATH=. python src/green_agent/agent.py
diff --git a/src/white_agent/agent.py b/src/white_agent/agent.py
@@ -53,20 +53,12 @@ async def execute(self, context: RequestContext, event_queue: EventQueue) -> Non
                 "content": user_input,
             }
         )
-        if os.environ.get("LITELLM_PROXY_API_KEY") is not None:
-            response = completion(
-                messages=messages,
-                model="openrouter/openai/gpt-4o",
-                custom_llm_provider="litellm_proxy",
-                temperature=0.0,
-            )
-        else:
-            response = completion(
-                messages=messages,
-                model="openai/gpt-4o",
-                custom_llm_provider="openai",
-                temperature=0.0,
-            )
+        response = completion(
+            messages=messages,
+            model="openai/gpt-4o",
+            custom_llm_provider="openai",
+            temperature=0.0,
+        )
         next_message = response.choices[0].message.model_dump()  # type: ignore
         messages.append(
             {
@@ -104,3 +96,8 @@ def start_white_agent(agent_name="general_white_agent", host="localhost", port=9
     )
 
     uvicorn.run(app.build(), host=host, port=port)
+
+
+if __name__ == "__main__":
+    port = int(os.getenv("AGENT_PORT", "9002"))
+    start_white_agent(port=port)
diff --git a/src/white_agent/run.sh b/src/white_agent/run.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+cd "$(dirname "$0")/../.."
+PYTHONPATH=. python src/white_agent/agent.py