diff --git a/examples/__init__.py b/examples/__init__.py new file mode 100644 index 0000000..0fb5745 --- /dev/null +++ b/examples/__init__.py @@ -0,0 +1 @@ +"""Example entrypoints and recipes for Uni-Agent.""" diff --git a/examples/agent_train/__init__.py b/examples/agent_train/__init__.py new file mode 100644 index 0000000..f539660 --- /dev/null +++ b/examples/agent_train/__init__.py @@ -0,0 +1 @@ +"""Training examples for Uni-Agent.""" diff --git a/examples/agent_train/deepeyes_gateway/README.md b/examples/agent_train/deepeyes_gateway/README.md new file mode 100644 index 0000000..13bc8d8 --- /dev/null +++ b/examples/agent_train/deepeyes_gateway/README.md @@ -0,0 +1,73 @@ +# DeepEyes Gateway Training Example + +This example wires the DeepEyes multimodal tool-use recipe into the Uni-Agent +gateway framework path on `verl.trainer.main_ppo_sync`. + +## Layout + +- `examples.agent_train.deepeyes_gateway.agent_runner`: gateway-backed DeepEyes + tool loop. +- `examples.agent_train.deepeyes_gateway.dataset`: dataset adapter that emits + `raw_prompt`, `tools_kwargs`, and reward fields without local prompt + tokenization. +- `examples.agent_train.deepeyes_gateway.reward`: self-contained `compute_score` + wrapper for the DeepEyes LLM-as-a-judge reward. +- `configs/deepeyes_gateway_grpo.yaml`: recipe config using + `uni_agent.trainer.framework.entry.AgentFrameworkRolloutAdapter`. +- `configs/image_zoom_in_tool_config.yaml`: image zoom-in tool config. +- `run_deepeyes_gateway_grpo.sh`: example full-data launch script. + +## Prerequisites + +- Run from the Uni-Agent repository with the `verl` trainer dependencies + available. +- Launch an OpenAI-compatible judge service and set `LLM_AS_A_JUDGE_BASE`. +- Prepare a DeepEyes parquet dataset with image payloads. +- Reserve training GPUs separately from the judge GPU. + +Example judge service: + +```bash +CUDA_VISIBLE_DEVICES=7 \ +python3 -m vllm.entrypoints.openai.api_server \ + --model /path/to/judge-model \ + --host 127.0.0.1 \ + --port 18901 \ + --served-model-name qwen3-4b-judge \ + --dtype float16 \ + --trust-remote-code \ + --max-model-len 4096 \ + --gpu-memory-utilization 0.75 \ + --enforce-eager +``` + +## Launch + +```bash +bash examples/agent_train/deepeyes_gateway/run_deepeyes_gateway_grpo.sh +``` + +Common overrides: + +```bash +MODEL_PATH=/path/to/policy-model \ +TRAIN_FILE=/path/to/train.parquet \ +VAL_FILE=/path/to/val.parquet \ +LLM_AS_A_JUDGE_BASE=http://127.0.0.1:18901/v1 \ +PROJECT_NAME=my_project \ +EXPERIMENT_NAME=my_run \ +TOTAL_TRAINING_STEPS=20 \ +bash examples/agent_train/deepeyes_gateway/run_deepeyes_gateway_grpo.sh +``` + +The script resolves the config directory relative to its own location, then +launches from the repository root so `examples.*` recipe imports are stable. + +## Notes + +- No parquet data files are included in this example. +- The image tool implementation is still loaded from `verl.tools` by the tool + config; the gateway framework adapter uses `uni_agent.*`, while the recipe + imports live with this example under `examples.*`. +- Reward scoring returns `0.0` if the judge service or reward dependencies are + unavailable. diff --git a/examples/agent_train/deepeyes_gateway/__init__.py b/examples/agent_train/deepeyes_gateway/__init__.py new file mode 100644 index 0000000..297372c --- /dev/null +++ b/examples/agent_train/deepeyes_gateway/__init__.py @@ -0,0 +1 @@ +"""DeepEyes gateway recipe.""" diff --git a/examples/agent_train/deepeyes_gateway/agent_runner.py b/examples/agent_train/deepeyes_gateway/agent_runner.py new file mode 100644 index 0000000..454fc0f --- /dev/null +++ b/examples/agent_train/deepeyes_gateway/agent_runner.py @@ -0,0 +1,172 @@ +from __future__ import annotations + +import base64 +import json +from io import BytesIO +from typing import TYPE_CHECKING, Any + +import httpx +from PIL import Image + +if TYPE_CHECKING: + from uni_agent.trainer.framework.types import SessionHandle + from verl.tools.schemas import ToolResponse +else: + SessionHandle = Any + ToolResponse = Any + + +IMAGE_ZOOM_IN_TOOL_NAME = "image_zoom_in_tool" +GATEWAY_REQUEST_TIMEOUT_SECONDS = 300.0 + + +def _json_ready(value: Any) -> Any: + if isinstance(value, Image.Image): + buffer = BytesIO() + value.convert("RGB").save(buffer, format="PNG") + encoded = base64.b64encode(buffer.getvalue()).decode("ascii") + return f"data:image/png;base64,{encoded}" + if isinstance(value, bytes): + encoded = base64.b64encode(value).decode("ascii") + return f"data:image/png;base64,{encoded}" + if isinstance(value, dict): + if "bytes" in value: + return _json_ready(value["bytes"]) + return {key: _json_ready(item) for key, item in value.items()} + if isinstance(value, list): + return [_json_ready(item) for item in value] + if isinstance(value, tuple): + return [_json_ready(item) for item in value] + return value + + +def _tool_kwargs_for_name(tools_kwargs: dict | None) -> dict[str, Any]: + if not isinstance(tools_kwargs, dict): + return {} + + maybe_tool_kwargs = tools_kwargs.get(IMAGE_ZOOM_IN_TOOL_NAME) + return maybe_tool_kwargs if isinstance(maybe_tool_kwargs, dict) else {} + + +def _parse_tool_arguments(arguments: object) -> dict[str, Any]: + if isinstance(arguments, dict): + return arguments + if not isinstance(arguments, str) or not arguments: + return {} + try: + parsed = json.loads(arguments) + except json.JSONDecodeError: + return {} + return parsed if isinstance(parsed, dict) else {} + + +def _assistant_message_from_response(payload: dict[str, Any]) -> dict[str, Any]: + choices = payload.get("choices") + if not choices: + raise ValueError("chat completion response did not include choices") + + message = choices[0].get("message") + if not isinstance(message, dict): + raise ValueError("chat completion response choice did not include a message") + return message + + +def _tool_response_to_openai_tool_message(*, tool_call_id: str, tool_response: ToolResponse) -> dict[str, Any]: + content: list[dict[str, Any]] = [] + + if tool_response.video: + raise NotImplementedError("ToolResponse video content is not supported by the DeepEyes gateway recipe") + + if tool_response.text is not None: + content.append({"type": "text", "text": str(tool_response.text)}) + for image in tool_response.image or []: + content.append({"type": "image", "image": _json_ready(image)}) + if not content: + content.append({"type": "text", "text": ""}) + + return { + "role": "tool", + "tool_call_id": tool_call_id, + "content": content, + } + + +def _select_tool(tool_config: list[Any] | None): + if not tool_config: + raise ValueError("tool_config is required for deepeyes_agent_runner") + + for tool in tool_config: + if getattr(tool, "name", None) == IMAGE_ZOOM_IN_TOOL_NAME: + return tool + raise ValueError(f"tool_config must include {IMAGE_ZOOM_IN_TOOL_NAME}") + + +async def deepeyes_agent_runner( + *, + raw_prompt: list[dict], + session: SessionHandle, + sample_index: int, + tools_kwargs: dict | None = None, + tool_config: list[Any] | None = None, + max_turns: int = 5, + **kwargs, +) -> None: + """Run a DeepEyes multi-turn image zoom-in tool loop against the gateway.""" + del sample_index, kwargs + if session.base_url is None: + raise ValueError("session.base_url is required for deepeyes_agent_runner") + + image_tool = _select_tool(tool_config) + image_tool_kwargs = _tool_kwargs_for_name(tools_kwargs) + create_kwargs = dict(image_tool_kwargs.get("create_kwargs") or {}) + if "image" not in create_kwargs and "image" in image_tool_kwargs: + create_kwargs["image"] = image_tool_kwargs["image"] + execute_kwargs = dict(image_tool_kwargs.get("execute_kwargs") or {}) + release_kwargs = dict(image_tool_kwargs.get("release_kwargs") or {}) + + tool_instance_id: str | None = None + messages = _json_ready(list(raw_prompt)) + + try: + tool_instance_id, _ = await image_tool.create( + instance_id=f"{session.session_id}-image_zoom_in_tool", + create_kwargs=create_kwargs, + ) + tool_schema = image_tool.get_openai_tool_schema().model_dump(exclude_none=True) + + async with httpx.AsyncClient(timeout=GATEWAY_REQUEST_TIMEOUT_SECONDS) as client: + for turn_index in range(max(0, max_turns)): + response = await client.post( + f"{session.base_url}/chat/completions", + json={ + "model": "deepeyes", + "messages": messages, + "tools": [tool_schema], + }, + ) + response.raise_for_status() + + assistant_message = _assistant_message_from_response(response.json()) + messages.append(dict(assistant_message)) + + tool_calls = assistant_message.get("tool_calls") or [] + if not tool_calls or turn_index + 1 >= max_turns: + break + + for tool_call in tool_calls: + function = tool_call.get("function") or {} + parameters = _parse_tool_arguments(function.get("arguments")) + tool_response, _, _ = await image_tool.execute( + tool_instance_id, + parameters=parameters, + **execute_kwargs, + ) + messages.append( + _tool_response_to_openai_tool_message( + tool_call_id=tool_call.get("id", ""), + tool_response=tool_response, + ) + ) + finally: + if tool_instance_id is not None: + await image_tool.release(tool_instance_id, **release_kwargs) diff --git a/examples/agent_train/deepeyes_gateway/configs/deepeyes_gateway_grpo.yaml b/examples/agent_train/deepeyes_gateway/configs/deepeyes_gateway_grpo.yaml new file mode 100644 index 0000000..3d1a0c7 --- /dev/null +++ b/examples/agent_train/deepeyes_gateway/configs/deepeyes_gateway_grpo.yaml @@ -0,0 +1,42 @@ +hydra: + searchpath: + - pkg://verl.trainer.config + +defaults: + - ppo_trainer + - _self_ + +data: + max_prompt_length: 2048 + max_response_length: 2048 + return_raw_chat: True + return_multi_modal_inputs: False + custom_cls: + path: pkg://examples.agent_train.deepeyes_gateway.dataset + name: DeepEyesGatewayDataset + +algorithm: + adv_estimator: grpo + +actor_rollout_ref: + hybrid_engine: True + model: + custom_chat_template: "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{%- if tools %}{{- '<|im_start|>system\\n' }}{%- if messages[0]['role'] == 'system' %}{%- if messages[0]['content'] is string %}{{- messages[0]['content'] }}{%- else %}{{- messages[0]['content'][0]['text'] }}{%- endif %}{%- else %}{{- 'You are a helpful assistant.' }}{%- endif %}{{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}{%- for tool in tools %}{{- \"\\n\" }}{{- tool | tojson }}{%- endfor %}{{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{\\\"name\\\": , \\\"arguments\\\": }\\n<|im_end|>\\n\" }}{% for message in messages %}{% if message['role'] != 'system' or loop.first == false %}{%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{%- elif message.role == \"assistant\" %}{{- '<|im_start|>' + message.role }}{%- if message.content %}{{- '\\n' + message.content }}{%- endif %}{%- for tool_call in message.tool_calls %}{%- if tool_call.function is defined %}{%- set tool_call = tool_call.function %}{%- endif %}{{- '\\n\\n{\"name\": \"' }}{{- tool_call.name }}{{- '\", \"arguments\": ' }}{{- tool_call.arguments | tojson }}{{- '}\\n' }}{%- endfor %}{{- '<|im_end|>\\n' }}{%- elif message.role == \"tool\" %}{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}{{- '<|im_start|>user' }}{%- endif %}{{- '\\n\\n' }}{% if message['content'] is string %}{{ message.content }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif content['type'] == 'text' or 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}{{- '\\n' }}{%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}{{- '<|im_end|>\\n' }}{%- endif %}{%- endif %}{% endif %}{% endfor %}{%- else %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}{%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{%- elif message.role == \"assistant\" %}{{- '<|im_start|>' + message.role }}{%- if message.content %}{{- '\\n' + message.content }}{%- endif %}{%- for tool_call in message.tool_calls %}{%- if tool_call.function is defined %}{%- set tool_call = tool_call.function %}{%- endif %}{{- '\\n\\n{\"name\": \"' }}{{- tool_call.name }}{{- '\", \"arguments\": ' }}{{- tool_call.arguments | tojson }}{{- '}\\n' }}{%- endfor %}{{- '<|im_end|>\\n' }}{%- elif message.role == \"tool\" %}{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}{{- '<|im_start|>user' }}{%- endif %}{{- '\\n\\n' }}{% if message['content'] is string %}{{ message.content }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif content['type'] == 'text' or 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}{{- '\\n' }}{%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}{{- '<|im_end|>\\n' }}{%- endif %}{%- endif %}{% endfor %}{%- endif %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}" + rollout: + name: sglang + multi_turn: + format: hermes + agent: + agent_loop_manager_class: uni_agent.trainer.framework.entry.AgentFrameworkRolloutAdapter + custom: + agent_framework: + agent_runner_fqn: examples.agent_train.deepeyes_gateway.agent_runner.deepeyes_agent_runner + gateway_count: 8 + agent_runner_kwargs: + max_turns: 5 + tool_config_path: examples/agent_train/deepeyes_gateway/configs/image_zoom_in_tool_config.yaml + +reward: + custom_reward_function: + path: pkg://examples.agent_train.deepeyes_gateway.reward + name: compute_score diff --git a/examples/agent_train/deepeyes_gateway/configs/image_zoom_in_tool_config.yaml b/examples/agent_train/deepeyes_gateway/configs/image_zoom_in_tool_config.yaml new file mode 100644 index 0000000..b048c17 --- /dev/null +++ b/examples/agent_train/deepeyes_gateway/configs/image_zoom_in_tool_config.yaml @@ -0,0 +1,26 @@ +tools: + - class_name: "verl.tools.image_zoom_in_tool.ImageZoomInTool" + config: + num_workers: 256 + rate_limit: 256 + timeout: 60 + type: native + tool_schema: + type: "function" + function: + name: "image_zoom_in_tool" + description: "Zoom in on a specific region of an image by cropping it based on a bounding box (bbox) and an optional object label." + parameters: + type: "object" + properties: + bbox_2d: + type: "array" + items: + type: "number" + minItems: 4 + maxItems: 4 + description: "The bounding box of the region to zoom in, as [x1, y1, x2, y2], where (x1, y1) is the top-left corner and (x2, y2) is the bottom-right corner." + label: + type: "string" + description: "The name or label of the object in the specified bounding box (optional)." + required: ["bbox_2d"] diff --git a/examples/agent_train/deepeyes_gateway/dataset.py b/examples/agent_train/deepeyes_gateway/dataset.py new file mode 100644 index 0000000..f33e13e --- /dev/null +++ b/examples/agent_train/deepeyes_gateway/dataset.py @@ -0,0 +1,132 @@ +"""Minimal dataset for the DeepEyes gateway recipe. + +Produces ``raw_prompt`` and reward-related fields only. +It does not perform tokenization or vision processing. +""" + +from __future__ import annotations + +import copy +import io +import logging +import re + +import torch +from PIL import Image + +from verl.utils.dataset.rl_dataset import RLHFDataset + +logger = logging.getLogger(__name__) + + +class DeepEyesGatewayDataset(RLHFDataset): + """Thin dataset that leaves prompt encoding and vision extraction to the gateway.""" + + def _build_messages(self, example: dict, key: str) -> tuple[list[dict], object | None]: + messages = copy.deepcopy(example[key]) + images = example.get(self.image_key, None) or [] + videos = example.get(self.video_key, None) or [] + first_image = None + image_offset = 0 + video_offset = 0 + + for message in messages: + content = message.get("content") + if isinstance(content, list): + normalized = [] + for part in content: + normalized_part = _normalize_content_part(part) + if ( + first_image is None + and isinstance(normalized_part, dict) + and normalized_part.get("type") in {"image", "image_url"} + ): + first_image = _decode_image_payload(normalized_part.get("image", normalized_part)) + normalized_part = dict(normalized_part) + normalized_part["image"] = first_image + normalized.append(normalized_part) + message["content"] = normalized + continue + if not isinstance(content, str) or ("" not in content and "