diff --git a/areal/__init__.py b/areal/__init__.py index 42be212bd..e6edb1a6a 100644 --- a/areal/__init__.py +++ b/areal/__init__.py @@ -4,28 +4,41 @@ from .version import __version__ # noqa -from .infra import ( - RolloutController, - StalenessManager, - TrainController, - WorkflowExecutor, - current_platform, - workflow_context, + +_INFRA_NAMES = frozenset( + { + "RolloutController", + "StalenessManager", + "TrainController", + "WorkflowExecutor", + "current_platform", + "workflow_context", + } +) + +_TRAINER_NAMES = frozenset( + { + "DPOTrainer", + "PPOTrainer", + "RWTrainer", + "SFTTrainer", + } ) def __getattr__(name: str): - if name in ("DPOTrainer", "PPOTrainer", "RWTrainer", "SFTTrainer"): - from .trainer import DPOTrainer, PPOTrainer, RWTrainer, SFTTrainer - - _map = { - "DPOTrainer": DPOTrainer, - "PPOTrainer": PPOTrainer, - "RWTrainer": RWTrainer, - "SFTTrainer": SFTTrainer, - } - globals().update(_map) - return _map[name] + if name in _INFRA_NAMES: + from . import infra as _infra + + value = getattr(_infra, name) + globals()[name] = value + return value + if name in _TRAINER_NAMES: + from . import trainer as _trainer + + value = getattr(_trainer, name) + globals()[name] = value + return value raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/areal/experimental/cli/__init__.py b/areal/experimental/cli/__init__.py new file mode 100644 index 000000000..cdcaf0e36 --- /dev/null +++ b/areal/experimental/cli/__init__.py @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: Apache-2.0 + +"""AReaL operator CLI — companion to the v2 microservice control plane. + +This package exposes a single ``areal`` console-script that drives the v2 +service gateways (inference / agent / training / weight-update) from a +shell, rather than from a Python script that has to instantiate the +matching controller. It is intentionally light at import time so that +adding a verb in a follow-up PR does not pull torch / ray / megatron / +sglang / vllm into the parser-construction path. + +The full per-verb design surface is tracked in the upstream design +discussion issue. +""" diff --git a/areal/experimental/cli/commands/__init__.py b/areal/experimental/cli/commands/__init__.py new file mode 100644 index 000000000..988131360 --- /dev/null +++ b/areal/experimental/cli/commands/__init__.py @@ -0,0 +1 @@ +# SPDX-License-Identifier: Apache-2.0 diff --git a/areal/experimental/cli/commands/agent/__init__.py b/areal/experimental/cli/commands/agent/__init__.py new file mode 100644 index 000000000..6b8392615 --- /dev/null +++ b/areal/experimental/cli/commands/agent/__init__.py @@ -0,0 +1,49 @@ +# SPDX-License-Identifier: Apache-2.0 + +"""``areal agent`` — agent service operator console (scaffold). + +Drives an agent service (gateway + router + N (worker, data-proxy) pairs) +for session-centric operator and debugging work. No verbs are implemented +in this scaffold release; this module only reserves the ``areal agent`` +command name and tells the user what is coming. + +The agent CLI is session-centric (not model-centric like ``areal inf``). +Sessions can negotiate an RL session key with a configured inference +service when they start, enabling online RL trajectory tracking. +""" + +from __future__ import annotations + +import argparse + +_DESCRIPTION = """\ +Operate an agent service: gateway + router + (worker, data-proxy) pairs. +Session-centric: the primary unit of interaction is an agent session, +not a model. + +NO VERBS IMPLEMENTED YET. This namespace currently only reserves the +`areal agent ...` command surface. + +Planned verb surface (flag matrices live in the design discussion issue): + run launch router + N pairs + gateway + stop tear them down + status health for one service + ps list locally known services + logs show gateway / router / worker / data-proxy logs + +State lives under ~/.areal/agent/. +""" + + +def add_parser(subparsers: argparse._SubParsersAction) -> None: + p = subparsers.add_parser( + "agent", + help="Operate an agent service (scaffold — no verbs yet).", + description=_DESCRIPTION, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + p.set_defaults(func=_handle) + + +def _handle(_: argparse.Namespace) -> int: + return 0 diff --git a/areal/experimental/cli/commands/inf/__init__.py b/areal/experimental/cli/commands/inf/__init__.py new file mode 100644 index 000000000..71287a6fe --- /dev/null +++ b/areal/experimental/cli/commands/inf/__init__.py @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: Apache-2.0 + +"""``areal inf`` — inference service operator console (scaffold). + +Drives an inference service (gateway + router + optional model backends) +for day-to-day operator and debugging work. No verbs are implemented in +this scaffold release; this module only reserves the ``areal inf`` +command name and tells the user what is coming. +""" + +from __future__ import annotations + +import argparse + +_DESCRIPTION = """\ +Operate an inference service: gateway + router + optional model backends. + +NO VERBS IMPLEMENTED YET. This namespace currently only reserves the +`areal inf ...` command surface. + +Planned verb surface (flag matrices live in the design discussion issue): + run launch gateway + router (optionally with --model inline) + stop tear them down + status health for one service + ps list locally known services + logs show gateway / router / model logs + +State lives under ~/.areal/inf/. +""" + + +def add_parser(subparsers: argparse._SubParsersAction) -> None: + p = subparsers.add_parser( + "inf", + help="Operate an inference service (scaffold — no verbs yet).", + description=_DESCRIPTION, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + p.set_defaults(func=_handle) + + +def _handle(_: argparse.Namespace) -> int: + return 0 diff --git a/areal/experimental/cli/commands/train/__init__.py b/areal/experimental/cli/commands/train/__init__.py new file mode 100644 index 000000000..94a87e100 --- /dev/null +++ b/areal/experimental/cli/commands/train/__init__.py @@ -0,0 +1,50 @@ +# SPDX-License-Identifier: Apache-2.0 + +"""``areal train`` — training job submitter (scaffold). + +Wraps the launch lifecycle of a training driver. Unlike ``areal inf`` / +``areal agent`` (which manage long-running services), ``areal train`` +treats each run as a job: it terminates, and the CLI's job is purely +lifecycle wrapping. The scheduling decision (local / slurm / ray) stays +inside the driver, decided by ``config.scheduler.type`` — the CLI does +not pick a scheduler. + +No verbs are implemented in this scaffold release. +""" + +from __future__ import annotations + +import argparse + +_DESCRIPTION = """\ +Submit and observe training jobs. Job-shaped (terminates), not +service-shaped — the CLI wraps lifecycle only and does not choose the +scheduler (that decision lives in the driver via config.scheduler.type). + +NO VERBS IMPLEMENTED YET. This namespace currently only reserves the +`areal train ...` command surface. + +Planned verb surface (flag matrices live in the design discussion issue): + run run a driver in the foreground (small jobs, debugging) + start spawn a detached driver process (cluster jobs) + stop signal a running job by name + ps list locally tracked jobs + status status of one job + logs tail a job's combined stdout/stderr + +State lives under ~/.areal/train/. +""" + + +def add_parser(subparsers: argparse._SubParsersAction) -> None: + p = subparsers.add_parser( + "train", + help="Submit and observe training jobs (scaffold — no verbs yet).", + description=_DESCRIPTION, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + p.set_defaults(func=_handle) + + +def _handle(_: argparse.Namespace) -> int: + return 0 diff --git a/areal/experimental/cli/commands/weight_update/__init__.py b/areal/experimental/cli/commands/weight_update/__init__.py new file mode 100644 index 000000000..fa9de4eb4 --- /dev/null +++ b/areal/experimental/cli/commands/weight_update/__init__.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: Apache-2.0 + +"""``areal weight-update`` — weight-sync diagnostic console (scaffold). + +Drives the weight-update service that sits between training and +inference. The operator-facing surface is small and diagnostic-only: +humans don't invoke `/connect` / `/update_weights` / `/disconnect` +directly during normal use — those are called by adapter code inside +the training and inference engines. The CLI's job is to show whether +the sync is healthy, which (train, inference) pairs are connected, and +where the logs are. + +No verbs are implemented in this scaffold release. + +The CLI-surface namespace is ``weight-update`` (hyphenated, matching +the v2 service naming). The Python module is ``weight_update`` because +identifiers can't contain hyphens. +""" + +from __future__ import annotations + +import argparse + +_DESCRIPTION = """\ +Diagnose the weight-update service that bridges training and inference. + +NO VERBS IMPLEMENTED YET. This namespace currently only reserves the +`areal weight-update ...` command surface. + +Planned verb surface (flag matrices live in the design discussion issue): + status is the gateway alive? how many pairs are connected? + ps list locally known weight-update services + logs tail the gateway log + +Note: there is no `run` verb in the first cut — in the v2 flow the +weight-update gateway is brought up by the training-side controller, +not by the operator. + +State lives under ~/.areal/weight-update/. +""" + + +def add_parser(subparsers: argparse._SubParsersAction) -> None: + p = subparsers.add_parser( + "weight-update", + help="Diagnose weight-sync state (scaffold — no verbs yet).", + description=_DESCRIPTION, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + p.set_defaults(func=_handle) + + +def _handle(_: argparse.Namespace) -> int: + return 0 diff --git a/areal/experimental/cli/main.py b/areal/experimental/cli/main.py new file mode 100644 index 000000000..2bbd134b2 --- /dev/null +++ b/areal/experimental/cli/main.py @@ -0,0 +1,82 @@ +# SPDX-License-Identifier: Apache-2.0 + +"""Top-level entry point for the ``areal`` console-script. + +This module wires the four sub-CLI namespaces (`inf`, `agent`, `train`, +`weight-update`) into a single argparse tree. Each namespace lives under +``areal/experimental/cli/commands//`` and exports an +``add_parser(subparsers)`` function; this file imports them and registers +them. No verb behavior is implemented at this level. + +The import path is kept deliberately light: only stdlib and the namespace +``__init__`` modules are touched here. Heavy dependencies (torch, ray, +megatron, sglang, vllm, fastapi, …) must never appear on the import path +that ``areal --help`` triggers — the invariant is locked by +``tests/experimental/test_cli_lightness.py``. +""" + +from __future__ import annotations + +import argparse +import sys + +from areal.experimental.cli.commands import agent as cmd_agent +from areal.experimental.cli.commands import inf as cmd_inf +from areal.experimental.cli.commands import train as cmd_train +from areal.experimental.cli.commands import weight_update as cmd_weight_update +from areal.version import __version__ + +_DESCRIPTION = """\ +AReaL operator CLI for the v2 microservice architecture. + +Each namespace drives one of the v2 services. Verbs land incrementally; +no verbs are implemented in this scaffold release — each namespace's +--help describes its planned surface and points at the design discussion. + +Namespaces: + inf Operate an inference service (gateway + router + models) + agent Operate an agent service (gateway + router + sessions) + train Submit and observe training jobs + weight-update Diagnose weight-sync state between train and inference + +Run `areal --help` for what's planned (and what's available +today). State files for each namespace live under ~/.areal//. +""" + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="areal", + description=_DESCRIPTION, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--version", + action="version", + version=f"areal {__version__}", + ) + subparsers = parser.add_subparsers( + dest="namespace", + required=True, + metavar="NAMESPACE", + ) + cmd_inf.add_parser(subparsers) + cmd_agent.add_parser(subparsers) + cmd_train.add_parser(subparsers) + cmd_weight_update.add_parser(subparsers) + return parser + + +def cli(argv: list[str] | None = None) -> int: + parser = build_parser() + args = parser.parse_args(argv if argv is not None else sys.argv[1:]) + func = getattr(args, "func", None) + if func is None: + parser.print_help() + return 2 + result = func(args) + return int(result) if isinstance(result, int) else 0 + + +if __name__ == "__main__": + sys.exit(cli()) diff --git a/areal/experimental/cli/state.py b/areal/experimental/cli/state.py new file mode 100644 index 000000000..66a36a20c --- /dev/null +++ b/areal/experimental/cli/state.py @@ -0,0 +1,106 @@ +# SPDX-License-Identifier: Apache-2.0 + +"""Cross-cutting state helpers shared by every sub-CLI. + +The AReaL CLI persists a small amount of local state under ``~/.areal/`` +(overridable with ``$AREAL_HOME``). Each sub-CLI owns a subdirectory with +the same internal shape, so commands can find services and jobs across +invocations without a background daemon: + + ~/.areal/ + ├── inf/ + │ ├── current-service + │ ├── services/.json + │ └── logs// + ├── agent/ + │ ├── current-service + │ ├── services/.json + │ └── logs// + ├── train/ + │ ├── current-run + │ ├── runs/.json + │ └── logs// + └── weight-update/ + ├── current-service + ├── services/.json + └── logs// + +The helpers in this module are deliberately tiny — atomic file write, +PID liveness, and the ``~/.areal/`` resolver. Per-namespace dataclasses +(``ServiceState``, ``RunState``, etc.) live alongside the verb files +that introduce them, not here. + +This module must stay import-light. Do NOT add dependencies on torch, +aiohttp, fastapi, or any other heavy package — the lightness guard test +in ``tests/experimental/test_cli_lightness.py`` will reject the change. +""" + +from __future__ import annotations + +import json +import os +from pathlib import Path +from typing import Any + + +def areal_home() -> Path: + """Return the root state directory (``$AREAL_HOME`` or ``~/.areal``). + + Creates the directory if it doesn't exist. Sub-CLIs derive their own + subdirectories from here; they should never bypass this function and + hardcode ``~/.areal`` themselves. + """ + env = os.environ.get("AREAL_HOME") + root = Path(env).expanduser() if env else Path.home() / ".areal" + root.mkdir(parents=True, exist_ok=True) + return root + + +def namespace_dir(namespace: str) -> Path: + """Return ``~/.areal//`` (created on demand). + + ``namespace`` is the on-disk directory name and follows the CLI + verb prefix (``inf``, ``agent``, ``train``, ``weight-update``). The + hyphenated form is used on disk to match the CLI surface exactly. + """ + d = areal_home() / namespace + d.mkdir(parents=True, exist_ok=True) + return d + + +def pid_alive(pid: int) -> bool: + """Cheap liveness probe: does *pid* still exist on this host? + + Uses ``kill(pid, 0)`` which signals nothing but raises ``ProcessLookupError`` + if the process is gone. Returns ``False`` for ``pid <= 0``. Note: a live + PID does not mean the service is healthy — pair this with an HTTP probe + when callers need real health status. + """ + if pid <= 0: + return False + try: + os.kill(pid, 0) + except ProcessLookupError: + return False + except PermissionError: + # The process exists but is owned by another user. + return True + return True + + +def atomic_write_text(path: Path, content: str) -> None: + """Write *content* to *path* atomically (write-temp + rename). + + Prevents readers from observing a partially written file when two CLI + invocations race on the same state file. + """ + path.parent.mkdir(parents=True, exist_ok=True) + tmp = path.with_suffix(path.suffix + ".tmp") + with open(tmp, "w") as f: + f.write(content) + os.replace(tmp, path) + + +def atomic_write_json(path: Path, data: Any, *, indent: int = 2) -> None: + """Convenience wrapper: serialize *data* to JSON and write atomically.""" + atomic_write_text(path, json.dumps(data, indent=indent) + "\n") diff --git a/pyproject.toml b/pyproject.toml index 1d90e3e71..0f1ea7f12 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -183,6 +183,19 @@ cuda = [ sandbox = [ "daytona>=0.167.0", ] +# Minimal extra for the lightweight `areal` CLI (login-node install). The CLI +# must NOT pull in torch / transformers / sglang / vllm / ray / megatron / ...; +# the lightness guard test in `tests/experimental/test_cli_lightness.py` +# enforces this invariant. The CLI only depends on PyYAML (to peek at config +# files for driver / scheduler resolution) and colorlog (for the project's +# colored loggers). +cli = [ + "PyYAML", + "colorlog", +] + +[project.scripts] +areal = "areal.experimental.cli.main:cli" [project.urls] "Homepage" = "https://github.com/areal-project/AReaL" diff --git a/pyproject.vllm.toml b/pyproject.vllm.toml index 4ab022c36..a79368b0c 100644 --- a/pyproject.vllm.toml +++ b/pyproject.vllm.toml @@ -194,6 +194,19 @@ cuda = [ sandbox = [ "daytona>=0.167.0", ] +# Minimal extra for the lightweight `areal` CLI (login-node install). The CLI +# must NOT pull in torch / transformers / sglang / vllm / ray / megatron / ...; +# the lightness guard test in `tests/experimental/test_cli_lightness.py` +# enforces this invariant. The CLI only depends on PyYAML (to peek at config +# files for driver / scheduler resolution) and colorlog (for the project's +# colored loggers). +cli = [ + "PyYAML", + "colorlog", +] + +[project.scripts] +areal = "areal.experimental.cli.main:cli" [project.urls] "Homepage" = "https://github.com/areal-project/AReaL" diff --git a/uv.lock b/uv.lock index e468608cc..db1652123 100644 --- a/uv.lock +++ b/uv.lock @@ -400,6 +400,10 @@ dependencies = [ ] [package.optional-dependencies] +cli = [ + { name = "colorlog", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "pyyaml", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] cuda = [ { name = "flash-linear-attention", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "kernels", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, @@ -482,6 +486,7 @@ requires-dist = [ { name = "claude-agent-sdk" }, { name = "colorama" }, { name = "colorlog" }, + { name = "colorlog", marker = "extra == 'cli'" }, { name = "cookiecutter", specifier = ">2.1.1" }, { name = "datasets", specifier = ">=3.0.0" }, { name = "daytona", marker = "extra == 'sandbox'", specifier = ">=0.167.0" }, @@ -534,6 +539,7 @@ requires-dist = [ { name = "python-debian", specifier = ">=0.1.49" }, { name = "python-dotenv" }, { name = "pyyaml" }, + { name = "pyyaml", marker = "extra == 'cli'" }, { name = "pyzmq" }, { name = "qwen-agent" }, { name = "ray", extras = ["default"] }, @@ -573,7 +579,7 @@ requires-dist = [ { name = "word2number" }, { name = "zstandard" }, ] -provides-extras = ["sglang", "tms", "kernels", "megatron", "cuda-train", "cuda", "sandbox"] +provides-extras = ["sglang", "tms", "kernels", "megatron", "cuda-train", "cuda", "sandbox", "cli"] [package.metadata.requires-dev] dev = [ diff --git a/uv.vllm.lock b/uv.vllm.lock index 9e9b587e6..dab9badc0 100644 --- a/uv.vllm.lock +++ b/uv.vllm.lock @@ -448,6 +448,10 @@ dependencies = [ ] [package.optional-dependencies] +cli = [ + { name = "colorlog", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "pyyaml", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] cuda = [ { name = "flash-linear-attention", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "kernels", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, @@ -528,6 +532,7 @@ requires-dist = [ { name = "claude-agent-sdk" }, { name = "colorama" }, { name = "colorlog" }, + { name = "colorlog", marker = "extra == 'cli'" }, { name = "cookiecutter", specifier = ">2.1.1" }, { name = "datasets", specifier = ">=3.0.0" }, { name = "daytona", marker = "extra == 'sandbox'", specifier = ">=0.167.0" }, @@ -579,6 +584,7 @@ requires-dist = [ { name = "python-debian", specifier = ">=0.1.49" }, { name = "python-dotenv" }, { name = "pyyaml" }, + { name = "pyyaml", marker = "extra == 'cli'" }, { name = "pyzmq" }, { name = "qwen-agent" }, { name = "ray", extras = ["default"] }, @@ -619,7 +625,7 @@ requires-dist = [ { name = "word2number" }, { name = "zstandard" }, ] -provides-extras = ["vllm", "tms", "kernels", "megatron", "cuda-train", "cuda", "sandbox"] +provides-extras = ["vllm", "tms", "kernels", "megatron", "cuda-train", "cuda", "sandbox", "cli"] [package.metadata.requires-dev] dev = [