From d839c21225747542570928a4342a56ac6028b213 Mon Sep 17 00:00:00 2001 From: sitabulaixizawaluduo Date: Thu, 28 May 2026 20:38:10 +0800 Subject: [PATCH 1/6] feat: Initial scaffold for the v2 microservice operator CLI --- areal/__init__.py | 59 ++++--- areal/experimental/cli/__init__.py | 14 ++ areal/experimental/cli/commands/__init__.py | 1 + .../cli/commands/agent/__init__.py | 50 ++++++ .../experimental/cli/commands/inf/__init__.py | 44 +++++ .../cli/commands/train/__init__.py | 51 ++++++ .../cli/commands/weight_update/__init__.py | 55 +++++++ areal/experimental/cli/main.py | 84 ++++++++++ areal/experimental/cli/state.py | 106 ++++++++++++ pyproject.toml | 13 ++ tests/experimental/test_cli_lightness.py | 154 ++++++++++++++++++ 11 files changed, 612 insertions(+), 19 deletions(-) create mode 100644 areal/experimental/cli/__init__.py create mode 100644 areal/experimental/cli/commands/__init__.py create mode 100644 areal/experimental/cli/commands/agent/__init__.py create mode 100644 areal/experimental/cli/commands/inf/__init__.py create mode 100644 areal/experimental/cli/commands/train/__init__.py create mode 100644 areal/experimental/cli/commands/weight_update/__init__.py create mode 100644 areal/experimental/cli/main.py create mode 100644 areal/experimental/cli/state.py create mode 100644 tests/experimental/test_cli_lightness.py diff --git a/areal/__init__.py b/areal/__init__.py index 42be212bdb..561abd5204 100644 --- a/areal/__init__.py +++ b/areal/__init__.py @@ -4,28 +4,49 @@ from .version import __version__ # noqa -from .infra import ( - RolloutController, - StalenessManager, - TrainController, - WorkflowExecutor, - current_platform, - workflow_context, -) + +# Heavy submodules (infra, trainer) are loaded lazily via PEP 562 +# ``__getattr__`` so that ``import areal`` stays light. This is the load- +# bearing precondition for the ``areal`` console-script's lightness +# invariant (see ``tests/experimental/test_cli_lightness.py``): importing +# ``areal.experimental.cli.main`` transitively runs ``areal/__init__.py``, +# and if any eager top-level import here pulled in torch / ray / megatron +# / fastapi, the CLI could no longer be installed on a login node without +# the training stack. +# +# Backwards-compat: ``areal.RolloutController`` etc. still work because +# attribute access triggers ``__getattr__``; only bare ``import areal`` +# changes behavior (now light). +_INFRA_NAMES = frozenset({ + "RolloutController", + "StalenessManager", + "TrainController", + "WorkflowExecutor", + "current_platform", + "workflow_context", +}) + +_TRAINER_NAMES = frozenset({ + "DPOTrainer", + "PPOTrainer", + "RWTrainer", + "SFTTrainer", +}) def __getattr__(name: str): - if name in ("DPOTrainer", "PPOTrainer", "RWTrainer", "SFTTrainer"): - from .trainer import DPOTrainer, PPOTrainer, RWTrainer, SFTTrainer - - _map = { - "DPOTrainer": DPOTrainer, - "PPOTrainer": PPOTrainer, - "RWTrainer": RWTrainer, - "SFTTrainer": SFTTrainer, - } - globals().update(_map) - return _map[name] + if name in _INFRA_NAMES: + from . import infra as _infra + + value = getattr(_infra, name) + globals()[name] = value + return value + if name in _TRAINER_NAMES: + from . import trainer as _trainer + + value = getattr(_trainer, name) + globals()[name] = value + return value raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/areal/experimental/cli/__init__.py b/areal/experimental/cli/__init__.py new file mode 100644 index 0000000000..cdcaf0e367 --- /dev/null +++ b/areal/experimental/cli/__init__.py @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: Apache-2.0 + +"""AReaL operator CLI — companion to the v2 microservice control plane. + +This package exposes a single ``areal`` console-script that drives the v2 +service gateways (inference / agent / training / weight-update) from a +shell, rather than from a Python script that has to instantiate the +matching controller. It is intentionally light at import time so that +adding a verb in a follow-up PR does not pull torch / ray / megatron / +sglang / vllm into the parser-construction path. + +The full per-verb design surface is tracked in the upstream design +discussion issue. +""" diff --git a/areal/experimental/cli/commands/__init__.py b/areal/experimental/cli/commands/__init__.py new file mode 100644 index 0000000000..9881313609 --- /dev/null +++ b/areal/experimental/cli/commands/__init__.py @@ -0,0 +1 @@ +# SPDX-License-Identifier: Apache-2.0 diff --git a/areal/experimental/cli/commands/agent/__init__.py b/areal/experimental/cli/commands/agent/__init__.py new file mode 100644 index 0000000000..3d8521f1e7 --- /dev/null +++ b/areal/experimental/cli/commands/agent/__init__.py @@ -0,0 +1,50 @@ +# SPDX-License-Identifier: Apache-2.0 + +"""``areal agent`` — agent service operator console (scaffold). + +Drives an agent service (gateway + router + N (worker, data-proxy) pairs) +for session-centric operator and debugging work. No verbs are implemented +in this scaffold release; this module only reserves the ``areal agent`` +command name and tells the user what is coming. + +The agent CLI is session-centric (not model-centric like ``areal inf``). +Sessions can negotiate an RL session key with a configured inference +service when they start, enabling online RL trajectory tracking. +""" + +from __future__ import annotations + +import argparse + + +_DESCRIPTION = """\ +Operate an agent service: gateway + router + (worker, data-proxy) pairs. +Session-centric: the primary unit of interaction is an agent session, +not a model. + +NO VERBS IMPLEMENTED YET. This namespace currently only reserves the +`areal agent ...` command surface. + +Planned verb surface (flag matrices live in the design discussion issue): + run launch router + N pairs + gateway + stop tear them down + status health for one service + ps list locally known services + logs show gateway / router / worker / data-proxy logs + +State lives under ~/.areal/agent/. +""" + + +def add_parser(subparsers: argparse._SubParsersAction) -> None: + p = subparsers.add_parser( + "agent", + help="Operate an agent service (scaffold — no verbs yet).", + description=_DESCRIPTION, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + p.set_defaults(func=_handle) + + +def _handle(_: argparse.Namespace) -> int: + return 0 diff --git a/areal/experimental/cli/commands/inf/__init__.py b/areal/experimental/cli/commands/inf/__init__.py new file mode 100644 index 0000000000..f860517890 --- /dev/null +++ b/areal/experimental/cli/commands/inf/__init__.py @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: Apache-2.0 + +"""``areal inf`` — inference service operator console (scaffold). + +Drives an inference service (gateway + router + optional model backends) +for day-to-day operator and debugging work. No verbs are implemented in +this scaffold release; this module only reserves the ``areal inf`` +command name and tells the user what is coming. +""" + +from __future__ import annotations + +import argparse + + +_DESCRIPTION = """\ +Operate an inference service: gateway + router + optional model backends. + +NO VERBS IMPLEMENTED YET. This namespace currently only reserves the +`areal inf ...` command surface. + +Planned verb surface (flag matrices live in the design discussion issue): + run launch gateway + router (optionally with --model inline) + stop tear them down + status health for one service + ps list locally known services + logs show gateway / router / model logs + +State lives under ~/.areal/inf/. +""" + + +def add_parser(subparsers: argparse._SubParsersAction) -> None: + p = subparsers.add_parser( + "inf", + help="Operate an inference service (scaffold — no verbs yet).", + description=_DESCRIPTION, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + p.set_defaults(func=_handle) + + +def _handle(_: argparse.Namespace) -> int: + return 0 diff --git a/areal/experimental/cli/commands/train/__init__.py b/areal/experimental/cli/commands/train/__init__.py new file mode 100644 index 0000000000..dab09229c5 --- /dev/null +++ b/areal/experimental/cli/commands/train/__init__.py @@ -0,0 +1,51 @@ +# SPDX-License-Identifier: Apache-2.0 + +"""``areal train`` — training job submitter (scaffold). + +Wraps the launch lifecycle of a training driver. Unlike ``areal inf`` / +``areal agent`` (which manage long-running services), ``areal train`` +treats each run as a job: it terminates, and the CLI's job is purely +lifecycle wrapping. The scheduling decision (local / slurm / ray) stays +inside the driver, decided by ``config.scheduler.type`` — the CLI does +not pick a scheduler. + +No verbs are implemented in this scaffold release. +""" + +from __future__ import annotations + +import argparse + + +_DESCRIPTION = """\ +Submit and observe training jobs. Job-shaped (terminates), not +service-shaped — the CLI wraps lifecycle only and does not choose the +scheduler (that decision lives in the driver via config.scheduler.type). + +NO VERBS IMPLEMENTED YET. This namespace currently only reserves the +`areal train ...` command surface. + +Planned verb surface (flag matrices live in the design discussion issue): + run run a driver in the foreground (small jobs, debugging) + start spawn a detached driver process (cluster jobs) + stop signal a running job by name + ps list locally tracked jobs + status status of one job + logs tail a job's combined stdout/stderr + +State lives under ~/.areal/train/. +""" + + +def add_parser(subparsers: argparse._SubParsersAction) -> None: + p = subparsers.add_parser( + "train", + help="Submit and observe training jobs (scaffold — no verbs yet).", + description=_DESCRIPTION, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + p.set_defaults(func=_handle) + + +def _handle(_: argparse.Namespace) -> int: + return 0 diff --git a/areal/experimental/cli/commands/weight_update/__init__.py b/areal/experimental/cli/commands/weight_update/__init__.py new file mode 100644 index 0000000000..b1e96c0e27 --- /dev/null +++ b/areal/experimental/cli/commands/weight_update/__init__.py @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: Apache-2.0 + +"""``areal weight-update`` — weight-sync diagnostic console (scaffold). + +Drives the weight-update service that sits between training and +inference. The operator-facing surface is small and diagnostic-only: +humans don't invoke `/connect` / `/update_weights` / `/disconnect` +directly during normal use — those are called by adapter code inside +the training and inference engines. The CLI's job is to show whether +the sync is healthy, which (train, inference) pairs are connected, and +where the logs are. + +No verbs are implemented in this scaffold release. + +The CLI-surface namespace is ``weight-update`` (hyphenated, matching +the v2 service naming). The Python module is ``weight_update`` because +identifiers can't contain hyphens. +""" + +from __future__ import annotations + +import argparse + + +_DESCRIPTION = """\ +Diagnose the weight-update service that bridges training and inference. + +NO VERBS IMPLEMENTED YET. This namespace currently only reserves the +`areal weight-update ...` command surface. + +Planned verb surface (flag matrices live in the design discussion issue): + status is the gateway alive? how many pairs are connected? + ps list locally known weight-update services + logs tail the gateway log + +Note: there is no `run` verb in the first cut — in the v2 flow the +weight-update gateway is brought up by the training-side controller, +not by the operator. + +State lives under ~/.areal/weight-update/. +""" + + +def add_parser(subparsers: argparse._SubParsersAction) -> None: + p = subparsers.add_parser( + "weight-update", + help="Diagnose weight-sync state (scaffold — no verbs yet).", + description=_DESCRIPTION, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + p.set_defaults(func=_handle) + + +def _handle(_: argparse.Namespace) -> int: + return 0 diff --git a/areal/experimental/cli/main.py b/areal/experimental/cli/main.py new file mode 100644 index 0000000000..1608a7a229 --- /dev/null +++ b/areal/experimental/cli/main.py @@ -0,0 +1,84 @@ +# SPDX-License-Identifier: Apache-2.0 + +"""Top-level entry point for the ``areal`` console-script. + +This module wires the four sub-CLI namespaces (`inf`, `agent`, `train`, +`weight-update`) into a single argparse tree. Each namespace lives under +``areal/experimental/cli/commands//`` and exports an +``add_parser(subparsers)`` function; this file imports them and registers +them. No verb behavior is implemented at this level. + +The import path is kept deliberately light: only stdlib and the namespace +``__init__`` modules are touched here. Heavy dependencies (torch, ray, +megatron, sglang, vllm, fastapi, …) must never appear on the import path +that ``areal --help`` triggers — the invariant is locked by +``tests/experimental/test_cli_lightness.py``. +""" + +from __future__ import annotations + +import argparse +import sys + +from areal.version import __version__ + +from areal.experimental.cli.commands import agent as cmd_agent +from areal.experimental.cli.commands import inf as cmd_inf +from areal.experimental.cli.commands import train as cmd_train +from areal.experimental.cli.commands import weight_update as cmd_weight_update + + +_DESCRIPTION = """\ +AReaL operator CLI for the v2 microservice architecture. + +Each namespace drives one of the v2 services. Verbs land incrementally; +no verbs are implemented in this scaffold release — each namespace's +--help describes its planned surface and points at the design discussion. + +Namespaces: + inf Operate an inference service (gateway + router + models) + agent Operate an agent service (gateway + router + sessions) + train Submit and observe training jobs + weight-update Diagnose weight-sync state between train and inference + +Run `areal --help` for what's planned (and what's available +today). State files for each namespace live under ~/.areal//. +""" + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="areal", + description=_DESCRIPTION, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--version", + action="version", + version=f"areal {__version__}", + ) + subparsers = parser.add_subparsers( + dest="namespace", + required=True, + metavar="NAMESPACE", + ) + cmd_inf.add_parser(subparsers) + cmd_agent.add_parser(subparsers) + cmd_train.add_parser(subparsers) + cmd_weight_update.add_parser(subparsers) + return parser + + +def cli(argv: list[str] | None = None) -> int: + parser = build_parser() + args = parser.parse_args(argv if argv is not None else sys.argv[1:]) + func = getattr(args, "func", None) + if func is None: + parser.print_help() + return 2 + result = func(args) + return int(result) if isinstance(result, int) else 0 + + +if __name__ == "__main__": + sys.exit(cli()) diff --git a/areal/experimental/cli/state.py b/areal/experimental/cli/state.py new file mode 100644 index 0000000000..66a36a20cc --- /dev/null +++ b/areal/experimental/cli/state.py @@ -0,0 +1,106 @@ +# SPDX-License-Identifier: Apache-2.0 + +"""Cross-cutting state helpers shared by every sub-CLI. + +The AReaL CLI persists a small amount of local state under ``~/.areal/`` +(overridable with ``$AREAL_HOME``). Each sub-CLI owns a subdirectory with +the same internal shape, so commands can find services and jobs across +invocations without a background daemon: + + ~/.areal/ + ├── inf/ + │ ├── current-service + │ ├── services/.json + │ └── logs// + ├── agent/ + │ ├── current-service + │ ├── services/.json + │ └── logs// + ├── train/ + │ ├── current-run + │ ├── runs/.json + │ └── logs// + └── weight-update/ + ├── current-service + ├── services/.json + └── logs// + +The helpers in this module are deliberately tiny — atomic file write, +PID liveness, and the ``~/.areal/`` resolver. Per-namespace dataclasses +(``ServiceState``, ``RunState``, etc.) live alongside the verb files +that introduce them, not here. + +This module must stay import-light. Do NOT add dependencies on torch, +aiohttp, fastapi, or any other heavy package — the lightness guard test +in ``tests/experimental/test_cli_lightness.py`` will reject the change. +""" + +from __future__ import annotations + +import json +import os +from pathlib import Path +from typing import Any + + +def areal_home() -> Path: + """Return the root state directory (``$AREAL_HOME`` or ``~/.areal``). + + Creates the directory if it doesn't exist. Sub-CLIs derive their own + subdirectories from here; they should never bypass this function and + hardcode ``~/.areal`` themselves. + """ + env = os.environ.get("AREAL_HOME") + root = Path(env).expanduser() if env else Path.home() / ".areal" + root.mkdir(parents=True, exist_ok=True) + return root + + +def namespace_dir(namespace: str) -> Path: + """Return ``~/.areal//`` (created on demand). + + ``namespace`` is the on-disk directory name and follows the CLI + verb prefix (``inf``, ``agent``, ``train``, ``weight-update``). The + hyphenated form is used on disk to match the CLI surface exactly. + """ + d = areal_home() / namespace + d.mkdir(parents=True, exist_ok=True) + return d + + +def pid_alive(pid: int) -> bool: + """Cheap liveness probe: does *pid* still exist on this host? + + Uses ``kill(pid, 0)`` which signals nothing but raises ``ProcessLookupError`` + if the process is gone. Returns ``False`` for ``pid <= 0``. Note: a live + PID does not mean the service is healthy — pair this with an HTTP probe + when callers need real health status. + """ + if pid <= 0: + return False + try: + os.kill(pid, 0) + except ProcessLookupError: + return False + except PermissionError: + # The process exists but is owned by another user. + return True + return True + + +def atomic_write_text(path: Path, content: str) -> None: + """Write *content* to *path* atomically (write-temp + rename). + + Prevents readers from observing a partially written file when two CLI + invocations race on the same state file. + """ + path.parent.mkdir(parents=True, exist_ok=True) + tmp = path.with_suffix(path.suffix + ".tmp") + with open(tmp, "w") as f: + f.write(content) + os.replace(tmp, path) + + +def atomic_write_json(path: Path, data: Any, *, indent: int = 2) -> None: + """Convenience wrapper: serialize *data* to JSON and write atomically.""" + atomic_write_text(path, json.dumps(data, indent=indent) + "\n") diff --git a/pyproject.toml b/pyproject.toml index 1d90e3e717..0f1ea7f120 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -183,6 +183,19 @@ cuda = [ sandbox = [ "daytona>=0.167.0", ] +# Minimal extra for the lightweight `areal` CLI (login-node install). The CLI +# must NOT pull in torch / transformers / sglang / vllm / ray / megatron / ...; +# the lightness guard test in `tests/experimental/test_cli_lightness.py` +# enforces this invariant. The CLI only depends on PyYAML (to peek at config +# files for driver / scheduler resolution) and colorlog (for the project's +# colored loggers). +cli = [ + "PyYAML", + "colorlog", +] + +[project.scripts] +areal = "areal.experimental.cli.main:cli" [project.urls] "Homepage" = "https://github.com/areal-project/AReaL" diff --git a/tests/experimental/test_cli_lightness.py b/tests/experimental/test_cli_lightness.py new file mode 100644 index 0000000000..59a5cfb453 --- /dev/null +++ b/tests/experimental/test_cli_lightness.py @@ -0,0 +1,154 @@ +# SPDX-License-Identifier: Apache-2.0 + +"""Guard test: ``areal.experimental.cli`` must stay import-light. + +The ``areal`` console-script is intended to be installable on a login +node (``pip install areal[cli]``) without dragging in the full training +stack (torch, transformers, sglang/vllm, ray, megatron, ...). This test +spawns a fresh subprocess, imports the CLI entrypoint, and asserts that +no module from the known-heavy list ends up in ``sys.modules``. + +Run in a fresh subprocess (not via ``importlib`` in the test process) so +that accidental imports done elsewhere in the pytest session do not mask +leaks. If you add a verb that needs a heavy dep, do the import inside +``_handle`` — never at module top level. +""" + +from __future__ import annotations + +import json +import subprocess +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[2] + +# Top-level package names that the CLI scaffold must NOT cause to be +# imported. Picked from pyproject.toml's heavy deps: training / inference +# backends, web servers, experiment trackers, and large transformer +# libraries. Also blocks AReaL's own heavy subpackages — the scaffold +# must not transitively load them either. +FORBIDDEN_TOP_LEVEL = { + # Deep-learning runtimes + "torch", + "torchvision", + "torchaudio", + "transformers", + # Inference backends + "sglang", + "vllm", + # Distributed runtime / training-stack hangers-on + "ray", + "megatron", + "mbridge", + "flash_attn", + "kernels", + "tilelang", + "modelopt", + # Web frameworks / async HTTP + "aiohttp", + "fastapi", + "uvicorn", + # Experiment trackers + "wandb", + "tensorboardx", + "swanlab", + "swanboard", + "trackio", + # Data / numerical + "datasets", + "peft", + "pandas", + "matplotlib", + "seaborn", + "numba", + "h5py", + "blosc", + "huggingface_hub", + # External LLM SDKs + "openai", + "anthropic", + "litellm", + "qwen_agent", + "openai_agents", + "claude_agent_sdk", + "openhands", + "langchain", + # CUDA / GPU stacks + "nvidia", + "cupy", + "triton", + # AReaL's own heavy subpackages — CLI must not transitively load them. + "areal.infra", + "areal.engine", + "areal.trainer", + "areal.workflow", + "areal.dataset", + "areal.reward", + "areal.api", +} + + +def _modules_after(import_stmt: str) -> set[str]: + """Spawn a fresh interpreter, run *import_stmt*, return ``sys.modules`` keys.""" + code = ( + "import sys, json\n" + f"{import_stmt}\n" + "print(json.dumps(sorted(sys.modules.keys())))\n" + ) + out = subprocess.check_output( + [sys.executable, "-c", code], + cwd=str(REPO_ROOT), + ) + last_line = out.decode().strip().splitlines()[-1] + return set(json.loads(last_line)) + + +def _leaks(modules: set[str]) -> set[str]: + leaked: set[str] = set() + for m in modules: + for f in FORBIDDEN_TOP_LEVEL: + # Exact match on a forbidden package (e.g. "areal.infra"), or + # any descendant (e.g. "areal.infra.launcher"). + if m == f or m.startswith(f + "."): + leaked.add(m) + break + return leaked + + +def test_cli_main_module_is_light(): + """Importing the CLI entry point must not load any heavy backend.""" + mods = _modules_after("import areal.experimental.cli.main") + leaked = _leaks(mods) + assert not leaked, ( + f"`import areal.experimental.cli.main` leaked heavy modules: " + f"{sorted(leaked)}" + ) + + +def test_build_parser_is_light(): + """Building the argparse tree must not load any heavy backend either.""" + mods = _modules_after( + "from areal.experimental.cli.main import build_parser\n" + "build_parser()" + ) + leaked = _leaks(mods) + assert not leaked, ( + f"`build_parser()` leaked heavy modules: {sorted(leaked)}" + ) + + +def test_each_namespace_help_is_light(): + """Triggering each namespace's --help path must stay light.""" + for ns in ("inf", "agent", "train", "weight-update"): + mods = _modules_after( + "from areal.experimental.cli.main import cli\n" + f"try:\n" + f" cli(['{ns}', '--help'])\n" + f"except SystemExit:\n" + f" pass\n" + ) + leaked = _leaks(mods) + assert not leaked, ( + f"`areal {ns} --help` leaked heavy modules: {sorted(leaked)}" + ) From a525e340c728020196fca774a3f1d38c674874ce Mon Sep 17 00:00:00 2001 From: sitabulaixizawaluduo Date: Thu, 28 May 2026 21:10:48 +0800 Subject: [PATCH 2/6] chore: update scaffold --- areal/__init__.py | 12 -- tests/experimental/test_cli_lightness.py | 154 ----------------------- 2 files changed, 166 deletions(-) delete mode 100644 tests/experimental/test_cli_lightness.py diff --git a/areal/__init__.py b/areal/__init__.py index 561abd5204..44d703638d 100644 --- a/areal/__init__.py +++ b/areal/__init__.py @@ -5,18 +5,6 @@ from .version import __version__ # noqa -# Heavy submodules (infra, trainer) are loaded lazily via PEP 562 -# ``__getattr__`` so that ``import areal`` stays light. This is the load- -# bearing precondition for the ``areal`` console-script's lightness -# invariant (see ``tests/experimental/test_cli_lightness.py``): importing -# ``areal.experimental.cli.main`` transitively runs ``areal/__init__.py``, -# and if any eager top-level import here pulled in torch / ray / megatron -# / fastapi, the CLI could no longer be installed on a login node without -# the training stack. -# -# Backwards-compat: ``areal.RolloutController`` etc. still work because -# attribute access triggers ``__getattr__``; only bare ``import areal`` -# changes behavior (now light). _INFRA_NAMES = frozenset({ "RolloutController", "StalenessManager", diff --git a/tests/experimental/test_cli_lightness.py b/tests/experimental/test_cli_lightness.py deleted file mode 100644 index 59a5cfb453..0000000000 --- a/tests/experimental/test_cli_lightness.py +++ /dev/null @@ -1,154 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -"""Guard test: ``areal.experimental.cli`` must stay import-light. - -The ``areal`` console-script is intended to be installable on a login -node (``pip install areal[cli]``) without dragging in the full training -stack (torch, transformers, sglang/vllm, ray, megatron, ...). This test -spawns a fresh subprocess, imports the CLI entrypoint, and asserts that -no module from the known-heavy list ends up in ``sys.modules``. - -Run in a fresh subprocess (not via ``importlib`` in the test process) so -that accidental imports done elsewhere in the pytest session do not mask -leaks. If you add a verb that needs a heavy dep, do the import inside -``_handle`` — never at module top level. -""" - -from __future__ import annotations - -import json -import subprocess -import sys -from pathlib import Path - -REPO_ROOT = Path(__file__).resolve().parents[2] - -# Top-level package names that the CLI scaffold must NOT cause to be -# imported. Picked from pyproject.toml's heavy deps: training / inference -# backends, web servers, experiment trackers, and large transformer -# libraries. Also blocks AReaL's own heavy subpackages — the scaffold -# must not transitively load them either. -FORBIDDEN_TOP_LEVEL = { - # Deep-learning runtimes - "torch", - "torchvision", - "torchaudio", - "transformers", - # Inference backends - "sglang", - "vllm", - # Distributed runtime / training-stack hangers-on - "ray", - "megatron", - "mbridge", - "flash_attn", - "kernels", - "tilelang", - "modelopt", - # Web frameworks / async HTTP - "aiohttp", - "fastapi", - "uvicorn", - # Experiment trackers - "wandb", - "tensorboardx", - "swanlab", - "swanboard", - "trackio", - # Data / numerical - "datasets", - "peft", - "pandas", - "matplotlib", - "seaborn", - "numba", - "h5py", - "blosc", - "huggingface_hub", - # External LLM SDKs - "openai", - "anthropic", - "litellm", - "qwen_agent", - "openai_agents", - "claude_agent_sdk", - "openhands", - "langchain", - # CUDA / GPU stacks - "nvidia", - "cupy", - "triton", - # AReaL's own heavy subpackages — CLI must not transitively load them. - "areal.infra", - "areal.engine", - "areal.trainer", - "areal.workflow", - "areal.dataset", - "areal.reward", - "areal.api", -} - - -def _modules_after(import_stmt: str) -> set[str]: - """Spawn a fresh interpreter, run *import_stmt*, return ``sys.modules`` keys.""" - code = ( - "import sys, json\n" - f"{import_stmt}\n" - "print(json.dumps(sorted(sys.modules.keys())))\n" - ) - out = subprocess.check_output( - [sys.executable, "-c", code], - cwd=str(REPO_ROOT), - ) - last_line = out.decode().strip().splitlines()[-1] - return set(json.loads(last_line)) - - -def _leaks(modules: set[str]) -> set[str]: - leaked: set[str] = set() - for m in modules: - for f in FORBIDDEN_TOP_LEVEL: - # Exact match on a forbidden package (e.g. "areal.infra"), or - # any descendant (e.g. "areal.infra.launcher"). - if m == f or m.startswith(f + "."): - leaked.add(m) - break - return leaked - - -def test_cli_main_module_is_light(): - """Importing the CLI entry point must not load any heavy backend.""" - mods = _modules_after("import areal.experimental.cli.main") - leaked = _leaks(mods) - assert not leaked, ( - f"`import areal.experimental.cli.main` leaked heavy modules: " - f"{sorted(leaked)}" - ) - - -def test_build_parser_is_light(): - """Building the argparse tree must not load any heavy backend either.""" - mods = _modules_after( - "from areal.experimental.cli.main import build_parser\n" - "build_parser()" - ) - leaked = _leaks(mods) - assert not leaked, ( - f"`build_parser()` leaked heavy modules: {sorted(leaked)}" - ) - - -def test_each_namespace_help_is_light(): - """Triggering each namespace's --help path must stay light.""" - for ns in ("inf", "agent", "train", "weight-update"): - mods = _modules_after( - "from areal.experimental.cli.main import cli\n" - f"try:\n" - f" cli(['{ns}', '--help'])\n" - f"except SystemExit:\n" - f" pass\n" - ) - leaked = _leaks(mods) - assert not leaked, ( - f"`areal {ns} --help` leaked heavy modules: {sorted(leaked)}" - ) From 46646a8ea0cb078a7b6833bb02f22df556fec28e Mon Sep 17 00:00:00 2001 From: sitabulaixizawaluduo Date: Thu, 28 May 2026 23:08:47 +0800 Subject: [PATCH 3/6] chore: mirror [cli] extra and console-script into pyproject.vllm.toml --- pyproject.vllm.toml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pyproject.vllm.toml b/pyproject.vllm.toml index 4ab022c367..a79368b0c1 100644 --- a/pyproject.vllm.toml +++ b/pyproject.vllm.toml @@ -194,6 +194,19 @@ cuda = [ sandbox = [ "daytona>=0.167.0", ] +# Minimal extra for the lightweight `areal` CLI (login-node install). The CLI +# must NOT pull in torch / transformers / sglang / vllm / ray / megatron / ...; +# the lightness guard test in `tests/experimental/test_cli_lightness.py` +# enforces this invariant. The CLI only depends on PyYAML (to peek at config +# files for driver / scheduler resolution) and colorlog (for the project's +# colored loggers). +cli = [ + "PyYAML", + "colorlog", +] + +[project.scripts] +areal = "areal.experimental.cli.main:cli" [project.urls] "Homepage" = "https://github.com/areal-project/AReaL" From 822d43ee0ed6dd92a79175c062857d6dabaf7600 Mon Sep 17 00:00:00 2001 From: sitabulaixizawaluduo Date: Fri, 29 May 2026 14:11:40 +0800 Subject: [PATCH 4/6] chore: regenerate uv lockfiles after [cli] extra --- uv.lock | 8 +++++++- uv.vllm.lock | 8 +++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/uv.lock b/uv.lock index e468608cc3..db1652123f 100644 --- a/uv.lock +++ b/uv.lock @@ -400,6 +400,10 @@ dependencies = [ ] [package.optional-dependencies] +cli = [ + { name = "colorlog", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "pyyaml", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] cuda = [ { name = "flash-linear-attention", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "kernels", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, @@ -482,6 +486,7 @@ requires-dist = [ { name = "claude-agent-sdk" }, { name = "colorama" }, { name = "colorlog" }, + { name = "colorlog", marker = "extra == 'cli'" }, { name = "cookiecutter", specifier = ">2.1.1" }, { name = "datasets", specifier = ">=3.0.0" }, { name = "daytona", marker = "extra == 'sandbox'", specifier = ">=0.167.0" }, @@ -534,6 +539,7 @@ requires-dist = [ { name = "python-debian", specifier = ">=0.1.49" }, { name = "python-dotenv" }, { name = "pyyaml" }, + { name = "pyyaml", marker = "extra == 'cli'" }, { name = "pyzmq" }, { name = "qwen-agent" }, { name = "ray", extras = ["default"] }, @@ -573,7 +579,7 @@ requires-dist = [ { name = "word2number" }, { name = "zstandard" }, ] -provides-extras = ["sglang", "tms", "kernels", "megatron", "cuda-train", "cuda", "sandbox"] +provides-extras = ["sglang", "tms", "kernels", "megatron", "cuda-train", "cuda", "sandbox", "cli"] [package.metadata.requires-dev] dev = [ diff --git a/uv.vllm.lock b/uv.vllm.lock index 9e9b587e61..dab9badc06 100644 --- a/uv.vllm.lock +++ b/uv.vllm.lock @@ -448,6 +448,10 @@ dependencies = [ ] [package.optional-dependencies] +cli = [ + { name = "colorlog", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "pyyaml", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] cuda = [ { name = "flash-linear-attention", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "kernels", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, @@ -528,6 +532,7 @@ requires-dist = [ { name = "claude-agent-sdk" }, { name = "colorama" }, { name = "colorlog" }, + { name = "colorlog", marker = "extra == 'cli'" }, { name = "cookiecutter", specifier = ">2.1.1" }, { name = "datasets", specifier = ">=3.0.0" }, { name = "daytona", marker = "extra == 'sandbox'", specifier = ">=0.167.0" }, @@ -579,6 +584,7 @@ requires-dist = [ { name = "python-debian", specifier = ">=0.1.49" }, { name = "python-dotenv" }, { name = "pyyaml" }, + { name = "pyyaml", marker = "extra == 'cli'" }, { name = "pyzmq" }, { name = "qwen-agent" }, { name = "ray", extras = ["default"] }, @@ -619,7 +625,7 @@ requires-dist = [ { name = "word2number" }, { name = "zstandard" }, ] -provides-extras = ["vllm", "tms", "kernels", "megatron", "cuda-train", "cuda", "sandbox"] +provides-extras = ["vllm", "tms", "kernels", "megatron", "cuda-train", "cuda", "sandbox", "cli"] [package.metadata.requires-dev] dev = [ From ec73c8fdb027d625bec598cee68ef9998a125fb9 Mon Sep 17 00:00:00 2001 From: sitabulaixizawaluduo Date: Fri, 29 May 2026 14:15:25 +0800 Subject: [PATCH 5/6] fix: reformat --- areal/__init__.py | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/areal/__init__.py b/areal/__init__.py index 44d703638d..e6edb1a6a7 100644 --- a/areal/__init__.py +++ b/areal/__init__.py @@ -5,21 +5,25 @@ from .version import __version__ # noqa -_INFRA_NAMES = frozenset({ - "RolloutController", - "StalenessManager", - "TrainController", - "WorkflowExecutor", - "current_platform", - "workflow_context", -}) - -_TRAINER_NAMES = frozenset({ - "DPOTrainer", - "PPOTrainer", - "RWTrainer", - "SFTTrainer", -}) +_INFRA_NAMES = frozenset( + { + "RolloutController", + "StalenessManager", + "TrainController", + "WorkflowExecutor", + "current_platform", + "workflow_context", + } +) + +_TRAINER_NAMES = frozenset( + { + "DPOTrainer", + "PPOTrainer", + "RWTrainer", + "SFTTrainer", + } +) def __getattr__(name: str): From 9110c53bdf52272fb62c9aa29518739fae76bcde Mon Sep 17 00:00:00 2001 From: sitabulaixizawaluduo Date: Fri, 29 May 2026 15:56:35 +0800 Subject: [PATCH 6/6] chore: ruff --fix on full repo --- areal/experimental/cli/commands/agent/__init__.py | 1 - areal/experimental/cli/commands/inf/__init__.py | 1 - areal/experimental/cli/commands/train/__init__.py | 1 - areal/experimental/cli/commands/weight_update/__init__.py | 1 - areal/experimental/cli/main.py | 4 +--- 5 files changed, 1 insertion(+), 7 deletions(-) diff --git a/areal/experimental/cli/commands/agent/__init__.py b/areal/experimental/cli/commands/agent/__init__.py index 3d8521f1e7..6b83926151 100644 --- a/areal/experimental/cli/commands/agent/__init__.py +++ b/areal/experimental/cli/commands/agent/__init__.py @@ -16,7 +16,6 @@ import argparse - _DESCRIPTION = """\ Operate an agent service: gateway + router + (worker, data-proxy) pairs. Session-centric: the primary unit of interaction is an agent session, diff --git a/areal/experimental/cli/commands/inf/__init__.py b/areal/experimental/cli/commands/inf/__init__.py index f860517890..71287a6fe1 100644 --- a/areal/experimental/cli/commands/inf/__init__.py +++ b/areal/experimental/cli/commands/inf/__init__.py @@ -12,7 +12,6 @@ import argparse - _DESCRIPTION = """\ Operate an inference service: gateway + router + optional model backends. diff --git a/areal/experimental/cli/commands/train/__init__.py b/areal/experimental/cli/commands/train/__init__.py index dab09229c5..94a87e1008 100644 --- a/areal/experimental/cli/commands/train/__init__.py +++ b/areal/experimental/cli/commands/train/__init__.py @@ -16,7 +16,6 @@ import argparse - _DESCRIPTION = """\ Submit and observe training jobs. Job-shaped (terminates), not service-shaped — the CLI wraps lifecycle only and does not choose the diff --git a/areal/experimental/cli/commands/weight_update/__init__.py b/areal/experimental/cli/commands/weight_update/__init__.py index b1e96c0e27..fa9de4eb44 100644 --- a/areal/experimental/cli/commands/weight_update/__init__.py +++ b/areal/experimental/cli/commands/weight_update/__init__.py @@ -21,7 +21,6 @@ import argparse - _DESCRIPTION = """\ Diagnose the weight-update service that bridges training and inference. diff --git a/areal/experimental/cli/main.py b/areal/experimental/cli/main.py index 1608a7a229..2bbd134b25 100644 --- a/areal/experimental/cli/main.py +++ b/areal/experimental/cli/main.py @@ -20,13 +20,11 @@ import argparse import sys -from areal.version import __version__ - from areal.experimental.cli.commands import agent as cmd_agent from areal.experimental.cli.commands import inf as cmd_inf from areal.experimental.cli.commands import train as cmd_train from areal.experimental.cli.commands import weight_update as cmd_weight_update - +from areal.version import __version__ _DESCRIPTION = """\ AReaL operator CLI for the v2 microservice architecture.