From a37a980ef7a6eb85ea6a8f807be26fbd947aefea Mon Sep 17 00:00:00 2001
From: Jiachen Zhang <zjc462490@alibaba-inc.com>
Date: Tue, 16 Jun 2026 09:58:14 +0000
Subject: [PATCH 1/2] refactor(cpu-overcommit): extract apply_cpu_overcommit to
 shared module

Move the inline _apply_cpu_overcommit_default from sandbox_api.py to
rock/common/cpu_overcommit.py so both start and restart flows can reuse
the same overcommit logic. The new function takes nacos_provider as an
explicit parameter instead of reading from the module-level global.
---
 rock/admin/entrypoints/sandbox_api.py | 45 +++------------------------
 rock/common/cpu_overcommit.py         | 42 +++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 41 deletions(-)
 create mode 100644 rock/common/cpu_overcommit.py

diff --git a/rock/admin/entrypoints/sandbox_api.py b/rock/admin/entrypoints/sandbox_api.py
index 5a59883461..4acb43d5f3 100644
--- a/rock/admin/entrypoints/sandbox_api.py
+++ b/rock/admin/entrypoints/sandbox_api.py
@@ -1,4 +1,3 @@
-import math
 import re
 import time
 from typing import Annotated, Any
@@ -30,8 +29,6 @@
 )
 from rock.admin.proto.response import SandboxStartResponse
 from rock.common.constants import (
-    CPU_OVERCOMMIT_ALLOWED_KEYS_KEY,
-    CPU_OVERCOMMIT_HEADROOM_KEY,
     EXTRA_ACCELERATOR_TYPES_KEY,
     GET_STATUS_SWITCH,
     KATA_DIND_DISK_SIZE_KEY,
@@ -39,6 +36,7 @@
     SANDBOX_DISK_LIMIT_ROOTFS_KEY,
     SUPPORT_KATA_SWITCH,
 )
+from rock.common.cpu_overcommit import apply_cpu_overcommit
 from rock.common.exception import handle_exceptions
 from rock.common.validation import NonBlankStr
 from rock.config import ImageRegistryMirror
@@ -275,43 +273,6 @@ async def _apply_accelerator_type_validation(config: DockerDeploymentConfig) ->
         )
 
 
-async def _apply_cpu_overcommit_default(config: DockerDeploymentConfig, rock_authorization: str | None) -> None:
-    """Derive limit_cpus from cpus + Nacos headroom when SDK did not set it.
-
-    Formula: limit_cpus = min(2 * cpus, cpus + headroom)
-    - SDK-supplied limit_cpus always wins (function is a no-op in that case).
-    - Grayscale gate driven by Nacos list `cpu_overcommit_allowed_keys`:
-      * key absent from Nacos -> gate is open for every caller (full rollout).
-      * key present as a list  -> only `rock_authorization` values in the list pass.
-      * key present but not a list (misconfigured) -> gate closed.
-    - headroom is read from Nacos key `cpu_overcommit_headroom` (default 0).
-    - headroom <= 0 keeps limit_cpus = None (docker run gets no --cpus flag).
-    """
-    if config.limit_cpus is not None:
-        return
-
-    nacos = sandbox_manager.rock_config.nacos_provider
-    if nacos is None:
-        return
-
-    nacos_config = await nacos.get_config() or {}
-    allowed_keys = nacos_config.get(CPU_OVERCOMMIT_ALLOWED_KEYS_KEY)
-    if allowed_keys is not None and (not isinstance(allowed_keys, list) or rock_authorization not in allowed_keys):
-        return
-
-    raw = nacos_config.get(CPU_OVERCOMMIT_HEADROOM_KEY)
-    try:
-        headroom = float(raw) if raw is not None else 0.0
-    except (TypeError, ValueError):
-        headroom = 0.0
-
-    # Reject NaN / inf so a fat-fingered Nacos edit can't break sandbox startup
-    if not math.isfinite(headroom) or headroom <= 0:
-        return
-
-    config.limit_cpus = min(2 * config.cpus, config.cpus + headroom)
-
-
 @sandbox_router.post("/start")
 @handle_exceptions(error_message="start sandbox failed")
 async def start(request: SandboxStartRequest) -> RockResponse[SandboxStartResponse]:
@@ -335,7 +296,9 @@ async def start_async(
     await _apply_accelerator_type_validation(config)
     await _apply_kata_runtime_switch(config)
     await _apply_kata_disk_size(config)
-    await _apply_cpu_overcommit_default(config, headers.user_info.get("rock_authorization"))
+    await apply_cpu_overcommit(
+        config, sandbox_manager.rock_config.nacos_provider, headers.user_info.get("rock_authorization")
+    )
     await _apply_disk_limits(config)
     await _apply_image_registry_mirror(config)
     sandbox_start_response = await sandbox_manager.start_async(
diff --git a/rock/common/cpu_overcommit.py b/rock/common/cpu_overcommit.py
new file mode 100644
index 0000000000..3b4e134a91
--- /dev/null
+++ b/rock/common/cpu_overcommit.py
@@ -0,0 +1,42 @@
+import math
+
+from rock.common.constants import CPU_OVERCOMMIT_ALLOWED_KEYS_KEY, CPU_OVERCOMMIT_HEADROOM_KEY
+from rock.deployments.config import DockerDeploymentConfig
+
+
+async def apply_cpu_overcommit(
+    config: DockerDeploymentConfig, nacos_provider, rock_authorization: str | None = None
+) -> None:
+    """Derive limit_cpus from cpus + Nacos headroom when not explicitly set.
+
+    Formula: limit_cpus = min(2 * cpus, cpus + headroom)
+    - SDK-supplied limit_cpus always wins (function is a no-op in that case).
+    - Grayscale gate driven by Nacos list `cpu_overcommit_allowed_keys`:
+      * key absent from Nacos -> gate is open for every caller (full rollout).
+      * key present as a list  -> only `rock_authorization` values in the list pass.
+      * key present but not a list (misconfigured) -> gate closed.
+    - headroom is read from Nacos key `cpu_overcommit_headroom` (default 0).
+    - headroom <= 0 keeps limit_cpus = None (docker run gets no --cpus flag).
+    """
+    if config.limit_cpus is not None:
+        return
+
+    if nacos_provider is None:
+        return
+
+    nacos_config = await nacos_provider.get_config() or {}
+    allowed_keys = nacos_config.get(CPU_OVERCOMMIT_ALLOWED_KEYS_KEY)
+    if allowed_keys is not None and (not isinstance(allowed_keys, list) or rock_authorization not in allowed_keys):
+        return
+
+    raw = nacos_config.get(CPU_OVERCOMMIT_HEADROOM_KEY)
+    try:
+        headroom = float(raw) if raw is not None else 0.0
+    except (TypeError, ValueError):
+        headroom = 0.0
+
+    # Reject NaN / inf so a fat-fingered Nacos edit can't break sandbox startup
+    if not math.isfinite(headroom) or headroom <= 0:
+        return
+
+    config.limit_cpus = min(2 * config.cpus, config.cpus + headroom)

From addb51f5962ef7b9fb7e76b0b0948eb22ccc35a1 Mon Sep 17 00:00:00 2001
From: Jiachen Zhang <zjc462490@alibaba-inc.com>
Date: Tue, 16 Jun 2026 09:59:00 +0000
Subject: [PATCH 2/2] feat(restart): support cpu/memory/limit_cpus resource
 updates on restart

- Add limit_cpus to SandboxRestartRequest and restart_async
- Build DockerDeploymentConfig from persisted spec in restart_async,
  apply resource overrides, validate via validate_sandbox_spec
- Reuse apply_cpu_overcommit when cpus changes without explicit limit_cpus
- Simplify on_restart to receive a fully-built config instead of raw
  resource_overrides dict
- Add _docker_update_resources (reuses _cpus/_memory) to apply resource
  changes to stopped containers before docker start
- Add unit tests for _docker_update_resources and restart resource flows
---
 rock/admin/entrypoints/sandbox_api.py         |   7 +-
 rock/admin/proto/request.py                   |   7 +
 rock/deployments/docker.py                    |  11 ++
 rock/sandbox/sandbox_manager.py               |  41 +++++-
 rock/sandbox/sandbox_statemachine.py          |  26 ++--
 .../entrypoints/test_param_validation.py      |  52 +++++++
 .../test_docker_update_resources.py           |  85 +++++++++++
 .../unit/sandbox/test_sandbox_statemachine.py | 135 +++++++++++++++++-
 8 files changed, 341 insertions(+), 23 deletions(-)
 create mode 100644 tests/unit/deployments/test_docker_update_resources.py

diff --git a/rock/admin/entrypoints/sandbox_api.py b/rock/admin/entrypoints/sandbox_api.py
index 4acb43d5f3..dc95eef522 100644
--- a/rock/admin/entrypoints/sandbox_api.py
+++ b/rock/admin/entrypoints/sandbox_api.py
@@ -23,6 +23,7 @@
     SandboxCommand,
     SandboxCreateBashSessionRequest,
     SandboxReadFileRequest,
+    SandboxRestartRequest,
     SandboxStartRequest,
     SandboxWriteFileRequest,
     StartHeaders,
@@ -417,8 +418,10 @@ async def delete(sandbox_id: str = Body(..., embed=True)) -> RockResponse:
 
 @sandbox_router.post("/restart")
 @handle_exceptions(error_message="restart sandbox failed")
-async def restart(sandbox_id: str = Body(..., embed=True)) -> RockResponse[SandboxStartResponse]:
-    result = await sandbox_manager.restart_async(sandbox_id)
+async def restart(request: SandboxRestartRequest) -> RockResponse[SandboxStartResponse]:
+    result = await sandbox_manager.restart_async(
+        request.sandbox_id, cpus=request.cpus, memory=request.memory, limit_cpus=request.limit_cpus
+    )
     return RockResponse(result=result)
 
 
diff --git a/rock/admin/proto/request.py b/rock/admin/proto/request.py
index 99398a27a2..85af50c22e 100644
--- a/rock/admin/proto/request.py
+++ b/rock/admin/proto/request.py
@@ -46,6 +46,13 @@ class SandboxStartRequest(BaseModel):
     """GPU accelerator type (e.g. 'A100', 'V100'). If not specified, any available GPU will be used."""
 
 
+class SandboxRestartRequest(BaseModel):
+    sandbox_id: NonBlankStr
+    cpus: float | None = None
+    memory: str | None = None
+    limit_cpus: float | None = None
+
+
 class SandboxCommand(Command):
     timeout: float | None = 1200
     """The timeout for the command. None means no timeout."""
diff --git a/rock/deployments/docker.py b/rock/deployments/docker.py
index 5b6fe6574f..5844f999a5 100644
--- a/rock/deployments/docker.py
+++ b/rock/deployments/docker.py
@@ -662,6 +662,15 @@ def _docker_create(self, cmd: list[str]) -> None:
             )
             raise
 
+    def _docker_update_resources(self) -> None:
+        """Apply cpu/memory resource updates to a stopped container via docker update."""
+        cmd = ["docker", "update"] + self._cpus() + self._memory() + [self._container_name]
+        logger.info(f"Updating container resources: {' '.join(cmd)}")
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        if result.returncode != 0:
+            logger.error(f"docker update failed: {result.stderr}")
+            raise RuntimeError(f"docker update failed for {self._container_name}: {result.stderr}")
+
     def _docker_start(self) -> subprocess.Popen:
         """Start a previously-created container with stdout/stderr attached."""
         try:
@@ -706,6 +715,8 @@ async def restart(self):
 
         logger.info(f"Restarting container {self._container_name} with docker start")
 
+        await loop.run_in_executor(executor, self._docker_update_resources)
+
         # Reuse the same Popen-based attached start used by start(), so the
         # restart path also produces a valid self._container_process. Without
         # this, _stop() would skip its `if self._container_process is not None`
diff --git a/rock/sandbox/sandbox_manager.py b/rock/sandbox/sandbox_manager.py
index 1c1ff3070a..454fc21aa4 100644
--- a/rock/sandbox/sandbox_manager.py
+++ b/rock/sandbox/sandbox_manager.py
@@ -26,6 +26,7 @@
 from rock.admin.proto.request import SandboxWriteFileRequest as WriteFileRequest
 from rock.admin.proto.response import SandboxStartResponse, SandboxStatusResponse
 from rock.common.constants import DeleteReason, StopReason
+from rock.common.cpu_overcommit import apply_cpu_overcommit
 from rock.config import RockConfig, RuntimeConfig
 from rock.deployments.config import DeploymentConfig, DockerDeploymentConfig
 from rock.logger import init_logger
@@ -152,7 +153,13 @@ async def start_async(
         )
 
     @monitor_sandbox_operation()
-    async def restart_async(self, sandbox_id: str) -> SandboxStartResponse:
+    async def restart_async(
+        self,
+        sandbox_id: str,
+        cpus: float | None = None,
+        memory: str | None = None,
+        limit_cpus: float | None = None,
+    ) -> SandboxStartResponse:
         sm = await self._get_current_statemachine(sandbox_id)
         if sm is None:
             raise BadRequestRockError(f"Sandbox {sandbox_id} not found")
@@ -161,11 +168,43 @@ async def restart_async(self, sandbox_id: str) -> SandboxStartResponse:
         if state != State.STOPPED:
             raise BadRequestRockError(f"Sandbox {sandbox_id} cannot be restarted: current state is '{state.value}'")
 
+        info = sm.sandbox_info or {}
+        spec = info.get("spec") or {}
+        if spec:
+            restart_config = DockerDeploymentConfig(**spec)
+        else:
+            logger.warning(
+                f"sandbox {sandbox_id} has no spec snapshot; rebuilding config from flat fields with model defaults"
+            )
+            restart_config = DockerDeploymentConfig(
+                container_name=sandbox_id,
+                image=info.get("image") or DockerDeploymentConfig.model_fields["image"].default,
+                memory=info.get("memory") or DockerDeploymentConfig.model_fields["memory"].default,
+                cpus=float(info.get("cpus") or DockerDeploymentConfig.model_fields["cpus"].default),
+            )
+
+        if cpus is not None:
+            restart_config.cpus = cpus
+        if limit_cpus is not None:
+            restart_config.limit_cpus = limit_cpus
+        if memory is not None:
+            restart_config.memory = memory
+        if cpus is not None and limit_cpus is None:
+            if restart_config.limit_cpus is not None:
+                logger.warning(
+                    f"restart {sandbox_id}: cpus changed to {cpus} but limit_cpus not provided, "
+                    f"clearing previous limit_cpus={restart_config.limit_cpus}"
+                )
+                restart_config.limit_cpus = None
+            await apply_cpu_overcommit(restart_config, self.rock_config.nacos_provider)
+        self.validate_sandbox_spec(self.rock_config.runtime, restart_config)
+
         await sm.send(
             "restart",
             sandbox_id=sandbox_id,
             operator=self._operator,
             meta_store=self._meta_store,
+            restart_config=restart_config,
         )
 
         info: SandboxInfo = sm.sandbox_info or {}
diff --git a/rock/sandbox/sandbox_statemachine.py b/rock/sandbox/sandbox_statemachine.py
index 105b2423d9..ba4a301912 100644
--- a/rock/sandbox/sandbox_statemachine.py
+++ b/rock/sandbox/sandbox_statemachine.py
@@ -100,29 +100,19 @@ async def on_alive(self, sandbox_id: str, meta_store, sandbox_info: SandboxInfo)
             sandbox_info["start_time"] = get_iso8601_timestamp()
         await meta_store.update(sandbox_id, sandbox_info)
 
-    async def on_restart(self, sandbox_id: str, operator, meta_store) -> None:
+    async def on_restart(
+        self,
+        sandbox_id: str,
+        operator,
+        meta_store,
+        restart_config: DockerDeploymentConfig,
+    ) -> None:
         info = self.sandbox_info or {}
 
         host_ip = info.get("host_ip")
         if not host_ip:
             raise BadRequestRockError(f"Sandbox {sandbox_id} has no host_ip; cannot pin restart to original node")
 
-        # Prefer the spec snapshot (DockerDeploymentConfig.model_dump persisted to
-        # the DB at start time) so the new actor wraps the existing container with
-        # the exact same config.
-        spec = info.get("spec") or {}
-        if spec:
-            restart_config = DockerDeploymentConfig(**spec)
-        else:
-            logger.warning(
-                f"sandbox {sandbox_id} has no spec snapshot; rebuilding config from flat fields with model defaults"
-            )
-            restart_config = DockerDeploymentConfig(
-                container_name=sandbox_id,
-                image=info.get("image") or DockerDeploymentConfig.model_fields["image"].default,
-                memory=info.get("memory") or DockerDeploymentConfig.model_fields["memory"].default,
-                cpus=float(info.get("cpus") or DockerDeploymentConfig.model_fields["cpus"].default),
-            )
         timeout_info = SandboxTimeoutHelper.make_timeout_info(restart_config.auto_clear_time)
 
         logger.info(f"restart sandbox {sandbox_id} (pin host_ip={host_ip})")
@@ -135,6 +125,8 @@ async def on_restart(self, sandbox_id: str, operator, meta_store) -> None:
         # fields (spec/status) won't pollute the alive key.
         new_info = dict(info)
         new_info["state"] = RockState.PENDING
+        new_info["cpus"] = restart_config.cpus
+        new_info["memory"] = restart_config.memory
         new_info.pop("stop_time", None)
         await meta_store.update(sandbox_id, new_info)
         await meta_store.update_timeout(sandbox_id, timeout_info)
diff --git a/tests/unit/admin/entrypoints/test_param_validation.py b/tests/unit/admin/entrypoints/test_param_validation.py
index 5f80600278..f14778ea03 100644
--- a/tests/unit/admin/entrypoints/test_param_validation.py
+++ b/tests/unit/admin/entrypoints/test_param_validation.py
@@ -255,3 +255,55 @@ async def test_gem_close_empty_sandbox_id(gem_app):
     async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
         _assert_failed(await client.post("/close", json={"sandbox_id": ""}))
         _assert_failed(await client.post("/close", json={"sandbox_id": "   "}))
+
+
+# --- restart backward compatibility ---
+
+
+@pytest.mark.asyncio
+async def test_restart_old_sdk_only_sandbox_id(sandbox_app):
+    """Old SDK sends only {"sandbox_id": "xxx"} — must still work."""
+    app, mock_manager = sandbox_app
+    mock_manager.restart_async = AsyncMock(
+        return_value=SandboxStartResponse(sandbox_id="sb-1", host_name="h", host_ip="1.2.3.4")
+    )
+    async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
+        resp = await client.post("/restart", json={"sandbox_id": "sb-1"})
+        assert resp.status_code == 200
+        assert resp.json()["status"] == "Success"
+        mock_manager.restart_async.assert_called_once_with("sb-1", cpus=None, memory=None, limit_cpus=None)
+
+
+@pytest.mark.asyncio
+async def test_restart_new_sdk_with_resources(sandbox_app):
+    """New SDK sends {"sandbox_id": "xxx", "cpus": 4, "memory": "8g"}."""
+    app, mock_manager = sandbox_app
+    mock_manager.restart_async = AsyncMock(
+        return_value=SandboxStartResponse(sandbox_id="sb-1", host_name="h", host_ip="1.2.3.4")
+    )
+    async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
+        resp = await client.post("/restart", json={"sandbox_id": "sb-1", "cpus": 4, "memory": "8g"})
+        assert resp.status_code == 200
+        assert resp.json()["status"] == "Success"
+        mock_manager.restart_async.assert_called_once_with("sb-1", cpus=4.0, memory="8g", limit_cpus=None)
+
+
+@pytest.mark.asyncio
+async def test_restart_with_limit_cpus(sandbox_app):
+    """New SDK sends {"sandbox_id": "xxx", "cpus": 2, "limit_cpus": 4}."""
+    app, mock_manager = sandbox_app
+    mock_manager.restart_async = AsyncMock(
+        return_value=SandboxStartResponse(sandbox_id="sb-1", host_name="h", host_ip="1.2.3.4")
+    )
+    async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
+        resp = await client.post("/restart", json={"sandbox_id": "sb-1", "cpus": 2, "limit_cpus": 4})
+        assert resp.status_code == 200
+        assert resp.json()["status"] == "Success"
+        mock_manager.restart_async.assert_called_once_with("sb-1", cpus=2.0, memory=None, limit_cpus=4.0)
+
+
+@pytest.mark.asyncio
+async def test_restart_empty_sandbox_id(sandbox_app):
+    app, _ = sandbox_app
+    async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
+        _assert_failed(await client.post("/restart", json={"sandbox_id": ""}))
diff --git a/tests/unit/deployments/test_docker_update_resources.py b/tests/unit/deployments/test_docker_update_resources.py
new file mode 100644
index 0000000000..b3eae1c779
--- /dev/null
+++ b/tests/unit/deployments/test_docker_update_resources.py
@@ -0,0 +1,85 @@
+"""Unit tests for DockerDeployment._docker_update_resources()."""
+
+from unittest.mock import patch
+
+import pytest
+
+from rock.deployments.config import DockerDeploymentConfig
+from rock.deployments.docker import DockerDeployment
+
+
+@pytest.fixture
+def _patch_validator():
+    with patch("rock.deployments.docker.DockerSandboxValidator"):
+        yield
+
+
+@pytest.mark.usefixtures("_patch_validator")
+class TestDockerUpdateResources:
+    """Tests for _docker_update_resources command construction and error handling."""
+
+    def _make_deployment(self, **kwargs) -> DockerDeployment:
+        config = DockerDeploymentConfig(container_name="test-sandbox", **kwargs)
+        return DockerDeployment.from_config(config)
+
+    def test_without_limit_cpus(self):
+        deployment = self._make_deployment(cpus=2, memory="4g")
+        with patch("rock.deployments.docker.subprocess.run") as mock_run:
+            mock_run.return_value.returncode = 0
+            deployment._docker_update_resources()
+
+        cmd = mock_run.call_args[0][0]
+        assert cmd == [
+            "docker", "update",
+            "--cpus=2.0",
+            "--memory=4g",
+            "--memory-swap=4g",
+            "test-sandbox",
+        ]
+
+    def test_with_limit_cpus(self):
+        deployment = self._make_deployment(cpus=2, limit_cpus=4, memory="4g")
+        with patch("rock.deployments.docker.subprocess.run") as mock_run:
+            mock_run.return_value.returncode = 0
+            deployment._docker_update_resources()
+
+        cmd = mock_run.call_args[0][0]
+        assert cmd == [
+            "docker", "update",
+            "--cpu-shares=2048",
+            "--cpus=4.0",
+            "--memory=4g",
+            "--memory-swap=4g",
+            "test-sandbox",
+        ]
+
+    def test_fractional_cpus_shares(self):
+        deployment = self._make_deployment(cpus=0.5, limit_cpus=1.5, memory="2g")
+        with patch("rock.deployments.docker.subprocess.run") as mock_run:
+            mock_run.return_value.returncode = 0
+            deployment._docker_update_resources()
+
+        cmd = mock_run.call_args[0][0]
+        assert "--cpu-shares=512" in cmd
+        assert "--cpus=1.5" in cmd
+
+    def test_failure_raises_runtime_error(self):
+        deployment = self._make_deployment(cpus=2, memory="4g")
+        with patch("rock.deployments.docker.subprocess.run") as mock_run:
+            mock_run.return_value.returncode = 1
+            mock_run.return_value.stderr = "no such container"
+            with pytest.raises(RuntimeError, match="docker update failed"):
+                deployment._docker_update_resources()
+
+    def test_consistent_with_cpus_method(self):
+        """_docker_update_resources cpu flags should match _cpus() output."""
+        for cpus, limit_cpus in [(2, None), (2, 4), (0.5, 1.5)]:
+            deployment = self._make_deployment(cpus=cpus, limit_cpus=limit_cpus, memory="4g")
+            with patch("rock.deployments.docker.subprocess.run") as mock_run:
+                mock_run.return_value.returncode = 0
+                deployment._docker_update_resources()
+
+            update_cmd = mock_run.call_args[0][0]
+            cpu_flags_from_update = [f for f in update_cmd if "cpu" in f.lower()]
+            cpu_flags_from_cpus = deployment._cpus()
+            assert cpu_flags_from_update == cpu_flags_from_cpus
diff --git a/tests/unit/sandbox/test_sandbox_statemachine.py b/tests/unit/sandbox/test_sandbox_statemachine.py
index 54f93b4952..461fe583b0 100644
--- a/tests/unit/sandbox/test_sandbox_statemachine.py
+++ b/tests/unit/sandbox/test_sandbox_statemachine.py
@@ -239,12 +239,21 @@ async def test_stop_time_always_written_even_when_start_failed(self, mock_operat
 }
 
 
+def _default_restart_config(**overrides):
+    from rock.deployments.config import DockerDeploymentConfig
+
+    kwargs = dict(_VALID_RESTART_INFO["spec"])
+    kwargs.update(overrides)
+    return DockerDeploymentConfig(**kwargs)
+
+
 class TestRestartTransitions:
     def _restart_kwargs(self, meta_store=None):
         return dict(
             sandbox_id="sb",
             operator=AsyncMock(),
             meta_store=meta_store or AsyncMock(),
+            restart_config=_default_restart_config(),
         )
 
     @pytest.mark.asyncio
@@ -276,7 +285,7 @@ class TestOnRestart:
     def mock_meta_store(self):
         return AsyncMock()
 
-    async def _send_restart(self, mock_meta_store, sandbox_info=None):
+    async def _send_restart(self, mock_meta_store, sandbox_info=None, config=None):
         info = sandbox_info if sandbox_info is not None else dict(_VALID_RESTART_INFO)
         sm = await SandboxStateMachine.from_state_value(State.STOPPED, sandbox_info=info)
         await sm.send(
@@ -284,6 +293,7 @@ async def _send_restart(self, mock_meta_store, sandbox_info=None):
             sandbox_id="sb-1",
             operator=AsyncMock(),
             meta_store=mock_meta_store,
+            restart_config=config or _default_restart_config(),
         )
 
     @pytest.mark.asyncio
@@ -300,15 +310,134 @@ async def test_updates_state_to_pending(self, mock_meta_store):
 
     @pytest.mark.asyncio
     async def test_writes_timeout_built_from_spec(self, mock_meta_store):
-        # auto_clear_time_minutes=30 in spec → make_timeout_info uses 30
         await self._send_restart(mock_meta_store)
         mock_meta_store.update_timeout.assert_awaited_once()
         sandbox_id, timeout_info = mock_meta_store.update_timeout.call_args[0]
         assert sandbox_id == "sb-1"
-        # SandboxTimeoutHelper.make_timeout_info stores auto_clear_time as the env-var key
         assert any("30" == str(v) for v in timeout_info.values())
 
 
+# ---------------------------------------------------------------------------
+# on_restart resource overrides
+# ---------------------------------------------------------------------------
+
+
+class TestOnRestartResourceOverrides:
+    @pytest.fixture
+    def mock_meta_store(self):
+        return AsyncMock()
+
+    async def _send_restart(self, mock_meta_store, config=None, sandbox_info=None):
+        info = sandbox_info if sandbox_info is not None else dict(_VALID_RESTART_INFO)
+        if "spec" in info:
+            info = {**info, "spec": dict(info["spec"])}
+        sm = await SandboxStateMachine.from_state_value(State.STOPPED, sandbox_info=info)
+        mock_operator = AsyncMock()
+        await sm.send(
+            "restart",
+            sandbox_id="sb-1",
+            operator=mock_operator,
+            meta_store=mock_meta_store,
+            restart_config=config or _default_restart_config(),
+        )
+        return mock_operator
+
+    @pytest.mark.asyncio
+    async def test_config_passed_to_operator(self, mock_meta_store):
+        config = _default_restart_config(cpus=4)
+        mock_operator = await self._send_restart(mock_meta_store, config=config)
+        restart_config = mock_operator.restart.call_args[0][0]
+        assert restart_config.cpus == 4
+
+    @pytest.mark.asyncio
+    async def test_no_change_uses_original_config(self, mock_meta_store):
+        mock_operator = await self._send_restart(mock_meta_store)
+        restart_config = mock_operator.restart.call_args[0][0]
+        assert restart_config.cpus == 1
+        assert restart_config.memory == "2g"
+
+    @pytest.mark.asyncio
+    async def test_limit_cpus_passed_to_operator(self, mock_meta_store):
+        config = _default_restart_config(cpus=4, limit_cpus=6)
+        mock_operator = await self._send_restart(mock_meta_store, config=config)
+        restart_config = mock_operator.restart.call_args[0][0]
+        assert restart_config.cpus == 4
+        assert restart_config.limit_cpus == 6
+
+    @pytest.mark.asyncio
+    async def test_update_persists_cpus_and_memory(self, mock_meta_store):
+        config = _default_restart_config(cpus=8, memory="32g")
+        await self._send_restart(mock_meta_store, config=config)
+        mock_meta_store.update.assert_awaited_once()
+        updated_info = mock_meta_store.update.call_args[0][1]
+        assert updated_info["cpus"] == 8
+        assert updated_info["memory"] == "32g"
+        assert updated_info["state"] == State.PENDING
+
+
+# ---------------------------------------------------------------------------
+# restart resource validation (reuses SandboxManager.validate_sandbox_spec)
+# ---------------------------------------------------------------------------
+
+
+class TestRestartResourceValidation:
+    @pytest.fixture
+    def sandbox_manager(self):
+        from unittest.mock import MagicMock
+
+        from rock.config import RockConfig, RuntimeConfig, StandardSpec
+        from rock.sandbox.sandbox_manager import SandboxManager
+
+        mgr = MagicMock()
+        mgr.rock_config = MagicMock(spec=RockConfig)
+        mgr.rock_config.runtime = MagicMock(spec=RuntimeConfig)
+        mgr.rock_config.runtime.max_allowed_spec = StandardSpec(cpus=16, memory="64g")
+        mgr.validate_sandbox_spec = SandboxManager.validate_sandbox_spec.__get__(mgr)
+        return mgr
+
+    def _make_config(self, cpus=2, memory="8g"):
+        from rock.deployments.config import DockerDeploymentConfig
+
+        return DockerDeploymentConfig(container_name="sb-1", image="python:3.11", cpus=cpus, memory=memory)
+
+    def test_cpus_within_limit_passes(self, sandbox_manager):
+        sandbox_manager.validate_sandbox_spec(sandbox_manager.rock_config.runtime, self._make_config(cpus=8))
+
+    def test_cpus_exceeds_limit_raises(self, sandbox_manager):
+        from rock.sdk.common.exceptions import BadRequestRockError
+
+        with pytest.raises(BadRequestRockError, match="exceed the maximum allowed"):
+            sandbox_manager.validate_sandbox_spec(sandbox_manager.rock_config.runtime, self._make_config(cpus=32))
+
+    def test_memory_within_limit_passes(self, sandbox_manager):
+        sandbox_manager.validate_sandbox_spec(sandbox_manager.rock_config.runtime, self._make_config(memory="32g"))
+
+    def test_memory_exceeds_limit_raises(self, sandbox_manager):
+        from rock.sdk.common.exceptions import BadRequestRockError
+
+        with pytest.raises(BadRequestRockError, match="exceed the maximum allowed"):
+            sandbox_manager.validate_sandbox_spec(sandbox_manager.rock_config.runtime, self._make_config(memory="128g"))
+
+    def test_invalid_memory_format_raises(self, sandbox_manager):
+        from rock.sdk.common.exceptions import BadRequestRockError
+
+        with pytest.raises(BadRequestRockError, match="Invalid memory size"):
+            sandbox_manager.validate_sandbox_spec(
+                sandbox_manager.rock_config.runtime, self._make_config(memory="not_a_size")
+            )
+
+    def test_both_within_limit_passes(self, sandbox_manager):
+        sandbox_manager.validate_sandbox_spec(
+            sandbox_manager.rock_config.runtime, self._make_config(cpus=16, memory="64g")
+        )
+
+    def test_cpus_at_boundary_passes(self, sandbox_manager):
+        sandbox_manager.validate_sandbox_spec(sandbox_manager.rock_config.runtime, self._make_config(cpus=16))
+
+    def test_memory_at_boundary_passes(self, sandbox_manager):
+        sandbox_manager.validate_sandbox_spec(sandbox_manager.rock_config.runtime, self._make_config(memory="64g"))
+
+
 # ---------------------------------------------------------------------------
 # delete transitions
 # ---------------------------------------------------------------------------