From a37a980ef7a6eb85ea6a8f807be26fbd947aefea Mon Sep 17 00:00:00 2001 From: Jiachen Zhang Date: Tue, 16 Jun 2026 09:58:14 +0000 Subject: [PATCH 1/2] refactor(cpu-overcommit): extract apply_cpu_overcommit to shared module Move the inline _apply_cpu_overcommit_default from sandbox_api.py to rock/common/cpu_overcommit.py so both start and restart flows can reuse the same overcommit logic. The new function takes nacos_provider as an explicit parameter instead of reading from the module-level global. --- rock/admin/entrypoints/sandbox_api.py | 45 +++------------------------ rock/common/cpu_overcommit.py | 42 +++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 41 deletions(-) create mode 100644 rock/common/cpu_overcommit.py diff --git a/rock/admin/entrypoints/sandbox_api.py b/rock/admin/entrypoints/sandbox_api.py index 5a59883461..4acb43d5f3 100644 --- a/rock/admin/entrypoints/sandbox_api.py +++ b/rock/admin/entrypoints/sandbox_api.py @@ -1,4 +1,3 @@ -import math import re import time from typing import Annotated, Any @@ -30,8 +29,6 @@ ) from rock.admin.proto.response import SandboxStartResponse from rock.common.constants import ( - CPU_OVERCOMMIT_ALLOWED_KEYS_KEY, - CPU_OVERCOMMIT_HEADROOM_KEY, EXTRA_ACCELERATOR_TYPES_KEY, GET_STATUS_SWITCH, KATA_DIND_DISK_SIZE_KEY, @@ -39,6 +36,7 @@ SANDBOX_DISK_LIMIT_ROOTFS_KEY, SUPPORT_KATA_SWITCH, ) +from rock.common.cpu_overcommit import apply_cpu_overcommit from rock.common.exception import handle_exceptions from rock.common.validation import NonBlankStr from rock.config import ImageRegistryMirror @@ -275,43 +273,6 @@ async def _apply_accelerator_type_validation(config: DockerDeploymentConfig) -> ) -async def _apply_cpu_overcommit_default(config: DockerDeploymentConfig, rock_authorization: str | None) -> None: - """Derive limit_cpus from cpus + Nacos headroom when SDK did not set it. - - Formula: limit_cpus = min(2 * cpus, cpus + headroom) - - SDK-supplied limit_cpus always wins (function is a no-op in that case). - - Grayscale gate driven by Nacos list `cpu_overcommit_allowed_keys`: - * key absent from Nacos -> gate is open for every caller (full rollout). - * key present as a list -> only `rock_authorization` values in the list pass. - * key present but not a list (misconfigured) -> gate closed. - - headroom is read from Nacos key `cpu_overcommit_headroom` (default 0). - - headroom <= 0 keeps limit_cpus = None (docker run gets no --cpus flag). - """ - if config.limit_cpus is not None: - return - - nacos = sandbox_manager.rock_config.nacos_provider - if nacos is None: - return - - nacos_config = await nacos.get_config() or {} - allowed_keys = nacos_config.get(CPU_OVERCOMMIT_ALLOWED_KEYS_KEY) - if allowed_keys is not None and (not isinstance(allowed_keys, list) or rock_authorization not in allowed_keys): - return - - raw = nacos_config.get(CPU_OVERCOMMIT_HEADROOM_KEY) - try: - headroom = float(raw) if raw is not None else 0.0 - except (TypeError, ValueError): - headroom = 0.0 - - # Reject NaN / inf so a fat-fingered Nacos edit can't break sandbox startup - if not math.isfinite(headroom) or headroom <= 0: - return - - config.limit_cpus = min(2 * config.cpus, config.cpus + headroom) - - @sandbox_router.post("/start") @handle_exceptions(error_message="start sandbox failed") async def start(request: SandboxStartRequest) -> RockResponse[SandboxStartResponse]: @@ -335,7 +296,9 @@ async def start_async( await _apply_accelerator_type_validation(config) await _apply_kata_runtime_switch(config) await _apply_kata_disk_size(config) - await _apply_cpu_overcommit_default(config, headers.user_info.get("rock_authorization")) + await apply_cpu_overcommit( + config, sandbox_manager.rock_config.nacos_provider, headers.user_info.get("rock_authorization") + ) await _apply_disk_limits(config) await _apply_image_registry_mirror(config) sandbox_start_response = await sandbox_manager.start_async( diff --git a/rock/common/cpu_overcommit.py b/rock/common/cpu_overcommit.py new file mode 100644 index 0000000000..3b4e134a91 --- /dev/null +++ b/rock/common/cpu_overcommit.py @@ -0,0 +1,42 @@ +import math + +from rock.common.constants import CPU_OVERCOMMIT_ALLOWED_KEYS_KEY, CPU_OVERCOMMIT_HEADROOM_KEY +from rock.deployments.config import DockerDeploymentConfig + + +async def apply_cpu_overcommit( + config: DockerDeploymentConfig, nacos_provider, rock_authorization: str | None = None +) -> None: + """Derive limit_cpus from cpus + Nacos headroom when not explicitly set. + + Formula: limit_cpus = min(2 * cpus, cpus + headroom) + - SDK-supplied limit_cpus always wins (function is a no-op in that case). + - Grayscale gate driven by Nacos list `cpu_overcommit_allowed_keys`: + * key absent from Nacos -> gate is open for every caller (full rollout). + * key present as a list -> only `rock_authorization` values in the list pass. + * key present but not a list (misconfigured) -> gate closed. + - headroom is read from Nacos key `cpu_overcommit_headroom` (default 0). + - headroom <= 0 keeps limit_cpus = None (docker run gets no --cpus flag). + """ + if config.limit_cpus is not None: + return + + if nacos_provider is None: + return + + nacos_config = await nacos_provider.get_config() or {} + allowed_keys = nacos_config.get(CPU_OVERCOMMIT_ALLOWED_KEYS_KEY) + if allowed_keys is not None and (not isinstance(allowed_keys, list) or rock_authorization not in allowed_keys): + return + + raw = nacos_config.get(CPU_OVERCOMMIT_HEADROOM_KEY) + try: + headroom = float(raw) if raw is not None else 0.0 + except (TypeError, ValueError): + headroom = 0.0 + + # Reject NaN / inf so a fat-fingered Nacos edit can't break sandbox startup + if not math.isfinite(headroom) or headroom <= 0: + return + + config.limit_cpus = min(2 * config.cpus, config.cpus + headroom) From addb51f5962ef7b9fb7e76b0b0948eb22ccc35a1 Mon Sep 17 00:00:00 2001 From: Jiachen Zhang Date: Tue, 16 Jun 2026 09:59:00 +0000 Subject: [PATCH 2/2] feat(restart): support cpu/memory/limit_cpus resource updates on restart - Add limit_cpus to SandboxRestartRequest and restart_async - Build DockerDeploymentConfig from persisted spec in restart_async, apply resource overrides, validate via validate_sandbox_spec - Reuse apply_cpu_overcommit when cpus changes without explicit limit_cpus - Simplify on_restart to receive a fully-built config instead of raw resource_overrides dict - Add _docker_update_resources (reuses _cpus/_memory) to apply resource changes to stopped containers before docker start - Add unit tests for _docker_update_resources and restart resource flows --- rock/admin/entrypoints/sandbox_api.py | 7 +- rock/admin/proto/request.py | 7 + rock/deployments/docker.py | 11 ++ rock/sandbox/sandbox_manager.py | 41 +++++- rock/sandbox/sandbox_statemachine.py | 26 ++-- .../entrypoints/test_param_validation.py | 52 +++++++ .../test_docker_update_resources.py | 85 +++++++++++ .../unit/sandbox/test_sandbox_statemachine.py | 135 +++++++++++++++++- 8 files changed, 341 insertions(+), 23 deletions(-) create mode 100644 tests/unit/deployments/test_docker_update_resources.py diff --git a/rock/admin/entrypoints/sandbox_api.py b/rock/admin/entrypoints/sandbox_api.py index 4acb43d5f3..dc95eef522 100644 --- a/rock/admin/entrypoints/sandbox_api.py +++ b/rock/admin/entrypoints/sandbox_api.py @@ -23,6 +23,7 @@ SandboxCommand, SandboxCreateBashSessionRequest, SandboxReadFileRequest, + SandboxRestartRequest, SandboxStartRequest, SandboxWriteFileRequest, StartHeaders, @@ -417,8 +418,10 @@ async def delete(sandbox_id: str = Body(..., embed=True)) -> RockResponse: @sandbox_router.post("/restart") @handle_exceptions(error_message="restart sandbox failed") -async def restart(sandbox_id: str = Body(..., embed=True)) -> RockResponse[SandboxStartResponse]: - result = await sandbox_manager.restart_async(sandbox_id) +async def restart(request: SandboxRestartRequest) -> RockResponse[SandboxStartResponse]: + result = await sandbox_manager.restart_async( + request.sandbox_id, cpus=request.cpus, memory=request.memory, limit_cpus=request.limit_cpus + ) return RockResponse(result=result) diff --git a/rock/admin/proto/request.py b/rock/admin/proto/request.py index 99398a27a2..85af50c22e 100644 --- a/rock/admin/proto/request.py +++ b/rock/admin/proto/request.py @@ -46,6 +46,13 @@ class SandboxStartRequest(BaseModel): """GPU accelerator type (e.g. 'A100', 'V100'). If not specified, any available GPU will be used.""" +class SandboxRestartRequest(BaseModel): + sandbox_id: NonBlankStr + cpus: float | None = None + memory: str | None = None + limit_cpus: float | None = None + + class SandboxCommand(Command): timeout: float | None = 1200 """The timeout for the command. None means no timeout.""" diff --git a/rock/deployments/docker.py b/rock/deployments/docker.py index 5b6fe6574f..5844f999a5 100644 --- a/rock/deployments/docker.py +++ b/rock/deployments/docker.py @@ -662,6 +662,15 @@ def _docker_create(self, cmd: list[str]) -> None: ) raise + def _docker_update_resources(self) -> None: + """Apply cpu/memory resource updates to a stopped container via docker update.""" + cmd = ["docker", "update"] + self._cpus() + self._memory() + [self._container_name] + logger.info(f"Updating container resources: {' '.join(cmd)}") + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode != 0: + logger.error(f"docker update failed: {result.stderr}") + raise RuntimeError(f"docker update failed for {self._container_name}: {result.stderr}") + def _docker_start(self) -> subprocess.Popen: """Start a previously-created container with stdout/stderr attached.""" try: @@ -706,6 +715,8 @@ async def restart(self): logger.info(f"Restarting container {self._container_name} with docker start") + await loop.run_in_executor(executor, self._docker_update_resources) + # Reuse the same Popen-based attached start used by start(), so the # restart path also produces a valid self._container_process. Without # this, _stop() would skip its `if self._container_process is not None` diff --git a/rock/sandbox/sandbox_manager.py b/rock/sandbox/sandbox_manager.py index 1c1ff3070a..454fc21aa4 100644 --- a/rock/sandbox/sandbox_manager.py +++ b/rock/sandbox/sandbox_manager.py @@ -26,6 +26,7 @@ from rock.admin.proto.request import SandboxWriteFileRequest as WriteFileRequest from rock.admin.proto.response import SandboxStartResponse, SandboxStatusResponse from rock.common.constants import DeleteReason, StopReason +from rock.common.cpu_overcommit import apply_cpu_overcommit from rock.config import RockConfig, RuntimeConfig from rock.deployments.config import DeploymentConfig, DockerDeploymentConfig from rock.logger import init_logger @@ -152,7 +153,13 @@ async def start_async( ) @monitor_sandbox_operation() - async def restart_async(self, sandbox_id: str) -> SandboxStartResponse: + async def restart_async( + self, + sandbox_id: str, + cpus: float | None = None, + memory: str | None = None, + limit_cpus: float | None = None, + ) -> SandboxStartResponse: sm = await self._get_current_statemachine(sandbox_id) if sm is None: raise BadRequestRockError(f"Sandbox {sandbox_id} not found") @@ -161,11 +168,43 @@ async def restart_async(self, sandbox_id: str) -> SandboxStartResponse: if state != State.STOPPED: raise BadRequestRockError(f"Sandbox {sandbox_id} cannot be restarted: current state is '{state.value}'") + info = sm.sandbox_info or {} + spec = info.get("spec") or {} + if spec: + restart_config = DockerDeploymentConfig(**spec) + else: + logger.warning( + f"sandbox {sandbox_id} has no spec snapshot; rebuilding config from flat fields with model defaults" + ) + restart_config = DockerDeploymentConfig( + container_name=sandbox_id, + image=info.get("image") or DockerDeploymentConfig.model_fields["image"].default, + memory=info.get("memory") or DockerDeploymentConfig.model_fields["memory"].default, + cpus=float(info.get("cpus") or DockerDeploymentConfig.model_fields["cpus"].default), + ) + + if cpus is not None: + restart_config.cpus = cpus + if limit_cpus is not None: + restart_config.limit_cpus = limit_cpus + if memory is not None: + restart_config.memory = memory + if cpus is not None and limit_cpus is None: + if restart_config.limit_cpus is not None: + logger.warning( + f"restart {sandbox_id}: cpus changed to {cpus} but limit_cpus not provided, " + f"clearing previous limit_cpus={restart_config.limit_cpus}" + ) + restart_config.limit_cpus = None + await apply_cpu_overcommit(restart_config, self.rock_config.nacos_provider) + self.validate_sandbox_spec(self.rock_config.runtime, restart_config) + await sm.send( "restart", sandbox_id=sandbox_id, operator=self._operator, meta_store=self._meta_store, + restart_config=restart_config, ) info: SandboxInfo = sm.sandbox_info or {} diff --git a/rock/sandbox/sandbox_statemachine.py b/rock/sandbox/sandbox_statemachine.py index 105b2423d9..ba4a301912 100644 --- a/rock/sandbox/sandbox_statemachine.py +++ b/rock/sandbox/sandbox_statemachine.py @@ -100,29 +100,19 @@ async def on_alive(self, sandbox_id: str, meta_store, sandbox_info: SandboxInfo) sandbox_info["start_time"] = get_iso8601_timestamp() await meta_store.update(sandbox_id, sandbox_info) - async def on_restart(self, sandbox_id: str, operator, meta_store) -> None: + async def on_restart( + self, + sandbox_id: str, + operator, + meta_store, + restart_config: DockerDeploymentConfig, + ) -> None: info = self.sandbox_info or {} host_ip = info.get("host_ip") if not host_ip: raise BadRequestRockError(f"Sandbox {sandbox_id} has no host_ip; cannot pin restart to original node") - # Prefer the spec snapshot (DockerDeploymentConfig.model_dump persisted to - # the DB at start time) so the new actor wraps the existing container with - # the exact same config. - spec = info.get("spec") or {} - if spec: - restart_config = DockerDeploymentConfig(**spec) - else: - logger.warning( - f"sandbox {sandbox_id} has no spec snapshot; rebuilding config from flat fields with model defaults" - ) - restart_config = DockerDeploymentConfig( - container_name=sandbox_id, - image=info.get("image") or DockerDeploymentConfig.model_fields["image"].default, - memory=info.get("memory") or DockerDeploymentConfig.model_fields["memory"].default, - cpus=float(info.get("cpus") or DockerDeploymentConfig.model_fields["cpus"].default), - ) timeout_info = SandboxTimeoutHelper.make_timeout_info(restart_config.auto_clear_time) logger.info(f"restart sandbox {sandbox_id} (pin host_ip={host_ip})") @@ -135,6 +125,8 @@ async def on_restart(self, sandbox_id: str, operator, meta_store) -> None: # fields (spec/status) won't pollute the alive key. new_info = dict(info) new_info["state"] = RockState.PENDING + new_info["cpus"] = restart_config.cpus + new_info["memory"] = restart_config.memory new_info.pop("stop_time", None) await meta_store.update(sandbox_id, new_info) await meta_store.update_timeout(sandbox_id, timeout_info) diff --git a/tests/unit/admin/entrypoints/test_param_validation.py b/tests/unit/admin/entrypoints/test_param_validation.py index 5f80600278..f14778ea03 100644 --- a/tests/unit/admin/entrypoints/test_param_validation.py +++ b/tests/unit/admin/entrypoints/test_param_validation.py @@ -255,3 +255,55 @@ async def test_gem_close_empty_sandbox_id(gem_app): async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: _assert_failed(await client.post("/close", json={"sandbox_id": ""})) _assert_failed(await client.post("/close", json={"sandbox_id": " "})) + + +# --- restart backward compatibility --- + + +@pytest.mark.asyncio +async def test_restart_old_sdk_only_sandbox_id(sandbox_app): + """Old SDK sends only {"sandbox_id": "xxx"} — must still work.""" + app, mock_manager = sandbox_app + mock_manager.restart_async = AsyncMock( + return_value=SandboxStartResponse(sandbox_id="sb-1", host_name="h", host_ip="1.2.3.4") + ) + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + resp = await client.post("/restart", json={"sandbox_id": "sb-1"}) + assert resp.status_code == 200 + assert resp.json()["status"] == "Success" + mock_manager.restart_async.assert_called_once_with("sb-1", cpus=None, memory=None, limit_cpus=None) + + +@pytest.mark.asyncio +async def test_restart_new_sdk_with_resources(sandbox_app): + """New SDK sends {"sandbox_id": "xxx", "cpus": 4, "memory": "8g"}.""" + app, mock_manager = sandbox_app + mock_manager.restart_async = AsyncMock( + return_value=SandboxStartResponse(sandbox_id="sb-1", host_name="h", host_ip="1.2.3.4") + ) + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + resp = await client.post("/restart", json={"sandbox_id": "sb-1", "cpus": 4, "memory": "8g"}) + assert resp.status_code == 200 + assert resp.json()["status"] == "Success" + mock_manager.restart_async.assert_called_once_with("sb-1", cpus=4.0, memory="8g", limit_cpus=None) + + +@pytest.mark.asyncio +async def test_restart_with_limit_cpus(sandbox_app): + """New SDK sends {"sandbox_id": "xxx", "cpus": 2, "limit_cpus": 4}.""" + app, mock_manager = sandbox_app + mock_manager.restart_async = AsyncMock( + return_value=SandboxStartResponse(sandbox_id="sb-1", host_name="h", host_ip="1.2.3.4") + ) + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + resp = await client.post("/restart", json={"sandbox_id": "sb-1", "cpus": 2, "limit_cpus": 4}) + assert resp.status_code == 200 + assert resp.json()["status"] == "Success" + mock_manager.restart_async.assert_called_once_with("sb-1", cpus=2.0, memory=None, limit_cpus=4.0) + + +@pytest.mark.asyncio +async def test_restart_empty_sandbox_id(sandbox_app): + app, _ = sandbox_app + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + _assert_failed(await client.post("/restart", json={"sandbox_id": ""})) diff --git a/tests/unit/deployments/test_docker_update_resources.py b/tests/unit/deployments/test_docker_update_resources.py new file mode 100644 index 0000000000..b3eae1c779 --- /dev/null +++ b/tests/unit/deployments/test_docker_update_resources.py @@ -0,0 +1,85 @@ +"""Unit tests for DockerDeployment._docker_update_resources().""" + +from unittest.mock import patch + +import pytest + +from rock.deployments.config import DockerDeploymentConfig +from rock.deployments.docker import DockerDeployment + + +@pytest.fixture +def _patch_validator(): + with patch("rock.deployments.docker.DockerSandboxValidator"): + yield + + +@pytest.mark.usefixtures("_patch_validator") +class TestDockerUpdateResources: + """Tests for _docker_update_resources command construction and error handling.""" + + def _make_deployment(self, **kwargs) -> DockerDeployment: + config = DockerDeploymentConfig(container_name="test-sandbox", **kwargs) + return DockerDeployment.from_config(config) + + def test_without_limit_cpus(self): + deployment = self._make_deployment(cpus=2, memory="4g") + with patch("rock.deployments.docker.subprocess.run") as mock_run: + mock_run.return_value.returncode = 0 + deployment._docker_update_resources() + + cmd = mock_run.call_args[0][0] + assert cmd == [ + "docker", "update", + "--cpus=2.0", + "--memory=4g", + "--memory-swap=4g", + "test-sandbox", + ] + + def test_with_limit_cpus(self): + deployment = self._make_deployment(cpus=2, limit_cpus=4, memory="4g") + with patch("rock.deployments.docker.subprocess.run") as mock_run: + mock_run.return_value.returncode = 0 + deployment._docker_update_resources() + + cmd = mock_run.call_args[0][0] + assert cmd == [ + "docker", "update", + "--cpu-shares=2048", + "--cpus=4.0", + "--memory=4g", + "--memory-swap=4g", + "test-sandbox", + ] + + def test_fractional_cpus_shares(self): + deployment = self._make_deployment(cpus=0.5, limit_cpus=1.5, memory="2g") + with patch("rock.deployments.docker.subprocess.run") as mock_run: + mock_run.return_value.returncode = 0 + deployment._docker_update_resources() + + cmd = mock_run.call_args[0][0] + assert "--cpu-shares=512" in cmd + assert "--cpus=1.5" in cmd + + def test_failure_raises_runtime_error(self): + deployment = self._make_deployment(cpus=2, memory="4g") + with patch("rock.deployments.docker.subprocess.run") as mock_run: + mock_run.return_value.returncode = 1 + mock_run.return_value.stderr = "no such container" + with pytest.raises(RuntimeError, match="docker update failed"): + deployment._docker_update_resources() + + def test_consistent_with_cpus_method(self): + """_docker_update_resources cpu flags should match _cpus() output.""" + for cpus, limit_cpus in [(2, None), (2, 4), (0.5, 1.5)]: + deployment = self._make_deployment(cpus=cpus, limit_cpus=limit_cpus, memory="4g") + with patch("rock.deployments.docker.subprocess.run") as mock_run: + mock_run.return_value.returncode = 0 + deployment._docker_update_resources() + + update_cmd = mock_run.call_args[0][0] + cpu_flags_from_update = [f for f in update_cmd if "cpu" in f.lower()] + cpu_flags_from_cpus = deployment._cpus() + assert cpu_flags_from_update == cpu_flags_from_cpus diff --git a/tests/unit/sandbox/test_sandbox_statemachine.py b/tests/unit/sandbox/test_sandbox_statemachine.py index 54f93b4952..461fe583b0 100644 --- a/tests/unit/sandbox/test_sandbox_statemachine.py +++ b/tests/unit/sandbox/test_sandbox_statemachine.py @@ -239,12 +239,21 @@ async def test_stop_time_always_written_even_when_start_failed(self, mock_operat } +def _default_restart_config(**overrides): + from rock.deployments.config import DockerDeploymentConfig + + kwargs = dict(_VALID_RESTART_INFO["spec"]) + kwargs.update(overrides) + return DockerDeploymentConfig(**kwargs) + + class TestRestartTransitions: def _restart_kwargs(self, meta_store=None): return dict( sandbox_id="sb", operator=AsyncMock(), meta_store=meta_store or AsyncMock(), + restart_config=_default_restart_config(), ) @pytest.mark.asyncio @@ -276,7 +285,7 @@ class TestOnRestart: def mock_meta_store(self): return AsyncMock() - async def _send_restart(self, mock_meta_store, sandbox_info=None): + async def _send_restart(self, mock_meta_store, sandbox_info=None, config=None): info = sandbox_info if sandbox_info is not None else dict(_VALID_RESTART_INFO) sm = await SandboxStateMachine.from_state_value(State.STOPPED, sandbox_info=info) await sm.send( @@ -284,6 +293,7 @@ async def _send_restart(self, mock_meta_store, sandbox_info=None): sandbox_id="sb-1", operator=AsyncMock(), meta_store=mock_meta_store, + restart_config=config or _default_restart_config(), ) @pytest.mark.asyncio @@ -300,15 +310,134 @@ async def test_updates_state_to_pending(self, mock_meta_store): @pytest.mark.asyncio async def test_writes_timeout_built_from_spec(self, mock_meta_store): - # auto_clear_time_minutes=30 in spec → make_timeout_info uses 30 await self._send_restart(mock_meta_store) mock_meta_store.update_timeout.assert_awaited_once() sandbox_id, timeout_info = mock_meta_store.update_timeout.call_args[0] assert sandbox_id == "sb-1" - # SandboxTimeoutHelper.make_timeout_info stores auto_clear_time as the env-var key assert any("30" == str(v) for v in timeout_info.values()) +# --------------------------------------------------------------------------- +# on_restart resource overrides +# --------------------------------------------------------------------------- + + +class TestOnRestartResourceOverrides: + @pytest.fixture + def mock_meta_store(self): + return AsyncMock() + + async def _send_restart(self, mock_meta_store, config=None, sandbox_info=None): + info = sandbox_info if sandbox_info is not None else dict(_VALID_RESTART_INFO) + if "spec" in info: + info = {**info, "spec": dict(info["spec"])} + sm = await SandboxStateMachine.from_state_value(State.STOPPED, sandbox_info=info) + mock_operator = AsyncMock() + await sm.send( + "restart", + sandbox_id="sb-1", + operator=mock_operator, + meta_store=mock_meta_store, + restart_config=config or _default_restart_config(), + ) + return mock_operator + + @pytest.mark.asyncio + async def test_config_passed_to_operator(self, mock_meta_store): + config = _default_restart_config(cpus=4) + mock_operator = await self._send_restart(mock_meta_store, config=config) + restart_config = mock_operator.restart.call_args[0][0] + assert restart_config.cpus == 4 + + @pytest.mark.asyncio + async def test_no_change_uses_original_config(self, mock_meta_store): + mock_operator = await self._send_restart(mock_meta_store) + restart_config = mock_operator.restart.call_args[0][0] + assert restart_config.cpus == 1 + assert restart_config.memory == "2g" + + @pytest.mark.asyncio + async def test_limit_cpus_passed_to_operator(self, mock_meta_store): + config = _default_restart_config(cpus=4, limit_cpus=6) + mock_operator = await self._send_restart(mock_meta_store, config=config) + restart_config = mock_operator.restart.call_args[0][0] + assert restart_config.cpus == 4 + assert restart_config.limit_cpus == 6 + + @pytest.mark.asyncio + async def test_update_persists_cpus_and_memory(self, mock_meta_store): + config = _default_restart_config(cpus=8, memory="32g") + await self._send_restart(mock_meta_store, config=config) + mock_meta_store.update.assert_awaited_once() + updated_info = mock_meta_store.update.call_args[0][1] + assert updated_info["cpus"] == 8 + assert updated_info["memory"] == "32g" + assert updated_info["state"] == State.PENDING + + +# --------------------------------------------------------------------------- +# restart resource validation (reuses SandboxManager.validate_sandbox_spec) +# --------------------------------------------------------------------------- + + +class TestRestartResourceValidation: + @pytest.fixture + def sandbox_manager(self): + from unittest.mock import MagicMock + + from rock.config import RockConfig, RuntimeConfig, StandardSpec + from rock.sandbox.sandbox_manager import SandboxManager + + mgr = MagicMock() + mgr.rock_config = MagicMock(spec=RockConfig) + mgr.rock_config.runtime = MagicMock(spec=RuntimeConfig) + mgr.rock_config.runtime.max_allowed_spec = StandardSpec(cpus=16, memory="64g") + mgr.validate_sandbox_spec = SandboxManager.validate_sandbox_spec.__get__(mgr) + return mgr + + def _make_config(self, cpus=2, memory="8g"): + from rock.deployments.config import DockerDeploymentConfig + + return DockerDeploymentConfig(container_name="sb-1", image="python:3.11", cpus=cpus, memory=memory) + + def test_cpus_within_limit_passes(self, sandbox_manager): + sandbox_manager.validate_sandbox_spec(sandbox_manager.rock_config.runtime, self._make_config(cpus=8)) + + def test_cpus_exceeds_limit_raises(self, sandbox_manager): + from rock.sdk.common.exceptions import BadRequestRockError + + with pytest.raises(BadRequestRockError, match="exceed the maximum allowed"): + sandbox_manager.validate_sandbox_spec(sandbox_manager.rock_config.runtime, self._make_config(cpus=32)) + + def test_memory_within_limit_passes(self, sandbox_manager): + sandbox_manager.validate_sandbox_spec(sandbox_manager.rock_config.runtime, self._make_config(memory="32g")) + + def test_memory_exceeds_limit_raises(self, sandbox_manager): + from rock.sdk.common.exceptions import BadRequestRockError + + with pytest.raises(BadRequestRockError, match="exceed the maximum allowed"): + sandbox_manager.validate_sandbox_spec(sandbox_manager.rock_config.runtime, self._make_config(memory="128g")) + + def test_invalid_memory_format_raises(self, sandbox_manager): + from rock.sdk.common.exceptions import BadRequestRockError + + with pytest.raises(BadRequestRockError, match="Invalid memory size"): + sandbox_manager.validate_sandbox_spec( + sandbox_manager.rock_config.runtime, self._make_config(memory="not_a_size") + ) + + def test_both_within_limit_passes(self, sandbox_manager): + sandbox_manager.validate_sandbox_spec( + sandbox_manager.rock_config.runtime, self._make_config(cpus=16, memory="64g") + ) + + def test_cpus_at_boundary_passes(self, sandbox_manager): + sandbox_manager.validate_sandbox_spec(sandbox_manager.rock_config.runtime, self._make_config(cpus=16)) + + def test_memory_at_boundary_passes(self, sandbox_manager): + sandbox_manager.validate_sandbox_spec(sandbox_manager.rock_config.runtime, self._make_config(memory="64g")) + + # --------------------------------------------------------------------------- # delete transitions # ---------------------------------------------------------------------------