Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
65a11fd
add release note 120
zhongwen666 Feb 3, 2026
9157582
Revert "add release note 120"
zhongwen666 Feb 3, 2026
dba7590
Merge branch 'alibaba:master' into master
zhongwen666 Feb 3, 2026
b14eac6
Merge branch 'alibaba:master' into master
zhongwen666 Feb 3, 2026
d5e799c
Merge branch 'alibaba:master' into master
zhongwen666 Feb 5, 2026
b84e5a2
Merge branch 'alibaba:master' into master
zhongwen666 Feb 10, 2026
3085eed
Merge branch 'alibaba:master' into master
zhongwen666 Feb 11, 2026
6d9a4ca
Merge branch 'alibaba:master' into master
zhongwen666 Feb 12, 2026
8d6e6ca
Merge branch 'alibaba:master' into master
zhongwen666 Feb 24, 2026
acff0c0
Merge branch 'alibaba:master' into master
zhongwen666 Feb 26, 2026
f24c268
Merge branch 'alibaba:master' into master
zhongwen666 Feb 27, 2026
0a11dbf
Merge branch 'alibaba:master' into master
zhongwen666 Feb 28, 2026
5dc5453
Merge branch 'alibaba:master' into master
zhongwen666 Mar 3, 2026
cdef141
Merge branch 'alibaba:master' into master
zhongwen666 Mar 3, 2026
d6dc02e
Merge branch 'alibaba:master' into master
zhongwen666 Mar 3, 2026
6382e4e
Merge branch 'alibaba:master' into master
zhongwen666 Mar 4, 2026
a4bd676
Merge branch 'alibaba:master' into master
zhongwen666 Mar 4, 2026
f020bd7
Merge branch 'alibaba:master' into master
zhongwen666 Mar 10, 2026
9d4ceeb
Merge branch 'alibaba:master' into master
zhongwen666 Mar 12, 2026
af5f29d
Merge branch 'alibaba:master' into master
zhongwen666 Mar 12, 2026
62ccd5d
Merge branch 'alibaba:master' into master
zhongwen666 Mar 12, 2026
1d2d62c
Merge branch 'alibaba:master' into master
zhongwen666 Mar 16, 2026
c10dce0
Merge branch 'alibaba:master' into master
zhongwen666 Mar 17, 2026
8fb0f3b
Merge branch 'alibaba:master' into master
zhongwen666 Mar 23, 2026
4d70da7
Merge branch 'alibaba:master' into master
zhongwen666 Mar 24, 2026
e27cc81
Merge branch 'alibaba:master' into master
zhongwen666 Mar 27, 2026
56f6000
Merge branch 'alibaba:master' into master
zhongwen666 Mar 27, 2026
c654a33
Merge branch 'alibaba:master' into master
zhongwen666 Mar 27, 2026
c2594ea
Merge branch 'alibaba:master' into master
zhongwen666 Mar 27, 2026
46172a3
Merge branch 'alibaba:master' into master
zhongwen666 Apr 1, 2026
f345d31
Merge branch 'alibaba:master' into master
zhongwen666 Apr 17, 2026
31bd9f3
Merge branch 'alibaba:master' into master
zhongwen666 Apr 23, 2026
c3ef4ce
Merge branch 'alibaba:master' into master
zhongwen666 May 8, 2026
40f2de2
Merge branch 'alibaba:master' into master
zhongwen666 May 9, 2026
171d150
Merge branch 'alibaba:master' into master
zhongwen666 May 15, 2026
43d0aa7
Merge branch 'alibaba:master' into master
zhongwen666 May 15, 2026
ed7abca
Merge branch 'alibaba:master' into master
zhongwen666 May 18, 2026
f7987f2
Merge branch 'alibaba:master' into master
zhongwen666 May 18, 2026
023b1e4
Merge branch 'alibaba:master' into master
zhongwen666 May 21, 2026
b34a2ee
Merge branch 'alibaba:master' into master
zhongwen666 May 27, 2026
370457e
Merge branch 'alibaba:master' into master
zhongwen666 Jun 1, 2026
53dc4ad
Merge branch 'alibaba:master' into master
zhongwen666 Jun 1, 2026
27913c7
Merge branch 'alibaba:master' into master
zhongwen666 Jun 2, 2026
45317cd
Merge branch 'alibaba:master' into master
zhongwen666 Jun 9, 2026
8fceba9
Merge branch 'alibaba:master' into master
zhongwen666 Jun 11, 2026
68c779b
Merge branch 'alibaba:master' into master
zhongwen666 Jun 16, 2026
5ecae9f
Merge branch 'alibaba:master' into master
zhongwen666 Jun 22, 2026
a5a6fff
Merge branch 'alibaba:master' into master
zhongwen666 Jun 25, 2026
c81ca1c
add time
zhongwen666 Jun 25, 2026
19322dc
test: add log
zhongwen666 Jun 26, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 51 additions & 6 deletions rock/admin/entrypoints/sandbox_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@

logger = init_logger(__name__)


def _log_duration(phase: str, stage: str, start: float):
duration = time.perf_counter() - start
logger.info(f"[{phase}] {stage} took {duration:.3f} s")

_MIRROR_PROBE_CACHE: dict[str, tuple[bool, float]] = {}
_MIRROR_PROBE_TTL_SECONDS = 60.0

Expand Down Expand Up @@ -389,20 +394,48 @@ async def start_async(
request: SandboxStartRequest,
headers: Annotated[StartHeaders, Depends()],
) -> RockResponse[SandboxStartResponse]:
total_start = time.perf_counter()
config = DockerDeploymentConfig.from_request(request)

t0 = time.perf_counter()
await _apply_accelerator_type_validation(config)
_log_duration("start_async_timing", "Apply accelerator type validation", t0)

t0 = time.perf_counter()
await _apply_kata_runtime_switch(config)
await _apply_kata_disk_size(config)
_log_duration("start_async_timing", "Apply kata runtime switch + disk size", t0)

t0 = time.perf_counter()
await _apply_image_os_profile(config)
_log_duration("start_async_timing", "Apply image OS profile", t0)

t0 = time.perf_counter()
await _apply_timeout_defaults(config)
await _apply_cpu_overcommit_default(config, headers.user_info.get("rock_authorization"))
_log_duration("start_async_timing", "Apply timeout defaults + CPU overcommit", t0)

t0 = time.perf_counter()
await _apply_disk_limits(config)
_log_duration("start_async_timing", "Apply disk limits", t0)

t0 = time.perf_counter()
await _apply_image_registry_mirror(config)
_log_duration("start_async_timing", "Apply image registry mirror", t0)

t0 = time.perf_counter()
sandbox_start_response = await sandbox_manager.start_async(
config,
user_info=headers.user_info,
cluster_info=headers.cluster_info,
)
_log_duration("start_async_timing", "sandbox_manager.start_async", t0)

total_duration = time.perf_counter() - total_start
logger.info(
f"[start_async_timing] [{sandbox_start_response.sandbox_id}] "
f"API start_async total took {total_duration:.3f} s"
)
return RockResponse(result=sandbox_start_response)


Expand All @@ -427,15 +460,27 @@ async def get_sandbox_statistics(sandbox_id: NonBlankStr):
@sandbox_router.get("/get_status")
@handle_exceptions(error_message="get sandbox status failed")
async def get_status(sandbox_id: NonBlankStr, include_all_states: bool = False):
total_start = time.perf_counter()

# TODO: do judgement inside operator
if (
t0 = time.perf_counter()
use_v2 = (
sandbox_manager.rock_config.nacos_provider is not None
and await sandbox_manager.rock_config.nacos_provider.get_switch_status(GET_STATUS_SWITCH)
):
return RockResponse(
result=await sandbox_manager.get_status_v2(sandbox_id, include_all_states=include_all_states)
)
return RockResponse(result=await sandbox_manager.get_status(sandbox_id, include_all_states=include_all_states))
)
_log_duration("get_status_timing", f"[{sandbox_id}] Nacos switch check", t0)

t0 = time.perf_counter()
if use_v2:
result = await sandbox_manager.get_status_v2(sandbox_id, include_all_states=include_all_states)
else:
result = await sandbox_manager.get_status(sandbox_id, include_all_states=include_all_states)
_log_duration("get_status_timing", f"[{sandbox_id}] sandbox_manager.get_status", t0)

total_duration = time.perf_counter() - total_start
logger.info(f"[get_status_timing] [{sandbox_id}] API get_status total took {total_duration:.3f} s")

return RockResponse(result=result)


@sandbox_router.post("/execute")
Expand Down
19 changes: 16 additions & 3 deletions rock/admin/metrics/decorator.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,18 @@ def monitor_sandbox_operation(
sandbox_id_position: int = None,
sandbox_id_param: str = None,
metric_prefix: str = "request",
skip_user_info: bool = False,
):
"""Method decorator: Monitor specific methods"""
"""Method decorator: Monitor specific methods

Parameters
----------
skip_user_info:
If True, skip the Redis lookup for user_id/experiment_id/namespace.
Use this for operations like ``start_async`` where the sandbox does
not yet exist in Redis — the lookup would always return None and
only add latency under load.
"""

def decorator(f):
if asyncio.iscoroutinefunction(f):
Expand All @@ -179,8 +189,11 @@ async def wrapper(self, *args, **kwargs):
args, kwargs, extract_sandbox_id, sandbox_id_position, sandbox_id_param
)

meta_store = getattr(self, "_meta_store", None)
user_id, experiment_id, namespace = await _get_user_info(meta_store, sandbox_id)
if skip_user_info:
user_id, experiment_id, namespace = "default", "default", "default"
else:
meta_store = getattr(self, "_meta_store", None)
user_id, experiment_id, namespace = await _get_user_info(meta_store, sandbox_id)

# Build attributes
attributes = _build_attributes(op_name, sandbox_id, f, user_id, experiment_id, namespace)
Expand Down
2 changes: 1 addition & 1 deletion rock/admin/metrics/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def export_with_logging(metrics_data, *args, **kwargs):
t0 = time.perf_counter()
result = original_export(metrics_data, *args, **kwargs)
elapsed_ms = (time.perf_counter() - t0) * 1000.0
logger.info(
logger.debug(
"OTLP export metric_prefix=%s endpoint=%s data_points=%d duration_ms=%.1f result=%s",
self.metric_prefix,
endpoint,
Expand Down
68 changes: 68 additions & 0 deletions rock/sandbox/operator/ray.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import time

import ray

Expand Down Expand Up @@ -60,7 +61,9 @@ async def submit(self, config: DockerDeploymentConfig, user_info: dict = {}) ->
async with self._ray_service.get_ray_rwlock().read_lock():
sandbox_id = config.container_name
logger.info(f"[{sandbox_id}] start_async params:{json.dumps(config.model_dump(), indent=2)}")
t0 = time.perf_counter()
sandbox_actor: SandboxActor = await self.create_actor(config)
logger.info(f"[startup_timing] [{sandbox_id}] Ray create_actor " f"took {time.perf_counter() - t0:.3f} s")
sandbox_actor.set_metrics_endpoint.remote(self._runtime_config.metrics_endpoint)
sandbox_actor.set_user_defined_tags.remote(self._runtime_config.user_defined_tags)
sandbox_actor.start.remote()
Expand All @@ -71,7 +74,11 @@ async def submit(self, config: DockerDeploymentConfig, user_info: dict = {}) ->
sandbox_actor.set_user_id.remote(user_id)
sandbox_actor.set_experiment_id.remote(experiment_id)
sandbox_actor.set_namespace.remote(namespace)
t0 = time.perf_counter()
sandbox_info: SandboxInfo = await self._ray_service.async_ray_get(sandbox_actor.sandbox_info.remote())
logger.info(
f"[startup_timing] [{sandbox_id}] Ray sandbox_info.remote() " f"took {time.perf_counter() - t0:.3f} s"
)
sandbox_info["user_id"] = user_id
sandbox_info["experiment_id"] = experiment_id
sandbox_info["namespace"] = namespace
Expand All @@ -82,34 +89,95 @@ async def submit(self, config: DockerDeploymentConfig, user_info: dict = {}) ->

async def get_status(self, sandbox_id: str) -> SandboxInfo | None:
if self.use_rocklet():
t0 = time.perf_counter()
sandbox_info: SandboxInfo = await build_sandbox_from_redis(self._redis_provider, sandbox_id)
logger.info(
f"[operator_get_status_timing] [{sandbox_id}] Redis build_sandbox_from_redis "
f"took {time.perf_counter() - t0:.3f} s"
)
if sandbox_info is None:
return None

host_ip = sandbox_info.get("host_ip")

t0 = time.perf_counter()
remote_status = await self.get_remote_status(sandbox_id, host_ip)
logger.info(
f"[operator_get_status_timing] [{sandbox_id}] HTTP get_remote_status "
f"took {time.perf_counter() - t0:.3f} s"
)

t0 = time.perf_counter()
is_alive = await self._check_alive_status(sandbox_id, host_ip, remote_status)
logger.info(
f"[operator_get_status_timing] [{sandbox_id}] HTTP check_alive_status "
f"took {time.perf_counter() - t0:.3f} s"
)

# TODO: sink update state according to is_alive logic into SandboxInfo
if is_alive:
sandbox_info["state"] = State.RUNNING
sandbox_info.update(remote_status.to_dict())

return sandbox_info
async with self._ray_service.get_ray_rwlock().read_lock():
total_start = time.perf_counter()
try:
t0 = time.perf_counter()
actor: SandboxActor = await self._ray_service.async_ray_get_actor(self._get_actor_name(sandbox_id))
logger.info(
f"[operator_get_status_timing] [{sandbox_id}] Ray get_actor "
f"took {time.perf_counter() - t0:.3f} s"
)
except (ValueError, Exception):
logger.debug(f"Actor for sandbox {sandbox_id} not found, returning None")
return None

t0 = time.perf_counter()
sandbox_info: SandboxInfo = await self._ray_service.async_ray_get(actor.sandbox_info.remote())
logger.info(
f"[operator_get_status_timing] [{sandbox_id}] Ray sandbox_info.remote() "
f"took {time.perf_counter() - t0:.3f} s"
)

t0 = time.perf_counter()
remote_status: ServiceStatus = await self._ray_service.async_ray_get(actor.get_status.remote())
logger.info(
f"[operator_get_status_timing] [{sandbox_id}] Ray get_status.remote() "
f"took {time.perf_counter() - t0:.3f} s"
)

sandbox_info["phases"] = {name: phase.to_dict() for name, phase in remote_status.phases.items()}
sandbox_info["port_mapping"] = remote_status.get_port_mapping()

t0 = time.perf_counter()
alive = await self._ray_service.async_ray_get(actor.is_alive.remote())
logger.info(
f"[operator_get_status_timing] [{sandbox_id}] Ray is_alive.remote() "
f"took {time.perf_counter() - t0:.3f} s"
)

# TODO: sink update state according to is_alive logic into SandboxInfo
if alive.is_alive:
sandbox_info["state"] = State.RUNNING
if not self._redis_provider:
logger.info(
f"[operator_get_status_timing] [{sandbox_id}] ray get_status total "
f"took {time.perf_counter() - total_start:.3f} s"
)
return sandbox_info

t0 = time.perf_counter()
redis_info = await self.get_sandbox_info_from_redis(sandbox_id)
logger.info(
f"[operator_get_status_timing] [{sandbox_id}] Redis get_sandbox_info "
f"took {time.perf_counter() - t0:.3f} s"
)

logger.info(
f"[operator_get_status_timing] [{sandbox_id}] ray get_status total "
f"took {time.perf_counter() - total_start:.3f} s"
)
if redis_info:
redis_info.update(sandbox_info)
return redis_info
Expand Down
46 changes: 30 additions & 16 deletions rock/sandbox/sandbox_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,11 @@ async def refresh_aes_key(self):
async def _check_sandbox_exists_in_redis(self, config: DeploymentConfig):
if isinstance(config, DockerDeploymentConfig) and config.container_name:
sandbox_id = config.container_name
if await self._meta_store.exists(sandbox_id):
self._meta_store._redis.log_pool_detailed_status()
t0 = time.perf_counter()
exists = await self._meta_store.exists(sandbox_id)
logger.info(f"[startup_timing] [{config.image}] Redis exists check took {time.perf_counter() - t0:.3f} s")
if exists:
raise BadRequestRockError(f"Sandbox {sandbox_id} already exists")

def _setup_sandbox_actor_metadata(self, sandbox_actor: SandboxActor, user_info: UserInfo) -> None:
Expand All @@ -116,10 +120,11 @@ async def _build_sandbox_info_metadata(
sandbox_info["state"] = State.PENDING
sandbox_info["create_time"] = get_iso8601_timestamp()

@monitor_sandbox_operation()
@monitor_sandbox_operation(skip_user_info=True)
async def start_async(
self, config: DeploymentConfig, user_info: UserInfo = {}, cluster_info: ClusterInfo = {}
) -> SandboxStartResponse:
total_start = time.perf_counter()
await self._check_sandbox_exists_in_redis(config)
self.validate_sandbox_spec(self.rock_config.runtime, config)
with StageTimer("startup_timing", f"[{config.image}] Init config", logger):
Expand All @@ -134,17 +139,20 @@ async def start_async(
)
docker_deployment_config.cpus = self.rock_config.runtime.standard_spec.cpus
docker_deployment_config.memory = self.rock_config.runtime.standard_spec.memory
with StageTimer("startup_timing", f"[{sandbox_id}] Operator submit", logger):
with StageTimer("startup_timing", f"[{sandbox_id}] Ray operator submit", logger):
sandbox_info: SandboxInfo = await self._operator.submit(docker_deployment_config, user_info)
await self._build_sandbox_info_metadata(sandbox_info, user_info, cluster_info)
timeout_info = SandboxTimeoutHelper.make_timeout_info(docker_deployment_config.auto_clear_time)
with StageTimer("startup_timing", f"[{sandbox_id}] Meta store create", logger):
with StageTimer("startup_timing", f"[{sandbox_id}] Build sandbox metadata", logger):
await self._build_sandbox_info_metadata(sandbox_info, user_info, cluster_info)
timeout_info = SandboxTimeoutHelper.make_timeout_info(docker_deployment_config.auto_clear_time)
with StageTimer("startup_timing", f"[{sandbox_id}] Meta store create (Redis+DB)", logger):
await self._meta_store.create(
sandbox_id,
sandbox_info,
timeout_info=timeout_info,
deployment_config=docker_deployment_config,
)
total_duration = time.perf_counter() - total_start
logger.info(f"[startup_timing] [{sandbox_id}] start_async total took {total_duration:.3f} s")
return SandboxStartResponse(
sandbox_id=sandbox_id,
host_name=sandbox_info.get("host_name"),
Expand Down Expand Up @@ -272,22 +280,27 @@ async def commit(self, sandbox_id, image_tag: str, username: str, password: str)

@monitor_sandbox_operation()
async def get_status(self, sandbox_id, include_all_states: bool = False) -> SandboxStatusResponse:
# get status from meta_store
sm = await self._get_current_statemachine(sandbox_id)
total_start = time.perf_counter()

# get status from meta_store (Redis + DB fallback)
with StageTimer("get_status_timing", f"[{sandbox_id}] Meta store get (Redis+DB)", logger):
sm = await self._get_current_statemachine(sandbox_id)
if sm is None:
raise BadRequestRockError(f"Sandbox {sandbox_id} not found")

# update status from operator
# update status from operator (Ray)
is_alive = False
operator_sandbox_info: SandboxInfo | None = await self._operator.get_status(sandbox_id=sandbox_id)
if operator_sandbox_info is not None:
is_alive = operator_sandbox_info.get("state") == State.RUNNING
if sm.current_state.value == State.PENDING and is_alive:
await sm.send(
"alive", sandbox_id=sandbox_id, meta_store=self._meta_store, sandbox_info=operator_sandbox_info
)
with StageTimer("get_status_timing", f"[{sandbox_id}] Meta store alive update (Redis+DB)", logger):
await sm.send(
"alive", sandbox_id=sandbox_id, meta_store=self._meta_store, sandbox_info=operator_sandbox_info
)
if operator_sandbox_info.get("state") in (State.PENDING, State.RUNNING):
await self._refresh_timeout(sandbox_id)
with StageTimer("get_status_timing", f"[{sandbox_id}] Redis refresh timeout", logger):
await self._refresh_timeout(sandbox_id)

# compat with legacy get_status behavior by default (include_all_states == False),
# raise 'not found' if not on pending or running status.
Expand All @@ -299,6 +312,9 @@ async def get_status(self, sandbox_id, include_all_states: bool = False) -> Sand
else:
sandbox_info = sm.sandbox_info

total_duration = time.perf_counter() - total_start
logger.info(f"[get_status_timing] [{sandbox_id}] get_status total took {total_duration:.3f} s")

return SandboxStatusResponse(
sandbox_id=sandbox_id,
status=sandbox_info.get("phases"),
Expand Down Expand Up @@ -436,6 +452,4 @@ def validate_sandbox_spec(self, runtime_config: RuntimeConfig, deployment_config
parse_size_to_bytes(deployment_config.disk_limit_rootfs)
except ValueError as e:
logger.warning(f"Invalid disk_limit_rootfs size: {deployment_config.disk_limit_rootfs}", exc_info=e)
raise BadRequestRockError(
f"Invalid disk_limit_rootfs size: {deployment_config.disk_limit_rootfs}"
)
raise BadRequestRockError(f"Invalid disk_limit_rootfs size: {deployment_config.disk_limit_rootfs}")
13 changes: 12 additions & 1 deletion rock/sandbox/sandbox_meta_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,21 @@ async def create(
DB-only fields the caller may carry (e.g. ``spec`` / ``status`` from a
prior DB-fallback read) cannot leak into the alive key.
"""
import time

redis_payload = pick_sandbox_info_fields(sandbox_info)
t0 = time.perf_counter()
await self._redis.json_set(alive_sandbox_key(sandbox_id), "$", redis_payload)
if timeout_info is not None:
await self._redis.json_set(timeout_sandbox_key(sandbox_id), "$", timeout_info)
logger.info(
f"[startup_timing] [{sandbox_id}] Meta store create Redis " f"took {time.perf_counter() - t0:.3f} s"
)
self._redis.log_pool_detailed_status()

t0 = time.perf_counter()
await self._db.create(sandbox_id, sandbox_info, deployment_config)
logger.info(f"[startup_timing] [{sandbox_id}] Meta store create DB " f"took {time.perf_counter() - t0:.3f} s")

@monitor_metastore_operation
async def update(self, sandbox_id: str, sandbox_info: SandboxInfo) -> None:
Expand Down Expand Up @@ -140,7 +149,9 @@ async def get(self, sandbox_id: str, check_db: bool = False) -> SandboxInfo | No

async def exists(self, sandbox_id: str) -> bool:
"""Return ``True`` when the Redis alive key exists for ``sandbox_id``."""
return await self.get(sandbox_id) is not None
# Use EXISTS command instead of JSON.GET for better performance
key = alive_sandbox_key(sandbox_id)
return await self._redis.exists(key)

@monitor_metastore_operation
async def get_timeout(self, sandbox_id: str) -> dict[str, str] | None:
Expand Down
Loading
Loading