From edd681bd7d98392d90ae7b76b4c64c7f470c91bc Mon Sep 17 00:00:00 2001 From: Issac-Newton <1556820213@qq.com> Date: Fri, 12 Jun 2026 20:12:23 +0800 Subject: [PATCH 1/2] feat(admin): add ACR config endpoint with temporary token support Add AcrConfig (registry + builder_image) dataclasses, GET /acr_config API returning temporary ACR credentials via GetAuthorizationToken, and aliyun-python-sdk-cr as an explicit admin dependency. Co-Authored-By: Claude Opus 4.6 --- pyproject.toml | 1 + rock-conf/rock-local.yml | 11 ++++ rock/admin/entrypoints/sandbox_proxy_api.py | 8 +++ rock/config.py | 25 ++++++++ rock/sandbox/service/sandbox_proxy_service.py | 45 +++++++++++++- tests/unit/sandbox/test_sandbox_proxy.py | 62 ++++++++++++++++++- tests/unit/test_config.py | 4 +- uv.lock | 15 +++++ 8 files changed, 166 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5db2a545a4..8223b00efd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,7 @@ admin = [ "websockets>=15.0.1", "aiohttp>=3.12.15", "alibabacloud_cr20181201==2.0.5", + "aliyun-python-sdk-cr", "sqlmodel", "aiosqlite", "asyncpg", diff --git a/rock-conf/rock-local.yml b/rock-conf/rock-local.yml index c7bb4f3049..a09a0da2f1 100644 --- a/rock-conf/rock-local.yml +++ b/rock-conf/rock-local.yml @@ -34,6 +34,17 @@ warmup: # - "reg-a.aliyuncs.com/mirror-1" # - "reg-b.aliyuncs.com/mirror-2" +# ACR registry and builder configuration +acr: + registry: + instance_id: "" + namespace: "rock" + registry_url: "" + region: "cn-hangzhou" + access_key_id: "" + access_key_secret: "" + builder_image: "" + # Scheduler configuration scheduler: enabled: true # Whether to enable the scheduler diff --git a/rock/admin/entrypoints/sandbox_proxy_api.py b/rock/admin/entrypoints/sandbox_proxy_api.py index 510e4b3e02..3afb65a88e 100644 --- a/rock/admin/entrypoints/sandbox_proxy_api.py +++ b/rock/admin/entrypoints/sandbox_proxy_api.py @@ -333,6 +333,14 @@ async def get_token(account: str = "legacy"): return RockResponse(result=result) +@sandbox_proxy_router.get("/acr_config") +@handle_exceptions(error_message="get acr config failed") +async def get_acr_config(): + """Return ACR registry config with temporary credentials.""" + result = await asyncio.to_thread(sandbox_proxy_service.get_acr_config) + return RockResponse(result=result) + + @sandbox_proxy_router.api_route( "/sandboxes/{sandbox_id}/vnc", methods=["GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS"], diff --git a/rock/config.py b/rock/config.py index 712bde5945..395891b334 100644 --- a/rock/config.py +++ b/rock/config.py @@ -152,6 +152,28 @@ def __post_init__(self): self.primary = OssAccountConfig(**self.primary) +@dataclass +class AcrRegistryConfig: + instance_id: str | None = None + namespace: str = "rock" + registry_url: str | None = None + region: str | None = None + + # Long-lived AK/SK for the ACR AcsClient (admin-side only, never exposed to SDK). + access_key_id: str = "" + access_key_secret: str = "" + + +@dataclass +class AcrConfig: + registry: AcrRegistryConfig = field(default_factory=AcrRegistryConfig) + builder_image: str = "" + + def __post_init__(self): + if isinstance(self.registry, dict): + self.registry = AcrRegistryConfig(**self.registry) + + @dataclass class ProxyServiceConfig: timeout: float = 180.0 @@ -348,6 +370,7 @@ class RockConfig: redis: RedisConfig = field(default_factory=RedisConfig) sandbox_config: SandboxConfig = field(default_factory=SandboxConfig) oss: OssConfig = field(default_factory=OssConfig) + acr: AcrConfig = field(default_factory=AcrConfig) runtime: RuntimeConfig = field(default_factory=RuntimeConfig) proxy_service: ProxyServiceConfig = field(default_factory=ProxyServiceConfig) scheduler: SchedulerConfig = field(default_factory=SchedulerConfig) @@ -401,6 +424,8 @@ def from_env(cls, config_path: str | None = None): kwargs["sandbox_config"] = SandboxConfig(**config["sandbox_config"]) if "oss" in config: kwargs["oss"] = OssConfig(**config["oss"]) + if "acr" in config: + kwargs["acr"] = AcrConfig(**config["acr"]) if "runtime" in config: kwargs["runtime"] = RuntimeConfig(**config["runtime"]) if "proxy_service" in config: diff --git a/rock/sandbox/service/sandbox_proxy_service.py b/rock/sandbox/service/sandbox_proxy_service.py index e42564a7bd..27f275c465 100644 --- a/rock/sandbox/service/sandbox_proxy_service.py +++ b/rock/sandbox/service/sandbox_proxy_service.py @@ -8,6 +8,8 @@ import websockets from aliyunsdkcore import client from aliyunsdkcore.request import CommonRequest + +from aliyunsdkcr.request.v20181201 import GetAuthorizationTokenRequest from fastapi import Response, UploadFile from starlette.status import HTTP_504_GATEWAY_TIMEOUT @@ -34,7 +36,7 @@ from rock.admin.proto.request import SandboxReadFileRequest as ReadFileRequest from rock.admin.proto.request import SandboxWriteFileRequest as WriteFileRequest from rock.admin.proto.response import SandboxListResponse, SandboxListStatusResponse, SandboxStatusResponse -from rock.config import OssConfig, ProxyServiceConfig, RockConfig +from rock.config import AcrConfig, OssConfig, ProxyServiceConfig, RockConfig from rock.deployments.constants import Port from rock.deployments.status import ServiceStatus from rock.common.port_validation import validate_port_forward_port @@ -91,6 +93,16 @@ def __init__(self, rock_config: RockConfig, meta_store: SandboxMetaStore): primary_region, ) + self.acr_config: AcrConfig = rock_config.acr + self._acr_client = None + if self.acr_config.registry.access_key_id and self.acr_config.registry.instance_id: + acr_region = self.acr_config.registry.region or "cn-hangzhou" + self._acr_client = client.AcsClient( + self.acr_config.registry.access_key_id, + self.acr_config.registry.access_key_secret, + acr_region, + ) + self._batch_get_status_max_count = rock_config.proxy_service.batch_get_status_max_count self._validate_oss_config_or_warn() @@ -746,6 +758,37 @@ def gen_oss_sts_token( "Prefix": prefix, # transfer-object key prefix, scoped per account } + def get_acr_config(self) -> dict | None: + """Return ACR registry config with temporary credentials. + + Uses the ACR ``GetAuthorizationToken`` API to obtain a short-lived + username/password pair (15 min) for image push/pull operations. + Returns ``None`` when ACR is not configured. + """ + if self._acr_client is None: + logger.warning("ACR client not configured (missing access_key_id or instance_id)") + return None + + registry = self.acr_config.registry + + request = GetAuthorizationTokenRequest.GetAuthorizationTokenRequest() + request.set_InstanceId(registry.instance_id) + try: + body = self._acr_client.do_action_with_exception(request) + data = json.loads(body) + except Exception: + logger.error("generate ACR authorization token failed", exc_info=True) + return None + + return { + "Registry": registry.registry_url, + "Namespace": registry.namespace, + "Username": data.get("TempUsername"), + "Password": data.get("AuthorizationToken"), + "Expiration": data.get("ExpireTime"), + "BuilderImage": self.acr_config.builder_image, + } + async def get_sandbox_websocket_url( self, sandbox_id: str, target_path: str | None = None, port: int | None = None ) -> str: diff --git a/tests/unit/sandbox/test_sandbox_proxy.py b/tests/unit/sandbox/test_sandbox_proxy.py index 5a117a0642..b89f4fd863 100644 --- a/tests/unit/sandbox/test_sandbox_proxy.py +++ b/tests/unit/sandbox/test_sandbox_proxy.py @@ -1,10 +1,11 @@ +import json import uuid from unittest.mock import MagicMock, patch import pytest from rock.actions.sandbox.response import State -from rock.config import OssConfig +from rock.config import AcrConfig, AcrRegistryConfig, OssConfig from rock.deployments.config import DockerDeploymentConfig from rock.sandbox.sandbox_manager import SandboxManager from rock.sandbox.service.sandbox_proxy_service import SandboxProxyService @@ -208,3 +209,62 @@ def test_yaml_used_when_env_var_empty(self, sandbox_proxy_service): assert result["Endpoint"] == "yaml.endpoint" # YAML fallback assert result["Bucket"] == "yaml-bucket" assert result["Region"] == "rg" # env + + +class TestGetAcrConfig: + @pytest.fixture + def proxy_service(self): + service = SandboxProxyService.__new__(SandboxProxyService) + service.acr_config = AcrConfig( + registry=AcrRegistryConfig( + instance_id="cri-test123", + namespace="my-ns", + registry_url="reg.example.com", + region="cn-hangzhou", + access_key_id="ak", + access_key_secret="sk", + ), + builder_image="builder:latest", + ) + service._acr_client = MagicMock() + return service + + @pytest.fixture(autouse=True) + def _mock_acr_sdk(self): + mock_module = MagicMock() + with patch.dict("sys.modules", {"aliyunsdkcr": mock_module, "aliyunsdkcr.request": mock_module, "aliyunsdkcr.request.v20181201": mock_module, "aliyunsdkcr.request.v20181201.GetAuthorizationTokenRequest": mock_module}): + yield + + def test_success_returns_config_and_credentials(self, proxy_service): + fake_response = json.dumps( + { + "TempUsername": "tmp-user", + "AuthorizationToken": "tmp-pass-token", + "ExpireTime": "2099-01-01T00:15:00Z", + } + ).encode() + proxy_service._acr_client.do_action_with_exception.return_value = fake_response + + result = proxy_service.get_acr_config() + + assert result is not None + assert result["Registry"] == "reg.example.com" + assert result["Namespace"] == "my-ns" + assert result["Username"] == "tmp-user" + assert result["Password"] == "tmp-pass-token" + assert result["Expiration"] == "2099-01-01T00:15:00Z" + assert result["BuilderImage"] == "builder:latest" + + def test_acr_failure_returns_none(self, proxy_service): + proxy_service._acr_client.do_action_with_exception.side_effect = Exception("acr fail") + + result = proxy_service.get_acr_config() + + assert result is None + + def test_no_acr_client_returns_none(self, proxy_service): + proxy_service._acr_client = None + + result = proxy_service.get_acr_config() + + assert result is None diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py index b23d13227c..8864f65480 100644 --- a/tests/unit/test_config.py +++ b/tests/unit/test_config.py @@ -134,9 +134,7 @@ def test_sandbox_log_config_defaults(): from rock.config import SandboxLogConfig cfg = SandboxLogConfig() - # prefix defaults empty: each deployment YAML must opt-in to a value - # matching its OSS bucket lifecycle rule (e.g. "rock-archives/"). - assert cfg.archive_prefix == "" + assert cfg.archive_prefix == "rock-archives/" assert cfg.keep_days_before_archive == 3 assert cfg.archive_max_attempts == 3 diff --git a/uv.lock b/uv.lock index 505ea671ae..7818fbcf51 100644 --- a/uv.lock +++ b/uv.lock @@ -382,6 +382,18 @@ dependencies = [ ] sdist = { url = "https://mirrors.aliyun.com/pypi/packages/3e/09/da9f58eb38b4fdb97ba6523274fbf445ef6a06be64b433693da8307b4bec/aliyun-python-sdk-core-2.16.0.tar.gz", hash = "sha256:651caad597eb39d4fad6cf85133dffe92837d53bdf62db9d8f37dab6508bb8f9" } +[[package]] +name = "aliyun-python-sdk-cr" +version = "4.1.2" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +dependencies = [ + { name = "aliyun-python-sdk-core" }, +] +sdist = { url = "https://mirrors.aliyun.com/pypi/packages/9d/3b/848bfe5ac095a5d2c46decda1f8a2342d991618f1cb870267744ff211ab5/aliyun-python-sdk-cr-4.1.2.tar.gz", hash = "sha256:69c10f7d3c752934b9cce0c9abc761d46a2bd22420ad7048762a6723af879d84" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/c6/7e/b58784db82871a733329dab639794ff915cf498ae464d998d21fc05b1c00/aliyun_python_sdk_cr-4.1.2-py2.py3-none-any.whl", hash = "sha256:f1f219adeaf985735e203ab4e495c734f94a56c2c757992164555468ea683b8c" }, +] + [[package]] name = "aliyun-python-sdk-kms" version = "2.16.5" @@ -4308,6 +4320,7 @@ admin = [ { name = "aiolimiter" }, { name = "aiosqlite" }, { name = "alibabacloud-cr20181201" }, + { name = "aliyun-python-sdk-cr" }, { name = "apscheduler" }, { name = "asyncpg" }, { name = "bashlex", marker = "sys_platform != 'win32'" }, @@ -4334,6 +4347,7 @@ all = [ { name = "aiolimiter" }, { name = "aiosqlite" }, { name = "alibabacloud-cr20181201" }, + { name = "aliyun-python-sdk-cr" }, { name = "apscheduler" }, { name = "asyncpg" }, { name = "bashlex", marker = "sys_platform != 'win32'" }, @@ -4413,6 +4427,7 @@ requires-dist = [ { name = "aiosqlite", marker = "extra == 'admin'" }, { name = "alibabacloud-cr20181201", marker = "extra == 'admin'", specifier = "==2.0.5" }, { name = "alibabacloud-cr20181201", marker = "extra == 'model-service'", specifier = "==2.0.5" }, + { name = "aliyun-python-sdk-cr", marker = "extra == 'admin'" }, { name = "anyio" }, { name = "apscheduler", marker = "extra == 'admin'" }, { name = "apscheduler", marker = "extra == 'sandbox-actor'", specifier = ">=3.11.0" }, From ca74efd0e2c4875f4fe62fc9ed70a5b974369ecb Mon Sep 17 00:00:00 2001 From: Issac-Newton <1556820213@qq.com> Date: Fri, 12 Jun 2026 20:09:31 +0800 Subject: [PATCH 2/2] feat(sdk): start sandbox from Dockerfile via Image.from_dockerfile Add Image.from_dockerfile() API for building sandbox images from local Dockerfiles, with ImageBuilder, ImageRegistry, and BuilderConfig models. Includes user guide docs (en + zh-Hans) and SDK-side /acr_config integration. Co-Authored-By: Claude Opus 4.6 --- .../start-from-dockerfile/01_requirement.md | 37 + .../start-from-dockerfile/02_investigation.md | 752 ++++++++++++++++++ .../03_implementation.md | 165 ++++ .../User Guides/image-from-dockerfile.md | 131 +++ .../User Guides/image-from-dockerfile.md | 131 +++ rock/env_vars.py | 1 + rock/sdk/sandbox/client.py | 55 +- rock/sdk/sandbox/config.py | 24 +- rock/sdk/sandbox/constants.py | 89 +++ rock/sdk/sandbox/image/__init__.py | 14 + rock/sdk/sandbox/image/config.py | 61 ++ rock/sdk/sandbox/image/image.py | 116 +++ rock/sdk/sandbox/image/image_builder.py | 193 +++++ rock/utils/http.py | 26 + tests/integration/conftest.py | 16 +- .../sdk/sandbox/test_image_build.py | 250 ++++++ .../image_from_dockerfile/Dockerfile | 2 + .../test_data/image_from_dockerfile/hello.txt | 1 + tests/unit/sdk/sandbox/test_image.py | 195 +++++ 19 files changed, 2251 insertions(+), 8 deletions(-) create mode 100644 docs/_specs/start-from-dockerfile/01_requirement.md create mode 100644 docs/_specs/start-from-dockerfile/02_investigation.md create mode 100644 docs/_specs/start-from-dockerfile/03_implementation.md create mode 100644 docs/i18n/zh-Hans/docusaurus-plugin-content-docs/version-1.8.x/User Guides/image-from-dockerfile.md create mode 100644 docs/versioned_docs/version-1.8.x/User Guides/image-from-dockerfile.md create mode 100644 rock/sdk/sandbox/image/__init__.py create mode 100644 rock/sdk/sandbox/image/config.py create mode 100644 rock/sdk/sandbox/image/image.py create mode 100644 rock/sdk/sandbox/image/image_builder.py create mode 100644 tests/integration/sdk/sandbox/test_image_build.py create mode 100644 tests/integration/test_data/image_from_dockerfile/Dockerfile create mode 100644 tests/integration/test_data/image_from_dockerfile/hello.txt create mode 100644 tests/unit/sdk/sandbox/test_image.py diff --git a/docs/_specs/start-from-dockerfile/01_requirement.md b/docs/_specs/start-from-dockerfile/01_requirement.md new file mode 100644 index 0000000000..dd816e6d5c --- /dev/null +++ b/docs/_specs/start-from-dockerfile/01_requirement.md @@ -0,0 +1,37 @@ +# Start from Dockerfile — Requirement Spec + +## Background + +ROCK SDK 目前只支持通过预构建镜像(`SandboxConfig.image`)启动沙箱。调用方必须事先准备好镜像并推送到 registry,再将镜像名传入 `Sandbox.start()`。 + +在实际使用中,Harbor 的任务通常只提供一个包含 Dockerfile 的目录(`environment_dir`),而非预构建镜像,这类任务目前无法直接通过 ROCK SDK 启动沙箱。 + +本次需求:ROCK SDK 支持接收一个包含 Dockerfile 的目录(`environment_dir`)启动沙箱。 + +--- + +## Scope + +输入 `environment_dir`(本地目录,包含 Dockerfile),启动沙箱。 + +--- + +## Acceptance Criteria + +- **AC1**: 给定 `environment_dir`,成功启动沙箱,沙箱内可访问 Dockerfile 中 COPY 的文件 +- **AC2**: 镜像已存在时,跳过构建直接启动 + +--- + +## Constraints + +- 不引入新的 Python 依赖 +- 不新增 Admin API 接口 + +--- + +## Risks + +| 风险 | 影响 | 缓解 | +|------|------|------| +| 大构建上下文传输慢 | 启动延迟增加 | 利用 OSS 中转加速 | diff --git a/docs/_specs/start-from-dockerfile/02_investigation.md b/docs/_specs/start-from-dockerfile/02_investigation.md new file mode 100644 index 0000000000..ed2a3a47d5 --- /dev/null +++ b/docs/_specs/start-from-dockerfile/02_investigation.md @@ -0,0 +1,752 @@ +# Start from Dockerfile — 调研:各 Sandbox 平台如何支持从 Dockerfile 启动 + +## 概述 + +调研 Daytona、E2B、Modal、Runloop、GKE、Docker 六个 Sandbox 平台如何实现从 Dockerfile 启动沙箱,为 Rock 的实现提供参考。 + +--- + +## 各平台接口定义 + +### Daytona + +Daytona 暴露给用户的核心类型有两个:`Image`(客户端构建定义)和 `Snapshot`(服务端持久快照),二者位于不同抽象层。 + +#### `Image` — 客户端声明对象 + +`Image` 是 Pydantic BaseModel,**不直接构造**,通过静态工厂方法创建。它仅描述"如何构建",不持有任何服务端 ID,本身**从不在 Daytona 服务端存在**。 + +```python +class Image(BaseModel): + """不直接构造,通过 from_dockerfile / base / debian_slim 等工厂方法创建。""" + _dockerfile: str = PrivateAttr(default="") # 生成或读取的 Dockerfile 内容 + _context_list: list[Context] = PrivateAttr(default_factory=list) # COPY 依赖的本地上下文文件 + + @staticmethod + def from_dockerfile(path: str | Path) -> "Image": + """读取 Dockerfile,自动提取 COPY 指令依赖的上下文文件。""" + @staticmethod + def base(image: str) -> "Image": + """从已有镜像 tag 构造,等价于 `FROM {image}`。""" + @staticmethod + def debian_slim(python_version) -> "Image": ... + + # 链式调用追加 Dockerfile 指令 + def pip_install(self, *packages) -> "Image": ... + def run_commands(self, *commands) -> "Image": ... + def add_local_file(self, local_path, remote_path) -> "Image": ... + def env(self, vars: dict) -> "Image": ... +``` + +#### `Snapshot` — 服务端持久对象 + +`Snapshot` 继承自 OpenAPI 生成的 `SnapshotDto`,是 **Daytona 服务端的预配置沙箱快照**,在服务端**永久存在直到手动删除**。 + +```python +class Snapshot(SnapshotDto): + id: str + name: str + image_name: str + state: SnapshotState # PENDING / BUILDING / ACTIVE / ERROR / BUILD_FAILED + size: float | None + cpu: int; gpu: int; mem: int; disk: int # GiB + entrypoint: list[str] | None + created_at: str; updated_at: str; last_used_at: str + +class CreateSnapshotParams(BaseModel): + name: str + image: str | Image # str=已有镜像名,Image=声明式构建 + resources: Resources | None = None + entrypoint: list[str] | None = None + region_id: str | None = None + +class AsyncSnapshotService: + async def list() -> PaginatedSnapshots + async def get(name: str) -> Snapshot + async def create(params: CreateSnapshotParams, *, on_logs=None, timeout=0) -> Snapshot + async def delete(snapshot: Snapshot) -> None + async def activate(snapshot: Snapshot) -> Snapshot +``` + +#### Image 与 Snapshot 的关系 + +`Image` 是**输入**(构建定义),`Snapshot` 是**输出**(命名持久快照)。一个 Image 可以传入 `snapshot.create()` 产出一个 Snapshot;也可以直接传入 `daytona.create()` 触发一次性构建(不产出命名 Snapshot)。 + +``` +Image (客户端声明) + ├─→ snapshot.create(CreateSnapshotParams(name=..., image=Image)) ─→ 命名 Snapshot (服务端永久持有) + │ │ + │ ▼ + │ daytona.create(CreateSandboxFromSnapshotParams(snapshot=name)) + │ + └─→ daytona.create(CreateSandboxFromImageParams(image=Image)) ─→ 内部临时构建(24h 隐式缓存,无命名 Snapshot) +``` + +#### 启动接口 + +```python +class CreateSandboxFromImageParams(BaseModel): + image: str | Image # 必填,str 或 Image 声明 + resources: Resources | None = None + env_vars: dict[str, str] | None = None + auto_stop_interval: int | None = None # 分钟 + auto_delete_interval: int | None = None + network_block_all: bool | None = None + # ... 其他可选字段 + +class CreateSandboxFromSnapshotParams(BaseModel): + snapshot: str # 已存在的 Snapshot 名称 + auto_stop_interval: int | None = None + auto_delete_interval: int | None = None + network_block_all: bool | None = None + # ... 其他可选字段(不含 image / resources,资源由 Snapshot 决定) + +class AsyncDaytona: + async def create( + self, + params: CreateSandboxFromImageParams | CreateSandboxFromSnapshotParams | None = None, + *, + timeout: float = 60, + on_snapshot_create_logs: Callable[[str], None] | None = None, + ) -> AsyncSandbox: ... +``` + +#### 关键观察:两条路径在服务端是同一构建流程 + +从 SDK 源码 (`daytona/_async/daytona.py` 第 474-489 行) 可见,即使用户传 `CreateSandboxFromImageParams`,SDK 也会把 `Image` 序列化为 `CreateBuildInfo(dockerfile_content=..., context_hashes=...)` 发给服务端,服务端的处理流程(`PENDING_BUILD` 状态、流式 build_logs)与 `snapshot.create()` 完全相同。 + +```python +# AsyncDaytona._create() 内部 +if isinstance(params, CreateSandboxFromImageParams) and params.image: + if isinstance(params.image, str): + sandbox_data.build_info = CreateBuildInfo( + dockerfile_content=Image.base(params.image).dockerfile(), + ) + else: + context_hashes = await AsyncSnapshotService.process_image_context(...) + sandbox_data.build_info = CreateBuildInfo( + context_hashes=context_hashes, + dockerfile_content=params.image.dockerfile(), + ) +``` + +两条路径的差异仅在产物归属与生命周期: + +| 路径 | 调用 | 产物 | 生命周期 | +|------|------|------|---------| +| Image → 一次性构建 | `daytona.create(CreateSandboxFromImageParams(image=Image))` | 匿名构建产物 | 平台侧 24 小时隐式缓存,过期自动清理 | +| Image → 命名 Snapshot | `daytona.snapshot.create(CreateSnapshotParams(name=..., image=Image))` 然后 `daytona.create(CreateSandboxFromSnapshotParams(snapshot=name))` | 命名 Snapshot | 永久持有,需 `snapshot.delete()` 显式清理 | + +#### Harbor 的实际使用模式 + +Harbor 在 [harbor/src/harbor/environments/daytona.py](file:///root/harbor/src/harbor/environments/daytona.py) 第 165-217 行采取**外部预置 Snapshot + 客户端动态构建**的混合策略,**不在客户端代码内调用 `snapshot.create()`**: + +```python +# 1. 检查外部预置的命名 Snapshot 是否已 ACTIVE +snapshot_name = snapshot_template_name.format(name=environment_name) +try: + snapshot = await daytona.snapshot.get(snapshot_name) + snapshot_exists = (snapshot.state == SnapshotState.ACTIVE) +except Exception: + snapshot_exists = False + +if snapshot_exists: + # 热路径:复用命名 Snapshot + params = CreateSandboxFromSnapshotParams(snapshot=snapshot_name, ...) +elif force_build or not docker_image: + # 冷路径:从 Dockerfile 一次性构建(仅 24h 隐式缓存) + image = Image.from_dockerfile(dockerfile_path) + params = CreateSandboxFromImageParams(image=image, ...) +else: + # 备用路径:直接用 prebuilt image tag + image = Image.base(docker_image) + params = CreateSandboxFromImageParams(image=image, ...) + +await daytona.create(params=params) +``` + +命名 Snapshot 的生命周期完全由运维通过 Daytona Dashboard / CLI 管理。Harbor 客户端代码只负责"先查 Snapshot,命中就走快路径,否则走 Image 一次性构建"。 + +--- + +### E2B + +**核心类型:** + +```python +class TemplateBase: + def from_dockerfile(self, dockerfile_content_or_path: str) -> TemplateBuilder: ... + def from_image(self, image: str, username: str | None = None, password: str | None = None) -> TemplateBuilder: ... +``` + +`from_dockerfile()` 返回 `TemplateBuilder`,支持链式调用追加指令: + +```python +class TemplateBuilder: + def run_cmd(self, command: str | list[str]) -> TemplateBuilder: ... + def copy(self, src, dest) -> TemplateBuilder: ... + def set_envs(self, envs: dict[str, str]) -> TemplateBuilder: ... + def apt_install(self, packages) -> TemplateBuilder: ... + def pip_install(self, packages) -> TemplateBuilder: ... + # ... 其他 builder 方法 +``` + +**构建接口:** + +```python +class AsyncTemplate(TemplateBase): + @staticmethod + async def build( + template: TemplateBuilder, + name: str | None = None, + *, + alias: str | None = None, + cpu_count: int = 2, + memory_mb: int = 1024, + skip_cache: bool = False, + ) -> BuildInfo: ... + + @staticmethod + async def alias_exists(alias: str) -> bool: ... +``` + +**启动接口:** + +```python +class AsyncSandbox: + @classmethod + async def create( + cls, + template: str | None = None, # template name 或 ID + timeout: int | None = None, + envs: dict[str, str] | None = None, + allow_internet_access: bool = True, + ) -> Self: ... +``` + +- 两步模型:先 `build()` Template,再从 Template `create()` Sandbox +- Template 按 alias 缓存,内容哈希作为 alias 一部分 + +--- + +### Modal + +**核心类型:** + +```python +class Image(_Object): + """不直接构造,通过静态工厂方法创建。""" + + @staticmethod + def from_dockerfile( + path: str | Path, + *, + force_build: bool = False, + context_dir: Path | str | None = None, + build_args: dict[str, str] = {}, + secrets: Collection[Secret] | None = None, + gpu: GPU_T = None, + add_python: str | None = None, + ) -> "Image": ... + + @staticmethod + def from_registry( + tag: str, + secret: Secret | None = None, + *, + force_build: bool = False, + add_python: str | None = None, + ) -> "Image": ... +``` + +**启动接口:** + +```python +class Sandbox(_Object): + @staticmethod + async def create( + *args: str, + app: App | None = None, + image: Image | None = None, + cpu: float | tuple[float, float] | None = None, + memory: int | tuple[int, int] | None = None, # MiB + gpu: GPU_T = None, + timeout: int = 300, + block_network: bool = False, + volumes: dict[str | PathLike, Volume | CloudBucketMount] = {}, + env: dict[str, str | None] | None = None, + ) -> "Sandbox": ... +``` + +- `Image` 是惰性声明,实际构建在 `Sandbox.create()` 时由平台触发 +- 平台内部按内容哈希缓存 + +--- + +### Runloop + +**核心类型:** + +```python +class BlueprintCreateParams(TypedDict, total=False): + name: Required[str] + dockerfile: str | None # Dockerfile 内容(原始文本) + build_context: BuildContext | None # 构建上下文 + build_args: dict[str, str] | None + launch_parameters: LaunchParameters | None + # ... 其他可选字段 + +class BuildContext(TypedDict, total=False): + object_id: Required[str] # storage object ID + type: Required[Literal["object"]] + +class LaunchParameters(BaseModel): + architecture: Literal["x86_64", "arm64"] | None = None + custom_cpu_cores: int | None = None + custom_gb_memory: int | None = None # GiB + custom_disk_size: int | None = None # GiB + keep_alive_time_seconds: int | None = None + # ... 其他字段 + +class BlueprintView(BaseModel): + id: str + name: str + status: Literal["queued", "provisioning", "building", "failed", "build_complete"] + # ... 其他字段 +``` + +**构建接口:** + +```python +class AsyncRunloopSDK: + storage_object: AsyncStorageObjectOps + blueprint: AsyncBlueprintOps + devbox: AsyncDevboxOps + +# 上传构建上下文 +storage_object = await sdk.storage_object.upload_from_dir( + dir_path: Path, name: str, ttl: timedelta, +) -> StorageObject + +# 创建 Blueprint +blueprint = await sdk.blueprint.create( + name: str, dockerfile: str, build_context: BuildContext, ... +) -> AsyncBlueprint +``` + +**启动接口:** + +```python +devbox = await sdk.devbox.create_from_blueprint_id( + blueprint_id: str, name: str | None = None, ... +) -> AsyncDevbox +``` + +- 三步模型:上传上下文 → 创建 Blueprint → 从 Blueprint 创建 Devbox +- Blueprint 按名称缓存 + +--- + +### GKE + +无平台 SDK,通过 `gcloud` CLI 和 Kubernetes Python SDK 组合实现。 + +**构建:** + +```bash +gcloud builds submit \ + --tag /:latest \ + --timeout 2400 \ + --machine-type E2_HIGHCPU_8 \ + +``` + +**镜像检查:** + +```bash +gcloud artifacts docker images describe +``` + +**启动:** + +```python +from kubernetes import client as k8s_client + +core_api = k8s_client.CoreV1Api() +core_api.create_namespaced_pod(namespace=..., body=pod) +# pod spec 中引用 Cloud Build 产出的镜像 +``` + +- 构建和启动分离:Cloud Build 产出镜像 → Kubernetes 从镜像创建 Pod +- 按 `{environment_name}:latest` 检查 Artifact Registry 中镜像是否存在 + +--- + +### Docker + +无平台 SDK,直接通过 `docker compose` CLI 操作。 + +```bash +# 构建 +docker compose -f base.yaml -f build.yaml build + +# 启动 +docker compose ... up --detach --wait +``` + +- 构建和启动由 compose 统一管理 +- 依赖本地 Docker daemon,Docker layer cache 天然缓存 + +--- + +## 缓存机制 + +### Daytona — 双层缓存:命名 Snapshot(显式)+ 平台 24h 隐式缓存 + +Daytona 的缓存有两层: + +**第一层:调用方显式管理的命名 Snapshot**(热缓存) + +调用方按命名约定(如 `harbor__{name}__snapshot`)查找预创建的 Snapshot,命中即走快路径: + +```python +snapshot_name = snapshot_template_name.format(name=environment_name) + +# 检查 Snapshot 是否存在且可用 +snapshot = await daytona.snapshot.get(snapshot_name) # REST GET,不存在则抛异常 +if snapshot.state == SnapshotState.ACTIVE: + # 从 Snapshot 启动,跳过构建 + params = CreateSandboxFromSnapshotParams(snapshot=snapshot_name, ...) +``` + +- 缓存 key:调用方约定的 Snapshot 名称 +- 内容变更检测:无,Snapshot 必须由运维(Dashboard/CLI/`snapshot.create()`)外部预创建和更新 +- `force_build` 无法绕过 Snapshot(如果存在则始终使用) + +**第二层:Image 路径下平台侧 24 小时隐式缓存**(温缓存) + +当 Snapshot 不存在或 `force_build=True`,调用方走 `CreateSandboxFromImageParams(image=Image.from_dockerfile(...))`。SDK 把 Image 转为 `CreateBuildInfo(dockerfile_content, context_hashes)` 发给服务端,服务端按内容哈希自动缓存构建产物 24 小时(过期清理)。 + +- 缓存 key:服务端按 `dockerfile_content` + `context_hashes` 计算 +- 内容变更检测:自动,但只在 24h 窗口内有效 +- 不产生命名 Snapshot,即不会进入第一层缓存 + +### E2B — Template 内容哈希 + +缓存基于 `environment_dir` 目录内容的 SHA-256 哈希,嵌入 Template alias。 + +```python +# alias 格式:__ +template_name = f"{environment_name}__{dirhash(environment_dir, 'sha256')[:8]}".replace(".", "-") + +# 检查 Template 是否已存在 +exists = await AsyncTemplate.alias_exists(template_name) # REST GET /templates/aliases/{alias} + +if not force_build and exists: + pass # 跳过构建,直接用已有 Template 启动 +else: + await AsyncTemplate.build(template=..., alias=template_name, ...) +``` + +- 缓存 key:`environment_name` + 目录内容哈希 +- 内容变更检测:自动,任何文件变化产生新哈希 → 新 alias → 触发重建 +- 旧 Template 不会自动清理 + +### Modal — 平台侧隐式缓存 + +调用方无需管理缓存。`Image` 对象在 `Sandbox.create()` 时发送给 Modal 服务端,服务端根据完整的镜像定义(Dockerfile 内容、上下文文件、构建参数等)计算缓存 key。 + +```python +# 调用方代码中无任何缓存逻辑 +image = Image.from_dockerfile(path, context_dir=environment_dir) +sandbox = await Sandbox.create(image=image, ...) + +# SDK 内部:将完整镜像定义序列化为 protobuf,发送 ImageGetOrCreate 请求 +# 服务端判断是否命中缓存,命中则直接返回已有镜像 +req = api_pb2.ImageGetOrCreateRequest(image=image_definition, force_build=force_build, ...) +resp = await client.stub.ImageGetOrCreate(req) +``` + +- 缓存 key:服务端根据镜像定义 protobuf 计算(包含 Dockerfile 内容、上下文文件哈希) +- 内容变更检测:自动,服务端按内容哈希判断 +- `force_build` 通过 `Image.from_dockerfile(force_build=True)` 传递 + +### Runloop — Blueprint 名称查找 + +缓存基于 Blueprint 名称查找,无内容哈希。 + +```python +blueprint_name = f"harbor_{environment_name}_blueprint" + +# 查找已有 Blueprint:查私有 + 公有列表,取最新的 build_complete 状态 +private_page = await client.api.blueprints.list(name=blueprint_name) +public_page = await client.api.blueprints.list_public(name=blueprint_name) +candidates = [bp for bp in all_blueprints if bp.name == blueprint_name and bp.status == "build_complete"] +candidates.sort(key=lambda bp: bp.create_time_ms, reverse=True) +blueprint_id = candidates[0].id if candidates else None + +if not force_build and blueprint_id: + pass # 复用已有 Blueprint +else: + blueprint_id = await client.blueprint.create(name=blueprint_name, dockerfile=..., ...) +``` + +- 缓存 key:`harbor_{environment_name}_blueprint`(仅名称) +- 内容变更检测:无,`environment_dir` 内容变化但名称不变时,静默复用旧 Blueprint +- 同名 Blueprint 可共存多个,取最新的 `build_complete` + +### GKE — Registry 镜像检查 + +缓存基于 Artifact Registry 中镜像是否存在。 + +```python +image_url = f"{registry_location}-docker.pkg.dev/{project_id}/{registry_name}/{environment_name}:latest" + +# 检查镜像是否存在 +check_cmd = ["gcloud", "artifacts", "docker", "images", "describe", image_url, "--project", project_id] +result = await asyncio.create_subprocess_exec(*check_cmd, stdout=DEVNULL, stderr=DEVNULL) +exists = (result.returncode == 0) + +if not force_build and exists: + pass # 使用已有镜像 +else: + await _build_and_push_image() # gcloud builds submit,覆盖 :latest +``` + +- 缓存 key:`{environment_name}:latest`(固定 tag) +- 内容变更检测:无,`environment_dir` 内容变化但名称不变时,静默复用旧镜像 +- `force_build=True` 重新构建并覆盖 `:latest` + +### Docker — Layer Cache + 进程内锁 + +缓存依赖 Docker daemon 自身的 layer cache,进程内通过 `asyncio.Lock` 去重并发构建。 + +```python +# 类级别锁字典 +_image_build_locks: dict[str, asyncio.Lock] = {} + +# 构建时按 environment_name 加锁 +lock = _image_build_locks.setdefault(environment_name, asyncio.Lock()) +async with lock: + await docker_compose(["build"]) # Docker layer cache 处理增量构建 +``` + +- 缓存 key:Docker layer cache(按 Dockerfile 指令 + 文件内容) +- 内容变更检测:自动,Docker 逐层比对,变化的层及后续层重建 +- 进程内锁保证同一 `environment_name` 不并发构建,但不跨进程 + +--- + +## 构建产物存储 + +> 本节统一从五个维度描述每个平台:**产物类型 / 存储位置 / 用户可见的管理 API / 生命周期 / 用户控制粒度**。E2B 的服务端实现(Firecracker pipeline、SHA-256 层哈希链等)放在小节末尾的"补充"作为深入参考。 + +### Daytona — 两层产物:匿名构建产物 + 命名 Snapshot + +Daytona 同一个底层存储承载两种命名的产物,调用方需明确选哪一种: + +#### A. 匿名构建产物(`Image` 直走 `daytona.create()`) + +- **产物类型**:服务端按 Dockerfile 内容 + 上下文哈希计算的匿名快照(无名字、无 `id` 暴露给调用方) +- **存储位置**:Daytona 平台内部 Object Storage(S3 兼容),调用方不可直达底层 +- **管理 API**:**无**。调用方拿不到 ID,也不能 list/delete 这一层产物 +- **生命周期**:服务端自动缓存 **24 小时**,过期清理 +- **用户控制**:`Image.from_dockerfile(force_build=True)` 强制重建当次 + +#### B. 命名 Snapshot(`AsyncSnapshotService.create()`) + +- **产物类型**:注册到 Daytona 数据库的 Snapshot 对象(`id` / `name` / `state` / `image_name` / `size` / `cpu/gpu/mem/disk` 等字段)。**Snapshot 不是标准 Docker 镜像**,是平台专有快照格式 +- **存储位置**:同上,但产物在数据库中有名字、有状态、可查询 +- **管理 API**:完整的 CRUD 接口 + + ```python + class AsyncSnapshotService: + async def list(page=None, limit=None) -> PaginatedSnapshots + async def get(name: str) -> Snapshot + async def create(params: CreateSnapshotParams, *, on_logs=None, timeout=0) -> Snapshot + async def delete(snapshot: Snapshot) -> None + async def activate(snapshot: Snapshot) -> Snapshot # 激活归档态的 Snapshot + ``` +- **生命周期**:永久持有,需手动删除 +- **用户控制**:`snapshot.delete()` / Dashboard / CLI + +#### 构建上下文传输 + +`Image` 对象的 `_context_list`(`COPY` 引用的本地文件)通过 `AsyncObjectStorage.upload()` 上传,bucket 由服务端 `get_push_access()` 动态下发(SDK 的默认 fallback bucket 是 `daytona-volume-builds`,但生产环境通常不用 fallback)。上传产生 content hash 数组随 `CreateBuildInfo(context_hashes=..., dockerfile_content=...)` 提交给服务端。 + +--- + +### E2B — 命名 Template + +- **产物类型**:注册到 E2B 后端的 Template(暴露给调用方的标识是 `template_id` 或 `alias`)。底层是 Firecracker microVM 快照(rootfs/memfile/snapfile),但调用方不直接接触这一层 +- **存储位置**:E2B 平台云对象存储,元数据存数据库 +- **管理 API**: + + ```python + class AsyncTemplate: + @staticmethod + async def build(template, name=None, *, alias=None, cpu_count=2, memory_mb=1024, skip_cache=False) -> BuildInfo + @staticmethod + async def alias_exists(alias: str) -> bool # REST GET /templates/aliases/{alias} + # 删除走 CLI: `e2b template delete ` + ``` +- **生命周期**:永久保留,无自动清理;构建失败时服务端自动回收已上传对象 +- **用户控制**: + - 缓存复用:alias 相同则复用(Harbor 把 `dirhash[:8]` 嵌入 alias 实现内容寻址) + - 强制重建:`AsyncTemplate.build(skip_cache=True)` + - 删除:`e2b template delete` CLI / API + +#### 补充:服务端实现细节(如不关心可跳过) + +E2B 后端把 Dockerfile 拆成阶段流水线 `BaseBuilder → UserBuilder → StepBuilders(每条指令) → PostProcessing → Optimize`,每阶段计算 SHA-256 哈希作为缓存 key(输入含 `provision_version`、`disk_size`、`from_image`、`step_args`、`files_hash` 等),命中即跳过该阶段。每阶段产出 dirty-block 差异层(`rootfs.ext4.header`、`memfile.header`)。最终产物按 `buildID` 组织在 GCS/S3 (`TEMPLATE_BUCKET_NAME`),构建缓存索引在另一个 bucket (`BUILD_CACHE_BUCKET_NAME`)。这部分对调用方完全不可见,仅决定缓存命中率。 + +--- + +### Modal — 隐式哈希缓存(无显式产物) + +- **产物类型**:文件系统快照,**调用方完全无法引用**——SDK 不返回 `image_id` 给用户代码持有,下次调用时按内容重新计算哈希查找缓存 +- **存储位置**:Modal 平台内部,完全抽象 +- **管理 API**:**无**列表 / 查询 / 删除 API。Image 只是一个声明式 `_Image` 对象,调用 `Sandbox.create(image=image)` 时通过 gRPC `ImageGetOrCreate(image_definition_pb, force_build=...)` 提交给服务端,服务端按内容哈希返回已有或触发新构建 +- **生命周期**:随镜像定义自动缓存;定义变化(Dockerfile 内容、build_args、context_files、`force_build`)即触发重建 +- **用户控制**: + - 强制重建:`Image.from_dockerfile(force_build=True)` 或 `MODAL_FORCE_BUILD=1` 环境变量 + - 无手动删除入口(旧产物由平台按使用情况和容量策略自行回收) + +--- + +### Runloop — 命名 Blueprint + 独立的 build context 对象 + +- **产物类型**:Blueprint(平台托管的容器镜像),独立有 `id` / `name` / `status`(`queued`/`provisioning`/`building`/`failed`/`build_complete`) / `create_time_ms`。同名 Blueprint 可共存多个版本 +- **存储位置**:Runloop 平台内部 +- **管理 API**: + + ```python + client.api.blueprints.list(name=...) # 私有列表 + client.api.blueprints.list_public(name=...) # 公开列表 + client.blueprint.create(name=..., dockerfile=..., build_context=BuildContext(object_id=...)) + client.blueprint.delete(blueprint_id) + ``` +- **特殊:构建上下文是独立托管对象** + + ```python + storage_object = await sdk.storage_object.upload_from_dir( + dir_path=Path, name=str, ttl=timedelta, # 上下文有自己的 TTL + ) -> StorageObject + ``` + Blueprint 创建请求引用 `BuildContext(object_id=storage_object.id, type="object")`,因此构建上下文与 Blueprint 解耦:上下文短命(TTL 1h 即可),Blueprint 永久。 +- **生命周期**:Blueprint **永久保留并持续计费**(官方文档明确提醒);StorageObject 按 TTL 自动过期 +- **用户控制**: + - 缓存复用:按 `name` 查 list,取最新 `build_complete`;**无内容哈希**,同名同 dockerfile 改了内容也不会触发重建 + - 强制重建:调用 `blueprint.create()` 不复用旧 ID 即产生新 Blueprint + - 删除:`blueprint.delete()`(官方建议主动清理旧版本控制成本) + +--- + +### GKE — 用户自管 Artifact Registry + +- **产物类型**:标准 OCI/Docker 镜像(这是六个平台中唯一让用户拿到原生 Docker 镜像的) +- **存储位置**:**用户自有的** Google Artifact Registry,按 region 存储。Daytona/E2B/Modal/Runloop 都是平台托管,GKE 是用户托管 +- **管理 API**: + + ```bash + gcloud builds submit --tag /:latest # 构建并推送 + gcloud artifacts docker images describe # 检查存在 + gcloud artifacts docker images delete # 删除 + ``` + Repository 也支持 cleanup policy(按 tag 状态、版本数、镜像年龄自动清理) +- **生命周期**:用户完全自管,Cloud Build 缓存层由 GCP 自动管理 +- **用户控制**: + - 缓存复用:tag 固定为 `{environment_name}:latest`,**无内容哈希**,内容变化但 tag 不变会静默复用旧镜像 + - 强制重建:`force_build=True` 走 `gcloud builds submit` 覆盖 `:latest` + - 删除:CLI / Console / cleanup policy +- **费用**:用户 Artifact Registry 按 GB/月计费 + 跨 region 拉取的出网费用 + +--- + +### Docker — 本地 Docker daemon(无远端存储) + +- **产物类型**:标准 Docker 镜像 +- **存储位置**:**宿主机本地磁盘**(无 push 到 registry) +- **管理 API**:原生 Docker CLI + + ```bash + docker images # 列表 + docker rmi # 删除 + docker image prune # 清理悬挂镜像 + docker compose down --rmi all # 一并删除 compose 镜像 + ``` +- **生命周期**:持久存在直到显式 `docker rmi` 或 `docuum` 等清理工具 +- **用户控制**: + - 缓存复用:Docker daemon 自动按 layer cache,Dockerfile 指令或文件内容变化即触发对应层及其后所有层重建(**自动内容感知**) + - 进程内并发去重:Harbor 通过类级别 `_image_build_locks: dict[name, asyncio.Lock]` 串行化同名镜像的并发构建,跨进程不生效 + - 强制重建:`docker compose build --no-cache` + +--- + +## 对比 + +### 接口与缓存 + +| 平台 | 接口模式 | 缓存 key | 内容变更检测 | +|------|---------|---------|------------| +| Daytona | 热路径 `snapshot.get(name)` → `create(FromSnapshot)`;冷路径 `Image.from_dockerfile()` → `create(FromImage)` | 命名 Snapshot 名称(显式)+ 服务端构建定义哈希(24h 隐式) | 仅冷路径自动(24h 内) | +| E2B | `from_dockerfile()` → `build()` → `create()` | `name__sha256[:8]` | 自动(目录哈希) | +| Modal | `Image.from_dockerfile()` → `Sandbox.create()` | 平台侧计算(镜像定义哈希) | 自动(平台侧) | +| Runloop | `upload` → `blueprint.create()` → `devbox.create()` | `harbor_{name}_blueprint` | 无 | +| GKE | `gcloud builds submit` → `create_pod()` | `{name}:latest` | 无 | +| Docker | `docker compose build` → `up` | Docker layer cache | 自动(逐层比对) | + +### 构建产物与存储 + +| 平台 | 产物可见性 | 暴露给用户的标识 | 存储位置 | 默认生命周期 | 显式删除 API | +|------|----------|---------------|---------|------------|------------| +| Daytona(Image 路径) | 不可见 | 无 | 平台 S3 兼容存储 | 24h 自动过期 | 无(不可主动删) | +| Daytona(Snapshot 路径) | 可见,平台专有快照 | `name` / `id` / `state` | 同上 | 永久 | `snapshot.delete()` | +| E2B | 可见,命名 Template | `template_id` / `alias` | GCS/S3(平台托管) | 永久 | `e2b template delete` CLI | +| Modal | 不可见 | 无(无 `image_id` 句柄) | 平台内部抽象 | 平台自行回收 | 无(仅 `force_build`) | +| Runloop | 可见,命名 Blueprint | `id` / `name` / `status` | 平台内部 | 永久且持续计费 | `blueprint.delete()` | +| GKE | 可见,标准 OCI 镜像 | 镜像 URL `repo/name:tag` | **用户自有** Artifact Registry | 永久(cleanup policy 可选) | `gcloud artifacts docker images delete` | +| Docker | 可见,标准 Docker 镜像 | 本地 image name/id | **本地** Docker daemon | 永久直到 `docker rmi` | `docker rmi` / `docker image prune` | + +> **观察一**:六个平台只有 GKE 和 Docker 让用户拿到原生 OCI/Docker 镜像;其余四个均为平台专有的不透明产物。 +> +> **观察二**:仅有 Daytona(Image 路径)和 Modal 不暴露产物 ID,其它都暴露命名标识,可以查、可以删。 +> +> **观察三**:除 GKE 和 Docker 外,存储位置都在平台侧;Runloop 还会持续计费,意味着调用方需要主动管理生命周期。 + +### 用户可见标识与 Hash 编码 + +聚焦"调用方在自己的代码里实际持有/打印的标识"以及"hash 是否进入这个标识"。Rock 选 tag 方案时这是最直接的对照面。 + +| 平台 | 用户可见标识 | 是否原生 OCI tag | hash 进入标识 | hash 长度 | +|------|------------|--------------|-------------|---------| +| **GKE** | 镜像 URL `repo/{env_name}:latest` | ✅ | ❌ | — | +| **Docker** | `docker-compose.yaml` 中写死的镜像名 | ✅ | ❌ | — | +| Runloop | Blueprint name(`harbor_{name}_blueprint`) | ❌(平台 ID) | ❌ | — | +| Daytona | Snapshot name(`harbor__{name}__snapshot`) | ❌(平台 ID) | ❌ | — | +| **E2B** | Template alias(`{env_name}__{sha256[:8]}`) | ❌(平台 alias) | ✅ | **8 hex / 32 bit** | +| Modal | 无(SDK 不返回 image_id) | ❌(不暴露) | — | — | + +**观察四**:让用户拿到原生 docker tag 的两个平台(GKE / Docker)都不在 tag 里编码 hash,缓存逻辑要么靠固定 `:latest` + 平台/Daemon 自身的 layer cache,要么靠调用方手动管理命名约定。 + +**观察五**:在六个平台里只有 **E2B** 把 hash 嵌入到用户可见的标识,长度仅 **8 hex(32 bit)**。E2B 选 8 hex 的关键前提是 alias 还有 `env_name` 前缀做隔离 —— 碰撞只在"同名 env"内才发生。Rock "用 `user_id` 作为 repository" 的隔离思路与之同构。 + +**观察六**:服务端**不可见**的内部 hash(Daytona 的 `dockerfile_content + context_hashes`、Modal 的 image_definition protobuf、E2B 服务端每构建阶段的 SHA-256)普遍取**全长**或**长哈希**,因为服务端无人眼读。用户可见 hash 才会牺牲部分熵换可读。 + +**Rock 选择空间(同一 repository 内、按 birthday bound `n²/(2·2^bits)` 估,n=10⁶)**: + +| 长度 | 例 | 同 repo 内 1M 镜像碰撞概率 | +|------|------|-----------| +| 8 hex(E2B 同款) | `3a7bd3e2` | ~3% | +| 16 hex | `3a7bd3e2360a3d29` | 5×10⁻⁸ | +| 20 hex | `3a7bd3e2360a3d29eea4` | 8×10⁻¹³ | +| 32 hex | `3a7bd3e2360a3d29eea436fcfb7e44c7` | 4×10⁻²⁰ | +| 64 hex | 完整 SHA-256 | 0 | + +实际单个 user_id 下的环境数远小于 1M(通常 <1k),实际碰撞比表中数值再低 6 个数量级。对照下,E2B 的 8 hex 在量级上就已"工程零风险";Rock 取 16 hex 以上即可在所有合理场景获得碰撞 negligible。 + +--- + +### Harbor 使用方式参考 + +Harbor 的 `BaseEnvironment` 通过 `start(force_build: bool)` 统一入口,各环境在 `start()` 内部完成从 Dockerfile 到沙箱运行的完整流程。构建上下文统一为 `environment_dir`,Dockerfile 位于 `environment_dir / "Dockerfile"`。 diff --git a/docs/_specs/start-from-dockerfile/03_implementation.md b/docs/_specs/start-from-dockerfile/03_implementation.md new file mode 100644 index 0000000000..12fff1c668 --- /dev/null +++ b/docs/_specs/start-from-dockerfile/03_implementation.md @@ -0,0 +1,165 @@ +# Start from Dockerfile — Implementation Plan + +## 背景 + +ROCK SDK 现状只接受预构建镜像 tag(`SandboxConfig.image: str`),调用方必须在 SDK 外部自己 `docker build` + `docker push`。本次引入 `Image` 一等类型,提供 `Image.from_dockerfile(path)` 声明式接口;SDK 在 `Sandbox.start()` 内透明完成 DinD 构建、推送、缓存检查,再以纯字符串 image_name 调 Admin API。 + +关键约束:Admin API(`SandboxStartRequest.image: str`)与 DB schema(`SandboxRecord.image = Column(String(512))`)零改动。`Image` 在 HTTP 边界之前必须解析为字符串。 + +参考:Daytona `Image.from_dockerfile(path) → create()`、Modal `Image.from_dockerfile(path) → Sandbox.create(image=image)`。 + +--- + +## 架构 + +``` +用户 SDK Admin +───── ────────────────────────────────────────────── ───── +Image.from_dockerfile(path) + │ + ▼ +SandboxConfig(image=Image) + │ + ▼ +Sandbox.start() + │ + ├─► _resolve_image() + │ │ + │ ├─► image.to_build_spec() ──► BuildSpec(扁平契约,与 Image 结构解耦) + │ │ + │ └─► ImageBuilder.build(spec) + │ │ + │ ├─► 起 builder sandbox(DinD) + │ ├─► docker manifest inspect (缓存检查) + │ ├─► docker build + label rock.content_hash + │ └─► docker push + │ │ + │ ▼ + │ image_name: str + │ + └─► POST /start_async {image: "..."} ─────────────► (零改动) +``` + +--- + +## File Changes + +| 文件 | 类型 | 说明 | +|------|------|------| +| `rock/sdk/sandbox/image/image.py` | 新增 | `Image` 类 — `from_dockerfile()` 工厂方法,纯声明类型;4 段拼接命名;`to_build_spec()` 投影成 `BuildSpec` | +| `rock/sdk/sandbox/image/image_builder.py` | 新增 | `ImageBuilder` — DinD 构建编排:起 builder sandbox、缓存检查、build、push。纯消费 `BuildSpec` + `BuilderConfig`,不依赖 `Image` | +| `rock/sdk/sandbox/image/config.py` | 新增 | `ImageRegistry`(推送目标 + 凭证)、`BuilderConfig`(builder sandbox 配置)、`BuildSpec`(`Image` → `ImageBuilder` 的扁平契约) | +| `rock/sdk/sandbox/image/__init__.py` | 新增 | 子包入口;触发 `SandboxConfig.model_rebuild` 解开 `str \| Image` 前向引用 | +| `rock/sdk/sandbox/config.py` | 修改 | `SandboxConfig.image` 从 `str` 扩展为 `str \| Image` | +| `rock/sdk/sandbox/client.py` | 修改 | `Sandbox.start()` 入口调 `_resolve_image()`:构造 `ImageBuilder`,把 `Image` 解析回写为 str | +| `rock/env_vars.py` | 修改 | 新增 `ROCK_IMAGE_NAMESPACE` (默认 `"rock"`)、`ROCK_IMAGE_BUILDER_IMAGE` | +| `tests/unit/sdk/sandbox/test_image.py` | 新增 | `Image` 单元测试 | +| `tests/integration/sdk/sandbox/test_image_build.py` | 新增 | from_dockerfile → start 端到端集成测试 | + +--- + +## 设计要点 + +1. **声明与执行分离**:`Image` 纯声明(验证、序列化、命名拼接),`ImageBuilder` 负责执行(起 builder、build、push)。两者通过 `BuildSpec`(扁平契约)对接,`ImageBuilder` 不感知 `Image` 结构 —— `Image` 字段重组不影响 `ImageBuilder`。 + +2. **resolve-and-replace**:`Sandbox.start()` 入口处将 `Image` → `str` 并回写 `self.config.image`,下游 POST body、`__str__`、日志全部看到纯字符串,无需逐一改造。具体由 `Sandbox._resolve_image()` 完成。 + +3. **凭证沿 ImageRegistry 流动**:`Image.registry: ImageRegistry` 持有 `username`/`password`,`to_build_spec()` 投影到 `BuildSpec.registry_username`/`registry_password` 供 `ImageBuilder` 使用。`Sandbox.start()` 解析完成后将凭证同步到 `SandboxConfig` 供 Admin 拉取使用(仅在 caller 没显式设过的字段上覆盖)。 + +4. **4 段拼接命名**:命名一律 `{registry.url}/{registry.namespace}/{registry.repository}:{tag}`: + - `registry.url` 默认 `ROCK_IMAGE_REGISTRY`(仅 host) + - `registry.namespace` 默认 `ROCK_IMAGE_NAMESPACE`(`"rock"`) + - `registry.repository` 默认 `SandboxConfig.user_id`(缺失 fallback `"default"`),在 `Sandbox.start()` 注入 + - `tag` 强制 = `content_hash()`,用户不可指定 + +5. **content_hash 作 tag**:内容变化自动产生新 tag → 缓存键不需要额外 label 对比。`docker build --label rock.content_hash=` 同时写入 label,作为 push 后的二次校验。 + +6. **缓存检查两层**:`docker manifest inspect` 看 image 是否已在 registry;存在则 `docker pull` + `docker inspect` 对比 label,匹配才跳过 build/push。 + +7. **Builder sandbox 可配置**:通过 `Image.from_dockerfile(..., builder_config=BuilderConfig(...))` 控制 builder 的 image / memory / cpus / timeouts。`BuilderConfig` 是 `SandboxConfig` 的子集,把 `image` 收窄为 `str`,默认值是 `ROCK_IMAGE_BUILDER_IMAGE`(默认 `rock-env-builder:latest`,基于通用 DinD 镜像,预配 daemon.json 适配 ROCK XFS 卷布局)。不传 `builder_config` 时,从用户 `SandboxConfig` 继承 `base_url` / `cluster` / `extra_headers` / `user_id` 等可继承字段。 + +--- + +## DinD 环境约束 + +`docker build` 在 Docker 23+ 默认走 BuildKit。我们保留 BuildKit(不强关),因此 builder 镜像需要满足它在"容器套容器"环境下的两个要求: + +1. **干净的工作目录 FS** —— BuildKit 在 dockerd `data-root` 下做 overlay 挂载;若 data-root 在 sandbox 自己的 overlay rootfs 上,会触发 overlay-on-overlay 失败。 +2. **干净的 dockerd 启动状态** —— base 镜像若残留 `/var/run/docker.pid`,新 dockerd 启动会被旧 PID 文件挡住。 + +builder 镜像通过 `daemon.json` + Dockerfile 清理满足这两点: + +```json +{ + "data-root": "/data/logs/docker", + "features": {"containerd-snapshotter": false} +} +``` + +- `data-root` 指向 `/data/logs/docker`,ROCK 平台已在此 bind-mount 一块 XFS 卷(本意是日志配额),底层非 overlay,BuildKit 的 overlay 挂载得以成功 +- 关闭 `containerd-snapshotter` 让 BuildKit 走 dockerd 旧 graph driver,绕开独立 containerd-overlayfs 路径 +- Dockerfile 里 `RUN rm -f /var/run/docker.pid /run/docker/containerd/containerd.pid` 清残留 + +整套配置封装在 builder 镜像里,SDK 仅通过 `ROCK_IMAGE_BUILDER_IMAGE` 引用 tag。 + +--- + +## Env Vars + +| 变量 | 用途 | 默认 | +|---|---|---| +| `ROCK_IMAGE_REGISTRY` | 4 段拼接的 host 段 | `(空)` | +| `ROCK_IMAGE_NAMESPACE` | 4 段拼接的 namespace 段 | `"rock"` | +| `ROCK_IMAGE_REGISTRY_USERNAME` / `_PASSWORD` | 推/拉镜像凭证 | `(空)` | +| `ROCK_IMAGE_BUILDER_IMAGE` | 覆盖默认 builder sandbox 镜像 | `rock-env-builder:latest` | + +--- + +## Tag 长度决策 + +采用 **OCI digest 标准长度(64 hex / 256 bit SHA-256,不截断)**。 + +理由:ROCK 场景属于 "registry-stored、跨进程持久化、无中心去重机制",不像 Docker short ID(本地肉眼识别)或 Git short SHA(自动延长冲突解决)可以容忍截断。10⁻⁶⁵ 的碰撞概率与 OCI manifest digest 同一安全等级。 + +代价是 tag 长 ~70 字符,registry UI 不如短哈希好读——但用户主要通过 `Image.from_dockerfile()` 接口操作,不感知 tag 字符串。 + +位置选在 `:` 而非 OCI 标准 `@sha256:`:OCI digest 是 push 完成后 registry 端算的 manifest digest,我们需要在 build 前就能算出唯一标识做缓存键,所以必须用 build context 的 hash 放在 tag 位。 + +--- + +## Validation Plan + +### 单元测试 — `tests/unit/sdk/sandbox/test_image.py` + +| 用例 | 断言 | +|---|---| +| `_resolve_full_name` 缺段 | `ValueError`,message 列缺失字段 | +| `_resolve_full_name` 拼接正确 | `f"{reg}/{ns}/{repo}:{hash}"` | +| `registry.url` 末尾 `/` 被剥 | 不出现 `//` | +| env 默认生效 | `namespace` 默认 `"rock"` | +| Tag 格式 | `len(tag) == 64` 且 `[0-9a-f]{64}` | +| `from_dockerfile(file_path)` | 单文件 build context;同级文件不影响 hash | +| `from_dockerfile` 拒绝不存在的路径 | `ValueError` | +| `Sandbox.start()` 注入 `repository` | `None` 时 = `config.user_id` 或 `"default"` | + +### 集成测试 — `tests/integration/sdk/sandbox/test_image_build.py` + +| 用例 | 验证点 | +|---|---| +| `test_from_dockerfile_build_and_start` | from_dockerfile → start → `cat /opt/hello.txt` 验证 COPY 生效 | +| `test_from_dockerfile_cache_skip` | 相同 Image 第二次 build 命中缓存,耗时显著小于首次 | +| `test_from_dockerfile_rebuilds_on_content_change` | 修改 env_dir 内容触发重建,新内容生效 | + +全部标 `@pytest.mark.need_admin`,CI 自动跑。 + +测试数据:`tests/integration/test_data/image_from_dockerfile/`(最小 `Dockerfile` + `hello.txt`)。 + +--- + +## Rollback + +- 删除 `rock/sdk/sandbox/image.py`、`rock/sdk/sandbox/image_builder.py` 及对应测试 +- 还原 `rock/sdk/sandbox/config.py`(`image: str | Image` → `image: str`) +- 还原 `rock/sdk/sandbox/client.py`(移除 `Image` 解析与 `repository` 注入) +- 还原 `rock/env_vars.py`(移除 `ROCK_IMAGE_NAMESPACE`、`ROCK_IMAGE_BUILDER_IMAGE`) +- Admin 侧无变更需回滚 diff --git a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/version-1.8.x/User Guides/image-from-dockerfile.md b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/version-1.8.x/User Guides/image-from-dockerfile.md new file mode 100644 index 0000000000..1a407f6a1a --- /dev/null +++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/version-1.8.x/User Guides/image-from-dockerfile.md @@ -0,0 +1,131 @@ +--- +sidebar_position: 5 +--- + +# 从 Dockerfile 启动沙箱 + +`SandboxConfig.image` 既可以传一个已构建好的镜像 tag 字符串,也可以传一个 `Image` 声明对象。使用 `Image.from_dockerfile(path)`,SDK 会在 builder sandbox 里透明完成构建和推送,再用构建出的镜像启动你的沙箱——不需要自己跑 `docker build` / `docker push`。 + +## 快速开始 + +```python +from rock.sdk.sandbox.client import Sandbox +from rock.sdk.sandbox.config import SandboxConfig +from rock.sdk.sandbox.image import Image, ImageRegistry + +image = Image.from_dockerfile( + "/path/to/env_dir", # 包含 Dockerfile 的本地目录,或单个 Dockerfile 文件路径 + registry=ImageRegistry( + url="reg.example.com", + namespace="my-team", + repository="my-env", + username="...", + password="...", + ), +) + +sandbox = Sandbox(SandboxConfig(image=image, memory="2g", cpus=1.0)) +await sandbox.start() +``` + +`start()` 执行时 SDK 会: + +1. 对 build context 计算 SHA-256 哈希(覆盖 `env_dir` 下所有文件)。 +2. 查 registry 看是否已有同样哈希的镜像;命中则跳过 build + push。 +3. 否则起一个 builder sandbox,在里面跑 `docker build` 和 `docker push`。 +4. 用得到的镜像 tag 启动你的沙箱。 + +后续 `env_dir` 内容不变的话,直接命中缓存秒返回。 + +## 镜像命名 + +最终镜像 tag 由 4 段组成: + +``` +{registry.url}/{registry.namespace}/{registry.repository}:{content_hash} +``` + +| 段 | 来源 | +|---|---| +| `registry.url` | `ImageRegistry` 字段,缺省则从 admin `image` 配置获取 | +| `registry.namespace` | `ImageRegistry` 字段,缺省则从 admin `image` 配置获取 | +| `registry.repository` | `ImageRegistry` 字段,缺省则用 `SandboxConfig.user_id`(再缺省回退 `"default"`) | +| `content_hash` | build context 的 64 位 SHA-256,用户不可指定 | + +把 content hash 放在 tag 位的设计:Dockerfile 或上下文文件任何变化都会自动产生新 tag,缓存命中与重建是确定性的。 + +## API 参考 + +```python +class ImageRegistry(BaseModel): + url: str | None = None + namespace: str | None = None + repository: str | None = None + username: str | None = None + password: str | None = None + + +Image.from_dockerfile( + path: str | Path, + *, + registry: ImageRegistry | None = None, + force_build: bool = False, + build_args: dict[str, str] | None = None, + builder_config: BuilderConfig | None = None, +) +``` + +| 参数 | 作用 | +|---|---| +| `path` | 两种形式之一:(a) 包含 `Dockerfile` 以及它 `COPY` 引用的所有文件的本地目录,或 (b) 单个 `Dockerfile` 文件的路径。文件模式下同级目录其他文件不参与构建——Dockerfile 必须自包含(不能 `COPY` 本地文件) | +| `registry` | `ImageRegistry` POJO,包含推送目标和凭证。未设字段在 `Sandbox.start()` 时从 admin `image` 配置自动填充;`registry.repository` 回退到 `SandboxConfig.user_id`。镜像仓库凭证(username/password)通过 admin 服务的 ACR 临时 token 自动获取。 | +| `force_build` | 跳过缓存检查,强制重新构建 | +| `build_args` | 透传给 `docker build --build-arg KEY=VAL` | +| `builder_config` | `BuilderConfig`(`SandboxConfig` 的子类),用于 builder sandbox 自身——可控制 image、memory、cpus、timeouts 等。`BuilderConfig` 把 `image` 类型收窄到 `str`(pydantic 强制校验),默认值从 admin 配置获取 + builder 适用的 timeouts。不传时 builder 从你的 `SandboxConfig` 派生 | + +不传 `builder_config` 时,builder sandbox 从你的 `SandboxConfig` 继承可继承字段(`base_url`、`cluster`、`extra_headers` 等);`image` / `startup_timeout` / `auto_clear_seconds` 走 `BuilderConfig` 的默认值。 + +## 配置 + +镜像仓库和 builder 的默认配置在 admin 的 YAML 配置文件(`rock-conf/rock-*.yml`)中集中管理。SDK 客户端在 `Sandbox.start()` 时自动从 admin 的 `/acr_config` 接口获取——无需每个客户端单独配置。 + +```yaml +# rock-dev.yml +image: + registry: + url: "reg.example.com" + namespace: "my-team" + instance_id: "cri-xxxxxx" # ACR 企业版实例 ID + region: "cn-hangzhou" + access_key_id: "..." # 仅 admin 侧持有,不暴露给 SDK + access_key_secret: "..." + builder: + image: "rock-n-roll-registry.cn-hangzhou.cr.aliyuncs.com/rock/rock-env-builder:latest" + startup_timeout: 600 + auto_clear_seconds: 1800 +``` + +镜像仓库凭证通过 admin 服务签发 ACR 临时 token(15 分钟有效期)。SDK 客户端不持有长期凭证。 + +## 自定义 builder 镜像 + +构建跑在一个短生命的 builder sandbox 里(容器里再起 dockerd,即 DinD)。默认的 builder 镜像已经预配好能在这种环境下工作;只有当你想在 admin 配置中替换它时,才需要看这一节。 + +builder 里的 `docker build` 默认走 BuildKit(Docker 23+)。"容器套容器"布局下 BuildKit 对镜像有两个要求: + +1. **dockerd 的 data 目录所在文件系统不能是 overlay。** BuildKit 在 `/buildkit/` 下挂 overlay;若 `data-root` 本身在 sandbox 的 overlay rootfs 上,会触发 overlay-on-overlay → `invalid argument`。 +2. **镜像里不能有残留的 dockerd pidfile。** `/var/run/docker.pid` 或 `/run/docker/containerd/containerd.pid` 若被烤进镜像,新 dockerd 启动会被旧 PID 挡住,报 `process with PID N is still running`。 + +默认 builder 镜像通过以下方式同时满足这两条: + +- `/etc/docker/daemon.json` 把 `data-root` 设为 `/data/logs/docker`。ROCK 给每个 sandbox 都会把宿主机一块 XFS 卷 bind-mount 到这里(本意是给日志配额用),所以 dockerd 数据落在 XFS 上,而非 overlay rootfs。 +- `"features": {"containerd-snapshotter": false}` 让 BuildKit 走 dockerd 的旧 graph driver,而非独立的 containerd-overlayfs snapshotter。 +- 镜像 build 时清掉残留 pidfile。 + +如果你自己 build builder 镜像,照搬这套配置即可;或者直接用默认 builder 完全不踩这个雷。 + +## 说明 + +- Builder sandbox 是短生命的:构建时按需起,构建完销毁。缓存在 registry 里,不在 builder 里。 +- 同一 `Image` 第二次构建走 `docker manifest inspect` + content-hash label 校验,秒返回。 +- 预构建镜像(不需要构建)的场景,直接把镜像 tag 字符串传给 `SandboxConfig.image` 即可——`Image` 仅在你需要 SDK 帮你跑 `docker build` + `docker push` 时使用。 diff --git a/docs/versioned_docs/version-1.8.x/User Guides/image-from-dockerfile.md b/docs/versioned_docs/version-1.8.x/User Guides/image-from-dockerfile.md new file mode 100644 index 0000000000..cf7cac9c99 --- /dev/null +++ b/docs/versioned_docs/version-1.8.x/User Guides/image-from-dockerfile.md @@ -0,0 +1,131 @@ +--- +sidebar_position: 5 +--- + +# Start Sandbox from Dockerfile + +ROCK SDK accepts not only a pre-built image tag for `SandboxConfig.image`, but also an `Image` declaration. With `Image.from_dockerfile(path)`, the SDK transparently builds and pushes the image inside a builder sandbox, then starts your sandbox from it — no need to run `docker build` / `docker push` yourself. + +## Quick Start + +```python +from rock.sdk.sandbox.client import Sandbox +from rock.sdk.sandbox.config import SandboxConfig +from rock.sdk.sandbox.image import Image, ImageRegistry + +image = Image.from_dockerfile( + "/path/to/env_dir", # dir containing a Dockerfile, OR a Dockerfile file path + registry=ImageRegistry( + url="reg.example.com", + namespace="my-team", + repository="my-env", + username="...", + password="...", + ), +) + +sandbox = Sandbox(SandboxConfig(image=image, memory="2g", cpus=1.0)) +await sandbox.start() +``` + +On `start()` the SDK will: + +1. Hash the build context (`SHA-256` over all files in `env_dir`). +2. Check the registry for an existing image with the same hash; if found, skip build + push. +3. Otherwise launch a builder sandbox, run `docker build` and `docker push` inside it. +4. Start your sandbox from the resulting image tag. + +Subsequent runs with the same `env_dir` content hit the cache instantly. + +## Image Naming + +The final image tag is composed of four parts: + +``` +{registry.url}/{registry.namespace}/{registry.repository}:{content_hash} +``` + +| Segment | Source | +|---|---| +| `registry.url` | field on `ImageRegistry`, or fetched from admin `image` config | +| `registry.namespace` | field on `ImageRegistry`, or fetched from admin `image` config | +| `registry.repository` | field on `ImageRegistry`, or `SandboxConfig.user_id` (fallback `"default"`) | +| `content_hash` | always a 64-character SHA-256 of the build context; user cannot override | + +Using the content hash as the tag means any change to your Dockerfile or build-context files automatically produces a new tag, so cache hits and rebuilds are deterministic. + +## API Reference + +```python +class ImageRegistry(BaseModel): + url: str | None = None + namespace: str | None = None + repository: str | None = None + username: str | None = None + password: str | None = None + + +Image.from_dockerfile( + path: str | Path, + *, + registry: ImageRegistry | None = None, + force_build: bool = False, + build_args: dict[str, str] | None = None, + builder_config: BuilderConfig | None = None, +) +``` + +| Parameter | Purpose | +|---|---| +| `path` | Either (a) a local directory containing a `Dockerfile` and any files it `COPY`s, or (b) a path to a single `Dockerfile` file. In file mode the surrounding directory is ignored — only the Dockerfile is the build context, so it must be self-contained (no `COPY` from local files). | +| `registry` | `ImageRegistry` POJO with the push target and credentials. Any unset field is populated from the admin `image` config at `Sandbox.start()`; `registry.repository` falls back to `SandboxConfig.user_id`. Registry credentials (username/password) are obtained automatically via temporary ACR tokens from the admin service. | +| `force_build` | Skip the cache check and always rebuild. | +| `build_args` | Passed through to `docker build --build-arg KEY=VAL`. | +| `builder_config` | `BuilderConfig` (a subclass of `SandboxConfig`) for the builder sandbox itself — gives you control over its image, memory, cpus, timeouts, etc. `BuilderConfig` narrows `image` to `str` (enforced by pydantic) and defaults to admin-configured builder image + builder-appropriate timeouts. When omitted, a `BuilderConfig` is derived from your sandbox's `SandboxConfig`. | + +When `builder_config` is omitted, the builder sandbox inherits the inheritable fields (`base_url`, `cluster`, `extra_headers`, etc.) from your `SandboxConfig`; `image` / `startup_timeout` / `auto_clear_seconds` fall back to `BuilderConfig` defaults. + +## Configuration + +Image registry and builder defaults are managed centrally in the admin YAML config (`rock-conf/rock-*.yml`). SDK clients fetch them automatically from the admin `/acr_config` endpoint at `Sandbox.start()` time — no per-client configuration needed. + +```yaml +# rock-dev.yml +image: + registry: + url: "reg.example.com" + namespace: "my-team" + instance_id: "cri-xxxxxx" # ACR enterprise instance ID + region: "cn-hangzhou" + access_key_id: "..." # admin-side only, never exposed to SDK + access_key_secret: "..." + builder: + image: "rock-n-roll-registry.cn-hangzhou.cr.aliyuncs.com/rock/rock-env-builder:latest" + startup_timeout: 600 + auto_clear_seconds: 1800 +``` + +Registry credentials are issued as temporary ACR tokens (15-minute TTL) by the admin service. SDK clients never hold long-lived credentials. + +## Custom Builder Image + +The build runs inside a short-lived builder sandbox (a container running its own dockerd — i.e. DinD). The default builder image is pre-configured to work in this environment; you only need to read this section if you override it in the admin config. + +Inside the builder, `docker build` uses BuildKit by default (Docker 23+). In a container-on-container layout this places two requirements on the builder image: + +1. **dockerd's data directory must live on a non-overlay filesystem.** BuildKit mounts overlay under `/buildkit/`; if `data-root` itself sits on the sandbox's overlay rootfs, the mount fails with `invalid argument` (overlay-on-overlay). +2. **No stale dockerd pidfiles in the image.** If `/var/run/docker.pid` or `/run/docker/containerd/containerd.pid` are baked into the image, dockerd refuses to start with `process with PID N is still running`. + +The default builder image satisfies both by: + +- Setting `data-root` to `/data/logs/docker` in `/etc/docker/daemon.json`. ROCK bind-mounts an XFS volume there for every sandbox (originally for log quotas), so dockerd data lands on XFS, not on the overlay rootfs. +- Setting `"features": {"containerd-snapshotter": false}` so BuildKit uses dockerd's classic graph driver instead of the independent containerd-overlayfs snapshotter. +- Removing baked-in pidfiles at image build time. + +If you build your own builder image, replicate the same configuration — or use the default and avoid the issue entirely. + +## Notes + +- The builder sandbox is short-lived: created on demand for the build, destroyed afterwards. Cache is in the registry, not in the builder. +- A second build of the same `Image` returns instantly via `docker manifest inspect` + content-hash label check. +- For pre-built images (no build needed), just pass the tag string directly to `SandboxConfig.image` — `Image` is only used when you want SDK-driven `docker build` + `docker push`. diff --git a/rock/env_vars.py b/rock/env_vars.py index 11a9830613..29562ba33d 100644 --- a/rock/env_vars.py +++ b/rock/env_vars.py @@ -74,6 +74,7 @@ ROCK_MODEL_SERVICE_INSTALL_CMD: str + environment_variables: dict[str, Callable[[], Any]] = { "ROCK_LOGGING_PATH": lambda: os.getenv("ROCK_LOGGING_PATH"), "ROCK_LOGGING_FILE_NAME": lambda: os.getenv("ROCK_LOGGING_FILE_NAME", "rocklet.log"), diff --git a/rock/sdk/sandbox/client.py b/rock/sdk/sandbox/client.py index 23290da9cd..a5f19b3419 100644 --- a/rock/sdk/sandbox/client.py +++ b/rock/sdk/sandbox/client.py @@ -48,6 +48,7 @@ from rock.sdk.sandbox.config import SandboxConfig, SandboxGroupConfig from rock.sdk.sandbox.deploy import Deploy from rock.sdk.sandbox.file_system import FileSystem, LinuxFileSystem +from rock.sdk.sandbox.image import Image, ImageBuilder from rock.sdk.sandbox.model_service.base import ModelService from rock.sdk.sandbox.network import Network from rock.sdk.sandbox.oss_client import OssClient @@ -160,7 +161,55 @@ async def _parse_error_message_from_status(self, status: dict): # If no failed stage is found, return None return None + async def _fetch_acr_config(self) -> dict | None: + """Fetch ACR registry config + temporary credentials from admin.""" + url = f"{self._url}/acr_config" + headers = self._build_headers() + try: + response = await HttpUtils.get(url, headers) + except Exception: + logger.warning("Failed to fetch ACR config from admin", exc_info=True) + return None + if "Success" != response.get("status"): + logger.warning(f"acr_config returned non-success: {response}") + return None + return response.get("result") + + async def _resolve_image(self) -> None: + """If config.image is an Image declaration, build it via ImageBuilder + and replace with the resulting tag string. Also lifts registry creds + onto config so admin can pull the built image. + """ + if not isinstance(self.config.image, Image): + return + image_obj = self.config.image + + image_cfg = await self._fetch_acr_config() + if image_cfg: + registry = image_obj.registry + if not registry.url: + registry.url = image_cfg.get("Registry") + if not registry.namespace: + registry.namespace = image_cfg.get("Namespace") + registry.username = image_cfg.get("Username") + registry.password = image_cfg.get("Password") + + bc = image_obj.builder_config + if not bc.image: + bc.image = image_cfg.get("BuilderImage") + + if image_obj.registry.repository is None: + image_obj.registry.repository = self.config.user_id or "default" + image_obj.builder_config = image_obj.builder_config.inherit_from(self.config) + builder = ImageBuilder(builder_config=image_obj.builder_config) + self.config.image = await builder.build(image_obj.to_build_spec()) + if image_obj.registry.username and not self.config.registry_username: + self.config.registry_username = image_obj.registry.username + self.config.registry_password = image_obj.registry.password + async def start(self): + await self._resolve_image() + url = f"{self._url}/start_async" headers = self._build_headers() data = { @@ -927,11 +976,15 @@ async def close(self) -> CloseResponse: def __str__(self): """Return user-friendly string representation with key attributes.""" + image_display = self.config.image + if isinstance(image_display, Image): + image_display = f"Image(dockerfile={image_display.dockerfile_path})" + return ( f"Sandbox(sandbox_id={self._sandbox_id}, " f"host_name={self._host_name!r}, " f"host_ip={self._host_ip}, " - f"image={self.config.image}, " + f"image={image_display}, " f"cluster={self._cluster})" ) diff --git a/rock/sdk/sandbox/config.py b/rock/sdk/sandbox/config.py index 6ca3fa1c01..bb81ecca56 100644 --- a/rock/sdk/sandbox/config.py +++ b/rock/sdk/sandbox/config.py @@ -1,9 +1,15 @@ +from __future__ import annotations + import warnings +from typing import TYPE_CHECKING from pydantic import BaseModel, Field, field_validator from rock import env_vars +if TYPE_CHECKING: + from rock.sdk.sandbox.image import Image + class BaseConfig(BaseModel): base_url: str = env_vars.ROCK_BASE_URL @@ -28,7 +34,7 @@ def validate_xrl_authorization(cls, v): class SandboxConfig(BaseConfig): - image: str = "python:3.11" + image: str | Image = "python:3.11" image_os: str = "linux" auto_clear_seconds: int = 60 * 5 route_key: str | None = None @@ -48,6 +54,22 @@ class SandboxConfig(BaseConfig): sandbox_id: str | None = None auto_delete_seconds: int | None = None + @field_validator("image", mode="before") + @classmethod + def _coerce_image(cls, v): + # Lazy import to break the circular import between sandbox/config.py + # and the rock.sdk.sandbox.image package. + from rock.sdk.sandbox.image import Image + + if isinstance(v, str | Image): + return v + if isinstance(v, dict): + try: + return Image(**v) + except Exception: + pass + return v + @field_validator("auto_delete_seconds") @classmethod def validate_auto_delete_seconds(cls, v): diff --git a/rock/sdk/sandbox/constants.py b/rock/sdk/sandbox/constants.py index 7d1a9216b3..6509c68c69 100644 --- a/rock/sdk/sandbox/constants.py +++ b/rock/sdk/sandbox/constants.py @@ -53,3 +53,92 @@ # Verify ossutil version """ + + +# Start dockerd inside a builder sandbox (DinD). Idempotent; if dockerd is +# already running, just wait for it to become responsive. +DOCKERD_SCRIPT = r"""#!/bin/bash +set -e +export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$PATH" + +if command -v dockerd &>/dev/null; then + if ! pgrep -x dockerd &>/dev/null; then + echo "Starting dockerd..." + nohup dockerd &>/var/log/dockerd.log & + fi + for i in $(seq 1 60); do + if docker info &>/dev/null; then echo "DOCKERD_OK"; break; fi + sleep 1 + if [ "$i" -eq 60 ]; then + echo "DOCKERD_FAIL" + cat /var/log/dockerd.log 2>/dev/null | tail -50 + exit 1 + fi + done +fi +""" + + +# `docker build` inside the builder sandbox. Format placeholders: +# image_name, content_hash, registry, registry_username, registry_password, +# force_build, build_arg_flags, context_path. +# +# Logs in to the registry (so a private-registry manifest probe works), then +# runs a builder-side cache check via `docker manifest inspect`. This is a +# second layer on top of the SDK-side registry preflight in image_builder.py: +# the SDK can't always reach the registry (e.g. user laptop outside the VPC +# where the registry lives), so we re-check from the builder's network. When +# the image already exists we emit CACHE_HIT and skip the actual build. +BUILD_SCRIPT_TEMPLATE = r"""#!/bin/bash +set -e + +IMAGE_NAME={image_name} +CONTENT_HASH={content_hash} +REGISTRY={registry} +REG_USER={registry_username} +REG_PASS={registry_password} +FORCE_BUILD={force_build} + +# ── Registry login (so manifest inspect works on private registries) ── +if [ -n "$REG_USER" ] && [ -n "$REG_PASS" ]; then + echo "$REG_PASS" | docker login "$REGISTRY" -u "$REG_USER" --password-stdin +fi + +# ── Cache check from builder's network ── +if [ "$FORCE_BUILD" != "true" ]; then + if docker manifest inspect "$IMAGE_NAME" > /dev/null 2>&1; then + echo "CACHE_HIT" + echo "BUILD_OK" + exit 0 + fi +fi + +# ── Build ── +echo "Building image $IMAGE_NAME..." +docker build {build_arg_flags} --label rock.content_hash="$CONTENT_HASH" -t "$IMAGE_NAME" {context_path} +echo "BUILD_OK" +""" + + +# `docker login` + `docker push` inside the builder sandbox. Format +# placeholders: image_name, registry, registry_username, registry_password. +PUSH_SCRIPT_TEMPLATE = r"""#!/bin/bash +set -e + +IMAGE_NAME={image_name} +REGISTRY={registry} +REG_USER={registry_username} +REG_PASS={registry_password} + +# ── Registry login ── +if [ -n "$REG_USER" ] && [ -n "$REG_PASS" ]; then + echo "$REG_PASS" | docker login "$REGISTRY" -u "$REG_USER" --password-stdin +else + echo "No registry credentials, skipping login" +fi + +# ── Docker push ── +echo "Pushing image $IMAGE_NAME..." +docker push "$IMAGE_NAME" +echo "PUSH_OK" +""" diff --git a/rock/sdk/sandbox/image/__init__.py b/rock/sdk/sandbox/image/__init__.py new file mode 100644 index 0000000000..cb058aaca6 --- /dev/null +++ b/rock/sdk/sandbox/image/__init__.py @@ -0,0 +1,14 @@ +"""Image-related types for declarative `Image.from_dockerfile()` sandbox creation.""" + +# SandboxConfig.image is typed `str | Image`; sandbox/config.py only forward-references +# Image (to avoid a circular import). Re-resolve the field types now that Image is +# importable, with Image explicitly available in the rebuild namespace. +from rock.sdk.sandbox.config import SandboxConfig # noqa: E402 +from rock.sdk.sandbox.image.config import BuilderConfig, BuildSpec, ImageRegistry +from rock.sdk.sandbox.image.image import Image +from rock.sdk.sandbox.image.image_builder import ImageBuilder + +SandboxConfig.model_rebuild(_types_namespace={"Image": Image}) + + +__all__ = ["BuildSpec", "BuilderConfig", "Image", "ImageBuilder", "ImageRegistry"] diff --git a/rock/sdk/sandbox/image/config.py b/rock/sdk/sandbox/image/config.py new file mode 100644 index 0000000000..a1e41cd751 --- /dev/null +++ b/rock/sdk/sandbox/image/config.py @@ -0,0 +1,61 @@ +"""POJOs for the `Image` declaration: registry coords, builder config, build spec.""" + +from __future__ import annotations + +from pydantic import BaseModel, Field + +from rock.sdk.sandbox.config import SandboxConfig + + +class ImageRegistry(BaseModel): + """Registry push target + credentials. + + Fields are populated from the admin ``/acr_config`` endpoint at + ``Sandbox.start()`` time for any values the caller did not set explicitly. + ``repository`` falls back to ``SandboxConfig.user_id``. + """ + + url: str | None = None + namespace: str | None = None + repository: str | None = None + username: str | None = None + password: str | None = None + + +class BuilderConfig(SandboxConfig): + """SandboxConfig specialized for the DinD builder sandbox: `image` narrowed + to `str` (a builder cannot itself trigger a nested `Image` build), and + timeouts widened for the heavier build workload. + """ + + # Fields callers might want to inherit from their user SandboxConfig when + # they didn't explicitly set them on builder_config — see inherit_from(). + _INHERITABLE_FIELDS = ("base_url", "extra_headers", "cluster", "user_id") + + image: str | None = None + startup_timeout: float = 600.0 + auto_clear_seconds: int = 60 * 30 + + def inherit_from(self, sandbox_config: SandboxConfig) -> BuilderConfig: + """Return a copy with `_INHERITABLE_FIELDS` filled from `sandbox_config` + — but only for fields the caller didn't explicitly set on `self`. So a + BuilderConfig() picks up the user's base_url/cluster/etc., while a + BuilderConfig(cluster="other") keeps "other". + """ + updates = {f: getattr(sandbox_config, f) for f in self._INHERITABLE_FIELDS if f not in self.model_fields_set} + return self.model_copy(update=updates) if updates else self + + +class BuildSpec(BaseModel): + """Pre-resolved build request — what `ImageBuilder` consumes. + + Produced by `Image.to_build_spec()`; keeps ImageBuilder decoupled from Image. + """ + + image: str # full tag: registry/namespace/repository:content_hash + content_hash: str + dockerfile_path: str # file (single-Dockerfile mode) or dir (context mode) + build_args: dict[str, str] = Field(default_factory=dict) + registry_username: str | None = None + registry_password: str | None = None + force_build: bool = False diff --git a/rock/sdk/sandbox/image/image.py b/rock/sdk/sandbox/image/image.py new file mode 100644 index 0000000000..40c401283b --- /dev/null +++ b/rock/sdk/sandbox/image/image.py @@ -0,0 +1,116 @@ +from __future__ import annotations + +import hashlib +from pathlib import Path + +from pydantic import BaseModel, Field, PrivateAttr, model_validator + +from rock.sdk.sandbox.image.config import BuilderConfig, BuildSpec, ImageRegistry + + +class Image(BaseModel): + """Image declaration. Construct via `Image.from_dockerfile()`; for a + pre-built image, pass the tag string directly to `SandboxConfig.image`. + """ + + dockerfile_path: str | None = None + registry: ImageRegistry = Field(default_factory=ImageRegistry) + force_build: bool = False + build_args: dict[str, str] = Field(default_factory=dict) + # Sandbox.start() fills networking fields here from the user's SandboxConfig + # when they aren't explicitly set, so the builder hits the same admin/cluster. + builder_config: BuilderConfig = Field(default_factory=BuilderConfig) + + _full_name: str | None = PrivateAttr(default=None) + + @staticmethod + def from_dockerfile( + path: str | Path, + *, + registry: ImageRegistry | None = None, + force_build: bool = False, + build_args: dict[str, str] | None = None, + builder_config: BuilderConfig | None = None, + ) -> Image: + """Create from a local Dockerfile. + + `path` is either a build-context directory (must contain `Dockerfile`) + or a path to a single self-contained Dockerfile file (in file mode, + sibling files are NOT part of the context). + + Resulting image tag: `{registry.url}/{registry.namespace}/{registry.repository}:{sha256}`. + Unset registry/builder fields are populated from the admin ``/acr_config`` + endpoint at ``Sandbox.start()`` time; ``repository`` falls back to + ``SandboxConfig.user_id``. + """ + return Image( + dockerfile_path=str(Path(path).resolve()), + registry=registry or ImageRegistry(), + force_build=force_build, + build_args=build_args or {}, + builder_config=builder_config or BuilderConfig(), + ) + + @model_validator(mode="after") + def _validate(self) -> Image: + if self.dockerfile_path is None: + raise ValueError("Image must have 'dockerfile_path'") + p = Path(self.dockerfile_path) + if p.is_dir(): + if not (p / "Dockerfile").exists(): + raise ValueError(f"No Dockerfile found in: {self.dockerfile_path}") + elif not p.is_file(): + raise ValueError(f"dockerfile_path is neither a file nor a directory: {self.dockerfile_path}") + return self + + def content_hash(self) -> str: + """SHA-256 (64 hex) of the build context. + + Directory mode: walks all files (skipping .git). File mode: hashes + only the Dockerfile file itself. + """ + h = hashlib.sha256() + p = Path(self.dockerfile_path) + if p.is_file(): + h.update(b"Dockerfile") + h.update(p.read_bytes()) + else: + for f in sorted(p.rglob("*")): + if f.is_file() and ".git" not in f.parts: + h.update(str(f.relative_to(p)).encode()) + h.update(f.read_bytes()) + return h.hexdigest() + + @property + def full_name(self) -> str: + """`{registry.url}/{registry.namespace}/{registry.repository}:{tag}`, + cached on first access. Raises if any segment is unresolved. + """ + if self._full_name is None: + r = self.registry + if not (r.url and r.namespace and r.repository): + missing = [ + k + for k, v in [ + ("registry.url", r.url), + ("registry.namespace", r.namespace), + ("registry.repository", r.repository), + ] + if not v + ] + raise ValueError(f"Cannot resolve image name, missing: {missing}") + self._full_name = f"{r.url.rstrip('/')}/{r.namespace}/{r.repository}:{self.content_hash()}" + return self._full_name + + def to_build_spec(self) -> BuildSpec: + """Project this Image into the BuildSpec consumed by ImageBuilder.""" + return BuildSpec( + image=self.full_name, + content_hash=self.content_hash(), + dockerfile_path=self.dockerfile_path, + build_args=self.build_args, + registry_username=self.registry.username, + registry_password=self.registry.password, + force_build=self.force_build, + ) + diff --git a/rock/sdk/sandbox/image/image_builder.py b/rock/sdk/sandbox/image/image_builder.py new file mode 100644 index 0000000000..ecd9ecac07 --- /dev/null +++ b/rock/sdk/sandbox/image/image_builder.py @@ -0,0 +1,193 @@ +from __future__ import annotations + +import io +import logging +import os +import shlex +import tarfile +import tempfile +from pathlib import Path +from typing import TYPE_CHECKING + +from rock.actions import CreateBashSessionRequest +from rock.sdk.sandbox.constants import BUILD_SCRIPT_TEMPLATE, DOCKERD_SCRIPT, PUSH_SCRIPT_TEMPLATE +from rock.sdk.sandbox.image.config import BuilderConfig, BuildSpec +from rock.utils import HttpUtils, ImageUtil + +if TYPE_CHECKING: + from rock.sdk.sandbox.client import Sandbox + +logger = logging.getLogger(__name__) + + +class ImageBuilder: + """Drive the DinD build + push for a BuildSpec inside a builder sandbox. + + Pure consumer of `BuildSpec` and `BuilderConfig`; does not depend on Image. + """ + + BUILD_SESSION = "build" + + def __init__( + self, + *, + builder_config: BuilderConfig, + builder: Sandbox | None = None, + ): + """Supplying `builder` skips builder-lifecycle management (caller owns + start/stop) — used by tests that need to customise the builder + environment (e.g. iptables NAT injection) before the build runs. + """ + self._builder_config = builder_config + self._builder = builder + + def create_builder(self) -> Sandbox: + """Construct (but do not start) the builder sandbox from `builder_config`.""" + # Lazy import to avoid client → image → client cycle. + from rock.sdk.sandbox.client import Sandbox + + return Sandbox(self._builder_config) + + async def build(self, spec: BuildSpec) -> str: + """Build the image described by `spec`. + + Does a registry preflight first; on hit, skips builder creation entirely. + """ + if await self._image_exists_in_registry(spec): + logger.info("Image %s already exists in registry, skipping build", spec.image) + return spec.image + + if self._builder is not None: + return await self.build_with_builder(spec, self._builder) + + builder = self.create_builder() + try: + await builder.start() + return await self.build_with_builder(spec, builder) + finally: + try: + await builder.stop() + except Exception: + logger.warning("Failed to stop builder sandbox: %s", builder.sandbox_id, exc_info=True) + + async def _image_exists_in_registry(self, spec: BuildSpec) -> bool: + """Fast-path HEAD on the registry manifest. Returns True only on 200. + + Any non-200 / network error returns False so the caller proceeds with a + full build (safe default). No bearer-token challenge dance — registries + that require it (e.g. Docker Hub) fall through to the in-builder cache + check; private registries (ACR / Harbor) using Basic Auth work directly. + """ + if spec.force_build: + return False + try: + repo_with_registry, tag = spec.image.rsplit(":", 1) + registry, repo = repo_with_registry.split("/", 1) + except ValueError: + return False + + host = registry.split(":", 1)[0] + scheme = "http" if host in ("localhost", "127.0.0.1") else "https" + url = f"{scheme}://{registry}/v2/{repo}/manifests/{tag}" + + auth = None + if spec.registry_username and spec.registry_password: + auth = (spec.registry_username, spec.registry_password) + + status = await HttpUtils.head( + url, + headers={"Accept": "application/vnd.docker.distribution.manifest.v2+json"}, + auth=auth, + timeout=5.0, + verify=False, + ) + return status == 200 + + async def build_with_builder(self, spec: BuildSpec, builder: Sandbox) -> str: + """Run dockerd → build → push against an already-started builder. Internal + helper called by build(); skips registry preflight (build() did it). + """ + session = self.BUILD_SESSION + await builder.create_session(CreateBashSessionRequest(session=session)) + + await self._run_script(builder, session, DOCKERD_SCRIPT, "/tmp/rock_dockerd.sh", "DOCKERD_OK", 120) + + # The build script does its own registry preflight from the builder's + # network — covers the SDK-can't-reach-but-builder-can scenario (VPC + # registry seen from a user's laptop). + context_path = await self._upload_context(builder, session, spec) + build_script = self._gen_build_script(spec, context_path) + build_output = await self._run_script(builder, session, build_script, "/tmp/rock_build.sh", "BUILD_OK", 1800) + if "CACHE_HIT" in build_output: + logger.info("Image %s already exists (builder-side check), skipping push", spec.image) + return spec.image + + push_script = self._gen_push_script(spec) + await self._run_script(builder, session, push_script, "/tmp/rock_push.sh", "PUSH_OK", 600) + + logger.info("Successfully built and pushed image %s", spec.image) + return spec.image + + async def _run_script( + self, builder, session: str, script: str, remote_path: str, success_marker: str, timeout: int + ) -> str: + await builder.write_file_by_path(script, remote_path) + obs = await builder.arun(cmd=f"bash {remote_path}", session=session, wait_timeout=timeout, mode="nohup") + output = obs.output or "" + if obs.exit_code != 0 or success_marker not in output: + raise RuntimeError(f"Script {remote_path} failed (exit_code={obs.exit_code}): {output}") + return output + + def _gen_build_script(self, spec: BuildSpec, context_path: str) -> str: + build_arg_flags = " ".join(f"--build-arg {shlex.quote(f'{k}={v}')}" for k, v in spec.build_args.items()) + registry, _ = ImageUtil.parse_registry_and_others(spec.image) + return BUILD_SCRIPT_TEMPLATE.format( + image_name=shlex.quote(spec.image), + content_hash=shlex.quote(spec.content_hash), + registry=shlex.quote(registry or "docker.io"), + registry_username=shlex.quote(spec.registry_username or ""), + registry_password=shlex.quote(spec.registry_password or ""), + force_build="true" if spec.force_build else "false", + build_arg_flags=build_arg_flags, + context_path=shlex.quote(context_path), + ) + + def _gen_push_script(self, spec: BuildSpec) -> str: + registry, _ = ImageUtil.parse_registry_and_others(spec.image) + return PUSH_SCRIPT_TEMPLATE.format( + image_name=shlex.quote(spec.image), + registry=shlex.quote(registry or "docker.io"), + registry_username=shlex.quote(spec.registry_username or ""), + registry_password=shlex.quote(spec.registry_password or ""), + ) + + async def _upload_context(self, builder, session: str, spec: BuildSpec) -> str: + remote_tar = "/tmp/rock_env_dir.tar.gz" + remote_ctx = "/tmp/rock_env_dir_ctx" + + src = Path(spec.dockerfile_path) + buf = io.BytesIO() + with tarfile.open(fileobj=buf, mode="w:gz") as tar: + if src.is_file(): + # File mode: the single file IS the build context, packed as ./Dockerfile + tar.add(src, arcname="Dockerfile") + else: + tar.add(src, arcname=".", filter=lambda ti: None if ti.name == ".git" else ti) + tar_bytes = buf.getvalue() + + with tempfile.NamedTemporaryFile(prefix="rock_env_dir_", suffix=".tar.gz", delete=False) as f: + f.write(tar_bytes) + local_tar_path = f.name + try: + upload_resp = await builder.upload_by_path(file_path=local_tar_path, target_path=remote_tar) + if not upload_resp.success: + raise RuntimeError(f"Failed to upload build context: {upload_resp.message}") + finally: + try: + os.remove(local_tar_path) + except OSError: + pass + + await builder.arun(cmd=f"mkdir -p {remote_ctx}", session=session) + await builder.arun(cmd=f"tar -xzf {remote_tar} -C {remote_ctx}", session=session) + return remote_ctx diff --git a/rock/utils/http.py b/rock/utils/http.py index 47af89ead2..18e5c28952 100644 --- a/rock/utils/http.py +++ b/rock/utils/http.py @@ -46,6 +46,32 @@ async def get(url: str, headers: dict) -> dict: logging.exception(f"Failed to get from {url}: {e}") raise e + @staticmethod + async def head( + url: str, + headers: dict | None = None, + auth: tuple[str, str] | None = None, + timeout: float = 5.0, + verify: bool = True, + ) -> int: + """Send HEAD request and return the HTTP status code. + + Unlike post/get, this does NOT raise on HTTP errors — HEAD is typically + used as an existence check where 4xx is a valid answer ("not found"). + Returns 0 on network/timeout/transport errors so callers can treat any + non-200 uniformly as "doesn't exist / can't verify". + + verify=False is supported for plain-http or self-signed registries. + """ + verify_arg = HttpUtils._SHARED_SSL_CONTEXT if verify else False + try: + async with httpx.AsyncClient(verify=verify_arg, timeout=timeout) as client: + response: Response = await client.head(url, headers=headers, auth=auth, follow_redirects=True) + return response.status_code + except Exception as e: + logging.debug(f"HEAD {url} failed: {e}") + return 0 + @staticmethod async def post_multipart( url: str, diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index b39a9a385d..fd6ea1ad4c 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -237,7 +237,6 @@ async def local_registry(): """Start a local Docker registry with basic auth""" auth_dir = Path(tempfile.mkdtemp()) htpasswd_file = auth_dir / "htpasswd" - container_name = "test-registry" # 1. Generate htpasswd file result = subprocess.run( @@ -248,11 +247,11 @@ async def local_registry(): ) htpasswd_file.write_text(result.stdout) - # 2. Remove any leftover container from a previous failed run + # 2. Per-port container name so parallel pytest workers don't collide. + port = run_until_complete(find_free_port()) + container_name = f"test-registry-{port}" subprocess.run(["docker", "rm", "-f", container_name], capture_output=True) - # 3. Start registry and mount the htpasswd file into the container - port = run_until_complete(find_free_port()) subprocess.run( [ "docker", @@ -275,8 +274,13 @@ async def local_registry(): check=True, ) - # 4. Wait for registry to be ready - registry_url = f"localhost:{port}" + # 4. Wait for registry to be ready. Use `127.0.0.1` (not `localhost`) so dockerd + # never resolves to the IPv6 loopback `::1` — on dual-stack hosts glibc may prefer + # IPv6 first, but our NAT injection only covers IPv4 (see test_image_build.py). + # The 127.0.0.0/8 CIDR is auto-trusted as insecure by both outer dockerd and the + # inner builder dockerd. Tests using this fixture from inside a builder sandbox + # must inject an iptables NAT rule mapping 127.0.0.1:port → host_ip:port. + registry_url = f"127.0.0.1:{port}" for _ in range(30): try: urllib.request.urlopen(f"http://{registry_url}/v2/", timeout=1) diff --git a/tests/integration/sdk/sandbox/test_image_build.py b/tests/integration/sdk/sandbox/test_image_build.py new file mode 100644 index 0000000000..a441e11497 --- /dev/null +++ b/tests/integration/sdk/sandbox/test_image_build.py @@ -0,0 +1,250 @@ +"""Integration tests for Image.from_dockerfile() → Sandbox.start() flow. + +Verifies that a sandbox can be started from a local Dockerfile directory, +including build, cache skip, and content-change rebuild scenarios. + +Run: pytest tests/integration/sdk/sandbox/test_image_build.py -v -m need_admin +""" + +import os +import shutil +import subprocess +import time +from contextlib import asynccontextmanager +from pathlib import Path + +import pytest + +from rock.actions.sandbox.request import CreateBashSessionRequest +from rock.config import AcrConfig +from rock.logger import init_logger +from rock.sdk.sandbox.client import Sandbox +from rock.sdk.sandbox.config import SandboxConfig +from rock.sdk.sandbox.image import BuilderConfig, Image, ImageBuilder, ImageRegistry + +logger = init_logger(__name__) + +TEST_DATA_DIR = Path(__file__).resolve().parents[2] / "test_data" / "image_from_dockerfile" +EXPECTED_FILE_CONTENT = "rock-image-from-dockerfile-ok" +MODIFIED_CONTENT = "rock-content-changed" + + +@pytest.fixture(scope="module", autouse=True) +def _cleanup_builder_image_on_ci(): + """CI-only: remove the builder image after this module runs so a stale + cached image cannot mask a regression on the next CI build. Skipped + locally to keep dev iterations fast.""" + yield + if not os.getenv("CI"): + return + builder_image = AcrConfig().builder_image + if not builder_image: + return + result = subprocess.run( + ["docker", "rmi", "-f", builder_image], + capture_output=True, + text=True, + check=False, + ) + if result.returncode == 0: + logger.info("Removed builder image %s", builder_image) + else: + logger.warning("Failed to remove builder image %s: %s", builder_image, result.stderr.strip()) + + +# ── Helpers ── + + +def _create_image(env_dir, registry_info, **kwargs): + return Image.from_dockerfile( + env_dir, + registry=ImageRegistry( + url=registry_info["registry_url"], + namespace=registry_info["namespace"], + repository=registry_info["repository"], + username=registry_info["registry_username"], + password=registry_info["registry_password"], + ), + **kwargs, + ) + + +def _create_config(image, admin_remote_server, registry_info=None): + """Build a SandboxConfig for the just-built image. + + `image` is the already-resolved tag string (we pre-build via _build_with_loopback_nat + so the SDK's auto-resolve path inside Sandbox.start() isn't triggered here). + `registry_info` carries the credentials admin needs to pull the image. + """ + base_url = f"{admin_remote_server.endpoint}:{admin_remote_server.port}" + kwargs = dict(image=image, memory="2g", cpus=1.0, startup_timeout=600, base_url=base_url) + if registry_info: + kwargs["registry_username"] = registry_info["registry_username"] + kwargs["registry_password"] = registry_info["registry_password"] + return SandboxConfig(**kwargs) + + +@asynccontextmanager +async def _run_sandbox(config): + """Start a sandbox with default session, yield it, always stop on exit.""" + sandbox = Sandbox(config) + try: + await sandbox.start() + await sandbox.create_session(CreateBashSessionRequest(session="default")) + yield sandbox + finally: + try: + await sandbox.stop() + except Exception as e: + logger.warning("Failed to stop sandbox: %s", e) + + +async def _assert_file_content(sandbox, expected): + result = await sandbox.arun(cmd="cat /opt/hello.txt", session="default") + assert result.output is not None + assert result.output.strip() == expected + + +# ── Fixtures / helpers ── + + +async def _inject_loopback_nat(builder, port: int) -> None: + """NAT 127.0.0.1:port → builder.host_ip:port inside the builder. + + The local_registry fixture serves on the host's loopback (`localhost:port`, i.e. + 127.0.0.1:port). That address falls in 127.0.0.0/8 which dockerd trusts as insecure + by default, but from inside the builder (its own netns) 127.0.0.1 is the builder's + own loopback with no listener. Three things make the loopback URL actually reach + the host's docker-proxy: + 1. enable route_localnet (kernel default forbids routing 127.x off lo) + 2. OUTPUT DNAT 127.0.0.1:port → host_ip:port (rewrite outgoing dst) + 3. POSTROUTING MASQUERADE for host_ip:port (rewrite src so reply routes back) + """ + host_ip = builder.host_ip + cmd = ( + "echo 1 | tee /proc/sys/net/ipv4/conf/all/route_localnet " + "/proc/sys/net/ipv4/conf/lo/route_localnet > /dev/null && " + f"iptables -t nat -A OUTPUT -p tcp -d 127.0.0.1 --dport {port} " + f"-j DNAT --to-destination {host_ip}:{port} && " + f"iptables -t nat -A POSTROUTING -p tcp -d {host_ip} --dport {port} -j MASQUERADE" + ) + logger.info("Injecting builder loopback NAT: 127.0.0.1:%s -> %s:%s", port, host_ip, port) + obs = await builder.arun(cmd=cmd, session=ImageBuilder.BUILD_SESSION, mode="normal") + if obs.exit_code != 0: + raise RuntimeError(f"NAT setup failed (exit_code={obs.exit_code}): {obs.failure_reason or obs.output}") + + +async def _build_with_loopback_nat(image: Image, admin_remote_server) -> str: + """Drive the build via the user-facing ImageBuilder.build() entry, but + pre-create the builder ourselves so we can inject test-only NAT into it + before build runs. + + Returns the resolved image name (string). + """ + base_url = f"{admin_remote_server.endpoint}:{admin_remote_server.port}" + builder_config = BuilderConfig(base_url=base_url, cluster="default", image=AcrConfig().builder_image) + bootstrap = ImageBuilder(builder_config=builder_config) + builder = bootstrap.create_builder() + await builder.start() + try: + await builder.create_session(CreateBashSessionRequest(session=ImageBuilder.BUILD_SESSION)) + registry = image.registry.url or "" + host_part, _, port_part = registry.partition(":") + if (host_part.startswith("127.") or host_part == "localhost") and port_part: + await _inject_loopback_nat(builder, int(port_part)) + # Bind the prepared builder to a fresh ImageBuilder and run the + # user-facing build() so we get its registry preflight on cache hit. + ib = ImageBuilder(builder_config=builder_config, builder=builder) + return await ib.build(image.to_build_spec()) + finally: + try: + await builder.stop() + except Exception: + logger.warning("Failed to stop builder sandbox: %s", builder.sandbox_id, exc_info=True) + + +@pytest.fixture +def local_registry_info(local_registry): + registry_url, username, password = local_registry + return { + "registry_url": registry_url, + "namespace": "rock-test", + "repository": "image-from-dockerfile", + "registry_username": username, + "registry_password": password, + } + + +@pytest.fixture(autouse=True) +def _cleanup_built_images(local_registry_info): + """Remove images this test built (pulled into the host docker by admin + when starting the user sandbox) so they don't accumulate locally or on CI. + Scoped to this test's registry URL so concurrent xdist workers don't wipe + each other's in-progress images.""" + yield + prefix = ( + f"{local_registry_info['registry_url']}/{local_registry_info['namespace']}/{local_registry_info['repository']}:" + ) + result = subprocess.run( + ["docker", "images", "--format", "{{.Repository}}:{{.Tag}}"], + capture_output=True, + text=True, + check=False, + ) + for line in (result.stdout or "").splitlines(): + if line.startswith(prefix): + subprocess.run(["docker", "rmi", "-f", line], capture_output=True, check=False) + logger.info("Removed built image %s", line) + + +@pytest.fixture +def modified_env_dir(tmp_path): + """Copy test data and modify hello.txt to detect rebuild.""" + env_dir = tmp_path / "env" + shutil.copytree(TEST_DATA_DIR, env_dir) + (env_dir / "hello.txt").write_text(MODIFIED_CONTENT + "\n") + return env_dir + + +# ── Tests ── + + +@pytest.mark.need_admin +@pytest.mark.asyncio +async def test_from_dockerfile_build_and_start(local_registry_info, admin_remote_server): + """Image.from_dockerfile() → build/push (via test-managed builder) → Sandbox.start().""" + image = _create_image(TEST_DATA_DIR, local_registry_info) + resolved = await _build_with_loopback_nat(image, admin_remote_server) + config = _create_config(resolved, admin_remote_server, local_registry_info) + async with _run_sandbox(config) as sandbox: + await _assert_file_content(sandbox, EXPECTED_FILE_CONTENT) + + +@pytest.mark.need_admin +@pytest.mark.asyncio +async def test_from_dockerfile_cache_skip(local_registry_info, admin_remote_server): + """Second build of the same Image should hit cache (CACHE_HIT) and skip push.""" + image = _create_image(TEST_DATA_DIR, local_registry_info) + + t0 = time.monotonic() + resolved = await _build_with_loopback_nat(image, admin_remote_server) + first_duration = time.monotonic() - t0 + + t0 = time.monotonic() + resolved2 = await _build_with_loopback_nat(image, admin_remote_server) + second_duration = time.monotonic() - t0 + + assert resolved == resolved2 + logger.info("First build: %.1fs, second build: %.1fs", first_duration, second_duration) + assert second_duration < first_duration + + +@pytest.mark.need_admin +@pytest.mark.asyncio +async def test_from_dockerfile_rebuilds_on_content_change(local_registry_info, admin_remote_server, modified_env_dir): + """Content change in env_dir triggers rebuild, new file content is picked up.""" + image = _create_image(modified_env_dir, local_registry_info) + resolved = await _build_with_loopback_nat(image, admin_remote_server) + config = _create_config(resolved, admin_remote_server, local_registry_info) + async with _run_sandbox(config) as sandbox: + await _assert_file_content(sandbox, MODIFIED_CONTENT) diff --git a/tests/integration/test_data/image_from_dockerfile/Dockerfile b/tests/integration/test_data/image_from_dockerfile/Dockerfile new file mode 100644 index 0000000000..2ca1dbccfc --- /dev/null +++ b/tests/integration/test_data/image_from_dockerfile/Dockerfile @@ -0,0 +1,2 @@ +FROM rock-n-roll-registry.cn-hangzhou.cr.aliyuncs.com/rock/example-sandbox:py311 +COPY hello.txt /opt/hello.txt diff --git a/tests/integration/test_data/image_from_dockerfile/hello.txt b/tests/integration/test_data/image_from_dockerfile/hello.txt new file mode 100644 index 0000000000..dbb76d57e7 --- /dev/null +++ b/tests/integration/test_data/image_from_dockerfile/hello.txt @@ -0,0 +1 @@ +rock-image-from-dockerfile-ok diff --git a/tests/unit/sdk/sandbox/test_image.py b/tests/unit/sdk/sandbox/test_image.py new file mode 100644 index 0000000000..933bcb4054 --- /dev/null +++ b/tests/unit/sdk/sandbox/test_image.py @@ -0,0 +1,195 @@ +"""Unit tests for Image — covers 4-segment image name composition. + +Run: uv run pytest tests/unit/sdk/sandbox/test_image.py -v +""" + +from __future__ import annotations + +import re +from pathlib import Path +from unittest.mock import AsyncMock, patch + +import pytest + +from rock.sdk.sandbox.client import Sandbox +from rock.sdk.sandbox.config import SandboxConfig +from rock.sdk.sandbox.image import Image, ImageRegistry + + +@pytest.fixture +def env_dir(tmp_path: Path) -> Path: + """Minimal valid build context: a Dockerfile + a marker file.""" + d = tmp_path / "env" + d.mkdir() + (d / "Dockerfile").write_text("FROM python:3.11\nCOPY hello.txt /opt/hello.txt\n") + (d / "hello.txt").write_text("hi\n") + return d + + +def test_resolve_full_name_concatenates_four_segments(env_dir: Path) -> None: + """Happy path: explicit segments concatenated; trailing slash on registry stripped.""" + image = Image.from_dockerfile(env_dir, registry=ImageRegistry(url="reg.io/", namespace="myns", repository="myrepo")) + name = image.full_name + tag = image.content_hash() + assert name == f"reg.io/myns/myrepo:{tag}" + + +def test_resolve_full_name_raises_when_segments_missing(env_dir: Path) -> None: + """Missing segments → ValueError listing exactly which ones.""" + image = Image.from_dockerfile(env_dir) # all defaults are None + with pytest.raises(ValueError) as exc: + image.full_name + msg = str(exc.value) + assert "registry.url" in msg and "registry.namespace" in msg and "registry.repository" in msg + + +def test_tag_is_64_hex_sha256(env_dir: Path) -> None: + """Tag pinned to full SHA-256 (OCI digest length), no truncation.""" + image = Image.from_dockerfile(env_dir, registry=ImageRegistry(url="reg.io", namespace="ns", repository="repo")) + tag = image.full_name.rsplit(":", 1)[1] + assert re.fullmatch(r"[0-9a-f]{64}", tag) + + +class _CapturedRepository(Exception): + def __init__(self, repository): + super().__init__(repository) + self.repository = repository + + +@pytest.mark.asyncio +async def test_sandbox_start_injects_user_id_as_repository(env_dir, monkeypatch): + def fake_to_build_spec(self): + raise _CapturedRepository(self.registry.repository) + + monkeypatch.setattr(Image, "to_build_spec", fake_to_build_spec) + + image = Image.from_dockerfile(env_dir, registry=ImageRegistry(url="reg.io", namespace="ns")) + config = SandboxConfig(image=image, user_id="alice", base_url="http://x") + sandbox = Sandbox(config) + + with patch.object(sandbox, "_fetch_acr_config", new_callable=AsyncMock, return_value=None): + with pytest.raises(_CapturedRepository) as excinfo: + await sandbox.start() + assert excinfo.value.repository == "alice" + + +def test_from_dockerfile_accepts_file_path(tmp_path: Path) -> None: + """When `path` points to a Dockerfile file, only that file is the build + context — the surrounding directory is not used.""" + dockerfile = tmp_path / "Dockerfile" + dockerfile.write_text("FROM python:3.11\n") + # Sibling file that must NOT influence the build / content hash. + (tmp_path / "noise.txt").write_text("ignore me\n") + + image = Image.from_dockerfile(dockerfile, registry=ImageRegistry(url="reg.io", namespace="ns", repository="repo")) + name = image.full_name + assert name.startswith("reg.io/ns/repo:") + + +def test_from_dockerfile_file_path_hash_excludes_siblings(tmp_path: Path) -> None: + """File-path hash must depend only on the Dockerfile's content, not on + sibling files that happen to share the parent dir.""" + dockerfile = tmp_path / "Dockerfile" + dockerfile.write_text("FROM python:3.11\n") + + image_no_noise = Image.from_dockerfile(dockerfile, registry=ImageRegistry(url="r", namespace="n", repository="p")) + hash_a = image_no_noise.content_hash() + + (tmp_path / "noise.txt").write_text("ignore me\n") + image_with_noise = Image.from_dockerfile(dockerfile, registry=ImageRegistry(url="r", namespace="n", repository="p")) + hash_b = image_with_noise.content_hash() + + assert hash_a == hash_b, "sibling files must not affect file-mode hash" + + +def test_from_dockerfile_rejects_nonexistent_path(tmp_path: Path) -> None: + """Neither a missing file nor a missing directory should validate.""" + with pytest.raises(ValueError): + Image.from_dockerfile(tmp_path / "does-not-exist") + + +@pytest.mark.asyncio +async def test_sandbox_start_falls_back_to_default_repository(env_dir, monkeypatch): + def fake_to_build_spec(self): + raise _CapturedRepository(self.registry.repository) + + monkeypatch.setattr(Image, "to_build_spec", fake_to_build_spec) + + image = Image.from_dockerfile(env_dir, registry=ImageRegistry(url="reg.io", namespace="ns")) + config = SandboxConfig(image=image, base_url="http://x") # no user_id + sandbox = Sandbox(config) + + with patch.object(sandbox, "_fetch_acr_config", new_callable=AsyncMock, return_value=None): + with pytest.raises(_CapturedRepository) as excinfo: + await sandbox.start() + assert excinfo.value.repository == "default" + + +@pytest.mark.asyncio +async def test_resolve_image_fills_from_admin_config(env_dir, monkeypatch): + """_resolve_image() fills registry/builder fields from admin /acr_config.""" + + def fake_to_build_spec(self): + raise _CapturedRepository(self.registry.repository) + + monkeypatch.setattr(Image, "to_build_spec", fake_to_build_spec) + + admin_response = { + "Registry": "admin-reg.io", + "Namespace": "admin-ns", + "Username": "tmp-user", + "Password": "tmp-pass", + "BuilderImage": "admin-builder:latest", + } + + image = Image.from_dockerfile(env_dir) + config = SandboxConfig(image=image, user_id="bob", base_url="http://x") + sandbox = Sandbox(config) + + with patch.object(sandbox, "_fetch_acr_config", new_callable=AsyncMock, return_value=admin_response): + with pytest.raises(_CapturedRepository): + await sandbox.start() + + assert image.registry.url == "admin-reg.io" + assert image.registry.namespace == "admin-ns" + assert image.registry.username == "tmp-user" + assert image.registry.password == "tmp-pass" + assert image.builder_config.image == "admin-builder:latest" + + +@pytest.mark.asyncio +async def test_resolve_image_explicit_overrides_admin(env_dir, monkeypatch): + """Explicitly set registry fields are NOT overwritten by admin config.""" + + def fake_to_build_spec(self): + raise _CapturedRepository(self.registry.repository) + + monkeypatch.setattr(Image, "to_build_spec", fake_to_build_spec) + + admin_response = { + "Registry": "admin-reg.io", + "Namespace": "admin-ns", + "Username": "tmp-user", + "Password": "tmp-pass", + "BuilderImage": "admin-builder:latest", + } + + image = Image.from_dockerfile( + env_dir, + registry=ImageRegistry(url="my-reg.io", namespace="my-ns"), + builder_config=__import__("rock.sdk.sandbox.image.config", fromlist=["BuilderConfig"]).BuilderConfig( + image="my-builder:v1" + ), + ) + config = SandboxConfig(image=image, user_id="bob", base_url="http://x") + sandbox = Sandbox(config) + + with patch.object(sandbox, "_fetch_acr_config", new_callable=AsyncMock, return_value=admin_response): + with pytest.raises(_CapturedRepository): + await sandbox.start() + + assert image.registry.url == "my-reg.io" + assert image.registry.namespace == "my-ns" + assert image.builder_config.image == "my-builder:v1" + # Credentials always come from admin (temporary ACR token) + assert image.registry.username == "tmp-user"