feat: add E2B API compatibility layer and Templates API support#275
feat: add E2B API compatibility layer and Templates API support#275MahaoAlex wants to merge 5 commits intovolcano-sh:mainfrom
Conversation
There was a problem hiding this comment.
Code Review
This pull request introduces an E2B-compatible API layer to AgentCube, enabling seamless migration from E2B services by supporting sandbox lifecycle management and template operations. It includes new API handlers, models, and authentication mechanisms, along with comprehensive E2E tests. My feedback highlights opportunities to simplify error handling logic in the router, decouple informer initialization for better testability, and refactor the wildcard routing logic in the templates handler to improve maintainability.
| // Return 500 for invalid endpoint URL errors, 404 for not found errors | ||
| status := http.StatusNotFound | ||
| if strings.Contains(err.Error(), "invalid endpoint URL") { | ||
| status = http.StatusInternalServerError | ||
| } | ||
| c.JSON(status, gin.H{ |
| func (a *Authenticator) InitializeInformer() error { | ||
| a.mu.Lock() | ||
| defer a.mu.Unlock() | ||
|
|
||
| if a.k8sClient == nil { | ||
| return fmt.Errorf("kubernetes client is nil, cannot initialize informer") | ||
| } | ||
|
|
||
| if a.informer != nil { | ||
| klog.V(2).Info("Informer already initialized, skipping") | ||
| return nil | ||
| } | ||
|
|
||
| // Create informer factory with namespace restriction | ||
| factory := informers.NewSharedInformerFactoryWithOptions( | ||
| a.k8sClient, | ||
| 10*time.Minute, | ||
| informers.WithNamespace(a.config.APIKeySecretNamespace), | ||
| ) | ||
|
|
||
| // Create secret informer filtered by secret name | ||
| secretInformer := factory.Core().V1().Secrets().Informer() | ||
|
|
||
| // Add event handlers for Secret changes | ||
| _, err := secretInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ | ||
| AddFunc: a.onSecretAdd, | ||
| UpdateFunc: a.onSecretUpdate, | ||
| DeleteFunc: a.onSecretDelete, | ||
| }) | ||
| if err != nil { | ||
| return fmt.Errorf("failed to add event handler to informer: %w", err) | ||
| } | ||
|
|
||
| a.informer = secretInformer | ||
| klog.V(2).InfoS("Informer initialized", "namespace", a.config.APIKeySecretNamespace, "secret", a.config.APIKeySecret) | ||
| return nil | ||
| } |
| func (s *Server) handleTemplateWildcard(c *gin.Context) { | ||
| path := c.Param("path") | ||
| path = strings.TrimPrefix(path, "/") | ||
|
|
||
| // Extract template ID and sub-path | ||
| var templateID string | ||
| var subPath string | ||
|
|
||
| // Check for builds sub-path | ||
| if idx := strings.Index(path, "/builds"); idx != -1 { | ||
| templateID = path[:idx] | ||
| subPath = path[idx:] | ||
| } else { | ||
| templateID = path | ||
| } | ||
|
|
||
| // Set the id parameter for downstream handlers | ||
| c.Params = gin.Params{{Key: "id", Value: templateID}} | ||
|
|
||
| // Route based on sub-path and HTTP method | ||
| switch subPath { | ||
| case "/builds": | ||
| if c.Request.Method == "GET" { | ||
| s.handleListTemplateBuilds(c) | ||
| } else if c.Request.Method == "POST" { | ||
| s.handleBuildTemplate(c) | ||
| } | ||
| case "": | ||
| // Regular template operations | ||
| switch c.Request.Method { | ||
| case "GET": | ||
| s.handleGetTemplate(c) | ||
| case "PATCH": | ||
| s.handleUpdateTemplate(c) | ||
| case "DELETE": | ||
| s.handleDeleteTemplate(c) | ||
| default: | ||
| respondWithError(c, ErrInvalidRequest, "method not allowed") | ||
| } | ||
| default: | ||
| // Check for /builds/{buildId} pattern | ||
| if strings.HasPrefix(subPath, "/builds/") { | ||
| buildID := strings.TrimPrefix(subPath, "/builds/") | ||
| c.Params = gin.Params{ | ||
| {Key: "id", Value: templateID}, | ||
| {Key: "buildId", Value: buildID}, | ||
| } | ||
| s.handleGetTemplateBuild(c) | ||
| } else { | ||
| respondWithError(c, ErrNotFound, "path not found") | ||
| } | ||
| } | ||
| } |
There was a problem hiding this comment.
|
Codecov Report❌ Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## main #275 +/- ##
===========================================
+ Coverage 43.37% 56.00% +12.62%
===========================================
Files 30 44 +14
Lines 2610 4907 +2297
===========================================
+ Hits 1132 2748 +1616
- Misses 1355 1861 +506
- Partials 123 298 +175
Flags with carried forward coverage won't be shown. Click here to find out more. ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
There was a problem hiding this comment.
Pull request overview
This PR adds an E2B-compatible API surface to the AgentCube Router (templates + sandboxes) and augments CI/e2e coverage and docs so E2B SDK users can migrate to AgentCube with minimal changes.
Changes:
- Register E2B-compatible
/templatesand/sandboxesroutes in the Router and add E2B models/helpers (mapper, rate limiter, error mapping). - Add/extend E2E + black-box tests (Go + optional Python E2B SDK) and update scripts/workflows to run them.
- Add Templates API and E2B API documentation, examples, and Helm/RBAC adjustments for deployment.
Reviewed changes
Copilot reviewed 51 out of 53 changed files in this pull request and generated 17 comments.
Show a summary per file
| File | Description |
|---|---|
| test/e2e/test_templates.yaml | Adds CodeInterpreter CRs used as templates for E2E coverage. |
| test/e2e/test_e2b_sdk.py | Adds Python E2B SDK-based compatibility E2E tests. |
| test/e2e/run_e2e.sh | Updates e2e runner to configure E2B API keys and run tagged e2e tests + optional SDK tests. |
| test/e2e/e2e_test.go | Adds e2e build tag and minor constant refactor for CI control. |
| test/e2e/README_E2B_SDK.md | Documents how to run the E2B SDK compatibility tests. |
| test/e2b/lifecycle_test.go | Adds black-box lifecycle tests for the E2B sandbox API. |
| test/e2b/TEST_REPORT.md | Adds test report documentation for E2B API black-box tests. |
| test/e2b/TEST_PLAN.md | Adds test plan documentation for E2B API black-box tests. |
| pkg/workloadmanager/client_cache_test.go | Minor formatting change in constants alignment. |
| pkg/workloadmanager/auth_test.go | Minor formatting/indentation cleanup in tests. |
| pkg/router/server.go | Registers E2B route group at root and wires it into the Router. |
| pkg/router/jwt_test.go | Makes RSA private-key equality assertion resilient across Go versions. |
| pkg/router/handlers_test.go | Switches Router tests to use miniredis instead of a real Redis endpoint. |
| pkg/router/handlers.go | Improves endpoint URL parsing and adjusts error status selection. |
| pkg/router/e2b/templates_models_test.go | Adds unit tests for Templates API models serialization/validation. |
| pkg/router/e2b/templates_models.go | Adds Templates API request/response models and validation helpers. |
| pkg/router/e2b/ratelimiter_test.go | Adds tests for a token-bucket rate limiter used by auth/cache logic. |
| pkg/router/e2b/ratelimiter.go | Implements an in-memory token-bucket rate limiter. |
| pkg/router/e2b/models.go | Adds E2B sandbox API request/response models and error codes. |
| pkg/router/e2b/mapper_test.go | Adds tests for mapping between AgentCube SandboxInfo and E2B models. |
| pkg/router/e2b/mapper.go | Implements mapping logic and template-id validation/expiry helpers. |
| pkg/router/e2b/handlers.go | Implements E2B sandbox endpoints: create/list/get/delete/timeout/refresh. |
| pkg/router/e2b/errors_test.go | Adds unit tests for E2B error mapping + Gin error responses. |
| pkg/router/e2b/errors.go | Implements E2B error formatting and error-to-code mapping. |
| pkg/router/e2b/e2b_server.go | Adds an E2B server that registers sandbox + template routes and auth middleware. |
| manifests/charts/base/values.yaml | Adds templates-related Helm values (currently not wired into templates). |
| manifests/charts/base/templates/rbac-router.yaml | Expands Router RBAC rules for runtime API resources. |
| hack/setup-local-env.sh | Adds a local dev helper to stand up Kind + AgentCube + port-forwards. |
| hack/setup-kind-cluster.sh | Adds a Kind cluster setup script with agent-sandbox + Redis provisioning. |
| go.mod | Adds an indirect dependency (objx) pulled by the new test stack. |
| examples/templates/python_sdk_example.py | Adds a Python example for using Templates API via an SDK. |
| examples/templates/list_templates.sh | Adds a curl example for listing templates. |
| examples/templates/create_template.sh | Adds a curl example for creating a template and polling state. |
| docs/tutorials/e2b-api-guide.md | Adds an E2B API usage guide (Chinese) with curl/SDK examples. |
| docs/api/templates-api.md | Adds Templates API reference documentation. |
| docs/api/e2b-openapi.yaml | Adds an OpenAPI document for the E2B-compatible sandbox API. |
| docs/api-comparison.md | Adds/updates a comparison doc positioning AgentCube vs E2B/OpenKruise. |
| README.md | Updates README to mention E2B compatibility + adds feature tables/quick example. |
| .github/workflows/templates-api-tests.yml | Adds a workflow to run Templates-specific unit/integration tests. |
| .github/workflows/e2e.yml | Updates e2e workflow to set E2B API key env/config before running e2e. |
| .github/workflows/e2b-api.yml | Adds a dedicated Kind-based workflow for E2B API + SDK compatibility testing. |
| .github/workflows/code-quality.yml | Adds a comprehensive code-quality workflow (fmt/vet/lint/tests/build/tidy/generate). |
| // List sandboxes using store client | ||
| // We use ListExpiredSandboxes with a future time to get all active sandboxes | ||
| ctx := c.Request.Context() | ||
| futureTime := time.Now().Add(365 * 24 * time.Hour) // Far future to get all sandboxes | ||
|
|
||
| sandboxes, err := s.storeClient.ListExpiredSandboxes(ctx, futureTime, 1000) | ||
| if err != nil { |
There was a problem hiding this comment.
handleListSandboxes is using ListExpiredSandboxes(time.Now()+365d) to approximate “list all sandboxes”. This will include already-expired sessions and will also silently omit sessions with ExpiresAt beyond that arbitrary window. Consider adding a store method to list active sandboxes (e.g., ZRANGEBYSCORE expiryIndexKey from now..+inf) and use that here so the endpoint matches “running sandboxes” semantics.
| // Calculate new expiration time from now | ||
| newExpiresAt := time.Now().Add(time.Duration(req.Timeout) * time.Second) | ||
| sandbox.ExpiresAt = newExpiresAt | ||
|
|
||
| // Update sandbox | ||
| if err := s.storeClient.UpdateSandbox(ctx, sandbox); err != nil { | ||
| klog.Errorf("failed to update sandbox timeout: %v", err) | ||
| respondWithError(c, ErrInternal, "failed to set timeout") | ||
| return | ||
| } |
There was a problem hiding this comment.
handleSetTimeout updates ExpiresAt and persists via UpdateSandbox, but UpdateSandbox doesn’t update the expiry index (ZSet). This means the new timeout won’t be reflected in expiration/GC behavior. Please update the expiry index together with the stored object (likely via a new store API that does both atomically).
| // Parse template_id to namespace/name | ||
| namespace, name := parseTemplateID(req.TemplateID) | ||
|
|
||
| klog.Infof("creating sandbox: template=%s, namespace=%s, clientID=%s, timeout=%d", | ||
| req.TemplateID, namespace, clientID, req.Timeout) |
There was a problem hiding this comment.
handleCreateSandbox parses template_id but never validates its format, even though validateTemplateID exists and other template endpoints use it. Without validation, template_id values with extra slashes or invalid characters can flow into K8s names and fail in confusing ways. Consider validating template_id up-front and returning ErrInvalidRequest on bad input.
| resources: ["secrets"] | ||
| verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] | ||
| - apiGroups: ["runtime.agentcube.volcano.sh"] | ||
| resources: ["codeinterpreterbuilds", "agentruntimebuilds"] |
There was a problem hiding this comment.
The router E2B Templates handlers create/read CodeInterpreter CRDs, but this Role grants access only to "codeinterpreterbuilds"/"agentruntimebuilds". Those resources don’t exist in this repo’s runtime API types, and the router will still lack permissions for "codeinterpreters"/"agentruntimes" (and lists/watches) needed for template CRUD. Please adjust the RBAC resources to the actual CRDs being accessed.
| resources: ["codeinterpreterbuilds", "agentruntimebuilds"] | |
| resources: ["codeinterpreters", "agentruntimes"] |
| // E2B API routes (templates, sandboxes) - registered at root level for E2B compatibility | ||
| // These routes handle their own authentication via API key middleware | ||
| e2bGroup := s.engine.Group("") | ||
| e2bGroup.Use(gin.Logger()) | ||
| e2bGroup.Use(gin.Recovery()) | ||
| // Apply concurrency limiting to E2B routes as well | ||
| e2bGroup.Use(s.concurrencyLimitMiddleware()) |
There was a problem hiding this comment.
concurrencyLimitMiddleware() allocates a new semaphore channel each time it’s called. By applying it to both v1 and E2B groups, the effective global concurrency cap becomes ~2× MaxConcurrentRequests (one pool per group). If the intent is a single global limit, store the semaphore on Server and reuse the same middleware instance across groups.
| // If timeout is provided, extend expiration time | ||
| if req.Timeout > 0 { | ||
| sandbox.ExpiresAt = time.Now().Add(time.Duration(req.Timeout) * time.Second) | ||
| if err := s.storeClient.UpdateSandbox(ctx, sandbox); err != nil { | ||
| klog.Errorf("failed to update sandbox on refresh: %v", err) | ||
| respondWithError(c, ErrInternal, "failed to refresh sandbox") | ||
| return | ||
| } | ||
| klog.Infof("sandbox refreshed with timeout: sandboxID=%s, timeout=%d", sandboxID, req.Timeout) |
There was a problem hiding this comment.
Refresh semantics don’t match the API docs/comments: when a timeout is provided, this sets ExpiresAt = now + timeout rather than extending from the current expiration. For a true “refresh/keep-alive”, it should usually add to max(now, current ExpiresAt) so repeated refreshes monotonically extend the lifetime.
| // Return 500 for invalid endpoint URL errors, 404 for not found errors | ||
| status := http.StatusNotFound | ||
| if strings.Contains(err.Error(), "invalid endpoint URL") { | ||
| status = http.StatusInternalServerError | ||
| } | ||
| c.JSON(status, gin.H{ |
There was a problem hiding this comment.
This logic determines HTTP status by substring-matching err.Error() ("invalid endpoint URL"). That’s brittle and couples behavior to error text. Prefer a sentinel/typed error (e.g., var ErrInvalidEndpointURL) or wrapping with errors.Is, then branch on errors.Is(err, ErrInvalidEndpointURL) to decide between 404 vs 500.
| -d '{ | ||
| "name": "python-data-science", | ||
| "description": "Python template with data science libraries", | ||
| "public": true, | ||
| "aliases": ["datascience", "py-ds"], | ||
| "memoryMB": 4096, | ||
| "cpuCount": 2, | ||
| "dockerfile": "FROM python:3.11-slim | ||
|
|
||
| WORKDIR /app | ||
|
|
||
| # Install data science libraries | ||
| RUN pip install --no-cache-dir \ | ||
| pandas \ | ||
| numpy \ | ||
| matplotlib \ | ||
| scikit-learn \ | ||
| jupyter | ||
|
|
||
| # Set up working directory | ||
| COPY . /app/ | ||
|
|
||
| CMD [\"python\"]", | ||
| "startCommand": "python" | ||
| }') | ||
|
|
||
| echo "Response: $TEMPLATE_RESPONSE" | ||
|
|
||
| # Extract template ID from response | ||
| TEMPLATE_ID=$(echo "$TEMPLATE_RESPONSE" | grep -o '"templateID":"[^"]*"' | cut -d'"' -f4) | ||
|
|
There was a problem hiding this comment.
The JSON field names in this example (memoryMB, cpuCount, startCommand) don’t match the server’s Templates API models, which use snake_case (memory_mb, vcpu_count, start_command). Also the response parsing looks for "templateID" but the server returns "template_id". As written, this script will fail against the current implementation—please align the field names with the API.
| NewSandbox: | ||
| type: object | ||
| description: 创建沙箱的请求 | ||
| properties: | ||
| templateID: | ||
| type: string | ||
| description: 模板 ID(格式:namespace/name 或 name) | ||
| example: default/code-interpreter | ||
| timeout: | ||
| type: integer | ||
| description: 沙箱超时时间(秒) | ||
| default: 900 | ||
| minimum: 60 | ||
| maximum: 86400 | ||
| metadata: | ||
| type: object | ||
| additionalProperties: true | ||
| description: 自定义元数据 | ||
| envVars: | ||
| type: object | ||
| additionalProperties: | ||
| type: string | ||
| description: 环境变量(Phase 1 不支持) | ||
| autoPause: | ||
| type: boolean | ||
| description: 自动暂停(Phase 1 不支持,必须为 false) | ||
| default: false | ||
| allowInternetAccess: | ||
| type: boolean | ||
| description: 允许互联网访问(Phase 1 不支持) | ||
| default: false | ||
| secure: | ||
| type: boolean | ||
| description: 安全模式 | ||
| default: false | ||
| required: | ||
| - templateID | ||
|
|
There was a problem hiding this comment.
The OpenAPI schema and examples here use camelCase (templateID/clientID/sandboxID, envVars, autoPause), but the router’s E2B models use snake_case JSON tags (template_id/client_id/sandbox_id, env_vars, auto_pause). Please align the OpenAPI spec field names (and defaults like timeout) with the actual implementation so the spec is usable.
| # Run template-specific E2E tests with API key | ||
| export ROUTER_URL=http://localhost:8081 | ||
| export E2B_API_KEY=${{ env.E2B_API_KEY }} | ||
| go test -v ./test/e2e/... -run "Template" -timeout 10m |
There was a problem hiding this comment.
The integration test step runs go test ./test/e2e/... without -tags e2e, but test/e2e/e2e_test.go now has //go:build e2e and will be excluded. This can make the workflow pass while running no meaningful E2E tests. Add -tags e2e (consistent with run_e2e.sh) to ensure the intended tests execute.
| go test -v ./test/e2e/... -run "Template" -timeout 10m | |
| go test -tags e2e -v ./test/e2e/... -run "Template" -timeout 10m |
5f85097 to
d276f04
Compare
There was a problem hiding this comment.
This duplicates the check that already exists in the main branch
There was a problem hiding this comment.
Still work in progcess, I’ll check
There was a problem hiding this comment.
I think this is also need to be removed
There was a problem hiding this comment.
seem to be a lot of unnecessary files
ed707e9 to
8a98efb
Compare
| ## Phase 1 限制说明 | ||
|
|
||
| 当前 E2B API 实现为 Phase 1,具有以下限制: | ||
|
|
||
| ### 支持的特性 | ||
|
|
||
| - [OK] 沙箱生命周期管理(创建、获取、列出、删除) | ||
| - [OK] 超时设置和刷新 | ||
| - [OK] API Key 认证 | ||
| - [OK] 基础元数据支持 | ||
|
|
||
| ### 不支持的特性(Phase 2+) | ||
|
|
||
| | 特性 | 状态 | 说明 | | ||
| |------|------|------| | ||
| | `/templates/*` API | [不支持] | 模板管理 API | | ||
| | `/sandboxes/{id}/metrics` | [不支持] | 指标监控 API | | ||
| | `/sandboxes/{id}/logs` | [不支持] | 日志流 API | | ||
| | `/snapshots/*` API | [不支持] | 快照管理 API | | ||
| | `/volumes/*` API | [不支持] | 卷管理 API | | ||
| | Pause/Resume | [不支持] | 沙箱暂停/恢复功能 | | ||
| | Auto Pause | [不支持] | 自动暂停功能(请求时会返回错误) | | ||
| | Network 配置 | [不支持] | 网络访问配置 | | ||
| | Volume Mounts | [不支持] | 卷挂载 | | ||
| | Environment Variables | [不支持] | 环境变量注入 | | ||
|
|
There was a problem hiding this comment.
The “Phase 1 限制说明” section states that /templates/* is not supported, but this PR adds Templates API routes under pkg/router/e2b and registers them in the router. This creates contradictory guidance for users. Please update the Phase 1 limitations list (and any examples) to reflect the current supported endpoints.
| // handleDeleteSandbox handles DELETE /sandboxes/{id} | ||
| func (ts *TestServer) handleDeleteSandbox(c *gin.Context) { | ||
| id := c.Param("id") | ||
|
|
||
| ts.mu.Lock() | ||
| sb, exists := ts.sandboxes[id] | ||
| if !exists { | ||
| ts.mu.Unlock() | ||
| c.JSON(http.StatusNotFound, ErrorResponse{ | ||
| Error: "not_found", | ||
| Code: "SANDBOX_NOT_FOUND", | ||
| Message: "Sandbox not found: " + id, | ||
| }) | ||
| return | ||
| } | ||
|
|
||
| delete(ts.sandboxes, id) | ||
| ts.mu.Unlock() | ||
|
|
||
| // Also delete from store | ||
| if err := ts.Store.DeleteSandboxBySessionID(c.Request.Context(), sb.SessionID); err != nil { | ||
| // Log error but don't fail the request | ||
| // In real implementation, use proper logging | ||
| _ = err | ||
| } | ||
|
|
||
| c.JSON(http.StatusOK, SuccessResponse{ | ||
| Message: "Sandbox deleted successfully", | ||
| }) | ||
| } |
There was a problem hiding this comment.
test/e2b is presented as “black-box” E2B API tests, but NewTestServer defines its own in-memory Gin handlers (not the router’s pkg/router/e2b implementation) and the tests exercise those local handlers. This means the suite can pass even if the real router implementation is broken. If the intent is to validate AgentCube’s E2B compatibility, consider turning these into integration tests that hit the actual router (e.g., start pkg/router with a mock store/session manager, or run against a real router instance via HTTP).
| // Set timeout if specified | ||
| if req.Timeout > 0 { | ||
| sandbox.ExpiresAt = CalculateExpiry(req.Timeout) | ||
| if err := s.storeClient.UpdateSandbox(c.Request.Context(), sandbox); err != nil { | ||
| klog.Errorf("failed to update sandbox timeout: %v", err) | ||
| } | ||
| } | ||
|
|
||
| // Convert to E2B response | ||
| response := s.mapper.ToE2BSandbox(sandbox, clientID) | ||
|
|
||
| klog.Infof("sandbox created successfully: sandboxID=%s", sandbox.SessionID) | ||
| c.JSON(http.StatusCreated, response) | ||
| } | ||
|
|
||
| // handleListSandboxes handles GET /sandboxes - List all sandboxes | ||
| func (s *Server) handleListSandboxes(c *gin.Context) { | ||
| // Get client ID from context | ||
| clientID := c.GetString("client_id") | ||
| if clientID == "" { | ||
| clientID = defaultClientID | ||
| } | ||
|
|
||
| // List sandboxes using store client | ||
| // We use ListExpiredSandboxes with a future time to get all active sandboxes | ||
| ctx := c.Request.Context() | ||
| futureTime := time.Now().Add(365 * 24 * time.Hour) // Far future to get all sandboxes | ||
|
|
||
| sandboxes, err := s.storeClient.ListExpiredSandboxes(ctx, futureTime, 1000) | ||
| if err != nil { | ||
| klog.Errorf("failed to list sandboxes: %v", err) | ||
| respondWithError(c, ErrInternal, "failed to list sandboxes") | ||
| return | ||
| } | ||
|
|
||
| // Convert to E2B response | ||
| response := make([]ListedSandbox, 0, len(sandboxes)) | ||
| for _, sandbox := range sandboxes { | ||
| response = append(response, *s.mapper.ToE2BListedSandbox(sandbox, clientID)) | ||
| } | ||
|
|
||
| klog.V(4).Infof("listed %d sandboxes", len(response)) | ||
| c.JSON(http.StatusOK, response) | ||
| } | ||
|
|
||
| // handleGetSandbox handles GET /sandboxes/{id} - Get sandbox details | ||
| func (s *Server) handleGetSandbox(c *gin.Context) { | ||
| sandboxID := c.Param("id") | ||
| if sandboxID == "" { | ||
| respondWithError(c, ErrInvalidRequest, "sandbox id is required") | ||
| return | ||
| } | ||
|
|
||
| // Get client ID from context | ||
| clientID := c.GetString("client_id") | ||
| if clientID == "" { | ||
| clientID = defaultClientID | ||
| } | ||
|
|
||
| // Get sandbox from store | ||
| sandbox, err := s.storeClient.GetSandboxBySessionID(c.Request.Context(), sandboxID) | ||
| if err != nil { | ||
| handleStoreError(c, err) | ||
| return | ||
| } | ||
|
|
||
| // Convert to E2B response | ||
| response := s.mapper.ToE2BSandboxDetail(sandbox, clientID) | ||
|
|
||
| klog.V(4).Infof("retrieved sandbox: sandboxID=%s", sandboxID) | ||
| c.JSON(http.StatusOK, response) | ||
| } | ||
|
|
||
| // handleDeleteSandbox handles DELETE /sandboxes/{id} - Delete a sandbox | ||
| func (s *Server) handleDeleteSandbox(c *gin.Context) { | ||
| sandboxID := c.Param("id") | ||
| if sandboxID == "" { | ||
| respondWithError(c, ErrInvalidRequest, "sandbox id is required") | ||
| return | ||
| } | ||
|
|
||
| ctx := c.Request.Context() | ||
|
|
||
| // First get the sandbox to find the session ID | ||
| sandbox, err := s.storeClient.GetSandboxBySessionID(ctx, sandboxID) | ||
| if err != nil { | ||
| handleStoreError(c, err) | ||
| return | ||
| } | ||
|
|
||
| // Delete sandbox by session ID | ||
| if err := s.storeClient.DeleteSandboxBySessionID(ctx, sandbox.SessionID); err != nil { | ||
| klog.Errorf("failed to delete sandbox: %v", err) | ||
| respondWithError(c, ErrInternal, "failed to delete sandbox") | ||
| return | ||
| } | ||
|
|
||
| klog.Infof("sandbox deleted successfully: sandboxID=%s, sessionID=%s", sandboxID, sandbox.SessionID) | ||
| c.Status(http.StatusNoContent) | ||
| } | ||
|
|
||
| // handleSetTimeout handles POST /sandboxes/{id}/timeout - Set sandbox timeout | ||
| func (s *Server) handleSetTimeout(c *gin.Context) { | ||
| sandboxID := c.Param("id") | ||
| if sandboxID == "" { | ||
| respondWithError(c, ErrInvalidRequest, "sandbox id is required") | ||
| return | ||
| } | ||
|
|
||
| var req TimeoutRequest | ||
| if err := c.ShouldBindJSON(&req); err != nil { | ||
| klog.Errorf("failed to bind request body: %v", err) | ||
| respondWithError(c, ErrInvalidRequest, "invalid request body") | ||
| return | ||
| } | ||
|
|
||
| // Validate timeout | ||
| if req.Timeout <= 0 { | ||
| respondWithError(c, ErrInvalidRequest, "timeout must be greater than 0") | ||
| return | ||
| } | ||
|
|
||
| ctx := c.Request.Context() | ||
|
|
||
| // Get the sandbox | ||
| sandbox, err := s.storeClient.GetSandboxBySessionID(ctx, sandboxID) | ||
| if err != nil { | ||
| handleStoreError(c, err) | ||
| return | ||
| } | ||
|
|
||
| // Calculate new expiration time from now | ||
| newExpiresAt := time.Now().Add(time.Duration(req.Timeout) * time.Second) | ||
| sandbox.ExpiresAt = newExpiresAt | ||
|
|
||
| // Update sandbox | ||
| if err := s.storeClient.UpdateSandbox(ctx, sandbox); err != nil { | ||
| klog.Errorf("failed to update sandbox timeout: %v", err) | ||
| respondWithError(c, ErrInternal, "failed to set timeout") | ||
| return | ||
| } |
There was a problem hiding this comment.
In handleCreateSandbox / handleSetTimeout / handleRefreshSandbox, the code updates sandbox.ExpiresAt and then calls storeClient.UpdateSandbox(...). However pkg/store’s UpdateSandbox explicitly does not update the expiry ZSET index (session:expiry), so TTL extensions won’t be reflected in GC / expiry-based queries. This will make /sandboxes/{id}/timeout and refresh-with-timeout appear to succeed but not actually extend lifetime. Consider adding a store method that updates both the sandbox object and the expiry index (e.g., UpdateSandboxExpiry / UpdateSandboxAndIndexes) and use that here, or update the store implementation so UpdateSandbox updates the expiry/last-activity indices when those fields change.
| // concurrencyLimitMiddleware limits the number of concurrent requests | ||
| func (s *Server) concurrencyLimitMiddleware() gin.HandlerFunc { | ||
| concurrency := make(chan struct{}, s.config.MaxConcurrentRequests) | ||
| return func(c *gin.Context) { | ||
| // Try to acquire a slot in the semaphore | ||
| select { | ||
| case concurrency <- struct{}{}: | ||
| // Successfully acquired a slot, continue processing | ||
| defer func() { | ||
| // Release the slot when done | ||
| <-concurrency | ||
| }() | ||
| c.Next() | ||
| default: | ||
| // No slots available, return 503 Service Unavailable | ||
| c.JSON(http.StatusTooManyRequests, gin.H{ | ||
| "error": "server overloaded, please try again later", | ||
| "code": "SERVER_OVERLOADED", | ||
| }) | ||
| c.Abort() | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // setupRoutes configures HTTP routes using Gin | ||
| func (s *Server) setupRoutes() { | ||
| s.engine = gin.New() | ||
|
|
||
| // Health check endpoints (no authentication required, no concurrency limit) | ||
| s.engine.GET("/health/live", s.handleHealthLive) | ||
| s.engine.GET("/health/ready", s.handleHealthReady) | ||
|
|
||
| // API v1 routes with concurrency limiting | ||
| v1 := s.engine.Group("/v1") | ||
| // Add middleware | ||
| v1.Use(gin.Logger()) | ||
| v1.Use(gin.Recovery()) | ||
|
|
||
| v1.Use(s.concurrencyLimitMiddleware()) // Apply concurrency limit to API routes | ||
|
|
||
| // Agent invoke requests (support GET/POST, since downstream uses these methods) | ||
| v1.GET("/namespaces/:namespace/agent-runtimes/:name/invocations/*path", s.handleAgentInvoke) | ||
| v1.POST("/namespaces/:namespace/agent-runtimes/:name/invocations/*path", s.handleAgentInvoke) | ||
|
|
||
| // Code interpreter invoke requests (support GET/POST, since downstream uses GET for file download) | ||
| v1.GET("/namespaces/:namespace/code-interpreters/:name/invocations/*path", s.handleCodeInterpreterInvoke) | ||
| v1.POST("/namespaces/:namespace/code-interpreters/:name/invocations/*path", s.handleCodeInterpreterInvoke) | ||
|
|
||
| // E2B API routes (templates, sandboxes) - registered at root level for E2B compatibility | ||
| // These routes handle their own authentication via API key middleware | ||
| e2bGroup := s.engine.Group("") | ||
| e2bGroup.Use(gin.Logger()) | ||
| e2bGroup.Use(gin.Recovery()) | ||
| // Apply concurrency limiting to E2B routes as well | ||
| e2bGroup.Use(s.concurrencyLimitMiddleware()) | ||
|
|
||
| // Initialize E2B server (registers /templates and /sandboxes routes) | ||
| _, err := e2b.NewServer(e2bGroup, s.storeClient, s.sessionManager) | ||
| if err != nil { |
There was a problem hiding this comment.
concurrencyLimitMiddleware() allocates a new semaphore channel each time it’s called. Since it’s applied separately to the /v1 group and the root E2B group, the router effectively allows up to 2 * MaxConcurrentRequests concurrent requests (one limit per group), which defeats the intent of a global cap. Consider creating the semaphore once on Server (e.g., s.concurrencySem) and returning a handler that uses that shared semaphore so the limit is enforced across all routes.
| for _, tt := range tests { | ||
| t.Run(string(rune(tt.timeout)), func(t *testing.T) { | ||
| result := CalculateExpiry(tt.timeout) | ||
| // Check that the result is within a reasonable time window | ||
| expectedTime := time.Now().Add(tt.expected) | ||
| assert.WithinDuration(t, expectedTime, result, time.Second) | ||
| }) |
There was a problem hiding this comment.
TestCalculateExpiry uses t.Run(string(rune(tt.timeout)), ...) as the subtest name. Converting an int timeout to a rune produces non-printable/duplicate names (e.g., 60 becomes '<'), which makes test output confusing and can collide. Use a stable string like fmt.Sprintf("timeout_%d", tt.timeout) instead.
| -d '{ | ||
| "name": "python-data-science", | ||
| "description": "Python template with data science libraries", | ||
| "public": true, | ||
| "aliases": ["datascience", "py-ds"], | ||
| "memoryMB": 4096, | ||
| "cpuCount": 2, | ||
| "dockerfile": "FROM python:3.11-slim | ||
|
|
||
| WORKDIR /app | ||
|
|
||
| # Install data science libraries | ||
| RUN pip install --no-cache-dir \ | ||
| pandas \ | ||
| numpy \ | ||
| matplotlib \ | ||
| scikit-learn \ | ||
| jupyter | ||
|
|
||
| # Set up working directory | ||
| COPY . /app/ | ||
|
|
||
| CMD [\"python\"]", | ||
| "startCommand": "python" | ||
| }') | ||
|
|
||
| echo "Response: $TEMPLATE_RESPONSE" | ||
|
|
||
| # Extract template ID from response | ||
| TEMPLATE_ID=$(echo "$TEMPLATE_RESPONSE" | grep -o '"templateID":"[^"]*"' | cut -d'"' -f4) | ||
|
|
There was a problem hiding this comment.
This script uses camelCase field names (memoryMB, cpuCount, startCommand) and parses templateID from the response, but the Templates API models in pkg/router/e2b/templates_models.go use snake_case JSON (memory_mb, vcpu_count, start_command, template_id). As written, the request won’t bind as intended and TEMPLATE_ID extraction will fail. Update the payload keys and the response parsing to match the actual API field names (or adjust the API to accept both formats if compatibility requires it).
| // List sandboxes using store client | ||
| // We use ListExpiredSandboxes with a future time to get all active sandboxes | ||
| ctx := c.Request.Context() | ||
| futureTime := time.Now().Add(365 * 24 * time.Hour) // Far future to get all sandboxes | ||
|
|
||
| sandboxes, err := s.storeClient.ListExpiredSandboxes(ctx, futureTime, 1000) | ||
| if err != nil { | ||
| klog.Errorf("failed to list sandboxes: %v", err) | ||
| respondWithError(c, ErrInternal, "failed to list sandboxes") | ||
| return |
There was a problem hiding this comment.
handleListSandboxes uses ListExpiredSandboxes with a timestamp 1 year in the future to approximate “list all active sandboxes”. ListExpiredSandboxes returns sandboxes whose ExpiresAt is before the provided time, so this will include already-expired sandboxes as well. It also relies on the expiry index being accurate (which UpdateSandbox doesn’t maintain). To match E2B semantics (list running/active sandboxes), consider adding a store query for non-expired sessions (score >= now), or filtering the returned items by ExpiresAt.After(time.Now()) after fetching IDs from a proper index.
| ```json | ||
| [ | ||
| { | ||
| "templateID": "default/my-template", | ||
| "name": "my-template", | ||
| "description": "My code interpreter template", | ||
| "aliases": ["my-alias"], | ||
| "createdAt": "2024-01-15T10:30:00Z", | ||
| "updatedAt": "2024-01-15T10:30:00Z", | ||
| "public": true, | ||
| "state": "ready", | ||
| "memoryMB": 4096, | ||
| "cpuCount": 2 | ||
| } | ||
| ] | ||
| ``` | ||
|
|
||
| **States:** | ||
|
|
||
| - `pending` - Template is being created | ||
| - `building` - Template build is in progress | ||
| - `ready` - Template is ready to use | ||
| - `failed` - Template build failed | ||
| - `deprecated` - Template is deprecated | ||
|
|
||
| ### Get Template | ||
|
|
||
| ``` | ||
| GET /templates/{id} | ||
| ``` | ||
|
|
||
| Path parameters: | ||
|
|
||
| | Parameter | Type | Description | | ||
| |-----------|------|-------------| | ||
| | `id` | string | Template ID (format: `namespace/name`) | | ||
|
|
||
| Response: | ||
|
|
||
| ```json | ||
| { | ||
| "templateID": "default/my-template", | ||
| "name": "my-template", | ||
| "description": "My code interpreter template", | ||
| "aliases": ["my-alias"], | ||
| "createdAt": "2024-01-15T10:30:00Z", | ||
| "updatedAt": "2024-01-15T10:30:00Z", | ||
| "public": true, | ||
| "state": "ready", | ||
| "memoryMB": 4096, | ||
| "cpuCount": 2, | ||
| "startCommand": "python app.py" | ||
| } | ||
| ``` | ||
|
|
||
| ### Create Template | ||
|
|
||
| ``` | ||
| POST /templates | ||
| ``` | ||
|
|
||
| Request body: | ||
|
|
||
| ```json | ||
| { | ||
| "name": "my-template", | ||
| "description": "My template description", | ||
| "dockerfile": "FROM python:3.9-slim\nRUN pip install pandas numpy", | ||
| "startCommand": "python app.py", | ||
| "aliases": ["alias1", "alias2"], | ||
| "public": true, | ||
| "memoryMB": 4096, | ||
| "cpuCount": 2 | ||
| } | ||
| ``` | ||
|
|
||
| Request fields: | ||
|
|
||
| | Field | Type | Required | Description | | ||
| |-------|------|----------|-------------| | ||
| | `name` | string | Yes | Template name (unique within namespace) | | ||
| | `description` | string | No | Template description | | ||
| | `dockerfile` | string | No | Dockerfile content for custom builds | | ||
| | `startCommand` | string | No | Command to run when sandbox starts | | ||
| | `aliases` | array[string] | No | Alternative names for the template | | ||
| | `public` | bool | No | Whether template is publicly accessible | | ||
| | `memoryMB` | int | No | Memory allocation in MB (default: 4096) | | ||
| | `cpuCount` | int | No | CPU cores allocation (default: 2) | | ||
|
|
||
| Response: | ||
|
|
||
| ```json | ||
| { | ||
| "templateID": "default/my-template", | ||
| "name": "my-template", | ||
| "description": "My template description", | ||
| "aliases": ["alias1", "alias2"], | ||
| "createdAt": "2024-01-15T10:30:00Z", | ||
| "updatedAt": "2024-01-15T10:30:00Z", | ||
| "public": true, | ||
| "state": "pending", | ||
| "memoryMB": 4096, | ||
| "cpuCount": 2 | ||
| } | ||
| ``` | ||
|
|
||
| ### Update Template | ||
|
|
||
| ``` | ||
| PATCH /templates/{id} | ||
| ``` | ||
|
|
||
| Path parameters: | ||
|
|
||
| | Parameter | Type | Description | | ||
| |-----------|------|-------------| | ||
| | `id` | string | Template ID (format: `namespace/name`) | | ||
|
|
||
| Request body (all fields optional): | ||
|
|
||
| ```json | ||
| { | ||
| "description": "Updated description", | ||
| "aliases": ["new-alias"], | ||
| "public": false, | ||
| "memoryMB": 8192, | ||
| "cpuCount": 4 | ||
| } | ||
| ``` | ||
|
|
||
| Response: Updated template object (same format as Get Template) | ||
|
|
||
| ### Delete Template | ||
|
|
||
| ``` | ||
| DELETE /templates/{id} | ||
| ``` | ||
|
|
||
| Path parameters: | ||
|
|
||
| | Parameter | Type | Description | | ||
| |-----------|------|-------------| | ||
| | `id` | string | Template ID (format: `namespace/name`) | | ||
|
|
||
| Response: `204 No Content` | ||
|
|
||
| ### List Template Builds | ||
|
|
||
| ``` | ||
| GET /templates/{id}/builds | ||
| ``` | ||
|
|
||
| Path parameters: | ||
|
|
||
| | Parameter | Type | Description | | ||
| |-----------|------|-------------| | ||
| | `id` | string | Template ID (format: `namespace/name`) | | ||
|
|
||
| Query parameters: | ||
|
|
||
| | Parameter | Type | Description | | ||
| |-----------|------|-------------| | ||
| | `limit` | int | Maximum number of builds to return (default: 100) | | ||
| | `offset` | int | Offset for pagination (default: 0) | | ||
|
|
||
| Response: | ||
|
|
||
| ```json | ||
| [ | ||
| { | ||
| "buildID": "build-12345", | ||
| "templateID": "default/my-template", | ||
| "state": "completed", | ||
| "createdAt": "2024-01-15T10:30:00Z", | ||
| "completedAt": "2024-01-15T10:35:00Z", | ||
| "logs": "Building image...\nInstalling dependencies..." | ||
| } | ||
| ] | ||
| ``` | ||
|
|
||
| **Build States:** | ||
|
|
||
| - `pending` - Build queued | ||
| - `building` - Build in progress | ||
| - `completed` - Build succeeded | ||
| - `failed` - Build failed | ||
| - `cancelled` - Build was cancelled | ||
|
|
||
| ### Create Template Build | ||
|
|
||
| ``` | ||
| POST /templates/{id}/builds | ||
| ``` | ||
|
|
||
| Path parameters: | ||
|
|
||
| | Parameter | Type | Description | | ||
| |-----------|------|-------------| | ||
| | `id` | string | Template ID (format: `namespace/name`) | | ||
|
|
||
| Request body: | ||
|
|
||
| ```json | ||
| { | ||
| "dockerfile": "FROM python:3.11-slim\nRUN pip install pandas numpy matplotlib", | ||
| "startCommand": "python app.py" | ||
| } | ||
| ``` | ||
|
|
||
| Request fields: | ||
|
|
||
| | Field | Type | Required | Description | | ||
| |-------|------|----------|-------------| | ||
| | `dockerfile` | string | No | Updated Dockerfile content | | ||
| | `startCommand` | string | No | Updated start command | | ||
|
|
||
| Response: | ||
|
|
||
| ```json | ||
| { | ||
| "buildID": "build-12345", | ||
| "templateID": "default/my-template", | ||
| "state": "pending", | ||
| "createdAt": "2024-01-15T10:30:00Z" | ||
| } | ||
| ``` | ||
|
|
||
| ### Get Template Build | ||
|
|
||
| ``` | ||
| GET /templates/{id}/builds/{buildId} | ||
| ``` | ||
|
|
||
| Path parameters: | ||
|
|
||
| | Parameter | Type | Description | | ||
| |-----------|------|-------------| | ||
| | `id` | string | Template ID (format: `namespace/name`) | | ||
| | `buildId` | string | Build ID | | ||
|
|
||
| Response: | ||
|
|
||
| ```json | ||
| { | ||
| "buildID": "build-12345", | ||
| "templateID": "default/my-template", | ||
| "state": "completed", | ||
| "createdAt": "2024-01-15T10:30:00Z", | ||
| "completedAt": "2024-01-15T10:35:00Z", | ||
| "logs": "Building image...\nStep 1/3 : FROM python:3.11-slim\n..." | ||
| } | ||
| ``` | ||
|
|
||
| ## Error Codes | ||
|
|
||
| | Code | HTTP Status | Description | | ||
| |------|-------------|-------------| | ||
| | `invalid_request` | 400 | Invalid request parameters | | ||
| | `unauthorized` | 401 | Missing or invalid API key | | ||
| | `template_not_found` | 404 | Template does not exist | | ||
| | `build_not_found` | 404 | Build does not exist | | ||
| | `template_already_exists` | 409 | Template with this name already exists | | ||
| | `build_in_progress` | 409 | Another build is already in progress for this template | | ||
| | `invalid_dockerfile` | 422 | Dockerfile syntax error | | ||
| | `internal_error` | 500 | Internal server error | | ||
|
|
||
| ## Mapping to Kubernetes | ||
|
|
||
| Templates map to CodeInterpreter and AgentRuntime CRDs: | ||
|
|
||
| | Templates API | Kubernetes CRD | Notes | | ||
| |---------------|----------------|-------| | ||
| | Template ID | CRD name | Format: `{namespace}/{name}` | | ||
| | Aliases | Annotation | Stored in `e2b.agentcube.io/aliases` | | ||
| | Public flag | Label | Stored in `e2b.agentcube.io/public` | | ||
| | Dockerfile | Build source | Used to build container image | | ||
| | Start command | Container args | Command to run on sandbox start | | ||
| | Memory/CPU | Resource limits | Maps to container resources | | ||
|
|
||
| ### Example CRD Mapping | ||
|
|
||
| A template with ID `default/python-ds` maps to: | ||
|
|
||
| ```yaml | ||
| apiVersion: runtime.agentcube.io/v1alpha1 | ||
| kind: CodeInterpreter | ||
| metadata: | ||
| name: python-ds | ||
| namespace: default | ||
| labels: | ||
| e2b.agentcube.io/public: "true" | ||
| annotations: | ||
| e2b.agentcube.io/aliases: '["datascience", "py-ds"]' | ||
| spec: | ||
| image: agentcube/python-ds:latest | ||
| resources: | ||
| memoryMB: 4096 | ||
| cpuCount: 2 | ||
| ``` |
There was a problem hiding this comment.
The Templates API documentation examples use camelCase JSON fields (templateID, createdAt, memoryMB, cpuCount, etc.), but the actual API structs serialize/deserialize snake_case (template_id, created_at, memory_mb, vcpu_count, ...). This mismatch will cause clients following the doc to send the wrong payload and misinterpret responses. Please align the docs with the implemented JSON field names (or explicitly document/implement dual-format support). Also, the CRD example uses apiVersion: runtime.agentcube.io/v1alpha1, but the actual group in this repo is runtime.agentcube.volcano.sh/v1alpha1.
ca550d7 to
7310723
Compare
7310723 to
6c29ee7
Compare
| // Parse template_id to namespace/name | ||
| namespace, name := parseTemplateID(req.TemplateID) | ||
|
|
||
| klog.Infof("creating sandbox: template=%s, namespace=%s, clientID=%s, timeout=%d", | ||
| req.TemplateID, namespace, clientID, req.Timeout) | ||
|
|
There was a problem hiding this comment.
handleCreateSandbox parses template_id via parseTemplateID but does not validate it. This allows IDs with multiple slashes or invalid characters (e.g., a/b/c becomes name b/c), which will likely fail later when mapping to Kubernetes resource names and may surface as 5xx errors. You already have validateTemplateID; please call it early and return a 400 on validation failure.
| paths: | ||
| /sandboxes: | ||
| post: | ||
| summary: 创建沙箱 | ||
| description: | | ||
| 从模板创建一个新的沙箱实例。 | ||
|
|
||
| **Phase 1 限制:** | ||
| - `autoPause` 必须为 `false`(默认) | ||
| - `envVars` 不支持 | ||
| - `network` 配置不支持 | ||
| - `volumeMounts` 不支持 | ||
| operationId: createSandbox | ||
| tags: | ||
| - Sandboxes | ||
| requestBody: | ||
| required: true | ||
| content: | ||
| application/json: | ||
| schema: | ||
| $ref: '#/components/schemas/NewSandbox' | ||
| examples: | ||
| minimal: | ||
| summary: 最小请求 | ||
| value: | ||
| templateID: default/code-interpreter | ||
| with-timeout: | ||
| summary: 指定超时 | ||
| value: | ||
| templateID: default/code-interpreter | ||
| timeout: 900 | ||
| metadata: | ||
| project: my-project | ||
| responses: | ||
| '201': | ||
| description: 沙箱创建成功 | ||
| content: | ||
| application/json: | ||
| schema: | ||
| $ref: '#/components/schemas/Sandbox' | ||
| example: | ||
| clientID: client-abc123 | ||
| envdVersion: "1.0.0" | ||
| sandboxID: sb-xyz789 | ||
| templateID: default/code-interpreter | ||
| domain: sb-xyz789.agentcube.local | ||
| '400': | ||
| description: 请求参数错误(如设置了不支持的 autoPause) | ||
| content: | ||
| application/json: | ||
| schema: | ||
| $ref: '#/components/schemas/E2BError' | ||
| example: | ||
| code: 400 | ||
| message: auto_pause not supported in Phase 1 |
There was a problem hiding this comment.
This OpenAPI spec documents camelCase fields like templateID, clientID, envdVersion, etc., but the router implementation structs use snake_case JSON tags (e.g., template_id, client_id, envd_version). This mismatch will break generated clients and mislead SDK users. Please align the OpenAPI schema and examples with the actual wire format (or update the server to accept/emit the documented casing consistently).
| ```json | ||
| [ | ||
| { | ||
| "templateID": "default/my-template", | ||
| "name": "my-template", | ||
| "description": "My code interpreter template", | ||
| "aliases": ["my-alias"], | ||
| "createdAt": "2024-01-15T10:30:00Z", | ||
| "updatedAt": "2024-01-15T10:30:00Z", | ||
| "public": true, | ||
| "state": "ready", | ||
| "memoryMB": 4096, | ||
| "cpuCount": 2 | ||
| } | ||
| ] | ||
| ``` | ||
|
|
||
| **States:** | ||
|
|
||
| - `pending` - Template is being created | ||
| - `building` - Template build is in progress | ||
| - `ready` - Template is ready to use | ||
| - `failed` - Template build failed | ||
| - `deprecated` - Template is deprecated | ||
|
|
||
| ### Get Template | ||
|
|
||
| ``` | ||
| GET /templates/{id} | ||
| ``` | ||
|
|
||
| Path parameters: | ||
|
|
||
| | Parameter | Type | Description | | ||
| |-----------|------|-------------| | ||
| | `id` | string | Template ID (format: `namespace/name`) | | ||
|
|
||
| Response: | ||
|
|
||
| ```json | ||
| { | ||
| "templateID": "default/my-template", | ||
| "name": "my-template", | ||
| "description": "My code interpreter template", | ||
| "aliases": ["my-alias"], | ||
| "createdAt": "2024-01-15T10:30:00Z", | ||
| "updatedAt": "2024-01-15T10:30:00Z", | ||
| "public": true, | ||
| "state": "ready", | ||
| "memoryMB": 4096, | ||
| "cpuCount": 2, | ||
| "startCommand": "python app.py" | ||
| } | ||
| ``` | ||
|
|
||
| ### Create Template | ||
|
|
||
| ``` | ||
| POST /templates | ||
| ``` | ||
|
|
||
| Request body: | ||
|
|
||
| ```json | ||
| { | ||
| "name": "my-template", | ||
| "description": "My template description", | ||
| "dockerfile": "FROM python:3.9-slim\nRUN pip install pandas numpy", | ||
| "startCommand": "python app.py", | ||
| "aliases": ["alias1", "alias2"], | ||
| "public": true, | ||
| "memoryMB": 4096, | ||
| "cpuCount": 2 | ||
| } | ||
| ``` | ||
|
|
||
| Request fields: | ||
|
|
||
| | Field | Type | Required | Description | | ||
| |-------|------|----------|-------------| | ||
| | `name` | string | Yes | Template name (unique within namespace) | | ||
| | `description` | string | No | Template description | | ||
| | `dockerfile` | string | No | Dockerfile content for custom builds | | ||
| | `startCommand` | string | No | Command to run when sandbox starts | | ||
| | `aliases` | array[string] | No | Alternative names for the template | | ||
| | `public` | bool | No | Whether template is publicly accessible | | ||
| | `memoryMB` | int | No | Memory allocation in MB (default: 4096) | | ||
| | `cpuCount` | int | No | CPU cores allocation (default: 2) | | ||
|
|
There was a problem hiding this comment.
The Templates API docs use camelCase fields (e.g., templateID, createdAt, memoryMB, cpuCount, startCommand) but the implementation and tests for templates use snake_case JSON (e.g., template_id, created_at, memory_mb, vcpu_count, start_command). Please update the documentation examples and field lists to match the actual API responses/requests to avoid confusing users and breaking copy/paste snippets.
| klog.Errorf("Failed to get sandbox access address %s: %v", sandbox.SandboxID, err) | ||
| c.JSON(http.StatusNotFound, gin.H{ | ||
| // Return 500 for invalid endpoint URL errors, 404 for not found errors | ||
| status := http.StatusNotFound | ||
| if strings.Contains(err.Error(), "invalid endpoint URL") { | ||
| status = http.StatusInternalServerError | ||
| } | ||
| c.JSON(status, gin.H{ | ||
| "error": err.Error(), | ||
| }) |
There was a problem hiding this comment.
forwardToSandbox determines whether to return 500 vs 404 by checking strings.Contains(err.Error(), "invalid endpoint URL"). This is brittle (changes to error wording or wrapping will change behavior). Since buildURL is the source of this condition, consider returning a typed/sentinel error (or wrapping with a known value) and using errors.Is/As here to map to the correct status code reliably.
| // testRedis is the global miniredis instance for testing | ||
| var testRedis *miniredis.Miniredis | ||
| var testRedisOnce sync.Once | ||
|
|
||
| // getTestRedis returns the global miniredis instance, creating it if necessary | ||
| func getTestRedis() *miniredis.Miniredis { | ||
| testRedisOnce.Do(func() { | ||
| var err error | ||
| testRedis, err = miniredis.Run() | ||
| if err != nil { | ||
| panic(fmt.Sprintf("failed to start miniredis: %v", err)) | ||
| } | ||
| }) | ||
| return testRedis | ||
| } | ||
|
|
||
| // Mock SessionManager for testing | ||
| type mockSessionManager struct { | ||
| sandbox *types.SandboxInfo | ||
| err error | ||
| } | ||
|
|
||
| func (m *mockSessionManager) GetSandboxBySession(_ context.Context, _ string, _ string, _ string, _ string) (*types.SandboxInfo, error) { | ||
| return m.sandbox, m.err | ||
| } | ||
|
|
||
| func setupEnv() { | ||
| os.Setenv("REDIS_ADDR", "localhost:6379") | ||
| os.Setenv("REDIS_PASSWORD", "test-password") | ||
| mr := getTestRedis() | ||
| os.Setenv("REDIS_ADDR", mr.Addr()) | ||
| os.Setenv("REDIS_PASSWORD", "") | ||
| os.Setenv("REDIS_PASSWORD_REQUIRED", "false") | ||
| os.Setenv("WORKLOAD_MANAGER_URL", "http://localhost:8080") | ||
| } |
There was a problem hiding this comment.
Router tests rely on store.Storage() (a singleton guarded by sync.Once) inside NewServer(). Because the store singleton is initialized only once per test process, whichever test runs first will lock in REDIS_ADDR for all subsequent tests. With handlers_test.go now using miniredis and server_test.go still setting REDIS_ADDR=localhost:6379, the suite becomes order-dependent and can fail when the singleton is initialized with a non-running Redis. A robust fix is to ensure all tests initialize the store with the same miniredis address before any NewServer() call (e.g., via TestMain), and/or add a test-only reset hook in pkg/store to reinitialize the singleton between tests.
6c29ee7 to
df81ae5
Compare
|
[APPROVALNOTIFIER] This PR is NOT APPROVED This pull-request has been approved by: The full list of commands accepted by this bot can be found here. DetailsNeeds approval from an approver in each of these files:Approvers can indicate their approval by writing |
| // Validate unsupported features for Phase 1 | ||
| if req.AutoPause { | ||
| respondWithError(c, ErrInvalidRequest, "auto_pause not supported") | ||
| return | ||
| } | ||
|
|
||
| // Get namespace and api key hash from auth context | ||
| namespace := c.GetString("namespace") | ||
| if namespace == "" { | ||
| namespace = s.config.E2BDefaultNamespace | ||
| } | ||
| apiKeyHash := c.GetString("api_key_hash") | ||
|
|
||
| // Resolve template to name and kind | ||
| _, name, kind, err := ResolveTemplate(req.TemplateID, req.Metadata) | ||
| if err != nil { | ||
| respondWithError(c, ErrInvalidRequest, err.Error()) | ||
| return | ||
| } | ||
|
|
||
| klog.Infof("creating sandbox: template=%s, namespace=%s, kind=%s, timeout=%d", | ||
| req.TemplateID, namespace, kind, req.Timeout) | ||
|
|
||
| ctx := c.Request.Context() | ||
|
|
||
| // Call session manager to create/get sandbox | ||
| sandbox, err := s.sessionManager.GetSandboxBySession(ctx, "", namespace, name, kind) |
| // E2B API routes (templates, sandboxes) - registered at root level for E2B compatibility | ||
| // These routes handle their own authentication via API key middleware | ||
| e2bGroup := s.engine.Group("") | ||
| e2bGroup.Use(gin.Logger()) | ||
| e2bGroup.Use(gin.Recovery()) | ||
| // Apply concurrency limiting to E2B routes as well | ||
| e2bGroup.Use(s.concurrencyLimitMiddleware()) | ||
|
|
||
| // Initialize E2B server (registers /templates and /sandboxes routes) | ||
| _, err := e2b.NewServer(e2bGroup, s.storeClient, s.sessionManager) |
| // Initialize E2B server (registers /templates and /sandboxes routes) | ||
| _, err := e2b.NewServer(e2bGroup, s.storeClient, s.sessionManager) |
| # Job 5: Unit Tests (Go 1.23) - Multi-version testing | ||
| unit-tests-previous: | ||
| name: Unit Tests (Go 1.23) | ||
| runs-on: ubuntu-24.04 | ||
| steps: | ||
| - name: Checkout code | ||
| uses: actions/checkout@v4 | ||
|
|
||
| - name: Set up Go | ||
| uses: actions/setup-go@v5 | ||
| with: | ||
| go-version: ${{ env.GO_VERSION_PREVIOUS }} |
| # Run template-specific E2E tests with API key | ||
| export ROUTER_URL=http://localhost:8081 | ||
| export E2B_API_KEY=${{ env.E2B_API_KEY }} | ||
| go test -v ./test/e2e/... -run "Template" -timeout 10m |
| // Persist to store - try UpdateSandbox first, fall back to StoreSandbox | ||
| if err := s.storeClient.UpdateSandbox(ctx, sandbox); err != nil { | ||
| if err := s.storeClient.StoreSandbox(ctx, sandbox); err != nil { |
| // Delete sandbox by session ID | ||
| if err := s.storeClient.DeleteSandboxBySessionID(ctx, sandbox.SessionID); err != nil { | ||
| klog.Errorf("failed to delete sandbox: %v", err) |
- docs/design/e2b-api-architecture.md - docs/devguide/e2b-api-guide.md - README.md E2B API compatibility mention
…support Platform API implementation including: - Router E2B server, handlers, auth, rate limiter, templates, id generation - Store support for E2B sandbox lifecycle (redis/valkey) - Workloadmanager integration for sandbox creation with env vars propagation - Router session manager and JWT middleware updates - Helm chart RBAC and router deployment manifests - Code generation script fixes for lister-gen GroupResource
- Envd filesystem, process, environment, and models - Process registry for sandbox process management - PicoD server integration for Envd API endpoints - Dockerfile.picod updates
- apikey create with namespace resolution, validation, and SHA-256 hashing - apikey list with Secret/ConfigMap join - apikey revoke with prefix matching and idempotency - KubernetesProvider with bootstrap, patch, and namespace helpers - ApiKey models, metadata codec, and runtime utilities - Unit, CLI, and integration tests - Makefile targets for CLI testing
- test/e2b/ Go tests for Platform API (lifecycle, concurrent, api) - test/e2e/ Go and Python tests for Envd API and SDK - test/e2e/run_e2e.sh test runner with Kind cluster setup - example/e2b/ Python usage examples - .github/workflows/ for CI (e2b-api, templates-api-tests, code-quality) - hack/setup-kind-cluster.sh and hack/setup-local-env.sh
hzxuzhonghu
left a comment
There was a problem hiding this comment.
Thanks for this large feature! The overall structure (separate E2B port, Informer-based auth, envd compatibility in PicoD) is solid. A few issues worth addressing before merging — most critical is the store atomicity bug.
| if err = cmd.Err(); err != nil { | ||
| return fmt.Errorf("StoreSandbox: EXEC pipeline failed: %w, cmder index: %v", err, i) | ||
| } | ||
| if e2bIDCmd != nil && !e2bIDCmd.Val() { |
There was a problem hiding this comment.
Bug: partial write on E2B ID conflict. By this point the pipeline has already written: the session JSON (SetNX at line ~186), the expiry ZAdd, the last-activity ZAdd, and the API-key SAdd. Returning ErrIDConflict here leaves those entries orphaned — the session JSON stays in Redis forever (no TTL on it), and the ZSet/Set entries are stale.
Fix: if the e2b ID SetNX fails, clean up the session key and remove from both ZSets before returning. Alternatively, wrap the whole thing in a Lua script or use MULTI/EXEC+WATCH for true atomicity.
| sandbox.ExpiresAt = time.Now().Add(time.Duration(s.config.E2BDefaultTTL) * time.Second) | ||
| } | ||
|
|
||
| // Persist to store - try UpdateSandbox first, fall back to StoreSandbox |
There was a problem hiding this comment.
The "try UpdateSandbox first, fall back to StoreSandbox" pattern is confusing: UpdateSandbox on a non-existent key will always fail, so this is really always StoreSandbox on first call. Consider checking existence explicitly or just always calling StoreSandbox for new sandbox creation and UpdateSandbox only for updates. The three-level retry with ErrIDConflict adds cognitive overhead.
| apiKeys: make(map[string]*APIKeyCacheEntry), | ||
| k8sClient: k8sClient, | ||
| stopCh: make(chan struct{}), | ||
| rateLimiter: NewRateLimiter(1.0, 1), // 1 request per second, burst of 1 |
There was a problem hiding this comment.
Rate limiter is 1.0 rps, burst 1 — extremely tight. A single failed auth check consumes the entire per-second token, so any burst (e.g. parallel sandbox creates from one client) will see 429s. Consider at least 10 rps / burst 10 or make this configurable.
| a.mu.Lock() | ||
| defer a.mu.Unlock() | ||
|
|
||
| envKeys := os.Getenv("E2B_API_KEYS") |
There was a problem hiding this comment.
Security concern: E2B_API_KEYS stores raw (plaintext) API keys in the env var, but the K8s Secret path stores SHA-256 hashes. Env var path bypasses the hash protection — if the env is leaked the raw keys are exposed. Either hash them on load here too, or drop the env var fallback for production and keep it dev-only with a clear warning comment.
| E2BAPIKeyConfigMap: getEnvOrDefault("E2B_API_KEY_CONFIGMAP", "e2b-api-key-config"), | ||
| E2BDefaultTTL: defaultTTL(), | ||
| E2BDefaultNamespace: getEnvOrDefault("E2B_DEFAULT_NAMESPACE", "e2b-default"), | ||
| E2BSandboxDomain: getEnvOrDefault("E2B_SANDBOX_DOMAIN", "sb.e2b.app"), |
There was a problem hiding this comment.
Default E2BSandboxDomain is sb.e2b.app — that's the commercial E2B SaaS domain. This will silently generate sandbox access URLs pointing at an external third-party service for any user who doesn't override it. Change the default to something like "" (empty, require explicit config) or a generic placeholder.
| limitations under the License. | ||
| */ | ||
|
|
||
| package e2b |
There was a problem hiding this comment.
Missing build tag. test/e2e/*.go use //go:build e2e so they're excluded from go test ./..., but these test/e2b/*.go files have no tag and will be included. Add //go:build e2b (+ the old-style // +build e2b for Go 1.16 compat) if these need live infra, or confirm they compile/pass in CI without any setup.
/kind feature
What this PR does / why we need it:
This PR introduces full E2B API compatibility to AgentCube, enabling seamless migration from E2B services. It consists of two major parts:
sandbox lifecycle management, including creating, listing, getting details, deleting,
setting timeout, and refreshing sandboxes. Implements Kubernetes Informer-based API key
authentication with real-time cache updates, rate limiting, and local cache fallback.
for templates, template builds lifecycle management, aliases support, and public/private
visibility controls. Maps templates to AgentCube CRDs (CodeInterpreter/AgentRuntime).
Which issue(s) this PR fixes:
Fixes #257
Special notes for your reviewer:
fallback.
test/e2e/templates_test.go.
Does this PR introduce a user-facing change?:
Add E2B API compatibility layer and Templates API support to AgentCube Router, enabling E2B
SDK users to migrate seamlessly.