Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
4c186aa
feat: add active ballooning reclaim controller
sjmiller609 Mar 19, 2026
472e966
test: stabilize ci boot timing and config defaults
sjmiller609 Mar 19, 2026
0e242a3
test: relax exec-agent wait under ci
sjmiller609 Mar 19, 2026
25a7fae
Merge origin/main into codex/active-ballooning
sjmiller609 Mar 19, 2026
4c06747
auth: scope manual memory reclaim endpoint
sjmiller609 Mar 20, 2026
699f18c
test: make initrd latest updates race-safe
sjmiller609 Mar 20, 2026
7289e56
obs: instrument active ballooning
sjmiller609 Mar 20, 2026
4cfa79d
sdk: expose memory reclaim in stainless
sjmiller609 Mar 20, 2026
763669b
sdk: flatten reclaim method in stainless
sjmiller609 Mar 20, 2026
ac2f967
fix: address ballooning review feedback
sjmiller609 Mar 20, 2026
032e00c
fix: harden linux guest memory validation
sjmiller609 Mar 20, 2026
a25a3e0
fix: address remaining balloon review issues
sjmiller609 Mar 20, 2026
1525c46
fix: tighten active ballooning threshold handling
sjmiller609 Mar 20, 2026
a3996fb
fix: align balloon defaults and qemu qmp helpers
sjmiller609 Mar 20, 2026
08b5e11
test: relax concurrent exec timing in ci
sjmiller609 Mar 20, 2026
f4e5728
fix: address bugbot balloon findings
sjmiller609 Mar 20, 2026
9662fe2
fix: remove balloon plan zero sentinel
sjmiller609 Mar 20, 2026
400c9c7
Merge origin/main into codex/active-ballooning
sjmiller609 Mar 20, 2026
ce84f1b
fix: use MiB in example configs and remove dead TrimSuffix
sjmiller609 Mar 21, 2026
89b0320
fix: use raw byte values in example configs to match Go defaults
sjmiller609 Mar 21, 2026
010cec5
Address review feedback from masnwilliams
sjmiller609 Mar 23, 2026
29cac92
Fix gofmt: remove trailing blank lines
sjmiller609 Mar 23, 2026
6651859
Add resource:write scope for POST /resources/memory/reclaim
sjmiller609 Mar 23, 2026
5f806d8
fix: address active ballooning review feedback
sjmiller609 Mar 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,9 @@ lib/system/init/init: lib/system/init/*.go
@echo "Building init binary for Linux..."
cd lib/system/init && CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o init .

build-embedded: lib/system/guest_agent/guest-agent lib/system/init/init
build-embedded:
@$(MAKE) -B lib/system/guest_agent/guest-agent
@$(MAKE) -B lib/system/init/init

# Build the binary
build:
Expand Down Expand Up @@ -305,15 +307,18 @@ test-guestmemory-linux: ensure-ch-binaries ensure-firecracker-binaries ensure-ca
@TEST_PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$$PATH"; \
GUESTMEM_TIMEOUT="$${GUESTMEMORY_TEST_TIMEOUT:-15m}"; \
echo "Running manual guest memory integration tests (CloudHypervisor, QEMU, Firecracker)"; \
sudo env "PATH=$$TEST_PATH" "DOCKER_CONFIG=$${DOCKER_CONFIG:-$$HOME/.docker}" "HYPEMAN_RUN_GUESTMEMORY_TESTS=1" \
go test -tags containers_image_openpgp -run='^TestGuestMemoryPolicy(CloudHypervisor|QEMU|Firecracker)$$' -timeout="$$GUESTMEM_TIMEOUT" ./lib/instances
for TEST_NAME in TestGuestMemoryPolicyCloudHypervisor TestGuestMemoryPolicyQEMU TestGuestMemoryPolicyFirecracker; do \
echo "Running $$TEST_NAME"; \
sudo env "PATH=$$TEST_PATH" "DOCKER_CONFIG=$${DOCKER_CONFIG:-$$HOME/.docker}" "HYPEMAN_RUN_GUESTMEMORY_TESTS=1" \
go test -count=1 -tags containers_image_openpgp -run="^$$TEST_NAME$$" -timeout="$$GUESTMEM_TIMEOUT" ./lib/instances || exit $$?; \
done

# Manual-only guest memory policy integration test (macOS VZ).
test-guestmemory-vz: build-embedded sign-vz-shim
@echo "Running manual guest memory integration test (VZ)"; \
PATH="/opt/homebrew/opt/e2fsprogs/sbin:$(PATH)" \
HYPEMAN_RUN_GUESTMEMORY_TESTS=1 \
go test -tags containers_image_openpgp -run='^TestGuestMemoryPolicyVZ$$' -timeout=$(TEST_TIMEOUT) ./lib/instances
go test -count=1 -tags containers_image_openpgp -run='^TestGuestMemoryPolicyVZ$$' -timeout=$(TEST_TIMEOUT) ./lib/instances

# Generate JWT token for testing
# Usage: make gen-jwt [USER_ID=test-user]
Expand Down
44 changes: 24 additions & 20 deletions cmd/api/api/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"github.com/kernel/hypeman/cmd/api/config"
"github.com/kernel/hypeman/lib/builds"
"github.com/kernel/hypeman/lib/devices"
"github.com/kernel/hypeman/lib/guestmemory"
"github.com/kernel/hypeman/lib/images"
"github.com/kernel/hypeman/lib/ingress"
"github.com/kernel/hypeman/lib/instances"
Expand All @@ -16,16 +17,17 @@ import (

// ApiService implements the oapi.StrictServerInterface
type ApiService struct {
Config *config.Config
ImageManager images.Manager
InstanceManager instances.Manager
VolumeManager volumes.Manager
NetworkManager network.Manager
DeviceManager devices.Manager
IngressManager ingress.Manager
BuildManager builds.Manager
ResourceManager *resources.Manager
VMMetricsManager *vm_metrics.Manager
Config *config.Config
ImageManager images.Manager
InstanceManager instances.Manager
VolumeManager volumes.Manager
NetworkManager network.Manager
DeviceManager devices.Manager
IngressManager ingress.Manager
BuildManager builds.Manager
ResourceManager *resources.Manager
GuestMemoryController guestmemory.Controller
VMMetricsManager *vm_metrics.Manager
}

var _ oapi.StrictServerInterface = (*ApiService)(nil)
Expand All @@ -41,18 +43,20 @@ func New(
ingressManager ingress.Manager,
buildManager builds.Manager,
resourceManager *resources.Manager,
guestMemoryController guestmemory.Controller,
vmMetricsManager *vm_metrics.Manager,
) *ApiService {
return &ApiService{
Config: config,
ImageManager: imageManager,
InstanceManager: instanceManager,
VolumeManager: volumeManager,
NetworkManager: networkManager,
DeviceManager: deviceManager,
IngressManager: ingressManager,
BuildManager: buildManager,
ResourceManager: resourceManager,
VMMetricsManager: vmMetricsManager,
Config: config,
ImageManager: imageManager,
InstanceManager: instanceManager,
VolumeManager: volumeManager,
NetworkManager: networkManager,
DeviceManager: deviceManager,
IngressManager: ingressManager,
BuildManager: buildManager,
ResourceManager: resourceManager,
GuestMemoryController: guestMemoryController,
VMMetricsManager: vmMetricsManager,
}
}
7 changes: 7 additions & 0 deletions cmd/api/api/api_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,13 @@ func ctx() context.Context {
return context.Background()
}

func integrationTestTimeout(timeout time.Duration) time.Duration {
if os.Getenv("CI") == "true" && timeout < 45*time.Second {
return 45 * time.Second
}
return timeout
}

// ctxWithInstance creates a context with a resolved instance (simulates ResolveResource middleware)
func ctxWithInstance(svc *ApiService, idOrName string) context.Context {
inst, err := svc.InstanceManager.GetInstance(ctx(), idOrName)
Expand Down
4 changes: 2 additions & 2 deletions cmd/api/api/cp_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ func TestCpToAndFromInstance(t *testing.T) {
// Wait for guest-agent to be ready
t.Log("Waiting for guest-agent to start...")
agentReady := false
agentTimeout := time.After(15 * time.Second)
agentTimeout := time.After(integrationTestTimeout(15 * time.Second))
agentTicker := time.NewTicker(500 * time.Millisecond)
defer agentTicker.Stop()

Expand Down Expand Up @@ -207,7 +207,7 @@ func TestCpDirectoryToInstance(t *testing.T) {
// Wait for guest-agent
t.Log("Waiting for guest-agent...")
agentReady := false
agentTimeout := time.After(15 * time.Second)
agentTimeout := time.After(integrationTestTimeout(15 * time.Second))
agentTicker := time.NewTicker(500 * time.Millisecond)
defer agentTicker.Stop()

Expand Down
4 changes: 2 additions & 2 deletions cmd/api/api/exec_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ func TestExecInstanceNonTTY(t *testing.T) {
// Wait for nginx to be fully started (poll console logs)
t.Log("Waiting for nginx to start...")
nginxReady := false
nginxTimeout := time.After(15 * time.Second)
nginxTimeout := time.After(integrationTestTimeout(15 * time.Second))
nginxTicker := time.NewTicker(500 * time.Millisecond)
defer nginxTicker.Stop()

Expand Down Expand Up @@ -227,7 +227,7 @@ func TestExecWithDebianMinimal(t *testing.T) {
// This is the key difference: we wait for guest-agent, not the app (which exits immediately)
t.Log("Waiting for guest-agent to start...")
execAgentReady := false
agentTimeout := time.After(15 * time.Second)
agentTimeout := time.After(integrationTestTimeout(15 * time.Second))
agentTicker := time.NewTicker(500 * time.Millisecond)
defer agentTicker.Stop()

Expand Down
180 changes: 180 additions & 0 deletions cmd/api/api/resources_reclaim.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
package api

import (
"context"
"errors"
"time"

"github.com/kernel/hypeman/lib/guestmemory"
"github.com/kernel/hypeman/lib/logger"
"github.com/kernel/hypeman/lib/oapi"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
"go.opentelemetry.io/otel/trace"
)

const (
defaultMemoryReclaimHold = 5 * time.Minute
maxMemoryReclaimHold = 1 * time.Hour
)

// ReclaimMemory triggers proactive guest memory reclaim via runtime ballooning.
func (s *ApiService) ReclaimMemory(ctx context.Context, request oapi.ReclaimMemoryRequestObject) (oapi.ReclaimMemoryResponseObject, error) {
log := logger.FromContext(ctx)
if request.Body == nil {
return oapi.ReclaimMemory400JSONResponse{
Code: "bad_request",
Message: "request body is required",
}, nil
}
if s.GuestMemoryController == nil {
return oapi.ReclaimMemory500JSONResponse{
Code: "internal_error",
Message: "guest memory controller not initialized",
}, nil
}

holdFor, err := parseMemoryReclaimHold(request.Body)
if err != nil {
return oapi.ReclaimMemory400JSONResponse{
Code: "bad_request",
Message: err.Error(),
}, nil
}

tracer := otel.Tracer("hypeman/guestmemory")
ctx, span := tracer.Start(ctx, "guestmemory.manual_reclaim",
traceAttrsForManualReclaim(request.Body.ReclaimBytes, holdFor, request.Body.DryRun != nil && *request.Body.DryRun, request.Body.Reason != nil))
defer span.End()

log.InfoContext(ctx,
"manual guest memory reclaim requested",
"operation", "manual_reclaim",
"requested_reclaim_bytes", request.Body.ReclaimBytes,
"hold_for_seconds", holdFor.Seconds(),
"dry_run", request.Body.DryRun != nil && *request.Body.DryRun,
"reason_present", request.Body.Reason != nil,
)

resp, err := s.GuestMemoryController.TriggerReclaim(ctx, guestmemory.ManualReclaimRequest{
ReclaimBytes: request.Body.ReclaimBytes,
HoldFor: holdFor,
DryRun: request.Body.DryRun != nil && *request.Body.DryRun,
Reason: derefString(request.Body.Reason),
})
if err != nil {
switch {
case errors.Is(err, guestmemory.ErrGuestMemoryDisabled), errors.Is(err, guestmemory.ErrActiveBallooningDisabled):
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
log.WarnContext(ctx, "manual guest memory reclaim rejected", "operation", "manual_reclaim", "error", err)
return oapi.ReclaimMemory400JSONResponse{
Code: "feature_disabled",
Message: err.Error(),
}, nil
default:
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
log.ErrorContext(ctx, "manual guest memory reclaim failed", "operation", "manual_reclaim", "error", err)
return oapi.ReclaimMemory500JSONResponse{
Code: "internal_error",
Message: err.Error(),
}, nil
}
}

span.SetAttributes(
attribute.Int64("planned_reclaim_bytes", resp.PlannedReclaimBytes),
attribute.Int64("applied_reclaim_bytes", resp.AppliedReclaimBytes),
attribute.Int64("host_available_bytes", resp.HostAvailableBytes),
attribute.String("host_pressure_state", string(resp.HostPressureState)),
attribute.Int("action_count", len(resp.Actions)),
)
span.SetStatus(codes.Ok, "")
log.InfoContext(ctx,
"manual guest memory reclaim completed",
"operation", "manual_reclaim",
"planned_reclaim_bytes", resp.PlannedReclaimBytes,
"applied_reclaim_bytes", resp.AppliedReclaimBytes,
"host_available_bytes", resp.HostAvailableBytes,
"host_pressure_state", resp.HostPressureState,
"action_count", len(resp.Actions),
)

return oapi.ReclaimMemory200JSONResponse(memoryReclaimResponseToOAPI(resp)), nil
}

func traceAttrsForManualReclaim(reclaimBytes int64, holdFor time.Duration, dryRun bool, reasonPresent bool) trace.SpanStartOption {
return trace.WithAttributes(
attribute.Int64("requested_reclaim_bytes", reclaimBytes),
attribute.Float64("hold_for_seconds", holdFor.Seconds()),
attribute.Bool("dry_run", dryRun),
attribute.Bool("reason_present", reasonPresent),
)
}

func parseMemoryReclaimHold(req *oapi.MemoryReclaimRequest) (time.Duration, error) {
if req == nil {
return 0, nil
}

if req.HoldFor == nil {
if req.ReclaimBytes > 0 {
return defaultMemoryReclaimHold, nil
}
return 0, nil
}

holdFor, err := time.ParseDuration(*req.HoldFor)
if err != nil {
return 0, errors.New("hold_for must be a valid duration")
}
if holdFor < 0 {
return 0, errors.New("hold_for must be non-negative")
}
if holdFor > maxMemoryReclaimHold {
return 0, errors.New("hold_for must be less than or equal to 1h")
}
return holdFor, nil
}

func memoryReclaimResponseToOAPI(resp guestmemory.ManualReclaimResponse) oapi.MemoryReclaimResponse {
out := oapi.MemoryReclaimResponse{
RequestedReclaimBytes: resp.RequestedReclaimBytes,
PlannedReclaimBytes: resp.PlannedReclaimBytes,
AppliedReclaimBytes: resp.AppliedReclaimBytes,
HoldUntil: resp.HoldUntil,
HostAvailableBytes: resp.HostAvailableBytes,
HostPressureState: oapi.MemoryReclaimResponseHostPressureState(resp.HostPressureState),
Actions: make([]oapi.MemoryReclaimAction, 0, len(resp.Actions)),
}

for _, action := range resp.Actions {
item := oapi.MemoryReclaimAction{
InstanceId: action.InstanceID,
InstanceName: action.InstanceName,
Hypervisor: oapi.MemoryReclaimActionHypervisor(action.Hypervisor),
AssignedMemoryBytes: action.AssignedMemoryBytes,
ProtectedFloorBytes: action.ProtectedFloorBytes,
PreviousTargetGuestMemoryBytes: action.PreviousTargetGuestMemoryBytes,
PlannedTargetGuestMemoryBytes: action.PlannedTargetGuestMemoryBytes,
TargetGuestMemoryBytes: action.TargetGuestMemoryBytes,
AppliedReclaimBytes: action.AppliedReclaimBytes,
Status: action.Status,
}
if action.Error != "" {
item.Error = &action.Error
}
out.Actions = append(out.Actions, item)
}

return out
}

func derefString(v *string) string {
if v == nil {
return ""
}
return *v
}
Loading
Loading