Skip to content

Commit fa7d99b

Browse files
committed
Merge origin/main into codex/snapshot-compression-defaults
2 parents 3b9264e + 75c3289 commit fa7d99b

60 files changed

Lines changed: 4477 additions & 412 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

Makefile

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,9 @@ lib/system/init/init: lib/system/init/*.go
217217
@echo "Building init binary for Linux..."
218218
cd lib/system/init && CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o init .
219219

220-
build-embedded: lib/system/guest_agent/guest-agent lib/system/init/init
220+
build-embedded:
221+
@$(MAKE) -B lib/system/guest_agent/guest-agent
222+
@$(MAKE) -B lib/system/init/init
221223

222224
# Build the binary
223225
build:
@@ -305,15 +307,18 @@ test-guestmemory-linux: ensure-ch-binaries ensure-firecracker-binaries ensure-ca
305307
@TEST_PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$$PATH"; \
306308
GUESTMEM_TIMEOUT="$${GUESTMEMORY_TEST_TIMEOUT:-15m}"; \
307309
echo "Running manual guest memory integration tests (CloudHypervisor, QEMU, Firecracker)"; \
308-
sudo env "PATH=$$TEST_PATH" "DOCKER_CONFIG=$${DOCKER_CONFIG:-$$HOME/.docker}" "HYPEMAN_RUN_GUESTMEMORY_TESTS=1" \
309-
go test -tags containers_image_openpgp -run='^TestGuestMemoryPolicy(CloudHypervisor|QEMU|Firecracker)$$' -timeout="$$GUESTMEM_TIMEOUT" ./lib/instances
310+
for TEST_NAME in TestGuestMemoryPolicyCloudHypervisor TestGuestMemoryPolicyQEMU TestGuestMemoryPolicyFirecracker; do \
311+
echo "Running $$TEST_NAME"; \
312+
sudo env "PATH=$$TEST_PATH" "DOCKER_CONFIG=$${DOCKER_CONFIG:-$$HOME/.docker}" "HYPEMAN_RUN_GUESTMEMORY_TESTS=1" \
313+
go test -count=1 -tags containers_image_openpgp -run="^$$TEST_NAME$$" -timeout="$$GUESTMEM_TIMEOUT" ./lib/instances || exit $$?; \
314+
done
310315

311316
# Manual-only guest memory policy integration test (macOS VZ).
312317
test-guestmemory-vz: build-embedded sign-vz-shim
313318
@echo "Running manual guest memory integration test (VZ)"; \
314319
PATH="/opt/homebrew/opt/e2fsprogs/sbin:$(PATH)" \
315320
HYPEMAN_RUN_GUESTMEMORY_TESTS=1 \
316-
go test -tags containers_image_openpgp -run='^TestGuestMemoryPolicyVZ$$' -timeout=$(TEST_TIMEOUT) ./lib/instances
321+
go test -count=1 -tags containers_image_openpgp -run='^TestGuestMemoryPolicyVZ$$' -timeout=$(TEST_TIMEOUT) ./lib/instances
317322

318323
# Generate JWT token for testing
319324
# Usage: make gen-jwt [USER_ID=test-user]

cmd/api/api/api.go

Lines changed: 24 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"github.com/kernel/hypeman/cmd/api/config"
55
"github.com/kernel/hypeman/lib/builds"
66
"github.com/kernel/hypeman/lib/devices"
7+
"github.com/kernel/hypeman/lib/guestmemory"
78
"github.com/kernel/hypeman/lib/images"
89
"github.com/kernel/hypeman/lib/ingress"
910
"github.com/kernel/hypeman/lib/instances"
@@ -16,16 +17,17 @@ import (
1617

1718
// ApiService implements the oapi.StrictServerInterface
1819
type ApiService struct {
19-
Config *config.Config
20-
ImageManager images.Manager
21-
InstanceManager instances.Manager
22-
VolumeManager volumes.Manager
23-
NetworkManager network.Manager
24-
DeviceManager devices.Manager
25-
IngressManager ingress.Manager
26-
BuildManager builds.Manager
27-
ResourceManager *resources.Manager
28-
VMMetricsManager *vm_metrics.Manager
20+
Config *config.Config
21+
ImageManager images.Manager
22+
InstanceManager instances.Manager
23+
VolumeManager volumes.Manager
24+
NetworkManager network.Manager
25+
DeviceManager devices.Manager
26+
IngressManager ingress.Manager
27+
BuildManager builds.Manager
28+
ResourceManager *resources.Manager
29+
GuestMemoryController guestmemory.Controller
30+
VMMetricsManager *vm_metrics.Manager
2931
}
3032

3133
var _ oapi.StrictServerInterface = (*ApiService)(nil)
@@ -41,18 +43,20 @@ func New(
4143
ingressManager ingress.Manager,
4244
buildManager builds.Manager,
4345
resourceManager *resources.Manager,
46+
guestMemoryController guestmemory.Controller,
4447
vmMetricsManager *vm_metrics.Manager,
4548
) *ApiService {
4649
return &ApiService{
47-
Config: config,
48-
ImageManager: imageManager,
49-
InstanceManager: instanceManager,
50-
VolumeManager: volumeManager,
51-
NetworkManager: networkManager,
52-
DeviceManager: deviceManager,
53-
IngressManager: ingressManager,
54-
BuildManager: buildManager,
55-
ResourceManager: resourceManager,
56-
VMMetricsManager: vmMetricsManager,
50+
Config: config,
51+
ImageManager: imageManager,
52+
InstanceManager: instanceManager,
53+
VolumeManager: volumeManager,
54+
NetworkManager: networkManager,
55+
DeviceManager: deviceManager,
56+
IngressManager: ingressManager,
57+
BuildManager: buildManager,
58+
ResourceManager: resourceManager,
59+
GuestMemoryController: guestMemoryController,
60+
VMMetricsManager: vmMetricsManager,
5761
}
5862
}

cmd/api/api/api_test.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,13 @@ func ctx() context.Context {
112112
return context.Background()
113113
}
114114

115+
func integrationTestTimeout(timeout time.Duration) time.Duration {
116+
if os.Getenv("CI") == "true" && timeout < 45*time.Second {
117+
return 45 * time.Second
118+
}
119+
return timeout
120+
}
121+
115122
// ctxWithInstance creates a context with a resolved instance (simulates ResolveResource middleware)
116123
func ctxWithInstance(svc *ApiService, idOrName string) context.Context {
117124
inst, err := svc.InstanceManager.GetInstance(ctx(), idOrName)

cmd/api/api/cp_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ func TestCpToAndFromInstance(t *testing.T) {
6767
// Wait for guest-agent to be ready
6868
t.Log("Waiting for guest-agent to start...")
6969
agentReady := false
70-
agentTimeout := time.After(15 * time.Second)
70+
agentTimeout := time.After(integrationTestTimeout(15 * time.Second))
7171
agentTicker := time.NewTicker(500 * time.Millisecond)
7272
defer agentTicker.Stop()
7373

@@ -207,7 +207,7 @@ func TestCpDirectoryToInstance(t *testing.T) {
207207
// Wait for guest-agent
208208
t.Log("Waiting for guest-agent...")
209209
agentReady := false
210-
agentTimeout := time.After(15 * time.Second)
210+
agentTimeout := time.After(integrationTestTimeout(15 * time.Second))
211211
agentTicker := time.NewTicker(500 * time.Millisecond)
212212
defer agentTicker.Stop()
213213

cmd/api/api/exec_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ func TestExecInstanceNonTTY(t *testing.T) {
6868
// Wait for nginx to be fully started (poll console logs)
6969
t.Log("Waiting for nginx to start...")
7070
nginxReady := false
71-
nginxTimeout := time.After(15 * time.Second)
71+
nginxTimeout := time.After(integrationTestTimeout(15 * time.Second))
7272
nginxTicker := time.NewTicker(500 * time.Millisecond)
7373
defer nginxTicker.Stop()
7474

@@ -227,7 +227,7 @@ func TestExecWithDebianMinimal(t *testing.T) {
227227
// This is the key difference: we wait for guest-agent, not the app (which exits immediately)
228228
t.Log("Waiting for guest-agent to start...")
229229
execAgentReady := false
230-
agentTimeout := time.After(15 * time.Second)
230+
agentTimeout := time.After(integrationTestTimeout(15 * time.Second))
231231
agentTicker := time.NewTicker(500 * time.Millisecond)
232232
defer agentTicker.Stop()
233233

cmd/api/api/resources_reclaim.go

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
package api
2+
3+
import (
4+
"context"
5+
"errors"
6+
"time"
7+
8+
"github.com/kernel/hypeman/lib/guestmemory"
9+
"github.com/kernel/hypeman/lib/logger"
10+
"github.com/kernel/hypeman/lib/oapi"
11+
"go.opentelemetry.io/otel"
12+
"go.opentelemetry.io/otel/attribute"
13+
"go.opentelemetry.io/otel/codes"
14+
"go.opentelemetry.io/otel/trace"
15+
)
16+
17+
const (
18+
defaultMemoryReclaimHold = 5 * time.Minute
19+
maxMemoryReclaimHold = 1 * time.Hour
20+
)
21+
22+
// ReclaimMemory triggers proactive guest memory reclaim via runtime ballooning.
23+
func (s *ApiService) ReclaimMemory(ctx context.Context, request oapi.ReclaimMemoryRequestObject) (oapi.ReclaimMemoryResponseObject, error) {
24+
log := logger.FromContext(ctx)
25+
if request.Body == nil {
26+
return oapi.ReclaimMemory400JSONResponse{
27+
Code: "bad_request",
28+
Message: "request body is required",
29+
}, nil
30+
}
31+
if s.GuestMemoryController == nil {
32+
return oapi.ReclaimMemory500JSONResponse{
33+
Code: "internal_error",
34+
Message: "guest memory controller not initialized",
35+
}, nil
36+
}
37+
38+
holdFor, err := parseMemoryReclaimHold(request.Body)
39+
if err != nil {
40+
return oapi.ReclaimMemory400JSONResponse{
41+
Code: "bad_request",
42+
Message: err.Error(),
43+
}, nil
44+
}
45+
46+
tracer := otel.Tracer("hypeman/guestmemory")
47+
ctx, span := tracer.Start(ctx, "guestmemory.manual_reclaim",
48+
traceAttrsForManualReclaim(request.Body.ReclaimBytes, holdFor, request.Body.DryRun != nil && *request.Body.DryRun, request.Body.Reason != nil))
49+
defer span.End()
50+
51+
log.InfoContext(ctx,
52+
"manual guest memory reclaim requested",
53+
"operation", "manual_reclaim",
54+
"requested_reclaim_bytes", request.Body.ReclaimBytes,
55+
"hold_for_seconds", holdFor.Seconds(),
56+
"dry_run", request.Body.DryRun != nil && *request.Body.DryRun,
57+
"reason_present", request.Body.Reason != nil,
58+
)
59+
60+
resp, err := s.GuestMemoryController.TriggerReclaim(ctx, guestmemory.ManualReclaimRequest{
61+
ReclaimBytes: request.Body.ReclaimBytes,
62+
HoldFor: holdFor,
63+
DryRun: request.Body.DryRun != nil && *request.Body.DryRun,
64+
Reason: derefString(request.Body.Reason),
65+
})
66+
if err != nil {
67+
switch {
68+
case errors.Is(err, guestmemory.ErrGuestMemoryDisabled), errors.Is(err, guestmemory.ErrActiveBallooningDisabled):
69+
span.RecordError(err)
70+
span.SetStatus(codes.Error, err.Error())
71+
log.WarnContext(ctx, "manual guest memory reclaim rejected", "operation", "manual_reclaim", "error", err)
72+
return oapi.ReclaimMemory400JSONResponse{
73+
Code: "feature_disabled",
74+
Message: err.Error(),
75+
}, nil
76+
default:
77+
span.RecordError(err)
78+
span.SetStatus(codes.Error, err.Error())
79+
log.ErrorContext(ctx, "manual guest memory reclaim failed", "operation", "manual_reclaim", "error", err)
80+
return oapi.ReclaimMemory500JSONResponse{
81+
Code: "internal_error",
82+
Message: err.Error(),
83+
}, nil
84+
}
85+
}
86+
87+
span.SetAttributes(
88+
attribute.Int64("planned_reclaim_bytes", resp.PlannedReclaimBytes),
89+
attribute.Int64("applied_reclaim_bytes", resp.AppliedReclaimBytes),
90+
attribute.Int64("host_available_bytes", resp.HostAvailableBytes),
91+
attribute.String("host_pressure_state", string(resp.HostPressureState)),
92+
attribute.Int("action_count", len(resp.Actions)),
93+
)
94+
span.SetStatus(codes.Ok, "")
95+
log.InfoContext(ctx,
96+
"manual guest memory reclaim completed",
97+
"operation", "manual_reclaim",
98+
"planned_reclaim_bytes", resp.PlannedReclaimBytes,
99+
"applied_reclaim_bytes", resp.AppliedReclaimBytes,
100+
"host_available_bytes", resp.HostAvailableBytes,
101+
"host_pressure_state", resp.HostPressureState,
102+
"action_count", len(resp.Actions),
103+
)
104+
105+
return oapi.ReclaimMemory200JSONResponse(memoryReclaimResponseToOAPI(resp)), nil
106+
}
107+
108+
func traceAttrsForManualReclaim(reclaimBytes int64, holdFor time.Duration, dryRun bool, reasonPresent bool) trace.SpanStartOption {
109+
return trace.WithAttributes(
110+
attribute.Int64("requested_reclaim_bytes", reclaimBytes),
111+
attribute.Float64("hold_for_seconds", holdFor.Seconds()),
112+
attribute.Bool("dry_run", dryRun),
113+
attribute.Bool("reason_present", reasonPresent),
114+
)
115+
}
116+
117+
func parseMemoryReclaimHold(req *oapi.MemoryReclaimRequest) (time.Duration, error) {
118+
if req == nil {
119+
return 0, nil
120+
}
121+
122+
if req.HoldFor == nil {
123+
if req.ReclaimBytes > 0 {
124+
return defaultMemoryReclaimHold, nil
125+
}
126+
return 0, nil
127+
}
128+
129+
holdFor, err := time.ParseDuration(*req.HoldFor)
130+
if err != nil {
131+
return 0, errors.New("hold_for must be a valid duration")
132+
}
133+
if holdFor < 0 {
134+
return 0, errors.New("hold_for must be non-negative")
135+
}
136+
if holdFor > maxMemoryReclaimHold {
137+
return 0, errors.New("hold_for must be less than or equal to 1h")
138+
}
139+
return holdFor, nil
140+
}
141+
142+
func memoryReclaimResponseToOAPI(resp guestmemory.ManualReclaimResponse) oapi.MemoryReclaimResponse {
143+
out := oapi.MemoryReclaimResponse{
144+
RequestedReclaimBytes: resp.RequestedReclaimBytes,
145+
PlannedReclaimBytes: resp.PlannedReclaimBytes,
146+
AppliedReclaimBytes: resp.AppliedReclaimBytes,
147+
HoldUntil: resp.HoldUntil,
148+
HostAvailableBytes: resp.HostAvailableBytes,
149+
HostPressureState: oapi.MemoryReclaimResponseHostPressureState(resp.HostPressureState),
150+
Actions: make([]oapi.MemoryReclaimAction, 0, len(resp.Actions)),
151+
}
152+
153+
for _, action := range resp.Actions {
154+
item := oapi.MemoryReclaimAction{
155+
InstanceId: action.InstanceID,
156+
InstanceName: action.InstanceName,
157+
Hypervisor: oapi.MemoryReclaimActionHypervisor(action.Hypervisor),
158+
AssignedMemoryBytes: action.AssignedMemoryBytes,
159+
ProtectedFloorBytes: action.ProtectedFloorBytes,
160+
PreviousTargetGuestMemoryBytes: action.PreviousTargetGuestMemoryBytes,
161+
PlannedTargetGuestMemoryBytes: action.PlannedTargetGuestMemoryBytes,
162+
TargetGuestMemoryBytes: action.TargetGuestMemoryBytes,
163+
AppliedReclaimBytes: action.AppliedReclaimBytes,
164+
Status: action.Status,
165+
}
166+
if action.Error != "" {
167+
item.Error = &action.Error
168+
}
169+
out.Actions = append(out.Actions, item)
170+
}
171+
172+
return out
173+
}
174+
175+
func derefString(v *string) string {
176+
if v == nil {
177+
return ""
178+
}
179+
return *v
180+
}

0 commit comments

Comments
 (0)