From 4c186aab37456671651912d8a679407b843db405 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Thu, 19 Mar 2026 17:12:27 -0400 Subject: [PATCH 01/22] feat: add active ballooning reclaim controller --- cmd/api/api/api.go | 44 +- cmd/api/api/resources_reclaim.go | 127 +++ cmd/api/api/resources_reclaim_test.go | 117 +++ cmd/api/config/config.go | 77 +- cmd/api/main.go | 8 + cmd/api/wire.go | 33 +- cmd/api/wire_gen.go | 69 +- cmd/vz-shim/server.go | 55 ++ config.example.darwin.yaml | 10 + config.example.yaml | 10 + lib/guestmemory/README.md | 214 +++-- lib/guestmemory/active_ballooning.go | 216 +++++ lib/guestmemory/controller.go | 237 ++++++ lib/guestmemory/controller_test.go | 146 ++++ lib/guestmemory/helpers.go | 8 + lib/guestmemory/planner.go | 124 +++ lib/guestmemory/pressure_darwin.go | 118 +++ lib/guestmemory/pressure_linux.go | 99 +++ .../cloudhypervisor/cloudhypervisor.go | 64 +- lib/hypervisor/firecracker/firecracker.go | 60 ++ lib/hypervisor/hypervisor.go | 13 + lib/hypervisor/qemu/qemu.go | 33 + lib/hypervisor/qemu/qmp.go | 28 + lib/hypervisor/vz/client.go | 26 + ...ory_active_ballooning_test_helpers_test.go | 131 +++ lib/instances/guestmemory_darwin_test.go | 28 + lib/instances/guestmemory_linux_test.go | 30 + lib/oapi/oapi.go | 753 +++++++++++++----- lib/providers/providers.go | 79 ++ openapi.yaml | 147 ++++ 30 files changed, 2762 insertions(+), 342 deletions(-) create mode 100644 cmd/api/api/resources_reclaim.go create mode 100644 cmd/api/api/resources_reclaim_test.go create mode 100644 lib/guestmemory/active_ballooning.go create mode 100644 lib/guestmemory/controller.go create mode 100644 lib/guestmemory/controller_test.go create mode 100644 lib/guestmemory/helpers.go create mode 100644 lib/guestmemory/planner.go create mode 100644 lib/guestmemory/pressure_darwin.go create mode 100644 lib/guestmemory/pressure_linux.go create mode 100644 lib/instances/guestmemory_active_ballooning_test_helpers_test.go diff --git a/cmd/api/api/api.go b/cmd/api/api/api.go index efb65eb1..47f828ed 100644 --- a/cmd/api/api/api.go +++ b/cmd/api/api/api.go @@ -4,6 +4,7 @@ import ( "github.com/kernel/hypeman/cmd/api/config" "github.com/kernel/hypeman/lib/builds" "github.com/kernel/hypeman/lib/devices" + "github.com/kernel/hypeman/lib/guestmemory" "github.com/kernel/hypeman/lib/images" "github.com/kernel/hypeman/lib/ingress" "github.com/kernel/hypeman/lib/instances" @@ -16,16 +17,17 @@ import ( // ApiService implements the oapi.StrictServerInterface type ApiService struct { - Config *config.Config - ImageManager images.Manager - InstanceManager instances.Manager - VolumeManager volumes.Manager - NetworkManager network.Manager - DeviceManager devices.Manager - IngressManager ingress.Manager - BuildManager builds.Manager - ResourceManager *resources.Manager - VMMetricsManager *vm_metrics.Manager + Config *config.Config + ImageManager images.Manager + InstanceManager instances.Manager + VolumeManager volumes.Manager + NetworkManager network.Manager + DeviceManager devices.Manager + IngressManager ingress.Manager + BuildManager builds.Manager + ResourceManager *resources.Manager + GuestMemoryController guestmemory.Controller + VMMetricsManager *vm_metrics.Manager } var _ oapi.StrictServerInterface = (*ApiService)(nil) @@ -41,18 +43,20 @@ func New( ingressManager ingress.Manager, buildManager builds.Manager, resourceManager *resources.Manager, + guestMemoryController guestmemory.Controller, vmMetricsManager *vm_metrics.Manager, ) *ApiService { return &ApiService{ - Config: config, - ImageManager: imageManager, - InstanceManager: instanceManager, - VolumeManager: volumeManager, - NetworkManager: networkManager, - DeviceManager: deviceManager, - IngressManager: ingressManager, - BuildManager: buildManager, - ResourceManager: resourceManager, - VMMetricsManager: vmMetricsManager, + Config: config, + ImageManager: imageManager, + InstanceManager: instanceManager, + VolumeManager: volumeManager, + NetworkManager: networkManager, + DeviceManager: deviceManager, + IngressManager: ingressManager, + BuildManager: buildManager, + ResourceManager: resourceManager, + GuestMemoryController: guestMemoryController, + VMMetricsManager: vmMetricsManager, } } diff --git a/cmd/api/api/resources_reclaim.go b/cmd/api/api/resources_reclaim.go new file mode 100644 index 00000000..12675e93 --- /dev/null +++ b/cmd/api/api/resources_reclaim.go @@ -0,0 +1,127 @@ +package api + +import ( + "context" + "errors" + "time" + + "github.com/kernel/hypeman/lib/guestmemory" + "github.com/kernel/hypeman/lib/oapi" +) + +const ( + defaultMemoryReclaimHold = 5 * time.Minute + maxMemoryReclaimHold = 1 * time.Hour +) + +// ReclaimMemory triggers proactive guest memory reclaim via runtime ballooning. +func (s *ApiService) ReclaimMemory(ctx context.Context, request oapi.ReclaimMemoryRequestObject) (oapi.ReclaimMemoryResponseObject, error) { + if request.Body == nil { + return oapi.ReclaimMemory400JSONResponse{ + Code: "bad_request", + Message: "request body is required", + }, nil + } + if s.GuestMemoryController == nil { + return oapi.ReclaimMemory500JSONResponse{ + Code: "internal_error", + Message: "guest memory controller not initialized", + }, nil + } + + holdFor, err := parseMemoryReclaimHold(request.Body) + if err != nil { + return oapi.ReclaimMemory400JSONResponse{ + Code: "bad_request", + Message: err.Error(), + }, nil + } + + resp, err := s.GuestMemoryController.TriggerReclaim(ctx, guestmemory.ManualReclaimRequest{ + ReclaimBytes: request.Body.ReclaimBytes, + HoldFor: holdFor, + DryRun: request.Body.DryRun != nil && *request.Body.DryRun, + Reason: derefString(request.Body.Reason), + }) + if err != nil { + switch { + case errors.Is(err, guestmemory.ErrGuestMemoryDisabled), errors.Is(err, guestmemory.ErrActiveBallooningDisabled): + return oapi.ReclaimMemory400JSONResponse{ + Code: "feature_disabled", + Message: err.Error(), + }, nil + default: + return oapi.ReclaimMemory500JSONResponse{ + Code: "internal_error", + Message: err.Error(), + }, nil + } + } + + return oapi.ReclaimMemory200JSONResponse(memoryReclaimResponseToOAPI(resp)), nil +} + +func parseMemoryReclaimHold(req *oapi.MemoryReclaimRequest) (time.Duration, error) { + if req == nil { + return 0, nil + } + + if req.HoldFor == nil { + if req.ReclaimBytes > 0 { + return defaultMemoryReclaimHold, nil + } + return 0, nil + } + + holdFor, err := time.ParseDuration(*req.HoldFor) + if err != nil { + return 0, errors.New("hold_for must be a valid duration") + } + if holdFor < 0 { + return 0, errors.New("hold_for must be non-negative") + } + if holdFor > maxMemoryReclaimHold { + return 0, errors.New("hold_for must be less than or equal to 1h") + } + return holdFor, nil +} + +func memoryReclaimResponseToOAPI(resp guestmemory.ManualReclaimResponse) oapi.MemoryReclaimResponse { + out := oapi.MemoryReclaimResponse{ + RequestedReclaimBytes: resp.RequestedReclaimBytes, + PlannedReclaimBytes: resp.PlannedReclaimBytes, + AppliedReclaimBytes: resp.AppliedReclaimBytes, + HoldUntil: resp.HoldUntil, + HostAvailableBytes: resp.HostAvailableBytes, + HostPressureState: oapi.MemoryReclaimResponseHostPressureState(resp.HostPressureState), + Actions: make([]oapi.MemoryReclaimAction, 0, len(resp.Actions)), + } + + for _, action := range resp.Actions { + item := oapi.MemoryReclaimAction{ + InstanceId: action.InstanceID, + InstanceName: action.InstanceName, + Hypervisor: oapi.MemoryReclaimActionHypervisor(action.Hypervisor), + AssignedMemoryBytes: action.AssignedMemoryBytes, + ProtectedFloorBytes: action.ProtectedFloorBytes, + PreviousTargetGuestMemoryBytes: action.PreviousTargetGuestMemoryBytes, + PlannedTargetGuestMemoryBytes: action.PlannedTargetGuestMemoryBytes, + TargetGuestMemoryBytes: action.TargetGuestMemoryBytes, + AppliedReclaimBytes: action.AppliedReclaimBytes, + Status: action.Status, + } + if action.Error != "" { + item.Error = &action.Error + } + out.Actions = append(out.Actions, item) + } + + return out +} + +func derefString(v *string) string { + if v == nil { + return "" + } + return *v +} diff --git a/cmd/api/api/resources_reclaim_test.go b/cmd/api/api/resources_reclaim_test.go new file mode 100644 index 00000000..f37ff091 --- /dev/null +++ b/cmd/api/api/resources_reclaim_test.go @@ -0,0 +1,117 @@ +package api + +import ( + "context" + "errors" + "testing" + "time" + + "github.com/kernel/hypeman/lib/guestmemory" + "github.com/kernel/hypeman/lib/hypervisor" + "github.com/kernel/hypeman/lib/oapi" + "github.com/stretchr/testify/require" +) + +type stubGuestMemoryController struct { + response guestmemory.ManualReclaimResponse + err error + requests []guestmemory.ManualReclaimRequest +} + +func (s *stubGuestMemoryController) Start(ctx context.Context) error { + <-ctx.Done() + return nil +} + +func (s *stubGuestMemoryController) TriggerReclaim(ctx context.Context, req guestmemory.ManualReclaimRequest) (guestmemory.ManualReclaimResponse, error) { + s.requests = append(s.requests, req) + return s.response, s.err +} + +func TestReclaimMemory_DefaultHoldAndResponse(t *testing.T) { + controller := &stubGuestMemoryController{ + response: guestmemory.ManualReclaimResponse{ + RequestedReclaimBytes: 512 * 1024 * 1024, + PlannedReclaimBytes: 512 * 1024 * 1024, + AppliedReclaimBytes: 256 * 1024 * 1024, + HostAvailableBytes: 2 * 1024 * 1024 * 1024, + HostPressureState: guestmemory.HostPressureStateHealthy, + Actions: []guestmemory.ManualReclaimAction{ + { + InstanceID: "inst-123", + InstanceName: "guestmem-test", + Hypervisor: hypervisor.TypeQEMU, + AssignedMemoryBytes: 4 * 1024 * 1024 * 1024, + ProtectedFloorBytes: 2 * 1024 * 1024 * 1024, + PreviousTargetGuestMemoryBytes: 4 * 1024 * 1024 * 1024, + PlannedTargetGuestMemoryBytes: 3 * 1024 * 1024 * 1024, + TargetGuestMemoryBytes: 3758096384, + AppliedReclaimBytes: 268435456, + Status: "applied", + }, + }, + }, + } + + svc := &ApiService{GuestMemoryController: controller} + resp, err := svc.ReclaimMemory(context.Background(), oapi.ReclaimMemoryRequestObject{ + Body: &oapi.MemoryReclaimRequest{ + ReclaimBytes: 512 * 1024 * 1024, + Reason: ptr("pack host before launch"), + }, + }) + require.NoError(t, err) + + okResp, ok := resp.(oapi.ReclaimMemory200JSONResponse) + require.True(t, ok) + require.Len(t, controller.requests, 1) + require.Equal(t, 5*time.Minute, controller.requests[0].HoldFor) + require.Equal(t, "pack host before launch", controller.requests[0].Reason) + require.Equal(t, int64(512*1024*1024), okResp.RequestedReclaimBytes) + require.Equal(t, oapi.MemoryReclaimResponseHostPressureState(guestmemory.HostPressureStateHealthy), okResp.HostPressureState) + require.Len(t, okResp.Actions, 1) + require.Equal(t, oapi.MemoryReclaimActionHypervisor(hypervisor.TypeQEMU), okResp.Actions[0].Hypervisor) +} + +func TestReclaimMemory_ValidationAndFeatureDisabled(t *testing.T) { + svc := &ApiService{GuestMemoryController: &stubGuestMemoryController{err: guestmemory.ErrActiveBallooningDisabled}} + resp, err := svc.ReclaimMemory(context.Background(), oapi.ReclaimMemoryRequestObject{ + Body: &oapi.MemoryReclaimRequest{ + ReclaimBytes: 256 * 1024 * 1024, + HoldFor: ptr("2h"), + }, + }) + require.NoError(t, err) + + badReq, ok := resp.(oapi.ReclaimMemory400JSONResponse) + require.True(t, ok) + require.Equal(t, "bad_request", badReq.Code) + + resp, err = svc.ReclaimMemory(context.Background(), oapi.ReclaimMemoryRequestObject{ + Body: &oapi.MemoryReclaimRequest{ + ReclaimBytes: 256 * 1024 * 1024, + HoldFor: ptr("10m"), + }, + }) + require.NoError(t, err) + + featureDisabled, ok := resp.(oapi.ReclaimMemory400JSONResponse) + require.True(t, ok) + require.Equal(t, "feature_disabled", featureDisabled.Code) +} + +func TestReclaimMemory_InternalError(t *testing.T) { + svc := &ApiService{GuestMemoryController: &stubGuestMemoryController{err: errors.New("boom")}} + resp, err := svc.ReclaimMemory(context.Background(), oapi.ReclaimMemoryRequestObject{ + Body: &oapi.MemoryReclaimRequest{ReclaimBytes: 128 * 1024 * 1024}, + }) + require.NoError(t, err) + + internalErr, ok := resp.(oapi.ReclaimMemory500JSONResponse) + require.True(t, ok) + require.Equal(t, "internal_error", internalErr.Code) +} + +func ptr(v string) *string { + return &v +} diff --git a/cmd/api/config/config.go b/cmd/api/config/config.go index 086df055..2b90a9b8 100644 --- a/cmd/api/config/config.go +++ b/cmd/api/config/config.go @@ -9,6 +9,7 @@ import ( "strings" "time" + "github.com/c2h5oh/datasize" "github.com/knadh/koanf/parsers/yaml" "github.com/knadh/koanf/providers/env" "github.com/knadh/koanf/providers/file" @@ -170,10 +171,24 @@ type HypervisorConfig struct { // HypervisorMemoryConfig holds guest memory management settings. type HypervisorMemoryConfig struct { - Enabled bool `koanf:"enabled"` - KernelPageInitMode string `koanf:"kernel_page_init_mode"` - ReclaimEnabled bool `koanf:"reclaim_enabled"` - VZBalloonRequired bool `koanf:"vz_balloon_required"` + Enabled bool `koanf:"enabled"` + KernelPageInitMode string `koanf:"kernel_page_init_mode"` + ReclaimEnabled bool `koanf:"reclaim_enabled"` + VZBalloonRequired bool `koanf:"vz_balloon_required"` + ActiveBallooning HypervisorActiveBallooningConfig `koanf:"active_ballooning"` +} + +// HypervisorActiveBallooningConfig holds runtime host-driven reclaim settings. +type HypervisorActiveBallooningConfig struct { + Enabled bool `koanf:"enabled"` + PollInterval string `koanf:"poll_interval"` + PressureHighWatermarkAvailablePercent int `koanf:"pressure_high_watermark_available_percent"` + PressureLowWatermarkAvailablePercent int `koanf:"pressure_low_watermark_available_percent"` + ProtectedFloorPercent int `koanf:"protected_floor_percent"` + ProtectedFloorMinBytes string `koanf:"protected_floor_min_bytes"` + MinAdjustmentBytes string `koanf:"min_adjustment_bytes"` + PerVmMaxStepBytes string `koanf:"per_vm_max_step_bytes"` + PerVmCooldown string `koanf:"per_vm_cooldown"` } // GPUConfig holds GPU-related settings. @@ -331,6 +346,17 @@ func defaultConfig() *Config { KernelPageInitMode: "hardened", ReclaimEnabled: true, VZBalloonRequired: true, + ActiveBallooning: HypervisorActiveBallooningConfig{ + Enabled: false, + PollInterval: "2s", + PressureHighWatermarkAvailablePercent: 10, + PressureLowWatermarkAvailablePercent: 15, + ProtectedFloorPercent: 50, + ProtectedFloorMinBytes: "512MiB", + MinAdjustmentBytes: "64MiB", + PerVmMaxStepBytes: "256MiB", + PerVmCooldown: "5s", + }, }, }, @@ -449,5 +475,48 @@ func (c *Config) Validate() error { if c.Hypervisor.Memory.KernelPageInitMode != "performance" && c.Hypervisor.Memory.KernelPageInitMode != "hardened" { return fmt.Errorf("hypervisor.memory.kernel_page_init_mode must be one of {performance,hardened}, got %q", c.Hypervisor.Memory.KernelPageInitMode) } + if c.Hypervisor.Memory.ActiveBallooning.PollInterval != "" { + if _, err := time.ParseDuration(c.Hypervisor.Memory.ActiveBallooning.PollInterval); err != nil { + return fmt.Errorf("hypervisor.memory.active_ballooning.poll_interval must be a valid duration, got %q: %w", c.Hypervisor.Memory.ActiveBallooning.PollInterval, err) + } + } + if c.Hypervisor.Memory.ActiveBallooning.PerVmCooldown != "" { + if _, err := time.ParseDuration(c.Hypervisor.Memory.ActiveBallooning.PerVmCooldown); err != nil { + return fmt.Errorf("hypervisor.memory.active_ballooning.per_vm_cooldown must be a valid duration, got %q: %w", c.Hypervisor.Memory.ActiveBallooning.PerVmCooldown, err) + } + } + if err := validateByteSize("hypervisor.memory.active_ballooning.protected_floor_min_bytes", c.Hypervisor.Memory.ActiveBallooning.ProtectedFloorMinBytes); err != nil { + return err + } + if err := validateByteSize("hypervisor.memory.active_ballooning.min_adjustment_bytes", c.Hypervisor.Memory.ActiveBallooning.MinAdjustmentBytes); err != nil { + return err + } + if err := validateByteSize("hypervisor.memory.active_ballooning.per_vm_max_step_bytes", c.Hypervisor.Memory.ActiveBallooning.PerVmMaxStepBytes); err != nil { + return err + } + ab := c.Hypervisor.Memory.ActiveBallooning + if ab.PressureHighWatermarkAvailablePercent <= 0 || ab.PressureHighWatermarkAvailablePercent >= 100 { + return fmt.Errorf("hypervisor.memory.active_ballooning.pressure_high_watermark_available_percent must be between 1 and 99, got %d", ab.PressureHighWatermarkAvailablePercent) + } + if ab.PressureLowWatermarkAvailablePercent <= 0 || ab.PressureLowWatermarkAvailablePercent >= 100 { + return fmt.Errorf("hypervisor.memory.active_ballooning.pressure_low_watermark_available_percent must be between 1 and 99, got %d", ab.PressureLowWatermarkAvailablePercent) + } + if ab.PressureLowWatermarkAvailablePercent <= ab.PressureHighWatermarkAvailablePercent { + return fmt.Errorf("hypervisor.memory.active_ballooning.pressure_low_watermark_available_percent must be greater than pressure_high_watermark_available_percent") + } + if ab.ProtectedFloorPercent <= 0 || ab.ProtectedFloorPercent >= 100 { + return fmt.Errorf("hypervisor.memory.active_ballooning.protected_floor_percent must be between 1 and 99, got %d", ab.ProtectedFloorPercent) + } + return nil +} + +func validateByteSize(field string, value string) error { + if strings.TrimSpace(value) == "" { + return fmt.Errorf("%s must not be empty", field) + } + var size datasize.ByteSize + if err := size.UnmarshalText([]byte(value)); err != nil { + return fmt.Errorf("%s must be a valid byte size, got %q: %w", field, value, err) + } return nil } diff --git a/cmd/api/main.go b/cmd/api/main.go index 5786c50a..3c527720 100644 --- a/cmd/api/main.go +++ b/cmd/api/main.go @@ -420,6 +420,14 @@ func run() error { return err } + grp.Go(func() error { + if app.GuestMemoryController == nil { + return nil + } + logger.Info("starting guest memory controller") + return app.GuestMemoryController.Start(gctx) + }) + // Run the server grp.Go(func() error { logger.Info("starting hypeman API", "port", app.Config.Port) diff --git a/cmd/api/wire.go b/cmd/api/wire.go index 93b307de..b50c27e2 100644 --- a/cmd/api/wire.go +++ b/cmd/api/wire.go @@ -11,6 +11,7 @@ import ( "github.com/kernel/hypeman/cmd/api/config" "github.com/kernel/hypeman/lib/builds" "github.com/kernel/hypeman/lib/devices" + "github.com/kernel/hypeman/lib/guestmemory" "github.com/kernel/hypeman/lib/images" "github.com/kernel/hypeman/lib/ingress" "github.com/kernel/hypeman/lib/instances" @@ -25,21 +26,22 @@ import ( // application struct to hold initialized components type application struct { - Ctx context.Context - Logger *slog.Logger - Config *config.Config - ImageManager images.Manager - SystemManager system.Manager - NetworkManager network.Manager - DeviceManager devices.Manager - InstanceManager instances.Manager - VolumeManager volumes.Manager - IngressManager ingress.Manager - BuildManager builds.Manager - ResourceManager *resources.Manager - VMMetricsManager *vm_metrics.Manager - Registry *registry.Registry - ApiService *api.ApiService + Ctx context.Context + Logger *slog.Logger + Config *config.Config + ImageManager images.Manager + SystemManager system.Manager + NetworkManager network.Manager + DeviceManager devices.Manager + InstanceManager instances.Manager + VolumeManager volumes.Manager + IngressManager ingress.Manager + BuildManager builds.Manager + ResourceManager *resources.Manager + GuestMemoryController guestmemory.Controller + VMMetricsManager *vm_metrics.Manager + Registry *registry.Registry + ApiService *api.ApiService } // initializeApp is the injector function @@ -58,6 +60,7 @@ func initializeApp() (*application, func(), error) { providers.ProvideIngressManager, providers.ProvideBuildManager, providers.ProvideResourceManager, + providers.ProvideGuestMemoryController, providers.ProvideVMMetricsManager, providers.ProvideRegistry, api.New, diff --git a/cmd/api/wire_gen.go b/cmd/api/wire_gen.go index 6eb15ce6..a5007a7e 100644 --- a/cmd/api/wire_gen.go +++ b/cmd/api/wire_gen.go @@ -12,6 +12,7 @@ import ( "github.com/kernel/hypeman/cmd/api/config" "github.com/kernel/hypeman/lib/builds" "github.com/kernel/hypeman/lib/devices" + "github.com/kernel/hypeman/lib/guestmemory" "github.com/kernel/hypeman/lib/images" "github.com/kernel/hypeman/lib/ingress" "github.com/kernel/hypeman/lib/instances" @@ -64,6 +65,10 @@ func initializeApp() (*application, func(), error) { if err != nil { return nil, nil, err } + controller, err := providers.ProvideGuestMemoryController(instancesManager, config, logger) + if err != nil { + return nil, nil, err + } vm_metricsManager, err := providers.ProvideVMMetricsManager(instancesManager, config, logger) if err != nil { return nil, nil, err @@ -72,23 +77,24 @@ func initializeApp() (*application, func(), error) { if err != nil { return nil, nil, err } - apiService := api.New(config, manager, instancesManager, volumesManager, networkManager, devicesManager, ingressManager, buildsManager, resourcesManager, vm_metricsManager) + apiService := api.New(config, manager, instancesManager, volumesManager, networkManager, devicesManager, ingressManager, buildsManager, resourcesManager, controller, vm_metricsManager) mainApplication := &application{ - Ctx: context, - Logger: logger, - Config: config, - ImageManager: manager, - SystemManager: systemManager, - NetworkManager: networkManager, - DeviceManager: devicesManager, - InstanceManager: instancesManager, - VolumeManager: volumesManager, - IngressManager: ingressManager, - BuildManager: buildsManager, - ResourceManager: resourcesManager, - VMMetricsManager: vm_metricsManager, - Registry: registry, - ApiService: apiService, + Ctx: context, + Logger: logger, + Config: config, + ImageManager: manager, + SystemManager: systemManager, + NetworkManager: networkManager, + DeviceManager: devicesManager, + InstanceManager: instancesManager, + VolumeManager: volumesManager, + IngressManager: ingressManager, + BuildManager: buildsManager, + ResourceManager: resourcesManager, + GuestMemoryController: controller, + VMMetricsManager: vm_metricsManager, + Registry: registry, + ApiService: apiService, } return mainApplication, func() { }, nil @@ -98,19 +104,20 @@ func initializeApp() (*application, func(), error) { // application struct to hold initialized components type application struct { - Ctx context.Context - Logger *slog.Logger - Config *config.Config - ImageManager images.Manager - SystemManager system.Manager - NetworkManager network.Manager - DeviceManager devices.Manager - InstanceManager instances.Manager - VolumeManager volumes.Manager - IngressManager ingress.Manager - BuildManager builds.Manager - ResourceManager *resources.Manager - VMMetricsManager *vm_metrics.Manager - Registry *registry.Registry - ApiService *api.ApiService + Ctx context.Context + Logger *slog.Logger + Config *config.Config + ImageManager images.Manager + SystemManager system.Manager + NetworkManager network.Manager + DeviceManager devices.Manager + InstanceManager instances.Manager + VolumeManager volumes.Manager + IngressManager ingress.Manager + BuildManager builds.Manager + ResourceManager *resources.Manager + GuestMemoryController guestmemory.Controller + VMMetricsManager *vm_metrics.Manager + Registry *registry.Registry + ApiService *api.ApiService } diff --git a/cmd/vz-shim/server.go b/cmd/vz-shim/server.go index 9acf0095..74640b8e 100644 --- a/cmd/vz-shim/server.go +++ b/cmd/vz-shim/server.go @@ -45,6 +45,10 @@ type snapshotRequest struct { DestinationPath string `json:"destination_path"` } +type balloonRequest struct { + TargetGuestMemoryBytes int64 `json:"target_guest_memory_bytes"` +} + // Handler returns the HTTP handler for the control API. func (s *ShimServer) Handler() http.Handler { mux := http.NewServeMux() @@ -55,6 +59,8 @@ func (s *ShimServer) Handler() http.Handler { mux.HandleFunc("PUT /api/v1/vm.resume", s.handleResume) mux.HandleFunc("PUT /api/v1/vm.shutdown", s.handleShutdown) mux.HandleFunc("PUT /api/v1/vm.snapshot", s.handleSnapshot) + mux.HandleFunc("GET /api/v1/vm.balloon", s.handleGetBalloon) + mux.HandleFunc("PUT /api/v1/vm.balloon", s.handleSetBalloon) mux.HandleFunc("PUT /api/v1/vm.power-button", s.handlePowerButton) mux.HandleFunc("GET /api/v1/vmm.ping", s.handlePing) mux.HandleFunc("PUT /api/v1/vmm.shutdown", s.handleVMMShutdown) @@ -203,6 +209,46 @@ func (s *ShimServer) handleSnapshot(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusNoContent) } +func (s *ShimServer) handleGetBalloon(w http.ResponseWriter, r *http.Request) { + s.mu.RLock() + defer s.mu.RUnlock() + + device, err := s.getTraditionalBalloonDevice() + if err != nil { + http.Error(w, err.Error(), http.StatusNotFound) + return + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(balloonRequest{ + TargetGuestMemoryBytes: int64(device.GetTargetVirtualMachineMemorySize()), + }) +} + +func (s *ShimServer) handleSetBalloon(w http.ResponseWriter, r *http.Request) { + s.mu.Lock() + defer s.mu.Unlock() + + device, err := s.getTraditionalBalloonDevice() + if err != nil { + http.Error(w, err.Error(), http.StatusNotFound) + return + } + + var req balloonRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + http.Error(w, fmt.Sprintf("invalid balloon request: %v", err), http.StatusBadRequest) + return + } + if req.TargetGuestMemoryBytes < 0 { + http.Error(w, "target_guest_memory_bytes must be non-negative", http.StatusBadRequest) + return + } + + device.SetTargetVirtualMachineMemorySize(uint64(req.TargetGuestMemoryBytes)) + w.WriteHeader(http.StatusNoContent) +} + func (s *ShimServer) handlePowerButton(w http.ResponseWriter, r *http.Request) { s.mu.Lock() defer s.mu.Unlock() @@ -219,6 +265,15 @@ func (s *ShimServer) handlePowerButton(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusNoContent) } +func (s *ShimServer) getTraditionalBalloonDevice() (*vz.VirtioTraditionalMemoryBalloonDevice, error) { + for _, device := range s.vm.MemoryBalloonDevices() { + if traditional := vz.AsVirtioTraditionalMemoryBalloonDevice(device); traditional != nil { + return traditional, nil + } + } + return nil, fmt.Errorf("no memory balloon device configured") +} + func (s *ShimServer) handlePing(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusOK) w.Write([]byte("OK")) diff --git a/config.example.darwin.yaml b/config.example.darwin.yaml index a67e5ab2..35d1e8e1 100644 --- a/config.example.darwin.yaml +++ b/config.example.darwin.yaml @@ -39,6 +39,16 @@ hypervisor: kernel_page_init_mode: hardened reclaim_enabled: true vz_balloon_required: true + active_ballooning: + enabled: false + poll_interval: 2s + pressure_high_watermark_available_percent: 10 + pressure_low_watermark_available_percent: 15 + protected_floor_percent: 50 + protected_floor_min_bytes: 512MiB + min_adjustment_bytes: 64MiB + per_vm_max_step_bytes: 256MiB + per_vm_cooldown: 5s # ============================================================================= # Network Configuration (DIFFERENT ON MACOS) diff --git a/config.example.yaml b/config.example.yaml index 2dff61c5..7af5fd5a 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -33,6 +33,16 @@ data_dir: /var/lib/hypeman # kernel_page_init_mode: hardened # reclaim_enabled: true # vz_balloon_required: true +# active_ballooning: +# enabled: false +# poll_interval: 2s +# pressure_high_watermark_available_percent: 10 +# pressure_low_watermark_available_percent: 15 +# protected_floor_percent: 50 +# protected_floor_min_bytes: 512MiB +# min_adjustment_bytes: 64MiB +# per_vm_max_step_bytes: 256MiB +# per_vm_cooldown: 5s # ============================================================================= # Network Configuration diff --git a/lib/guestmemory/README.md b/lib/guestmemory/README.md index e60e7d81..1f548470 100644 --- a/lib/guestmemory/README.md +++ b/lib/guestmemory/README.md @@ -1,102 +1,174 @@ -# Guest Memory Reclaim +# Guest Memory -This feature reduces host RAM waste from guest VMs by combining three behaviors: +Hypeman's guest-memory feature combines passive reclaim and active reclaim. -1. Lazy host allocation preservation: -The VM is configured with requested memory capacity, but host pages should only back guest pages as they are touched. +- Passive reclaim gives pages back to the host when the guest has already freed them. +- Active reclaim asks the guest to give memory back by inflating its virtio balloon target. +- Linux page-init tuning controls whether the guest eagerly scrubs pages on allocation/free. -2. Guest-to-host reclaim: -When the guest frees memory, virtio balloon/reporting/hinting features let the VMM return those pages to the host. +The important distinction is that active ballooning is not `drop_caches`. Balloon inflation makes the guest kernel feel memory pressure, so the guest reclaims memory through its normal LRU and reclaim paths. That lets the guest keep hot working-set cache and evict colder pages first. -3. Guest boot page-touch reduction: -The guest kernel page-init mode controls whether Linux eagerly touches pages: -- `performance` mode sets `init_on_alloc=0 init_on_free=0` for better density and lower memory churn. -- `hardened` mode sets `init_on_alloc=1 init_on_free=1` for stronger memory hygiene at some density/perf cost. +## What Happens At Runtime -## Configuration +When `hypervisor.memory.enabled=true`, Hypeman enables the guest-memory features each hypervisor supports: -This feature is controlled by `hypervisor.memory` in server config and is default-off: +- Cloud Hypervisor configures a balloon device with free-page reporting and deflate-on-oom. +- QEMU adds a virtio balloon device and enables free-page reporting when available. +- Firecracker configures ballooning with hinting/reporting and deflate-on-oom. +- VZ attaches a traditional memory balloon device through `vz-shim`. -```yaml -hypervisor: - memory: - enabled: false - kernel_page_init_mode: hardened - reclaim_enabled: true - vz_balloon_required: true -``` +When `kernel_page_init_mode=performance`, Hypeman also adds `init_on_alloc=0 init_on_free=0` to the guest kernel command line. That reduces unnecessary guest page touching during boot and steady-state reclaim. `hardened` keeps both flags enabled. + +## Automatic Active Ballooning -To enable reclaim behavior and density-oriented kernel args, set: +Automatic ballooning is controlled by `hypervisor.memory.active_ballooning`. ```yaml hypervisor: memory: enabled: true + reclaim_enabled: true kernel_page_init_mode: performance + active_ballooning: + enabled: true + poll_interval: 2s + pressure_high_watermark_available_percent: 10 + pressure_low_watermark_available_percent: 15 + protected_floor_percent: 50 + protected_floor_min_bytes: 512MiB + min_adjustment_bytes: 64MiB + per_vm_max_step_bytes: 256MiB + per_vm_cooldown: 5s ``` -## Runtime Flow +The automatic loop is pressure-driven by default: + +1. Hypeman samples host memory pressure. +2. If the host is under pressure, it computes a global reclaim target. +3. Eligible VMs are asked to give back memory proportionally to their reclaimable headroom. +4. Each hypervisor gets a new runtime balloon target. +5. When the host is healthy again, Hypeman gradually deflates balloons back toward full guest memory. + +The controller uses hysteresis so it does not flap when available memory hovers near the threshold: + +- `pressure_high_watermark_available_percent` enters pressure mode. +- `pressure_low_watermark_available_percent` exits pressure mode. + +### Host Pressure Signals + +Linux uses: + +- `/proc/meminfo` `MemAvailable` as the primary available-memory signal +- `/proc/pressure/memory` PSI as a secondary stress signal + +macOS uses: + +- `vm_stat` free/speculative pages to estimate available memory +- `memory_pressure -Q` as a secondary stress signal + +## Protected Floors And Allocation Rules + +Active reclaim never shrinks a guest below its protected floor: + +- `protected_floor_percent` reserves a percentage of assigned guest RAM +- `protected_floor_min_bytes` reserves an absolute minimum +- the larger of the two becomes the guest's floor + +Example: + +- a 4 GiB guest with `protected_floor_percent=50` has a 2 GiB floor +- if `protected_floor_min_bytes=512MiB`, the effective floor is still 2 GiB +- Hypeman can reclaim at most 2 GiB from that guest + +Reclaim is also rate-limited: + +- `min_adjustment_bytes` skips tiny target changes +- `per_vm_max_step_bytes` caps how much one reconcile can change a guest +- `per_vm_cooldown` prevents frequent small oscillations -- Operator config (`hypervisor.memory`) is normalized into one policy. -- The instances layer applies policy generically: - - merges kernel args with the selected page-init mode; - - sets generic memory feature toggles in `hypervisor.VMConfig.GuestMemory`. -- Each hypervisor backend maps generic toggles to native mechanisms: - - Cloud Hypervisor: `balloon` config with free page reporting and deflate-on-oom. - - QEMU: `virtio-balloon-pci` device options. - - Firecracker: `/balloon` API with free page hinting/reporting. - - VZ: attach `VirtioTraditionalMemoryBalloon` device. +## Manual Reclaim API -## Backend Behavior Matrix +Hypeman also exposes a proactive reclaim endpoint: -| Hypervisor | Lazy allocation | Balloon | Free page reporting/hinting | Deflate on OOM | -|---|---|---|---|---| -| Cloud Hypervisor | Yes | Yes | Reporting | Yes | -| QEMU | Yes | Yes | Reporting (+ hinting when enabled) | Yes | -| Firecracker | Yes | Yes | Hinting + reporting | Yes | -| VZ | macOS-managed | Yes | Host-managed + guest cooperation | Host-managed | +- `POST /resources/memory/reclaim` + +Request fields: + +- `reclaim_bytes`: required total reclaim target across eligible guests +- `hold_for`: optional duration, default `5m`, max `1h` +- `dry_run`: optional, computes the plan without applying it +- `reason`: optional operator note for logs/traces + +Manual reclaim uses the same planner and protected floors as automatic reclaim. When `hold_for` is set, Hypeman keeps at least that much reclaim in place until the hold expires, even if host pressure clears sooner. Sending `reclaim_bytes=0` with `hold_for=0s` clears the hold and allows full deflation immediately. + +By design, Hypeman does not reclaim memory without a reason. Automatic reclaim only happens under real host pressure. Proactive reclaim without host pressure is only done when an operator explicitly asks for it through the API. + +## Passive Reclaim vs Active Ballooning + +Passive reclaim and active reclaim are complementary: + +- free-page reporting/hinting handles "the guest freed this already" +- active ballooning handles "the host needs memory back now" + +Both are useful. Passive reporting improves density opportunistically. Active ballooning gives Hypeman a control loop for pressure events and explicit operator requests. + +## Hypervisor Expectations + +Cloud Hypervisor: + +- boot-time ballooning plus free-page reporting +- runtime target changes through `/vm.resize` + +QEMU: + +- virtio balloon device on the VM command line +- runtime target changes through QMP `balloon` + +Firecracker: + +- balloon config at boot with hinting/reporting +- runtime target changes through the balloon API +- if a custom or older binary lacks the runtime balloon endpoint, Hypeman skips active reclaim for that VM + +VZ: + +- traditional memory balloon device attached through `vz-shim` +- runtime target changes through `vz-shim` balloon endpoints ## Failure Behavior -- If policy is disabled, memory features are not applied. -- If reclaim is disabled, balloon/reporting/hinting are not applied. -- For VZ, balloon attachment is attempted when enabled. - - If `vz_balloon_required=true`, startup fails if balloon cannot be configured. - - If `vz_balloon_required=false`, startup continues without balloon and logs a warning. +- If `hypervisor.memory.enabled=false`, none of the guest-memory features are configured. +- If `reclaim_enabled=false`, passive reclaim and active ballooning are both disabled. +- If `active_ballooning.enabled=false`, the background pressure loop stays off and the manual reclaim endpoint returns a feature-disabled error. +- If a specific VM or hypervisor backend does not support runtime balloon control, Hypeman skips that VM and continues with the rest. +- `deflate_on_oom` stays enabled where supported so guests can recover memory quickly during real guest-side pressure. -## Quick CLI Experiment +## Manual Integration Tests -Use this A/B check to compare host memory footprint with policy enabled vs disabled: +The guest-memory integration tests are manual by default and cover one test per hypervisor: + +- Linux: `TestGuestMemoryPolicyCloudHypervisor` +- Linux: `TestGuestMemoryPolicyQEMU` +- Linux: `TestGuestMemoryPolicyFirecracker` +- macOS: `TestGuestMemoryPolicyVZ` + +All of them live in the existing `lib/instances` guest-memory test files and are gated by: ```bash -# 1) Start API with config A (hypervisor.memory.enabled=true), then run: -ID=$(hypeman run --hypervisor qemu --network=false --memory 1GB \ - --entrypoint /bin/sh --entrypoint -c \ - --cmd 'sleep 5; dd if=/dev/zero of=/dev/shm/hype-mem bs=1M count=256; sleep 5; rm -f /dev/shm/hype-mem; sleep 90' \ - docker.io/library/alpine:latest | tail -n1) -PID=$(jq -r '.HypervisorPID' "/guests/$ID/metadata.json") -awk '/^Pss:/ {print $2 " kB"}' "/proc/$PID/smaps_rollup" # Linux (preferred) -awk '/^VmRSS:/ {print $2 " kB"}' "/proc/$PID/status" # Linux fallback -ps -o rss= -p "$PID" # macOS -hypeman rm --force "$ID" - -# 2) Restart API with config B (hypervisor.memory.enabled=false) and run the same command. -# 3) Compare final/steady host memory between A and B. +HYPEMAN_RUN_GUESTMEMORY_TESTS=1 ``` -In one startup-focused sample run, absolute host footprint stayed far below guest memory size (for example, ~4GB guest with low host PSS on Cloud Hypervisor/Firecracker), while QEMU showed a larger fixed process overhead. - -Sample probe results (4GB idle guest, rounded MB): +Run them with: -| Hypervisor | Host RSS (MB) | Host PSS (MB) | Notes | -|---|---:|---:|---| -| Cloud Hypervisor (Linux) | ~345 | ~29 | Low actual host pressure when idle | -| Firecracker (Linux) | ~295 | ~27 | Low actual host pressure when idle | -| QEMU (Linux) | ~400 | ~116 | Higher fixed process overhead | -| VZ (macOS) | ~23 | N/A | RSS sampled with `ps` | +```bash +make test-guestmemory-linux +make test-guestmemory-vz +``` -## Out of Scope +The tests verify: -- No API surface changes. -- No scheduler/admission logic changes. -- No automatic background tuning loops outside hypervisor-supported reclaim mechanisms. +- boot-time guest-memory configuration is present +- runtime balloon target starts at full assigned memory +- manual reclaim changes the target in the expected direction +- protected floors prevent over-reclaim +- clearing the manual hold deflates back to full guest memory diff --git a/lib/guestmemory/active_ballooning.go b/lib/guestmemory/active_ballooning.go new file mode 100644 index 00000000..d63c8ec6 --- /dev/null +++ b/lib/guestmemory/active_ballooning.go @@ -0,0 +1,216 @@ +package guestmemory + +import ( + "context" + "errors" + "log/slog" + "time" + + "github.com/kernel/hypeman/lib/hypervisor" +) + +var ( + ErrActiveBallooningDisabled = errors.New("active ballooning is disabled") + ErrGuestMemoryDisabled = errors.New("guest memory reclaim is disabled") +) + +// ActiveBallooningConfig controls host-driven balloon reclaim behavior. +type ActiveBallooningConfig struct { + Enabled bool + + PollInterval time.Duration + + PressureHighWatermarkAvailablePercent int + PressureLowWatermarkAvailablePercent int + + ProtectedFloorPercent int + ProtectedFloorMinBytes int64 + + MinAdjustmentBytes int64 + PerVMMaxStepBytes int64 + PerVMCooldown time.Duration +} + +// DefaultActiveBallooningConfig returns conservative defaults for active reclaim. +func DefaultActiveBallooningConfig() ActiveBallooningConfig { + return ActiveBallooningConfig{ + Enabled: false, + PollInterval: 2 * time.Second, + PressureHighWatermarkAvailablePercent: 10, + PressureLowWatermarkAvailablePercent: 15, + ProtectedFloorPercent: 50, + ProtectedFloorMinBytes: 512 * 1024 * 1024, + MinAdjustmentBytes: 64 * 1024 * 1024, + PerVMMaxStepBytes: 256 * 1024 * 1024, + PerVMCooldown: 5 * time.Second, + } +} + +// Normalize applies defaults and clamps invalid values. +func (c ActiveBallooningConfig) Normalize() ActiveBallooningConfig { + d := DefaultActiveBallooningConfig() + + if c.PollInterval <= 0 { + c.PollInterval = d.PollInterval + } + if c.PressureHighWatermarkAvailablePercent <= 0 || c.PressureHighWatermarkAvailablePercent >= 100 { + c.PressureHighWatermarkAvailablePercent = d.PressureHighWatermarkAvailablePercent + } + if c.PressureLowWatermarkAvailablePercent <= 0 || c.PressureLowWatermarkAvailablePercent >= 100 { + c.PressureLowWatermarkAvailablePercent = d.PressureLowWatermarkAvailablePercent + } + if c.PressureLowWatermarkAvailablePercent <= c.PressureHighWatermarkAvailablePercent { + c.PressureLowWatermarkAvailablePercent = d.PressureLowWatermarkAvailablePercent + if c.PressureLowWatermarkAvailablePercent <= c.PressureHighWatermarkAvailablePercent { + c.PressureHighWatermarkAvailablePercent = d.PressureHighWatermarkAvailablePercent + } + } + if c.ProtectedFloorPercent <= 0 || c.ProtectedFloorPercent >= 100 { + c.ProtectedFloorPercent = d.ProtectedFloorPercent + } + if c.ProtectedFloorMinBytes <= 0 { + c.ProtectedFloorMinBytes = d.ProtectedFloorMinBytes + } + if c.MinAdjustmentBytes <= 0 { + c.MinAdjustmentBytes = d.MinAdjustmentBytes + } + if c.PerVMMaxStepBytes <= 0 { + c.PerVMMaxStepBytes = d.PerVMMaxStepBytes + } + if c.PerVMCooldown <= 0 { + c.PerVMCooldown = d.PerVMCooldown + } + + return c +} + +// BalloonVM describes a running VM that may participate in reclaim. +type BalloonVM struct { + ID string + Name string + HypervisorType hypervisor.Type + SocketPath string + AssignedMemoryBytes int64 +} + +// Source lists reclaim-eligible VMs. +type Source interface { + ListBalloonVMs(ctx context.Context) ([]BalloonVM, error) +} + +// Controller coordinates automatic and manual reclaim. +type Controller interface { + Start(ctx context.Context) error + TriggerReclaim(ctx context.Context, req ManualReclaimRequest) (ManualReclaimResponse, error) +} + +// ManualReclaimRequest triggers a proactive reclaim cycle. +type ManualReclaimRequest struct { + ReclaimBytes int64 + HoldFor time.Duration + DryRun bool + Reason string +} + +// HostPressureState summarizes host memory pressure. +type HostPressureState string + +const ( + HostPressureStateHealthy HostPressureState = "healthy" + HostPressureStatePressure HostPressureState = "pressure" +) + +// ManualReclaimAction captures one VM's reclaim plan/result. +type ManualReclaimAction struct { + InstanceID string + InstanceName string + Hypervisor hypervisor.Type + AssignedMemoryBytes int64 + ProtectedFloorBytes int64 + PreviousTargetGuestMemoryBytes int64 + PlannedTargetGuestMemoryBytes int64 + TargetGuestMemoryBytes int64 + AppliedReclaimBytes int64 + Status string + Error string +} + +// ManualReclaimResponse summarizes the last reconcile result. +type ManualReclaimResponse struct { + RequestedReclaimBytes int64 + PlannedReclaimBytes int64 + AppliedReclaimBytes int64 + HoldUntil *time.Time + HostAvailableBytes int64 + HostPressureState HostPressureState + Actions []ManualReclaimAction +} + +// HostPressureSample captures the host memory snapshot used for reclaim decisions. +type HostPressureSample struct { + TotalBytes int64 + AvailableBytes int64 + AvailablePercent float64 + Stressed bool +} + +// PressureSampler provides host memory pressure samples. +type PressureSampler interface { + Sample(ctx context.Context) (HostPressureSample, error) +} + +type controller struct { + policy Policy + config ActiveBallooningConfig + source Source + sampler PressureSampler + log *slog.Logger + + reconcileMu syncState +} + +type syncState struct { + mu chan struct{} + pressureState HostPressureState + manualHold *manualHold + lastApplied map[string]time.Time + newClient func(hypervisor.Type, string) (hypervisor.Hypervisor, error) +} + +type manualHold struct { + reclaimBytes int64 + until time.Time +} + +// NewController creates the active ballooning controller. +func NewController(policy Policy, cfg ActiveBallooningConfig, source Source, log *slog.Logger) Controller { + return NewControllerWithSampler(policy, cfg, source, newHostPressureSampler(), log) +} + +// NewControllerWithSampler creates the active ballooning controller with an injected +// host pressure sampler. This is primarily useful for tests that need deterministic +// reclaim behavior. +func NewControllerWithSampler(policy Policy, cfg ActiveBallooningConfig, source Source, sampler PressureSampler, log *slog.Logger) Controller { + if log == nil { + log = slog.Default() + } + if sampler == nil { + sampler = newHostPressureSampler() + } + + c := &controller{ + policy: policy.Normalize(), + config: cfg.Normalize(), + source: source, + sampler: sampler, + log: log, + reconcileMu: syncState{ + mu: make(chan struct{}, 1), + pressureState: HostPressureStateHealthy, + lastApplied: make(map[string]time.Time), + newClient: hypervisor.NewClient, + }, + } + c.reconcileMu.mu <- struct{}{} + return c +} diff --git a/lib/guestmemory/controller.go b/lib/guestmemory/controller.go new file mode 100644 index 00000000..6dcdd31f --- /dev/null +++ b/lib/guestmemory/controller.go @@ -0,0 +1,237 @@ +package guestmemory + +import ( + "context" + "fmt" + "time" +) + +func (c *controller) Start(ctx context.Context) error { + if !c.policy.Enabled || !c.policy.ReclaimEnabled { + <-ctx.Done() + return nil + } + if !c.config.Enabled { + <-ctx.Done() + return nil + } + + ticker := time.NewTicker(c.config.PollInterval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return nil + case <-ticker.C: + if _, err := c.reconcile(ctx, reconcileRequest{}); err != nil { + c.log.WarnContext(ctx, "active ballooning reconcile failed", "error", err) + } + } + } +} + +func (c *controller) TriggerReclaim(ctx context.Context, req ManualReclaimRequest) (ManualReclaimResponse, error) { + if !c.policy.Enabled || !c.policy.ReclaimEnabled { + return ManualReclaimResponse{}, ErrGuestMemoryDisabled + } + if !c.config.Enabled { + return ManualReclaimResponse{}, ErrActiveBallooningDisabled + } + if req.ReclaimBytes < 0 { + return ManualReclaimResponse{}, fmt.Errorf("reclaim_bytes must be non-negative") + } + return c.reconcile(ctx, reconcileRequest{ + force: true, + dryRun: req.DryRun, + requestedReclaim: req.ReclaimBytes, + holdFor: req.HoldFor, + reason: req.Reason, + }) +} + +type reconcileRequest struct { + force bool + dryRun bool + requestedReclaim int64 + holdFor time.Duration + reason string +} + +func (c *controller) reconcile(ctx context.Context, req reconcileRequest) (ManualReclaimResponse, error) { + state := &c.reconcileMu + <-state.mu + defer func() { state.mu <- struct{}{} }() + + now := time.Now() + sample, err := c.sampler.Sample(ctx) + if err != nil { + return ManualReclaimResponse{}, err + } + + if state.manualHold != nil && !state.manualHold.until.IsZero() && now.After(state.manualHold.until) { + state.manualHold = nil + } + + if req.force && !req.dryRun { + switch { + case req.requestedReclaim <= 0 || req.holdFor <= 0: + state.manualHold = nil + default: + state.manualHold = &manualHold{ + reclaimBytes: req.requestedReclaim, + until: now.Add(req.holdFor), + } + } + } + + vms, err := c.source.ListBalloonVMs(ctx) + if err != nil { + return ManualReclaimResponse{}, err + } + + candidates := make([]candidateState, 0, len(vms)) + actions := make([]ManualReclaimAction, 0, len(vms)) + var currentTotalReclaim int64 + for _, vm := range vms { + hv, err := state.newClient(vm.HypervisorType, vm.SocketPath) + if err != nil { + actions = append(actions, skippedAction(vm, "error", fmt.Sprintf("create hypervisor client: %v", err))) + continue + } + if !hv.Capabilities().SupportsBalloonControl { + actions = append(actions, skippedAction(vm, "unsupported", "runtime balloon control is not supported")) + continue + } + + currentTarget, err := hv.GetTargetGuestMemoryBytes(ctx) + if err != nil { + actions = append(actions, skippedAction(vm, "error", fmt.Sprintf("read balloon target: %v", err))) + continue + } + + currentTarget = clampInt64(currentTarget, 0, vm.AssignedMemoryBytes) + protectedFloor := protectedFloorBytes(c.config, vm.AssignedMemoryBytes) + if protectedFloor > vm.AssignedMemoryBytes { + protectedFloor = vm.AssignedMemoryBytes + } + + currentReclaim := vm.AssignedMemoryBytes - currentTarget + if currentReclaim < 0 { + currentReclaim = 0 + } + currentTotalReclaim += currentReclaim + + candidates = append(candidates, candidateState{ + vm: vm, + hv: hv, + currentTargetGuestBytes: currentTarget, + protectedFloorBytes: protectedFloor, + maxReclaimBytes: maxInt64(0, vm.AssignedMemoryBytes-protectedFloor), + }) + } + + state.pressureState = nextPressureState(state.pressureState, c.config, sample) + autoTarget := automaticTargetBytes(state.pressureState, c.config, sample, currentTotalReclaim) + + manualTarget := int64(0) + if req.dryRun { + manualTarget = req.requestedReclaim + } else if state.manualHold != nil { + manualTarget = state.manualHold.reclaimBytes + } + totalTarget := maxInt64(autoTarget, manualTarget) + + plannedTargets := planGuestTargets(c.config, candidates, totalTarget) + + resp := ManualReclaimResponse{ + RequestedReclaimBytes: req.requestedReclaim, + HoldUntil: holdUntil(state.manualHold), + HostAvailableBytes: sample.AvailableBytes, + HostPressureState: state.pressureState, + Actions: make([]ManualReclaimAction, 0, len(actions)+len(candidates)), + } + resp.Actions = append(resp.Actions, actions...) + + for _, candidate := range candidates { + plannedTarget := plannedTargets[candidate.vm.ID] + if plannedTarget == 0 { + plannedTarget = candidate.vm.AssignedMemoryBytes + } + + appliedTarget := plannedTarget + delta := plannedTarget - candidate.currentTargetGuestBytes + if absInt64(delta) < c.config.MinAdjustmentBytes { + appliedTarget = candidate.currentTargetGuestBytes + } + if !req.force { + if lastAppliedAt, ok := state.lastApplied[candidate.vm.ID]; ok && now.Sub(lastAppliedAt) < c.config.PerVMCooldown { + appliedTarget = candidate.currentTargetGuestBytes + } + } + if appliedTarget != candidate.currentTargetGuestBytes { + if delta > 0 { + appliedTarget = candidate.currentTargetGuestBytes + minInt64(delta, c.config.PerVMMaxStepBytes) + } else { + appliedTarget = candidate.currentTargetGuestBytes - minInt64(-delta, c.config.PerVMMaxStepBytes) + } + } + + appliedTarget = clampInt64(appliedTarget, candidate.protectedFloorBytes, candidate.vm.AssignedMemoryBytes) + plannedTarget = clampInt64(plannedTarget, candidate.protectedFloorBytes, candidate.vm.AssignedMemoryBytes) + + action := ManualReclaimAction{ + InstanceID: candidate.vm.ID, + InstanceName: candidate.vm.Name, + Hypervisor: candidate.vm.HypervisorType, + AssignedMemoryBytes: candidate.vm.AssignedMemoryBytes, + ProtectedFloorBytes: candidate.protectedFloorBytes, + PreviousTargetGuestMemoryBytes: candidate.currentTargetGuestBytes, + PlannedTargetGuestMemoryBytes: plannedTarget, + TargetGuestMemoryBytes: candidate.currentTargetGuestBytes, + Status: "unchanged", + } + + resp.PlannedReclaimBytes += candidate.vm.AssignedMemoryBytes - plannedTarget + + if !req.dryRun && appliedTarget != candidate.currentTargetGuestBytes { + if err := candidate.hv.SetTargetGuestMemoryBytes(ctx, appliedTarget); err != nil { + action.Status = "error" + action.Error = err.Error() + resp.Actions = append(resp.Actions, action) + continue + } + state.lastApplied[candidate.vm.ID] = now + action.Status = "applied" + action.TargetGuestMemoryBytes = appliedTarget + } + if req.dryRun && appliedTarget != candidate.currentTargetGuestBytes { + action.Status = "planned" + action.TargetGuestMemoryBytes = appliedTarget + } + action.AppliedReclaimBytes = candidate.vm.AssignedMemoryBytes - action.TargetGuestMemoryBytes + resp.AppliedReclaimBytes += action.AppliedReclaimBytes + resp.Actions = append(resp.Actions, action) + } + + return resp, nil +} + +func holdUntil(hold *manualHold) *time.Time { + if hold == nil || hold.until.IsZero() { + return nil + } + until := hold.until + return &until +} + +func skippedAction(vm BalloonVM, status, err string) ManualReclaimAction { + return ManualReclaimAction{ + InstanceID: vm.ID, + InstanceName: vm.Name, + Hypervisor: vm.HypervisorType, + AssignedMemoryBytes: vm.AssignedMemoryBytes, + Status: status, + Error: err, + } +} diff --git a/lib/guestmemory/controller_test.go b/lib/guestmemory/controller_test.go new file mode 100644 index 00000000..4aaa2659 --- /dev/null +++ b/lib/guestmemory/controller_test.go @@ -0,0 +1,146 @@ +package guestmemory + +import ( + "context" + "errors" + "io" + "log/slog" + "testing" + "time" + + "github.com/kernel/hypeman/lib/hypervisor" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +type stubSource struct { + vms []BalloonVM + err error +} + +func (s *stubSource) ListBalloonVMs(ctx context.Context) ([]BalloonVM, error) { + _ = ctx + if s.err != nil { + return nil, s.err + } + return s.vms, nil +} + +type stubSampler struct { + sample HostPressureSample + err error +} + +func (s *stubSampler) Sample(ctx context.Context) (HostPressureSample, error) { + _ = ctx + return s.sample, s.err +} + +type stubHypervisor struct { + target int64 + capabilities hypervisor.Capabilities + setErr error +} + +func (s *stubHypervisor) DeleteVM(ctx context.Context) error { return nil } +func (s *stubHypervisor) Shutdown(ctx context.Context) error { return nil } +func (s *stubHypervisor) GetVMInfo(ctx context.Context) (*hypervisor.VMInfo, error) { + return &hypervisor.VMInfo{State: hypervisor.StateRunning}, nil +} +func (s *stubHypervisor) Pause(ctx context.Context) error { return nil } +func (s *stubHypervisor) Resume(ctx context.Context) error { return nil } +func (s *stubHypervisor) Snapshot(ctx context.Context, destPath string) error { return nil } +func (s *stubHypervisor) ResizeMemory(ctx context.Context, bytes int64) error { return nil } +func (s *stubHypervisor) ResizeMemoryAndWait(ctx context.Context, bytes int64, timeout time.Duration) error { + return nil +} +func (s *stubHypervisor) Capabilities() hypervisor.Capabilities { return s.capabilities } +func (s *stubHypervisor) SetTargetGuestMemoryBytes(ctx context.Context, bytes int64) error { + _ = ctx + if s.setErr != nil { + return s.setErr + } + s.target = bytes + return nil +} +func (s *stubHypervisor) GetTargetGuestMemoryBytes(ctx context.Context) (int64, error) { + _ = ctx + return s.target, nil +} + +func TestTriggerReclaimDistributesProportionally(t *testing.T) { + const mib = int64(1024 * 1024) + src := &stubSource{ + vms: []BalloonVM{ + {ID: "a", Name: "a", HypervisorType: hypervisor.TypeCloudHypervisor, SocketPath: "a", AssignedMemoryBytes: 1024 * mib}, + {ID: "b", Name: "b", HypervisorType: hypervisor.TypeCloudHypervisor, SocketPath: "b", AssignedMemoryBytes: 2048 * mib}, + }, + } + hvA := &stubHypervisor{target: 1024 * mib, capabilities: hypervisor.Capabilities{SupportsBalloonControl: true}} + hvB := &stubHypervisor{target: 2048 * mib, capabilities: hypervisor.Capabilities{SupportsBalloonControl: true}} + + c := NewController(Policy{Enabled: true, ReclaimEnabled: true}, ActiveBallooningConfig{ + Enabled: true, + ProtectedFloorPercent: 50, + ProtectedFloorMinBytes: 0, + MinAdjustmentBytes: 1, + PerVMMaxStepBytes: 4096 * mib, + PerVMCooldown: time.Second, + }, src, slog.New(slog.NewTextHandler(io.Discard, nil))).(*controller) + c.sampler = &stubSampler{sample: HostPressureSample{TotalBytes: 4096 * mib, AvailableBytes: 4096 * mib, AvailablePercent: 100}} + c.reconcileMu.newClient = func(t hypervisor.Type, socket string) (hypervisor.Hypervisor, error) { + switch socket { + case "a": + return hvA, nil + case "b": + return hvB, nil + default: + return nil, errors.New("unknown") + } + } + + resp, err := c.TriggerReclaim(context.Background(), ManualReclaimRequest{ReclaimBytes: 768 * mib, HoldFor: time.Minute}) + require.NoError(t, err) + assert.Equal(t, int64(768*mib), resp.PlannedReclaimBytes) + assert.Equal(t, int64(768*mib), resp.AppliedReclaimBytes) + assert.Equal(t, int64(768*mib), 1024*mib-hvA.target+2048*mib-hvB.target) + assert.Equal(t, int64(768*mib), resp.Actions[0].AppliedReclaimBytes+resp.Actions[1].AppliedReclaimBytes) +} + +func TestPressureStateUsesHysteresis(t *testing.T) { + cfg := DefaultActiveBallooningConfig() + cfg.PressureHighWatermarkAvailablePercent = 10 + cfg.PressureLowWatermarkAvailablePercent = 15 + + assert.Equal(t, HostPressureStatePressure, nextPressureState(HostPressureStateHealthy, cfg, HostPressureSample{AvailablePercent: 9})) + assert.Equal(t, HostPressureStatePressure, nextPressureState(HostPressureStatePressure, cfg, HostPressureSample{AvailablePercent: 12})) + assert.Equal(t, HostPressureStateHealthy, nextPressureState(HostPressureStatePressure, cfg, HostPressureSample{AvailablePercent: 16})) +} + +func TestTriggerReclaimRespectsProtectedFloor(t *testing.T) { + const mib = int64(1024 * 1024) + src := &stubSource{ + vms: []BalloonVM{ + {ID: "a", Name: "a", HypervisorType: hypervisor.TypeCloudHypervisor, SocketPath: "a", AssignedMemoryBytes: 1024 * mib}, + }, + } + hv := &stubHypervisor{target: 1024 * mib, capabilities: hypervisor.Capabilities{SupportsBalloonControl: true}} + c := NewController(Policy{Enabled: true, ReclaimEnabled: true}, ActiveBallooningConfig{ + Enabled: true, + ProtectedFloorPercent: 75, + ProtectedFloorMinBytes: 0, + MinAdjustmentBytes: 1, + PerVMMaxStepBytes: 4096 * mib, + PerVMCooldown: time.Second, + }, src, slog.New(slog.NewTextHandler(io.Discard, nil))).(*controller) + c.sampler = &stubSampler{sample: HostPressureSample{TotalBytes: 1024 * mib, AvailableBytes: 1024 * mib, AvailablePercent: 100}} + c.reconcileMu.newClient = func(t hypervisor.Type, socket string) (hypervisor.Hypervisor, error) { + return hv, nil + } + + resp, err := c.TriggerReclaim(context.Background(), ManualReclaimRequest{ReclaimBytes: 1024 * mib, HoldFor: time.Minute}) + require.NoError(t, err) + require.Len(t, resp.Actions, 1) + assert.Equal(t, int64(768*mib), resp.Actions[0].TargetGuestMemoryBytes) + assert.Equal(t, int64(256*mib), resp.AppliedReclaimBytes) +} diff --git a/lib/guestmemory/helpers.go b/lib/guestmemory/helpers.go new file mode 100644 index 00000000..a83811e5 --- /dev/null +++ b/lib/guestmemory/helpers.go @@ -0,0 +1,8 @@ +package guestmemory + +func percentage(part, total int64) float64 { + if total <= 0 { + return 0 + } + return (float64(part) / float64(total)) * 100 +} diff --git a/lib/guestmemory/planner.go b/lib/guestmemory/planner.go new file mode 100644 index 00000000..2a351536 --- /dev/null +++ b/lib/guestmemory/planner.go @@ -0,0 +1,124 @@ +package guestmemory + +import "github.com/kernel/hypeman/lib/hypervisor" + +type candidateState struct { + vm BalloonVM + hv hypervisor.Hypervisor + currentTargetGuestBytes int64 + protectedFloorBytes int64 + maxReclaimBytes int64 +} + +func planGuestTargets(cfg ActiveBallooningConfig, candidates []candidateState, totalReclaim int64) map[string]int64 { + targets := make(map[string]int64, len(candidates)) + if len(candidates) == 0 { + return targets + } + + var totalHeadroom int64 + for _, candidate := range candidates { + totalHeadroom += candidate.maxReclaimBytes + targets[candidate.vm.ID] = candidate.vm.AssignedMemoryBytes + } + if totalHeadroom <= 0 { + return targets + } + + totalReclaim = clampInt64(totalReclaim, 0, totalHeadroom) + if totalReclaim == 0 { + return targets + } + + remainder := totalReclaim + for _, candidate := range candidates { + reclaim := (totalReclaim * candidate.maxReclaimBytes) / totalHeadroom + if reclaim > candidate.maxReclaimBytes { + reclaim = candidate.maxReclaimBytes + } + targets[candidate.vm.ID] = candidate.vm.AssignedMemoryBytes - reclaim + remainder -= reclaim + } + + for _, candidate := range candidates { + if remainder <= 0 { + break + } + currentReclaim := candidate.vm.AssignedMemoryBytes - targets[candidate.vm.ID] + headroomLeft := candidate.maxReclaimBytes - currentReclaim + if headroomLeft <= 0 { + continue + } + extra := minInt64(headroomLeft, remainder) + targets[candidate.vm.ID] -= extra + remainder -= extra + } + + return targets +} + +func protectedFloorBytes(cfg ActiveBallooningConfig, assigned int64) int64 { + percentFloor := (assigned * int64(cfg.ProtectedFloorPercent)) / 100 + return maxInt64(cfg.ProtectedFloorMinBytes, percentFloor) +} + +func nextPressureState(current HostPressureState, cfg ActiveBallooningConfig, sample HostPressureSample) HostPressureState { + switch current { + case HostPressureStatePressure: + if int(sample.AvailablePercent) >= cfg.PressureLowWatermarkAvailablePercent && !sample.Stressed { + return HostPressureStateHealthy + } + return HostPressureStatePressure + default: + if int(sample.AvailablePercent) <= cfg.PressureHighWatermarkAvailablePercent || sample.Stressed { + return HostPressureStatePressure + } + return HostPressureStateHealthy + } +} + +func automaticTargetBytes(state HostPressureState, cfg ActiveBallooningConfig, sample HostPressureSample, currentTotalReclaim int64) int64 { + if state != HostPressureStatePressure || sample.TotalBytes <= 0 { + return 0 + } + lowWatermarkBytes := (sample.TotalBytes * int64(cfg.PressureLowWatermarkAvailablePercent)) / 100 + needed := lowWatermarkBytes - sample.AvailableBytes + if needed > 0 { + return needed + } + if sample.Stressed { + return currentTotalReclaim + } + return 0 +} + +func absInt64(v int64) int64 { + if v < 0 { + return -v + } + return v +} + +func clampInt64(v, minV, maxV int64) int64 { + if v < minV { + return minV + } + if v > maxV { + return maxV + } + return v +} + +func minInt64(a, b int64) int64 { + if a < b { + return a + } + return b +} + +func maxInt64(a, b int64) int64 { + if a > b { + return a + } + return b +} diff --git a/lib/guestmemory/pressure_darwin.go b/lib/guestmemory/pressure_darwin.go new file mode 100644 index 00000000..3e74dad8 --- /dev/null +++ b/lib/guestmemory/pressure_darwin.go @@ -0,0 +1,118 @@ +//go:build darwin + +package guestmemory + +import ( + "context" + "fmt" + "os/exec" + "strconv" + "strings" +) + +type darwinPressureSampler struct{} + +func newHostPressureSampler() PressureSampler { + return &darwinPressureSampler{} +} + +func (s *darwinPressureSampler) Sample(ctx context.Context) (HostPressureSample, error) { + total, available, err := readDarwinVMStat(ctx) + if err != nil { + return HostPressureSample{}, err + } + stressed, err := readDarwinMemoryPressure(ctx) + if err != nil { + return HostPressureSample{}, err + } + + return HostPressureSample{ + TotalBytes: total, + AvailableBytes: available, + AvailablePercent: percentage(available, total), + Stressed: stressed, + }, nil +} + +func readDarwinVMStat(ctx context.Context) (int64, int64, error) { + out, err := exec.CommandContext(ctx, "vm_stat").Output() + if err != nil { + return 0, 0, fmt.Errorf("run vm_stat: %w", err) + } + lines := strings.Split(string(out), "\n") + pageSize := int64(4096) + var freePages, speculativePages int64 + for _, line := range lines { + line = strings.TrimSpace(line) + if strings.Contains(line, "page size of") { + parts := strings.Fields(line) + for i := 0; i < len(parts); i++ { + if parts[i] == "of" && i+1 < len(parts) { + n, err := strconv.ParseInt(strings.TrimSuffix(parts[i+1], " bytes)"), 10, 64) + if err == nil && n > 0 { + pageSize = n + } + break + } + } + } + if strings.HasPrefix(line, "Pages free:") { + n, err := parseDarwinPageCount(line) + if err != nil { + return 0, 0, err + } + freePages = n + } + if strings.HasPrefix(line, "Pages speculative:") { + n, err := parseDarwinPageCount(line) + if err != nil { + return 0, 0, err + } + speculativePages = n + } + } + + memsizeOut, err := exec.CommandContext(ctx, "sysctl", "-n", "hw.memsize").Output() + if err != nil { + return 0, 0, fmt.Errorf("run sysctl hw.memsize: %w", err) + } + total, err := strconv.ParseInt(strings.TrimSpace(string(memsizeOut)), 10, 64) + if err != nil { + return 0, 0, fmt.Errorf("parse hw.memsize: %w", err) + } + + available := (freePages + speculativePages) * pageSize + return total, available, nil +} + +func parseDarwinPageCount(line string) (int64, error) { + parts := strings.Split(line, ":") + if len(parts) != 2 { + return 0, fmt.Errorf("parse vm_stat line %q", line) + } + value := strings.TrimSpace(strings.TrimSuffix(parts[1], ".")) + return strconv.ParseInt(value, 10, 64) +} + +func readDarwinMemoryPressure(ctx context.Context) (bool, error) { + out, err := exec.CommandContext(ctx, "memory_pressure", "-Q").Output() + if err != nil { + return false, fmt.Errorf("run memory_pressure -Q: %w", err) + } + for _, line := range strings.Split(string(out), "\n") { + line = strings.TrimSpace(line) + if strings.HasPrefix(line, "System-wide memory free percentage:") { + fields := strings.Fields(line) + if len(fields) == 0 { + break + } + last := strings.TrimSuffix(fields[len(fields)-1], "%") + value, err := strconv.ParseInt(last, 10, 64) + if err != nil { + return false, fmt.Errorf("parse memory_pressure free percentage: %w", err) + } + return value <= 10, nil + } + } + return false, nil +} diff --git a/lib/guestmemory/pressure_linux.go b/lib/guestmemory/pressure_linux.go new file mode 100644 index 00000000..25665785 --- /dev/null +++ b/lib/guestmemory/pressure_linux.go @@ -0,0 +1,99 @@ +//go:build linux + +package guestmemory + +import ( + "bufio" + "context" + "fmt" + "os" + "strconv" + "strings" +) + +type linuxPressureSampler struct{} + +func newHostPressureSampler() PressureSampler { + return &linuxPressureSampler{} +} + +func (s *linuxPressureSampler) Sample(ctx context.Context) (HostPressureSample, error) { + _ = ctx + + total, available, err := readLinuxMeminfo() + if err != nil { + return HostPressureSample{}, err + } + stressed, err := readLinuxPSI() + if err != nil { + return HostPressureSample{}, err + } + + return HostPressureSample{ + TotalBytes: total, + AvailableBytes: available, + AvailablePercent: percentage(available, total), + Stressed: stressed, + }, nil +} + +func readLinuxMeminfo() (int64, int64, error) { + file, err := os.Open("/proc/meminfo") + if err != nil { + return 0, 0, fmt.Errorf("open /proc/meminfo: %w", err) + } + defer file.Close() + + var total, available int64 + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := scanner.Text() + fields := strings.Fields(line) + if len(fields) < 2 { + continue + } + switch fields[0] { + case "MemTotal:": + value, err := strconv.ParseInt(fields[1], 10, 64) + if err != nil { + return 0, 0, fmt.Errorf("parse MemTotal: %w", err) + } + total = value * 1024 + case "MemAvailable:": + value, err := strconv.ParseInt(fields[1], 10, 64) + if err != nil { + return 0, 0, fmt.Errorf("parse MemAvailable: %w", err) + } + available = value * 1024 + } + } + if err := scanner.Err(); err != nil { + return 0, 0, fmt.Errorf("scan /proc/meminfo: %w", err) + } + if total <= 0 || available < 0 { + return 0, 0, fmt.Errorf("missing memory totals from /proc/meminfo") + } + return total, available, nil +} + +func readLinuxPSI() (bool, error) { + data, err := os.ReadFile("/proc/pressure/memory") + if err != nil { + return false, fmt.Errorf("read /proc/pressure/memory: %w", err) + } + for _, line := range strings.Split(string(data), "\n") { + if strings.HasPrefix(line, "some ") { + fields := strings.Fields(line) + for _, field := range fields[1:] { + if strings.HasPrefix(field, "avg10=") { + value, err := strconv.ParseFloat(strings.TrimPrefix(field, "avg10="), 64) + if err != nil { + return false, fmt.Errorf("parse psi avg10: %w", err) + } + return value > 0, nil + } + } + } + } + return false, nil +} diff --git a/lib/hypervisor/cloudhypervisor/cloudhypervisor.go b/lib/hypervisor/cloudhypervisor/cloudhypervisor.go index 2d3f77bb..a3deef6b 100644 --- a/lib/hypervisor/cloudhypervisor/cloudhypervisor.go +++ b/lib/hypervisor/cloudhypervisor/cloudhypervisor.go @@ -5,6 +5,7 @@ package cloudhypervisor import ( "context" "fmt" + "sync" "time" "github.com/kernel/hypeman/lib/hypervisor" @@ -13,9 +14,12 @@ import ( // CloudHypervisor implements hypervisor.Hypervisor for Cloud Hypervisor VMM. type CloudHypervisor struct { - client *vmm.VMM + client *vmm.VMM + socketPath string } +var balloonTargetCache sync.Map + // New creates a new Cloud Hypervisor client for an existing VMM socket. func New(socketPath string) (*CloudHypervisor, error) { client, err := vmm.NewVMM(socketPath) @@ -23,7 +27,8 @@ func New(socketPath string) (*CloudHypervisor, error) { return nil, fmt.Errorf("create vmm client: %w", err) } return &CloudHypervisor{ - client: client, + client: client, + socketPath: socketPath, }, nil } @@ -39,6 +44,7 @@ func capabilities() hypervisor.Capabilities { return hypervisor.Capabilities{ SupportsSnapshot: true, SupportsHotplugMemory: true, + SupportsBalloonControl: true, SupportsPause: true, SupportsVsock: true, SupportsGPUPassthrough: true, @@ -57,6 +63,7 @@ func (c *CloudHypervisor) DeleteVM(ctx context.Context) error { if resp.StatusCode() != 204 { return fmt.Errorf("delete vm failed with status %d: %s", resp.StatusCode(), string(resp.Body)) } + balloonTargetCache.Delete(c.socketPath) return nil } @@ -70,6 +77,7 @@ func (c *CloudHypervisor) Shutdown(ctx context.Context) error { if resp.StatusCode() != 204 { return fmt.Errorf("shutdown vmm failed with status %d", resp.StatusCode()) } + balloonTargetCache.Delete(c.socketPath) return nil } @@ -207,3 +215,55 @@ func (c *CloudHypervisor) ResizeMemoryAndWait(ctx context.Context, bytes int64, // Timeout reached, but resize was requested successfully return nil } + +func (c *CloudHypervisor) SetTargetGuestMemoryBytes(ctx context.Context, bytes int64) error { + info, err := c.client.GetVmInfoWithResponse(ctx) + if err != nil { + return fmt.Errorf("get vm info for balloon update: %w", err) + } + if info.StatusCode() != 200 || info.JSON200 == nil { + return fmt.Errorf("get vm info for balloon update failed with status %d", info.StatusCode()) + } + if info.JSON200.Config.Balloon == nil { + return hypervisor.ErrNotSupported + } + + assigned := info.JSON200.Config.Memory.Size + if info.JSON200.MemoryActualSize != nil { + assigned = *info.JSON200.MemoryActualSize + info.JSON200.Config.Balloon.Size + } + if bytes < 0 || bytes > assigned { + return fmt.Errorf("target guest memory %d is outside valid range [0,%d]", bytes, assigned) + } + + desiredBalloon := assigned - bytes + resp, err := c.client.PutVmResizeWithResponse(ctx, vmm.VmResize{DesiredBalloon: &desiredBalloon}) + if err != nil { + return fmt.Errorf("set balloon target: %w", err) + } + if resp.StatusCode() != 204 { + return fmt.Errorf("set balloon target failed with status %d", resp.StatusCode()) + } + balloonTargetCache.Store(c.socketPath, bytes) + return nil +} + +func (c *CloudHypervisor) GetTargetGuestMemoryBytes(ctx context.Context) (int64, error) { + info, err := c.client.GetVmInfoWithResponse(ctx) + if err != nil { + return 0, fmt.Errorf("get vm info for balloon read: %w", err) + } + if info.StatusCode() != 200 || info.JSON200 == nil { + return 0, fmt.Errorf("get vm info for balloon read failed with status %d", info.StatusCode()) + } + if info.JSON200.Config.Balloon == nil { + return 0, hypervisor.ErrNotSupported + } + if target, ok := balloonTargetCache.Load(c.socketPath); ok { + if value, ok := target.(int64); ok { + return value, nil + } + } + assigned := info.JSON200.Config.Memory.Size + return assigned - info.JSON200.Config.Balloon.Size, nil +} diff --git a/lib/hypervisor/firecracker/firecracker.go b/lib/hypervisor/firecracker/firecracker.go index 4c2b2a69..5fd6ce93 100644 --- a/lib/hypervisor/firecracker/firecracker.go +++ b/lib/hypervisor/firecracker/firecracker.go @@ -54,6 +54,7 @@ func capabilities() hypervisor.Capabilities { return hypervisor.Capabilities{ SupportsSnapshot: true, SupportsHotplugMemory: false, + SupportsBalloonControl: true, SupportsPause: true, SupportsVsock: true, SupportsGPUPassthrough: false, @@ -125,6 +126,40 @@ func (f *Firecracker) ResizeMemoryAndWait(ctx context.Context, bytes int64, time return hypervisor.ErrNotSupported } +func (f *Firecracker) SetTargetGuestMemoryBytes(ctx context.Context, bytes int64) error { + cfg, err := f.getVMConfig(ctx) + if err != nil { + return err + } + desiredBalloonMiB := cfg.MachineConfig.MemSizeMiB - bytesToMiB(bytes) + if desiredBalloonMiB < 0 { + return fmt.Errorf("target guest memory %d exceeds configured memory %d MiB", bytes, cfg.MachineConfig.MemSizeMiB) + } + + body := map[string]int64{"amount_mib": desiredBalloonMiB} + if _, err := f.do(ctx, http.MethodPatch, "/balloon", body, http.StatusNoContent); err != nil { + if strings.Contains(err.Error(), "Invalid request method and/or path") { + if _, putErr := f.do(ctx, http.MethodPut, "/balloon", body, http.StatusNoContent); putErr != nil { + if strings.Contains(putErr.Error(), "Invalid request method and/or path") { + return hypervisor.ErrNotSupported + } + return fmt.Errorf("set balloon target: %w", putErr) + } + return nil + } + return fmt.Errorf("set balloon target: %w", err) + } + return nil +} + +func (f *Firecracker) GetTargetGuestMemoryBytes(ctx context.Context) (int64, error) { + cfg, err := f.getVMConfig(ctx) + if err != nil { + return 0, err + } + return (cfg.MachineConfig.MemSizeMiB - cfg.Balloon.AmountMiB) * 1024 * 1024, nil +} + func (f *Firecracker) configureForBoot(ctx context.Context, cfg hypervisor.VMConfig) error { if cfg.SerialLogPath != "" { if err := os.MkdirAll(filepath.Dir(cfg.SerialLogPath), 0755); err != nil { @@ -200,6 +235,31 @@ func (f *Firecracker) postAction(ctx context.Context, action string) error { return nil } +type firecrackerVMConfig struct { + MachineConfig struct { + MemSizeMiB int64 `json:"mem_size_mib"` + } `json:"machine-config"` + Balloon struct { + AmountMiB int64 `json:"amount_mib"` + } `json:"balloon"` +} + +func (f *Firecracker) getVMConfig(ctx context.Context) (*firecrackerVMConfig, error) { + body, err := f.do(ctx, http.MethodGet, "/vm/config", nil, http.StatusOK) + if err != nil { + if strings.Contains(err.Error(), "Invalid request method and/or path") { + return nil, hypervisor.ErrNotSupported + } + return nil, fmt.Errorf("get vm config: %w", err) + } + + var cfg firecrackerVMConfig + if err := json.Unmarshal(body, &cfg); err != nil { + return nil, fmt.Errorf("decode vm config: %w", err) + } + return &cfg, nil +} + func (f *Firecracker) do(ctx context.Context, method, path string, reqBody any, expectedStatus ...int) ([]byte, error) { var bodyReader io.Reader if reqBody != nil { diff --git a/lib/hypervisor/hypervisor.go b/lib/hypervisor/hypervisor.go index 5777f381..e3e1dc00 100644 --- a/lib/hypervisor/hypervisor.go +++ b/lib/hypervisor/hypervisor.go @@ -188,6 +188,16 @@ type Hypervisor interface { // Check Capabilities().SupportsHotplugMemory before calling. ResizeMemoryAndWait(ctx context.Context, bytes int64, timeout time.Duration) error + // SetTargetGuestMemoryBytes adjusts the runtime balloon target so the guest + // sees the requested amount of RAM. + // Check Capabilities().SupportsBalloonControl before calling. + SetTargetGuestMemoryBytes(ctx context.Context, bytes int64) error + + // GetTargetGuestMemoryBytes returns the current guest-visible RAM target after + // runtime ballooning is applied. + // Check Capabilities().SupportsBalloonControl before calling. + GetTargetGuestMemoryBytes(ctx context.Context) (int64, error) + // Capabilities returns what features this hypervisor supports. Capabilities() Capabilities } @@ -201,6 +211,9 @@ type Capabilities struct { // SupportsHotplugMemory indicates if ResizeMemory is available SupportsHotplugMemory bool + // SupportsBalloonControl indicates if runtime balloon target changes are available. + SupportsBalloonControl bool + // SupportsPause indicates if Pause/Resume are available SupportsPause bool diff --git a/lib/hypervisor/qemu/qemu.go b/lib/hypervisor/qemu/qemu.go index 4db8a05b..e73c0dda 100644 --- a/lib/hypervisor/qemu/qemu.go +++ b/lib/hypervisor/qemu/qemu.go @@ -5,6 +5,7 @@ import ( "fmt" "os" "path/filepath" + "sync" "time" "github.com/digitalocean/go-qemu/qemu" @@ -17,6 +18,8 @@ type QEMU struct { socketPath string // for self-removal from pool on error } +var balloonTargetCache sync.Map + // New returns a QEMU client for the given socket path. // Uses a connection pool to ensure only one connection per socket exists. func New(socketPath string) (*QEMU, error) { @@ -44,6 +47,7 @@ func capabilities() hypervisor.Capabilities { return hypervisor.Capabilities{ SupportsSnapshot: true, // Uses QMP migrate file:// for snapshot SupportsHotplugMemory: false, // Not implemented - balloon not configured + SupportsBalloonControl: true, SupportsPause: true, SupportsVsock: true, SupportsGPUPassthrough: true, @@ -60,6 +64,7 @@ func (q *QEMU) DeleteVM(ctx context.Context) error { Remove(q.socketPath) return err } + balloonTargetCache.Delete(q.socketPath) return nil } @@ -71,6 +76,7 @@ func (q *QEMU) Shutdown(ctx context.Context) error { } // Connection is gone after quit, remove from pool Remove(q.socketPath) + balloonTargetCache.Delete(q.socketPath) return nil } @@ -175,3 +181,30 @@ func (q *QEMU) ResizeMemory(ctx context.Context, bytes int64) error { func (q *QEMU) ResizeMemoryAndWait(ctx context.Context, bytes int64, timeout time.Duration) error { return fmt.Errorf("memory resize not supported by QEMU implementation") } + +func (q *QEMU) SetTargetGuestMemoryBytes(ctx context.Context, bytes int64) error { + if bytes < 0 { + return fmt.Errorf("target guest memory %d must be non-negative", bytes) + } + if err := q.client.Balloon(bytes); err != nil { + Remove(q.socketPath) + return fmt.Errorf("set balloon target: %w", err) + } + balloonTargetCache.Store(q.socketPath, bytes) + return nil +} + +func (q *QEMU) GetTargetGuestMemoryBytes(ctx context.Context) (int64, error) { + if target, ok := balloonTargetCache.Load(q.socketPath); ok { + if value, ok := target.(int64); ok { + return value, nil + } + } + + bytes, err := q.client.QueryAssignedMemory() + if err != nil { + Remove(q.socketPath) + return 0, fmt.Errorf("query balloon target: %w", err) + } + return bytes, nil +} diff --git a/lib/hypervisor/qemu/qmp.go b/lib/hypervisor/qemu/qmp.go index f28fcb72..1be742fb 100644 --- a/lib/hypervisor/qemu/qmp.go +++ b/lib/hypervisor/qemu/qmp.go @@ -105,6 +105,34 @@ func (c *Client) Run(cmd qmp.Command) ([]byte, error) { return c.domain.Run(cmd) } +// Balloon updates the guest-visible target RAM for the balloon device. +func (c *Client) Balloon(bytes int64) error { + return c.raw.Balloon(bytes) +} + +// QueryBalloon returns the current guest-visible RAM target for the balloon device. +func (c *Client) QueryBalloon() (int64, error) { + info, err := c.raw.QueryBalloon() + if err != nil { + return 0, err + } + return info.Actual, nil +} + +// QueryAssignedMemory returns the currently configured RAM size for the guest. +func (c *Client) QueryAssignedMemory() (int64, error) { + info, err := c.raw.QueryMemorySizeSummary() + if err != nil { + return 0, err + } + + total := int64(info.BaseMemory) + if info.PluggedMemory != nil { + total += int64(*info.PluggedMemory) + } + return total, nil +} + // Migrate initiates a migration to the given URI (typically "file:///path"). // This is used for saving VM state to a file for snapshot/standby. func (c *Client) Migrate(uri string) error { diff --git a/lib/hypervisor/vz/client.go b/lib/hypervisor/vz/client.go index 51c1f283..3d371331 100644 --- a/lib/hypervisor/vz/client.go +++ b/lib/hypervisor/vz/client.go @@ -70,6 +70,10 @@ type snapshotRequest struct { DestinationPath string `json:"destination_path"` } +type balloonResponse struct { + TargetGuestMemoryBytes int64 `json:"target_guest_memory_bytes"` +} + func (c *Client) Capabilities() hypervisor.Capabilities { return capabilities() } @@ -78,6 +82,7 @@ func capabilities() hypervisor.Capabilities { return hypervisor.Capabilities{ SupportsSnapshot: runtime.GOARCH == "arm64", SupportsHotplugMemory: false, + SupportsBalloonControl: true, SupportsPause: true, SupportsVsock: true, SupportsGPUPassthrough: false, @@ -198,3 +203,24 @@ func (c *Client) ResizeMemory(ctx context.Context, bytes int64) error { func (c *Client) ResizeMemoryAndWait(ctx context.Context, bytes int64, timeout time.Duration) error { return hypervisor.ErrNotSupported } + +func (c *Client) SetTargetGuestMemoryBytes(ctx context.Context, targetBytes int64) error { + reqBody, err := json.Marshal(balloonResponse{TargetGuestMemoryBytes: targetBytes}) + if err != nil { + return fmt.Errorf("marshal balloon target: %w", err) + } + return c.doPut(ctx, "/api/v1/vm.balloon", bytes.NewReader(reqBody)) +} + +func (c *Client) GetTargetGuestMemoryBytes(ctx context.Context) (int64, error) { + body, err := c.doGet(ctx, "/api/v1/vm.balloon") + if err != nil { + return 0, fmt.Errorf("get balloon target: %w", err) + } + + var resp balloonResponse + if err := json.Unmarshal(body, &resp); err != nil { + return 0, fmt.Errorf("decode balloon target: %w", err) + } + return resp.TargetGuestMemoryBytes, nil +} diff --git a/lib/instances/guestmemory_active_ballooning_test_helpers_test.go b/lib/instances/guestmemory_active_ballooning_test_helpers_test.go new file mode 100644 index 00000000..f2916fd1 --- /dev/null +++ b/lib/instances/guestmemory_active_ballooning_test_helpers_test.go @@ -0,0 +1,131 @@ +package instances + +import ( + "context" + "log/slog" + "os" + "testing" + "time" + + "github.com/kernel/hypeman/lib/guestmemory" + "github.com/kernel/hypeman/lib/hypervisor" + "github.com/stretchr/testify/require" +) + +type staticBalloonSource struct { + vms []guestmemory.BalloonVM +} + +type fixedPressureSampler struct { + sample guestmemory.HostPressureSample +} + +func (s *staticBalloonSource) ListBalloonVMs(ctx context.Context) ([]guestmemory.BalloonVM, error) { + _ = ctx + return s.vms, nil +} + +func (s *fixedPressureSampler) Sample(ctx context.Context) (guestmemory.HostPressureSample, error) { + _ = ctx + return s.sample, nil +} + +func newActiveBallooningTestController(t *testing.T, inst *Instance) guestmemory.Controller { + t.Helper() + + cfg := guestmemory.DefaultActiveBallooningConfig() + cfg.Enabled = true + cfg.MinAdjustmentBytes = 1 + cfg.PerVMMaxStepBytes = inst.Size + inst.HotplugSize + cfg.PerVMCooldown = 1 * time.Millisecond + + return guestmemory.NewControllerWithSampler( + guestmemory.Policy{ + Enabled: true, + ReclaimEnabled: true, + }, + cfg, + &staticBalloonSource{ + vms: []guestmemory.BalloonVM{ + { + ID: inst.Id, + Name: inst.Name, + HypervisorType: inst.HypervisorType, + SocketPath: inst.SocketPath, + AssignedMemoryBytes: inst.Size + inst.HotplugSize, + }, + }, + }, + &fixedPressureSampler{ + sample: guestmemory.HostPressureSample{ + TotalBytes: 64 * 1024 * 1024 * 1024, + AvailableBytes: 32 * 1024 * 1024 * 1024, + AvailablePercent: 50, + Stressed: false, + }, + }, + slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelWarn})), + ) +} + +func requireRuntimeGuestMemoryTarget(t *testing.T, ctx context.Context, inst *Instance) int64 { + t.Helper() + + hv, err := hypervisor.NewClient(inst.HypervisorType, inst.SocketPath) + require.NoError(t, err) + + target, err := hv.GetTargetGuestMemoryBytes(ctx) + require.NoError(t, err) + return target +} + +func requireRuntimeGuestMemoryTargetEventually(t *testing.T, ctx context.Context, inst *Instance, expected int64) { + t.Helper() + + deadline := time.Now().Add(60 * time.Second) + var last int64 + var lastErr error + + for time.Now().Before(deadline) { + hv, err := hypervisor.NewClient(inst.HypervisorType, inst.SocketPath) + if err == nil { + last, err = hv.GetTargetGuestMemoryBytes(ctx) + lastErr = err + if err == nil && last == expected { + return + } + } else { + lastErr = err + } + time.Sleep(250 * time.Millisecond) + } + + require.NoError(t, lastErr) + require.Equal(t, expected, last) +} + +func requireManualReclaimApplied(t *testing.T, ctx context.Context, controller guestmemory.Controller, inst *Instance, reclaimBytes int64, holdFor time.Duration) guestmemory.ManualReclaimResponse { + t.Helper() + + resp, err := controller.TriggerReclaim(ctx, guestmemory.ManualReclaimRequest{ + ReclaimBytes: reclaimBytes, + HoldFor: holdFor, + Reason: "integration-test", + }) + require.NoError(t, err) + requireRuntimeGuestMemoryTargetEventually(t, ctx, inst, inst.Size+inst.HotplugSize-resp.AppliedReclaimBytes) + return resp +} + +func requireManualReclaimCleared(t *testing.T, ctx context.Context, controller guestmemory.Controller, inst *Instance) guestmemory.ManualReclaimResponse { + t.Helper() + + resp, err := controller.TriggerReclaim(ctx, guestmemory.ManualReclaimRequest{ + ReclaimBytes: 0, + HoldFor: 0, + Reason: "integration-test-clear", + }) + require.NoError(t, err) + requireRuntimeGuestMemoryTargetEventually(t, ctx, inst, inst.Size+inst.HotplugSize) + return resp +} diff --git a/lib/instances/guestmemory_darwin_test.go b/lib/instances/guestmemory_darwin_test.go index 560e7735..c944034a 100644 --- a/lib/instances/guestmemory_darwin_test.go +++ b/lib/instances/guestmemory_darwin_test.go @@ -69,6 +69,7 @@ func TestGuestMemoryPolicyVZ(t *testing.T) { require.NoError(t, err) require.NotNil(t, instMeta.HypervisorPID) assertLowIdleVZHostMemoryFootprint(t, *instMeta.HypervisorPID, 192*1024) + assertActiveBallooningLifecycleVZ(t, ctx, inst) } func forceEnableGuestMemoryPolicyForVZTest(mgr *manager) { @@ -175,3 +176,30 @@ func mustReadDarwinRSSBytes(t *testing.T, pid int) int64 { require.NoError(t, err) return kb * 1024 } + +func assertActiveBallooningLifecycleVZ(t *testing.T, ctx context.Context, inst *Instance) { + t.Helper() + + assigned := inst.Size + inst.HotplugSize + initialTarget := requireRuntimeGuestMemoryTarget(t, ctx, inst) + assert.Equal(t, assigned, initialTarget, "runtime balloon target should start at full assigned memory") + + controller := newActiveBallooningTestController(t, inst) + + reclaimResp := requireManualReclaimApplied(t, ctx, controller, inst, 1*1024*1024*1024, 5*time.Minute) + require.Len(t, reclaimResp.Actions, 1) + assert.NotNil(t, reclaimResp.HoldUntil) + assert.Equal(t, int64(1*1024*1024*1024), reclaimResp.Actions[0].AppliedReclaimBytes) + assert.Equal(t, assigned-int64(1*1024*1024*1024), reclaimResp.Actions[0].TargetGuestMemoryBytes) + + clearResp := requireManualReclaimCleared(t, ctx, controller, inst) + assert.Nil(t, clearResp.HoldUntil) + + floorResp := requireManualReclaimApplied(t, ctx, controller, inst, assigned, 5*time.Minute) + require.Len(t, floorResp.Actions, 1) + expectedFloor := assigned / 2 + assert.Equal(t, expectedFloor, floorResp.Actions[0].TargetGuestMemoryBytes) + assert.Equal(t, assigned-expectedFloor, floorResp.Actions[0].AppliedReclaimBytes) + + requireManualReclaimCleared(t, ctx, controller, inst) +} diff --git a/lib/instances/guestmemory_linux_test.go b/lib/instances/guestmemory_linux_test.go index 4cb8986a..7d8d8a03 100644 --- a/lib/instances/guestmemory_linux_test.go +++ b/lib/instances/guestmemory_linux_test.go @@ -66,6 +66,7 @@ func TestGuestMemoryPolicyCloudHypervisor(t *testing.T) { pid := requireHypervisorPID(t, ctx, mgr, inst.Id) assertLowIdleHostMemoryFootprint(t, "cloud-hypervisor", pid, 512*1024) + assertActiveBallooningLifecycle(t, ctx, inst) } func TestGuestMemoryPolicyQEMU(t *testing.T) { @@ -104,6 +105,7 @@ func TestGuestMemoryPolicyQEMU(t *testing.T) { assert.Contains(t, joined, "virtio-balloon-pci", "qemu cmdline should include virtio balloon device") assertLowIdleHostMemoryFootprint(t, "qemu", pid, 640*1024) + assertActiveBallooningLifecycle(t, ctx, inst) } func TestGuestMemoryPolicyFirecracker(t *testing.T) { @@ -141,6 +143,7 @@ func TestGuestMemoryPolicyFirecracker(t *testing.T) { pid := requireHypervisorPID(t, ctx, mgr, inst.Id) assertLowIdleHostMemoryFootprint(t, "firecracker", pid, 512*1024) + assertActiveBallooningLifecycle(t, ctx, inst) } func guestMemoryIdleScript() string { @@ -296,3 +299,30 @@ func getFirecrackerVMConfig(socketPath string) (*firecrackerVMConfig, error) { } return &cfg, nil } + +func assertActiveBallooningLifecycle(t *testing.T, ctx context.Context, inst *Instance) { + t.Helper() + + assigned := inst.Size + inst.HotplugSize + initialTarget := requireRuntimeGuestMemoryTarget(t, ctx, inst) + assert.Equal(t, assigned, initialTarget, "runtime balloon target should start at full assigned memory") + + controller := newActiveBallooningTestController(t, inst) + + reclaimResp := requireManualReclaimApplied(t, ctx, controller, inst, 1*1024*1024*1024, 5*time.Minute) + require.Len(t, reclaimResp.Actions, 1) + assert.NotNil(t, reclaimResp.HoldUntil) + assert.Equal(t, int64(1*1024*1024*1024), reclaimResp.Actions[0].AppliedReclaimBytes) + assert.Equal(t, assigned-int64(1*1024*1024*1024), reclaimResp.Actions[0].TargetGuestMemoryBytes) + + clearResp := requireManualReclaimCleared(t, ctx, controller, inst) + assert.Nil(t, clearResp.HoldUntil) + + floorResp := requireManualReclaimApplied(t, ctx, controller, inst, assigned, 5*time.Minute) + require.Len(t, floorResp.Actions, 1) + expectedFloor := assigned / 2 + assert.Equal(t, expectedFloor, floorResp.Actions[0].TargetGuestMemoryBytes) + assert.Equal(t, assigned-expectedFloor, floorResp.Actions[0].AppliedReclaimBytes) + + requireManualReclaimCleared(t, ctx, controller, inst) +} diff --git a/lib/oapi/oapi.go b/lib/oapi/oapi.go index e4290fb0..2a7d98b5 100644 --- a/lib/oapi/oapi.go +++ b/lib/oapi/oapi.go @@ -115,6 +115,20 @@ const ( InstanceStateUnknown InstanceState = "Unknown" ) +// Defines values for MemoryReclaimActionHypervisor. +const ( + MemoryReclaimActionHypervisorCloudHypervisor MemoryReclaimActionHypervisor = "cloud-hypervisor" + MemoryReclaimActionHypervisorFirecracker MemoryReclaimActionHypervisor = "firecracker" + MemoryReclaimActionHypervisorQemu MemoryReclaimActionHypervisor = "qemu" + MemoryReclaimActionHypervisorVz MemoryReclaimActionHypervisor = "vz" +) + +// Defines values for MemoryReclaimResponseHostPressureState. +const ( + Healthy MemoryReclaimResponseHostPressureState = "healthy" + Pressure MemoryReclaimResponseHostPressureState = "pressure" +) + // Defines values for RestoreSnapshotRequestTargetHypervisor. const ( RestoreSnapshotRequestTargetHypervisorCloudHypervisor RestoreSnapshotRequestTargetHypervisor = "cloud-hypervisor" @@ -125,10 +139,10 @@ const ( // Defines values for SnapshotSourceHypervisor. const ( - CloudHypervisor SnapshotSourceHypervisor = "cloud-hypervisor" - Firecracker SnapshotSourceHypervisor = "firecracker" - Qemu SnapshotSourceHypervisor = "qemu" - Vz SnapshotSourceHypervisor = "vz" + SnapshotSourceHypervisorCloudHypervisor SnapshotSourceHypervisor = "cloud-hypervisor" + SnapshotSourceHypervisorFirecracker SnapshotSourceHypervisor = "firecracker" + SnapshotSourceHypervisorQemu SnapshotSourceHypervisor = "qemu" + SnapshotSourceHypervisorVz SnapshotSourceHypervisor = "vz" ) // Defines values for SnapshotKind. @@ -806,6 +820,59 @@ type InstanceStats struct { NetworkTxBytes int64 `json:"network_tx_bytes"` } +// MemoryReclaimAction defines model for MemoryReclaimAction. +type MemoryReclaimAction struct { + AppliedReclaimBytes int64 `json:"applied_reclaim_bytes"` + AssignedMemoryBytes int64 `json:"assigned_memory_bytes"` + + // Error Error message when status is error or unsupported. + Error *string `json:"error,omitempty"` + Hypervisor MemoryReclaimActionHypervisor `json:"hypervisor"` + InstanceId string `json:"instance_id"` + InstanceName string `json:"instance_name"` + PlannedTargetGuestMemoryBytes int64 `json:"planned_target_guest_memory_bytes"` + PreviousTargetGuestMemoryBytes int64 `json:"previous_target_guest_memory_bytes"` + ProtectedFloorBytes int64 `json:"protected_floor_bytes"` + + // Status Result of this VM's reclaim step. + Status string `json:"status"` + TargetGuestMemoryBytes int64 `json:"target_guest_memory_bytes"` +} + +// MemoryReclaimActionHypervisor defines model for MemoryReclaimAction.Hypervisor. +type MemoryReclaimActionHypervisor string + +// MemoryReclaimRequest defines model for MemoryReclaimRequest. +type MemoryReclaimRequest struct { + // DryRun Calculate a reclaim plan without applying balloon changes or creating a hold. + DryRun *bool `json:"dry_run,omitempty"` + + // HoldFor How long to keep the reclaim hold active (Go duration string). Defaults to 5m when omitted. + HoldFor *string `json:"hold_for,omitempty"` + + // Reason Optional operator-provided reason attached to logs and traces. + Reason *string `json:"reason,omitempty"` + + // ReclaimBytes Total bytes of guest memory to reclaim across eligible VMs. + ReclaimBytes int64 `json:"reclaim_bytes"` +} + +// MemoryReclaimResponse defines model for MemoryReclaimResponse. +type MemoryReclaimResponse struct { + Actions []MemoryReclaimAction `json:"actions"` + AppliedReclaimBytes int64 `json:"applied_reclaim_bytes"` + + // HoldUntil When the current manual reclaim hold expires. + HoldUntil *time.Time `json:"hold_until,omitempty"` + HostAvailableBytes int64 `json:"host_available_bytes"` + HostPressureState MemoryReclaimResponseHostPressureState `json:"host_pressure_state"` + PlannedReclaimBytes int64 `json:"planned_reclaim_bytes"` + RequestedReclaimBytes int64 `json:"requested_reclaim_bytes"` +} + +// MemoryReclaimResponseHostPressureState defines model for MemoryReclaimResponse.HostPressureState. +type MemoryReclaimResponseHostPressureState string + // PassthroughDevice Physical GPU available for passthrough type PassthroughDevice struct { // Available Whether this GPU is available (not attached to an instance) @@ -1203,6 +1270,9 @@ type StartInstanceJSONRequestBody StartInstanceJSONBody // AttachVolumeJSONRequestBody defines body for AttachVolume for application/json ContentType. type AttachVolumeJSONRequestBody = AttachVolumeRequest +// ReclaimMemoryJSONRequestBody defines body for ReclaimMemory for application/json ContentType. +type ReclaimMemoryJSONRequestBody = MemoryReclaimRequest + // ForkSnapshotJSONRequestBody defines body for ForkSnapshot for application/json ContentType. type ForkSnapshotJSONRequestBody = ForkSnapshotRequest @@ -1408,6 +1478,11 @@ type ClientInterface interface { // GetResources request GetResources(ctx context.Context, reqEditors ...RequestEditorFn) (*http.Response, error) + // ReclaimMemoryWithBody request with any body + ReclaimMemoryWithBody(ctx context.Context, contentType string, body io.Reader, reqEditors ...RequestEditorFn) (*http.Response, error) + + ReclaimMemory(ctx context.Context, body ReclaimMemoryJSONRequestBody, reqEditors ...RequestEditorFn) (*http.Response, error) + // ListSnapshots request ListSnapshots(ctx context.Context, params *ListSnapshotsParams, reqEditors ...RequestEditorFn) (*http.Response, error) @@ -1980,6 +2055,30 @@ func (c *Client) GetResources(ctx context.Context, reqEditors ...RequestEditorFn return c.Client.Do(req) } +func (c *Client) ReclaimMemoryWithBody(ctx context.Context, contentType string, body io.Reader, reqEditors ...RequestEditorFn) (*http.Response, error) { + req, err := NewReclaimMemoryRequestWithBody(c.Server, contentType, body) + if err != nil { + return nil, err + } + req = req.WithContext(ctx) + if err := c.applyEditors(ctx, req, reqEditors); err != nil { + return nil, err + } + return c.Client.Do(req) +} + +func (c *Client) ReclaimMemory(ctx context.Context, body ReclaimMemoryJSONRequestBody, reqEditors ...RequestEditorFn) (*http.Response, error) { + req, err := NewReclaimMemoryRequest(c.Server, body) + if err != nil { + return nil, err + } + req = req.WithContext(ctx) + if err := c.applyEditors(ctx, req, reqEditors); err != nil { + return nil, err + } + return c.Client.Do(req) +} + func (c *Client) ListSnapshots(ctx context.Context, params *ListSnapshotsParams, reqEditors ...RequestEditorFn) (*http.Response, error) { req, err := NewListSnapshotsRequest(c.Server, params) if err != nil { @@ -3621,6 +3720,46 @@ func NewGetResourcesRequest(server string) (*http.Request, error) { return req, nil } +// NewReclaimMemoryRequest calls the generic ReclaimMemory builder with application/json body +func NewReclaimMemoryRequest(server string, body ReclaimMemoryJSONRequestBody) (*http.Request, error) { + var bodyReader io.Reader + buf, err := json.Marshal(body) + if err != nil { + return nil, err + } + bodyReader = bytes.NewReader(buf) + return NewReclaimMemoryRequestWithBody(server, "application/json", bodyReader) +} + +// NewReclaimMemoryRequestWithBody generates requests for ReclaimMemory with any type of body +func NewReclaimMemoryRequestWithBody(server string, contentType string, body io.Reader) (*http.Request, error) { + var err error + + serverURL, err := url.Parse(server) + if err != nil { + return nil, err + } + + operationPath := fmt.Sprintf("/resources/memory/reclaim") + if operationPath[0] == '/' { + operationPath = "." + operationPath + } + + queryURL, err := serverURL.Parse(operationPath) + if err != nil { + return nil, err + } + + req, err := http.NewRequest("POST", queryURL.String(), body) + if err != nil { + return nil, err + } + + req.Header.Add("Content-Type", contentType) + + return req, nil +} + // NewListSnapshotsRequest generates requests for ListSnapshots func NewListSnapshotsRequest(server string, params *ListSnapshotsParams) (*http.Request, error) { var err error @@ -4250,6 +4389,11 @@ type ClientWithResponsesInterface interface { // GetResourcesWithResponse request GetResourcesWithResponse(ctx context.Context, reqEditors ...RequestEditorFn) (*GetResourcesResponse, error) + // ReclaimMemoryWithBodyWithResponse request with any body + ReclaimMemoryWithBodyWithResponse(ctx context.Context, contentType string, body io.Reader, reqEditors ...RequestEditorFn) (*ReclaimMemoryResponse, error) + + ReclaimMemoryWithResponse(ctx context.Context, body ReclaimMemoryJSONRequestBody, reqEditors ...RequestEditorFn) (*ReclaimMemoryResponse, error) + // ListSnapshotsWithResponse request ListSnapshotsWithResponse(ctx context.Context, params *ListSnapshotsParams, reqEditors ...RequestEditorFn) (*ListSnapshotsResponse, error) @@ -5165,6 +5309,31 @@ func (r GetResourcesResponse) StatusCode() int { return 0 } +type ReclaimMemoryResponse struct { + Body []byte + HTTPResponse *http.Response + JSON200 *MemoryReclaimResponse + JSON400 *Error + JSON401 *Error + JSON500 *Error +} + +// Status returns HTTPResponse.Status +func (r ReclaimMemoryResponse) Status() string { + if r.HTTPResponse != nil { + return r.HTTPResponse.Status + } + return http.StatusText(0) +} + +// StatusCode returns HTTPResponse.StatusCode +func (r ReclaimMemoryResponse) StatusCode() int { + if r.HTTPResponse != nil { + return r.HTTPResponse.StatusCode + } + return 0 +} + type ListSnapshotsResponse struct { Body []byte HTTPResponse *http.Response @@ -5782,6 +5951,23 @@ func (c *ClientWithResponses) GetResourcesWithResponse(ctx context.Context, reqE return ParseGetResourcesResponse(rsp) } +// ReclaimMemoryWithBodyWithResponse request with arbitrary body returning *ReclaimMemoryResponse +func (c *ClientWithResponses) ReclaimMemoryWithBodyWithResponse(ctx context.Context, contentType string, body io.Reader, reqEditors ...RequestEditorFn) (*ReclaimMemoryResponse, error) { + rsp, err := c.ReclaimMemoryWithBody(ctx, contentType, body, reqEditors...) + if err != nil { + return nil, err + } + return ParseReclaimMemoryResponse(rsp) +} + +func (c *ClientWithResponses) ReclaimMemoryWithResponse(ctx context.Context, body ReclaimMemoryJSONRequestBody, reqEditors ...RequestEditorFn) (*ReclaimMemoryResponse, error) { + rsp, err := c.ReclaimMemory(ctx, body, reqEditors...) + if err != nil { + return nil, err + } + return ParseReclaimMemoryResponse(rsp) +} + // ListSnapshotsWithResponse request returning *ListSnapshotsResponse func (c *ClientWithResponses) ListSnapshotsWithResponse(ctx context.Context, params *ListSnapshotsParams, reqEditors ...RequestEditorFn) (*ListSnapshotsResponse, error) { rsp, err := c.ListSnapshots(ctx, params, reqEditors...) @@ -7452,6 +7638,53 @@ func ParseGetResourcesResponse(rsp *http.Response) (*GetResourcesResponse, error return response, nil } +// ParseReclaimMemoryResponse parses an HTTP response from a ReclaimMemoryWithResponse call +func ParseReclaimMemoryResponse(rsp *http.Response) (*ReclaimMemoryResponse, error) { + bodyBytes, err := io.ReadAll(rsp.Body) + defer func() { _ = rsp.Body.Close() }() + if err != nil { + return nil, err + } + + response := &ReclaimMemoryResponse{ + Body: bodyBytes, + HTTPResponse: rsp, + } + + switch { + case strings.Contains(rsp.Header.Get("Content-Type"), "json") && rsp.StatusCode == 200: + var dest MemoryReclaimResponse + if err := json.Unmarshal(bodyBytes, &dest); err != nil { + return nil, err + } + response.JSON200 = &dest + + case strings.Contains(rsp.Header.Get("Content-Type"), "json") && rsp.StatusCode == 400: + var dest Error + if err := json.Unmarshal(bodyBytes, &dest); err != nil { + return nil, err + } + response.JSON400 = &dest + + case strings.Contains(rsp.Header.Get("Content-Type"), "json") && rsp.StatusCode == 401: + var dest Error + if err := json.Unmarshal(bodyBytes, &dest); err != nil { + return nil, err + } + response.JSON401 = &dest + + case strings.Contains(rsp.Header.Get("Content-Type"), "json") && rsp.StatusCode == 500: + var dest Error + if err := json.Unmarshal(bodyBytes, &dest); err != nil { + return nil, err + } + response.JSON500 = &dest + + } + + return response, nil +} + // ParseListSnapshotsResponse parses an HTTP response from a ListSnapshotsWithResponse call func ParseListSnapshotsResponse(rsp *http.Response) (*ListSnapshotsResponse, error) { bodyBytes, err := io.ReadAll(rsp.Body) @@ -7957,6 +8190,9 @@ type ServerInterface interface { // Get host resource capacity and allocations // (GET /resources) GetResources(w http.ResponseWriter, r *http.Request) + // Trigger proactive guest memory reclaim + // (POST /resources/memory/reclaim) + ReclaimMemory(w http.ResponseWriter, r *http.Request) // List snapshots // (GET /snapshots) ListSnapshots(w http.ResponseWriter, r *http.Request, params ListSnapshotsParams) @@ -8206,6 +8442,12 @@ func (_ Unimplemented) GetResources(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusNotImplemented) } +// Trigger proactive guest memory reclaim +// (POST /resources/memory/reclaim) +func (_ Unimplemented) ReclaimMemory(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusNotImplemented) +} + // List snapshots // (GET /snapshots) func (_ Unimplemented) ListSnapshots(w http.ResponseWriter, r *http.Request, params ListSnapshotsParams) { @@ -9400,6 +9642,26 @@ func (siw *ServerInterfaceWrapper) GetResources(w http.ResponseWriter, r *http.R handler.ServeHTTP(w, r) } +// ReclaimMemory operation middleware +func (siw *ServerInterfaceWrapper) ReclaimMemory(w http.ResponseWriter, r *http.Request) { + + ctx := r.Context() + + ctx = context.WithValue(ctx, BearerAuthScopes, []string{}) + + r = r.WithContext(ctx) + + handler := http.Handler(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + siw.Handler.ReclaimMemory(w, r) + })) + + for _, middleware := range siw.HandlerMiddlewares { + handler = middleware(handler) + } + + handler.ServeHTTP(w, r) +} + // ListSnapshots operation middleware func (siw *ServerInterfaceWrapper) ListSnapshots(w http.ResponseWriter, r *http.Request) { @@ -9957,6 +10219,9 @@ func HandlerWithOptions(si ServerInterface, options ChiServerOptions) http.Handl r.Group(func(r chi.Router) { r.Get(options.BaseURL+"/resources", wrapper.GetResources) }) + r.Group(func(r chi.Router) { + r.Post(options.BaseURL+"/resources/memory/reclaim", wrapper.ReclaimMemory) + }) r.Group(func(r chi.Router) { r.Get(options.BaseURL+"/snapshots", wrapper.ListSnapshots) }) @@ -11505,6 +11770,50 @@ func (response GetResources500JSONResponse) VisitGetResourcesResponse(w http.Res return json.NewEncoder(w).Encode(response) } +type ReclaimMemoryRequestObject struct { + Body *ReclaimMemoryJSONRequestBody +} + +type ReclaimMemoryResponseObject interface { + VisitReclaimMemoryResponse(w http.ResponseWriter) error +} + +type ReclaimMemory200JSONResponse MemoryReclaimResponse + +func (response ReclaimMemory200JSONResponse) VisitReclaimMemoryResponse(w http.ResponseWriter) error { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(200) + + return json.NewEncoder(w).Encode(response) +} + +type ReclaimMemory400JSONResponse Error + +func (response ReclaimMemory400JSONResponse) VisitReclaimMemoryResponse(w http.ResponseWriter) error { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(400) + + return json.NewEncoder(w).Encode(response) +} + +type ReclaimMemory401JSONResponse Error + +func (response ReclaimMemory401JSONResponse) VisitReclaimMemoryResponse(w http.ResponseWriter) error { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(401) + + return json.NewEncoder(w).Encode(response) +} + +type ReclaimMemory500JSONResponse Error + +func (response ReclaimMemory500JSONResponse) VisitReclaimMemoryResponse(w http.ResponseWriter) error { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(500) + + return json.NewEncoder(w).Encode(response) +} + type ListSnapshotsRequestObject struct { Params ListSnapshotsParams } @@ -11993,6 +12302,9 @@ type StrictServerInterface interface { // Get host resource capacity and allocations // (GET /resources) GetResources(ctx context.Context, request GetResourcesRequestObject) (GetResourcesResponseObject, error) + // Trigger proactive guest memory reclaim + // (POST /resources/memory/reclaim) + ReclaimMemory(ctx context.Context, request ReclaimMemoryRequestObject) (ReclaimMemoryResponseObject, error) // List snapshots // (GET /snapshots) ListSnapshots(ctx context.Context, request ListSnapshotsRequestObject) (ListSnapshotsResponseObject, error) @@ -13047,6 +13359,37 @@ func (sh *strictHandler) GetResources(w http.ResponseWriter, r *http.Request) { } } +// ReclaimMemory operation middleware +func (sh *strictHandler) ReclaimMemory(w http.ResponseWriter, r *http.Request) { + var request ReclaimMemoryRequestObject + + var body ReclaimMemoryJSONRequestBody + if err := json.NewDecoder(r.Body).Decode(&body); err != nil { + sh.options.RequestErrorHandlerFunc(w, r, fmt.Errorf("can't decode JSON body: %w", err)) + return + } + request.Body = &body + + handler := func(ctx context.Context, w http.ResponseWriter, r *http.Request, request interface{}) (interface{}, error) { + return sh.ssi.ReclaimMemory(ctx, request.(ReclaimMemoryRequestObject)) + } + for _, middleware := range sh.middlewares { + handler = middleware(handler, "ReclaimMemory") + } + + response, err := handler(r.Context(), w, r, request) + + if err != nil { + sh.options.ResponseErrorHandlerFunc(w, r, err) + } else if validResponse, ok := response.(ReclaimMemoryResponseObject); ok { + if err := validResponse.VisitReclaimMemoryResponse(w); err != nil { + sh.options.ResponseErrorHandlerFunc(w, r, err) + } + } else if response != nil { + sh.options.ResponseErrorHandlerFunc(w, r, fmt.Errorf("unexpected response type: %T", response)) + } +} + // ListSnapshots operation middleware func (sh *strictHandler) ListSnapshots(w http.ResponseWriter, r *http.Request, params ListSnapshotsParams) { var request ListSnapshotsRequestObject @@ -13298,201 +13641,213 @@ func (sh *strictHandler) GetVolume(w http.ResponseWriter, r *http.Request, id st // Base64 encoded, gzipped, json marshaled Swagger object var swaggerSpec = []string{ - "H4sIAAAAAAAC/+x97XLbOJboq6B0d2vkHUmWP+I42ura68RJ2ttx4hvH3rvTylUgEpLQJgE2AMpRUvk7", - "DzCPOE9yCwcAvwRKlGM78SZTUx2ZBPFxcHBwvs/nVsDjhDPClGwNPrdkMCMxhp9HSuFgdsmjNCZvyZ8p", - "kUo/TgRPiFCUQKOYp0yNEqxm+q+QyEDQRFHOWoPWGVYzdD0jgqA59ILkjKdRiMYEwXckbHVa5COOk4i0", - "Bq3tmKntECvc6rTUItGPpBKUTVtfOi1BcMhZtDDDTHAaqdZggiNJOpVhT3XXCEukP+nCN1l/Y84jglnr", - "C/T4Z0oFCVuD34vLeJ815uM/SKD04EdzTCM8jsgxmdOALIMhSIUgTI1CQedELIPimXkfLdCYpyxEph1q", - "szSKEJ0gxhnZKgGDzWlINSR0Ez10a6BESjyQCWFOIxp6duDZCTKv0ckxas/Ix/Igu4/Hh636LhmOyXKn", - "v6YxZl0NXD0t1z+0Lfb9at/XM+VxnI6mgqfJcs8nb05PLxC8RCyNx0QUezzczfqjTJEpEbrDJKAjHIaC", - "SOlfv3tZnFu/3+8P8O6g3+/1fbOcExZyUQtS89oP0p1+SFZ02Qiktv8lkL6+PDk+OULPuEi4wPDt0kgV", - "xC6Cp7iuItqUd8WH/09TGoXLWD/Wj4kYUSYVZjU4eGJfanDxCVIzgux36PIUtSdcoJCM0+mUsulWE3zX", - "BCsiioQjrJaHg6ki24ZyhhSNiVQ4Tlqd1oSLWH/UCrEiXf2m0YCC4DXD6RaNBls+aqnZyVEs63p3TRBl", - "KKZRRCUJOAtlcQzK1MF+/WIKB4YIwT0U6rl+jGIiJZ4S1NZkU9NuhqTCKpWISjTBNCJhoz3yIYJZzB98", - "jGhImKITWj7fBp26eBzs7O55aUeMp2QU0qm9icrdH8NzjWK6H4WgtX8h+qAtmq0DhhRksjzeCyDdMIgg", - "EyKIxvGvHC4RfE6YPi16vH+BcVv/azu/orft/bwNwDzLm3/ptP5MSUpGCZfUzHCJctk3Go0A1Ai+8M8Z", - "Xq3a6wJGSYXF6vMBLW7hJJr5NYLNuWn6pdNSeLr2k3e6TZV2Amm0Q5aoQC2JfD4nzMMkBZwp+6IMnVd8", - "iiLKCLIt7F5omqgH+CXiQBJvCQ4Z+JcPv573DYiXeVDTm37XaRGWxhqYEZ8WoTkjWKgxKQGz5gqzHeWz", - "qwX/Wen4VO4qLMloNQU5o4yREOmW9mCbliiVwKkuLR9O0RVVozkR0nvmYFq/UYVsi9quIh5cTWhERjMs", - "Z2bGOAzhvOLorLQSD7dWYn9xoomg6xC4CIkUR+e/Hu0+OkB2AA8MJU9FYGawvJLC17p70xYpLMY4iry4", - "UY9um9/Ryxjix4Dz7GDU3T0ZBjrENJSuZXdTd99pJamcmV9Au/Ws4O7TZECjV6R/v/cs+hkQCSMl1MpM", - "fh7wTWI2G00jrmG6QCmjf6YlBruHTrSsoJC+KGhIwg7C8EKTbJwq3p0SRoSmU2gieAzcVoEJRm3Sm/Y6", - "aKj5wq7mgrt4t9vvd/vDVpmNjfa70yTVoMBKEaEn+P9+x91PR92/9btP3uc/R73u+7/+iw8BmnLmjiu0", - "62y7s99BbrJFdr060XWs/I2pf3H6PopjtvpE04lNd/rZyTLjYNYa8uCKiB7l2xEdCywW22xK2cdBhBWR", - "qrzy1W1vFRawjhVAYFMNpg3BUBF6AI3bEb8mItAUOCIa8WRHE2GqZAdhLTcD8UL6lvx3FGCmz4JhLrhA", - "hIXomqoZwtCuDK140cUJ7VIz1VanFeOPrwibqllrcLC3hOcaydv2R/f9v7lHW//hRXWRRsSD5G95qiib", - "InhtbvUZlSifA1UkXrsjDrppBGxeTNmJ+WwnmwkWAi++fofdQlbttBHmarc6iD2c/5s5EYKG7lZ9dnqM", - "2hG9IhbdkUgZGqb9/l4ADeAnsU8CHseYhebZVg+9ianSt1maX9JGG9QrbvfvLRLMOPAZUcT1gjJQ1zAx", - "OQwNHfJs57HTpEhkpXO4VzHoyWB7X55dbGvKlmAp1UzwdDorz8qS1c3mQ+XViPLROPHNicordLL9Bmmi", - "jyKqoZMR+Z1+//Tpthy29B+P3B9bPXRsQAbT1/vHhb175AwLAhxQiDhDz84uEI4iHlj5c6IZ1QmdpoKE", - "vYraA3r3HQ7ClFgknPoY4Apm5E2XEaTbzd9ugAfbY8q2pd6GbrAZ3AmbfwUb9pzNqeAs1qzwHAuqaVxJ", - "CfW59frN8fPR89eXrYE+RGEaWI3O2Zu371qD1l6/32/5OB2NQWvO+Muzi2ewU7r9jKskSqcjST95yPBR", - "tj4Uk5gLI37Yb1B7VqbShjtDsDnD1t7Lpwa5dl4CXrlNCamE1q4X03EZY3ZfPvVhy2yREDGn0qej+DV7", - "53a+QFMNYSrjtiRiTkSGtIDFvQLvF0Q8DbuFITutCRUkEFijXavT+pPEmgmaf9Kok8/d851fddDo8l9z", - "q+MooYysuNa/k+v1mouriOOwu3PLtysjSve9vMTX5kV5fy1OkAwlWp0lUZCF1zRUs1HIr5mesoeu2jco", - "a5wR1496JTj659//cXma86g7L8eJpbQ7u4++ktJWaKvu2it/ZgtJE/8yLhL/Ii5P//n3f7iVfNtFEKbx", - "MyzZdYz6p7yU/5oRNSOicOO6DdaPjAABnyOHL4XhS/qkohFoibjyORERXhSIpZ1Ta6cPFKsyK0EVnC/7", - "nSZ9V0h/vIZ06t7cxfyyKtTs9v3E0TMpz5ye6vNtaXmTmWQT2dk9tT93l6dUM6Mrmoymmhcc4Wmm41pl", - "nju/ogmCL7rwhdnGKDKHN0x1z2jMueoN2X/NCEOwd7DB5CMJgE5pIR4dnZ1IdE2jCCRiIATL18GQvSuQ", - "AtNcKv1fkbIOGqcKCRJzRZBlNGGQFOYCjccEpQw7+19vyIpQsQus4pUFyxURjESjGcEhEbIhZMxHyH5U", - "CxxY6gRLRYSh0GlShtfxb6fnqH28YDimAfrN9HrKwzQi6DxN9BneKkOvM2SJIHPCQGbRTAW14/IJ4qnq", - "8klXCULcFGPoLNMpWOPU/OXZhTVvyq3ekL0lGrCEhSSEObtbQiI1wwqFnP1Fn1gSlrstjl8Buv8sbyL7", - "dFrzIEnLO7Jb3Y3XYIDUa59ToVIcafJW4uC89khj6fZw6saQXpQYLNnKkBOrsiGpqYBoegaz9zIf65fz", - "DHNSL+edM5zIGVe1ct4VZeG6eblOftNta/mUTO8lbfO7ZlUSQbppMhUYDLW3yajcWPoGaNbvxhofDJ+x", - "LYNqkErF44LJDbUrikJaVimWgTXnUTfECgNT15DzNNNdNl/HC9OVOSJ199toOvZon/U1Rhma0ikeL1RZ", - "ktrp+w7i16pC3Fx821LnBmIONglHiq82hNMJcm2b2L3AaWSk+Gg+oZ6eM9Yo16JSiYKKz4klN7qLbhJQ", - "S6Q76HpGNTMlkQMC0OnL06IWozdkXbhYBug4GyDrNutSH0zQmEMXbS4Kk6Bg/EDjxRbC6PK0h95ls/2L", - "RAwrOifOL2aGJRoTwlAKTDgJYXy4NIsTSKW+qaiqfm5vJONCswXKGm7f9ZAWImNsb3d9FGKsaAAK9zGt", - "rAeMomaj9EiadLMib9GIF1jlPvCWTKlUouI8gNpvXzzb29t7UuUKdx91+zvdnUfvdvqDvv7/35r7Gdy+", - "l5Cvr6MybbEmjCL1eXZxcrxrWdDyOOrTPn5y+PEjVk8O6LV88ikei+kfe/he/Ij8pOw4t72gdiqJ6Doy", - "qbHKZ3EpGDZqLCo3NpTckd0jN+Ouamsg8U63vAsHKZ/p3Rp+N3dhqhLMtcb7wuKW1qOfai4wPyUFBZK1", - "kQXUaw08pvLqqSD4KuTXzHNvayZMjsx95tfsplqyHi8Q+agZdhIiwbmaSKNBKjOjO/uP9w/3DvYP+32P", - "X9AywvOAjgJ9AzWawJtnJyjCCyIQfIPaIPqHaBzxcRnRH+0dHD7uP9nZbToPIzg3g0PGK7uvUNtC5K/O", - "x9S9KU1qd/fxwd7eXv/gYHe/0awsG99oUo7lL7Ekj/ce7+8c7u43goJPEfHc+WlVfUlCnxI3SSJq1C5d", - "mZCATmiAwNML6Q9QO4YrjGQ6gPKZHONwJCx76b07FKaRXKk7NoPZlsatL04jRZOImHewIY3kGVj5MfTk", - "08tTxogYZW5sG/RkvdvW6krdWrImqOSlWALdKZXAheTMEyVRODAndC2dg93MJ/a+Dg/sGhpiwystOnUj", - "MidREQnM1aUnG3NBUIYnZtNKq6JsjiMajihLUi9K1ILyRSqAFzWdIjzmqTLKG9iw4iBgOwfZY6LJdTM3", - "jxdcXK21QuqbeCRSxnQ3a/UuR1HEr/UWX2nYwC2Okf3aOboUmL5MyWJUUfa9RG/NF0ZVlT9OUoUoU1xL", - "pywcLzowEgmhHUOCSMWBkuLgSnOYtpum3KWfb3mtGRanCDfj5bTznqwA3YlRwt6uhC2mRI2kwmotx6Ix", - "5R20P4fmjZ0a9IdrFSAN4M7I9X0AHbw+uhptu5Lh5G4gvsosl+ka8kZwCwsakh6C0wX2AedlWjlp54on", - "CQkz/U9vyM7NUckeSRSnEnSeVwYOakaoQFzQKS0PbI/NPdj3NkFFh003Rsfih8scKrwEpXj9occTRYSB", - "oHOgL3rB2U1odVoW9q1Oy1KiMmjcQw9EcqPz0hRfnl1saqVLBJ/QyLNc0DDbt1Yyc/arV/v98+7O/zG2", - "aI1vwKJRZrTSMQ9JrxKjAu2b3Twvzy7O6uaUBQih4uyW1pTZETyUI1M3O4hYDXmAGRoTZCUYh/76YskG", - "yXnvJz5ediJwTMbpZELEKPYo117o98g0MAYjytDp0zI/q/nmplLzWWlzQGye4MDGdzSDvkchV1lGpwDN", - "9/7tekvMNVznFaq3Stg21jG0h15nIVno5dmFRLntx6OpK29vrefQ2WwhaYAj06Nx8qasqGAD5GzMIZ/l", - "H1pVpIdPjr28oTsIqD2fJikcw/O33ZM3l9txSOad0pzAXjPjEdHz3ipQi7nzDc3dnEpEYl6n6TCIIZse", - "oAKsshPcGEiF8+qBjuIKRyMZceWZzTv9EsFL1L58YXz39Aw6KCltpX5egEIJvw+8J0ZTpLphz2HAqsq0", - "dMC9smM5ktGoVwrLKw3qOyq/EhyZAM4yPudhBm7j+VV5o/nV2tNrO/GNe+JcZBr4ED47PTYMQ8CZwpQR", - "gWKisA0XLTh8ATvU6rS6+o4KMYnBgDn599XOXzUq+AxdVilxny1Ff92JArcmakETuWhOQhRjRidEKhu1", - "UBpZzvDuo4OBia0KyWT/0UGv19vUW+957p7XaCu2jTNTwXGvJ2dftw934JTXZC2fW2dH735tDVrbqRTb", - "EQ9wtC3HlA0Kf2d/5i/gh/lzTJnXma9ROB6dLIXhlU2a+s4yzwd6JYwEGUJyEODXmphq5BmNmhH9RELk", - "9VpXeKrlE4NxX+uefuMAtjyKWhUC14p2+gZBbPTTak2oY4ygjR0zZYpGeXzfsg70RhGacmUQy1IAS0JY", - "FrYSReZXwNlcnwpfDEuJgLt3X2U/uDYC3SikHkz+LyvthVrwUuBfuv68tbZxkqxHWz+jmNG/prF71sPe", - "cxN9c6p/ExtbefQ30//88//Ks8d/7Pz56vLyv+cv//P4Nf3vy+jszVf5kq4OrvimERK3FhQBhqVSZERT", - "VDrFKvAwVDMuVQ2E7RukOIr1xz30DAS/wZB10SuqiMDRAA1bOKE9C/hewONhC7XJRxwo8xXiDOmurP/Y", - "lv74zKh/9MefnWz5pdpHaB3FhN2QzKdTpuOQx5iyrSEbMtsXcguRYNPXv0IU4ESlgujd0zxstEBjgYPc", - "QSwfvIM+4yT5sjVkIOGSj0roFSRYqCwazI0ASGFnZXwGbHMSojmOUiKthDxk2b0EIr/uxOhoepkSBHTz", - "Fc1qDVC84gsXZQfHw37Hs49It9MbGVGpCEOZtoNKQHTUdp6qh/0SqTjsH/bXMvgZDq1APzgJy7laHFI2", - "OEsGgWFoQ7hHM6WSBrp0TZvMGUG/vnt3psGg/z1HrqMcFtkWGyEPJ0lEiTQ6QhUBr2Odg7dafoc/vbsN", - "F2SUZPBZ1MAX8zkMjN69OkeKiJgyQ+vbgQbnhAZ6fWD+p1KmGhUpRkfPTp9v9RokmwHYZvNfsY/vshVW", - "LMdOaVanC8wwXsO3g06OO5pNsyc0Z+DAreYFFygyBCY/1wN0IUnZPRG2ylj1zU5Gi1zzZm6AYWvL9ZhU", - "KcUAvc34RpxNJYtSzZHBdZmfS+jWGl6Mz89S753yXMGbycpFlrSBhw9WyNo54dquJwWrj78H4nDmOavq", - "NDc720VlqB7Mjxr53t85t7K3qYy6aaha2TO9EImQRas1DzO7i3CtZXntI1WjWiM80q+tyd1JJZenaIYl", - "+4uClxXZZGfvcaOkLXrUpubrouGaT8yUslPl3Nwzs6tx+L+iUWS8GSSdMhyhJ6h9fvLyt5NXr7ZQF715", - "c1rdilVf+PanQdSaQ+2XZxcQCoblyFmA6p0ece44TD5SqeRyFEAjQ+rqKLlfS5Fs3rCKrVsMb3PW56Vl", - "3Efg2rd06/v+guZWhrl9bayaZXbvKFStlrj6wrzKdNY8vt2gszuZTil8zEcfijyB87m+ccRYp0U9/qZH", - "UpNAEqKTszzTRq6Uct1X1vRkt7dzcNjb6fd7O/0mKroYByvGPj161nzw/q5RRAzweBCEAzL5ChWhRWzD", - "vOHoGi8kGjr2etgy/HyBkS8cW8uCNzK/Lgfm3SwOr8pQrIu02ySyrlnI3Ip0WeflRFmNebRHf/uqnFqk", - "6c1sXRfsV6NNlNcEBTyNQs0HjfXJM2IVCa30J4nKc5DBYb1gV4xfs/LSjQ5Tn98/UyIW6PL0tKTxFmRi", - "Uyw1WDi4PNTsA0822obdNazy2tncMHrtPiLWqlSzcFvdenxaUeXmXCgNhjZQveXco9fsTZnZGo0nK9ZU", - "UZqEZD5KUx9TpF+5wImLi5PjEnJgfLBz2D980j0c7xx098P+Thfv7B10dx/h/mQveLxXk+SwudvLzT1Z", - "yqe5PlAJAA8KSBOHFg70ectcUcapQpmbmj7IzzR3iQpsrAnLAZ3ACaOK4oh+omyquwER3XK51zO9JogJ", - "RpRRpd+ZIF3K9JJBF6I7sc5HA/QS2sIrHEO4kJuElm3KagAcLowaVBMGN3QCf62e8vksVZrtgm/kLFVI", - "/wXL1mCw0sbqLgyNGaDXHL4RzkeU8arYYpqD79Vy86qI07ZeQc57FAazBHOAXmREMiOzlqy2JbE/De22", - "js3gtL1Vcp2zO97S2JLvXMErrNMyEG11Wg5Q4D227Edm5+UNkSiios8+QHAEJDT300kVjegnc4r1SqhU", - "NDBCH4bNrTvJNjMBCUfmBq+z9hnnD3vLZx85QnF5itoQjfhXZGVC/ddWZhksnsr93Sf7Tw4e7z45aBRz", - "kE9wPYF/Bq5Jy5NbS+2DJB25/LE1S392dgF3n75XZRobId+uveDimQgeaGaTMpQnpM0Hf9J7Ugy1CHk6", - "jgpKIxuXBf78TbIH15i3/qTRnE4m7M9PwdXuH4LGOx8P5O7YK5tlA/kZ2ZOionNJ6iPjrskm4/eGB4QS", - "sjZg5C2RsAJ0ThQC/OkiHMAlnXkUWZRzYSUW4l7E2t/b2zt8/Gi3EV7Z2RUOzgjEz+VZntoZFI4YtETt", - "t+fnaLuAcKZP52aZCCL14ky8pPecIZvVq1/ywNSiz54PS2r4pRxrbN/zuBbkl5YJsouyQAfHqIxBWjrl", - "Xmjv7fUf7z86fNTsGFuBayQ+rqYwtp11KBAkIHRe2vk2KMffHZ0h3buY4KAsYOzs7u0/Onh8uNGs1Eaz", - "UgIzGVOlNprY4eODR/t7uzvNIp98CnAb01c6sGXa5Tl0HqTw7IYHFMukt1N3W/gYz2VvzJUOoLlHadV9", - "cBN/4Tzmm0rolRZcVVFb82VFHrcQt7zVRM3hJ5F6nLqs9JoDberKu9pz9wyr2Qmb8GULxybypvWHcprv", - "RPNBEnLwhoRREjralQmelrUCD6tIEhSmxELOsEoCW4BjY+VJsJoB/wsfUjYt+5YvDdhECjRzWB3hD+Pa", - "hk0UVtLvl/NOpAAro2KWCOceOo305VSO/ILKcseCTNMIC1R1V18xZbmII8qumvQuF/GYRzRA+oOqNmHC", - "o4hfj/Qr+QusZavR6vQHo9zAXNEOmMlZ9wKzIZVx8yX8ole5VXFugpt/23y/DWVHmuj/vFanF1p2Mh7d", - "F4x+LCB6OQR2f7df5/dW02nJ4205GmBT2m5R1nfinaP+UZY8zGPdNPajilBc5oNL6/WtFgyUq7z8ljkB", - "1HYqRRdiXIZrIdS30UXczEZaVZ672WxLEpRH3z989PigYaz1V7HaKwozfAVjPY9XMNQ1O3XahGs7fHT4", - "5Mne/qMnuxvxR87OUrM/dbaW4v5UcgRWeLZHffjfRpMylhb/lGqsLeUJlfL93XhCX1Yc3TzGpkbqXlUU", - "Kd9JJ+aXGfBmLO4KbumoxHIVUtq2yWRCAkXnZGTg1s0nU/HNajSHACc4oGrhkQDxNbiroKxJJVakQe+V", - "yXpAavu24X6acsl0nLsDtN3g6N+MZFfBhcPGKRtkOq6TIt9URzUypPHvCisaigYKAoMRPpv8dQZMdI1l", - "yaigfweKhJ1CyuKq9cm0aF7YwuF6Vtsit6v74p38dSyK21/ZzoLUUWKSqxBfdYXWH0HNEYDzWBOdvedG", - "9njWBut9Oir0wV6AN/tqNC4mU1mZraaUeSW/dTcft1my5eXvzA22+XgFB4JNPqzmlQB8tHOwIM/77pRQ", - "ogabFBfr0wDeQXS4UWnfKD7casPvJUTcPr6TsPCl7TgveEE19/lzX/nLk5XsmAfd/l63f/BuZ2/w6GCw", - "s3MXAQqZDaNOlfv4087142gXT/ajw8XjP3dmj6e78Z7X6+MO0k9WcuRWslHaNSREVDOCVDPpSBJRRroy", - "M3+sN0SvCD0ySrkEL4DJWyGRbSIGuEI/K07teXmRxcOLVQ6car7S+/BPs7NfKctUp39yvHraN7InVCfi", - "R7DqVACfmk0GAuZ2bjXTKOhV4fR4AVmzKB/KlOzxJSR+v4KC/WYPbh2lsm7hdoZ5dgp3mJx9sYQ1+esl", - "QPlI7OoEGpVLyNhIi/lKMpfU282e8c5uaZ0DcyGpy+6jg3JWl6Pu30wWFzTqDbZ/+ev/7r7/t3/xZ/Iq", - "iY6SiG5IJsAxX5FFFwJqkEauXjn6FZyrW1Jhm/9KERwDtQuuiKGuMf5YnO+jfqbLXrzG8dISQNSIKcv+", - "Xrsgf5GpJUQzziZ1qVljV+e4kjCLmsqRNvQeFRqjNokTtXDhpU6nvrWZ88tR1qGXjb5lx/3+k9sIM7xY", - "GVf4AyYGLvomuQmt9Upa2v/aYB6/Uu646iNsNN822WHZp7WSwk2qFcVUVxXuNhW0Qa1tA+mmaTWjwAbF", - "uusMGfkpc446rlr3Ov38SqthYWWFmdTvjXFM+8rK5lS6kuY3BJlVMq+PTDOOOZoT7VazYZqELYKC1toC", - "yABWgyAzRCxbO1b71p7ij9kIwIFiucQtwzoKladePoUkTG9dVkQ6cV3ANKq1QZ5+Xcl3h1XLm7GqBrxz", - "ffQePEurVlC/urNVQc58jM7qMvOazJEgFVQtzjUZshEABAsijlKDhkCfYBHwOB8cojO/fAFl/MSjk3up", - "pTEaoKOzE8CSGDO4y9HlKYrohASLICI2uG7JoQ3k6TfPTromKjgr4QBVWRUAxGXVPjo7gSS9th5qq9/b", - "7UEVK54QhhPaGrT2ejuQsliDAZa4Dckc4Kc1t+lzCLfeSWhv56emif5K4JgoKKjxu8dspYgwySEkOCzg", - "aYGxSTAVlrNJIjCmGYmM6m/Bn9gR+IG5JToG4LipY65UC6taJMkbu63vNTrIhDNpNnS336/UBMZ58tbt", - "P6SxfeXjNuIyTIH2ZefaJZbPcToW5F86rf3+zkbzWZtv1TfsBcOpmnFBPxGY5qMNgXCjQU+YsXe4MmDE", - "NszPGaBQ8YT9/l7vl0zjGIuFA1cOq4TLOhaNSIQh4+PYlZrtISv6QTignPE0CqGcTGJy2msyipHCojf9", - "hLAIZnROhszeHiZ3LhYQLh0jfWsYvVX5aJihze4bskOkesrDRQW6WXfbujvgtsoA3rhYcqY3SWqqJvso", - "usk3LQPuTbRNGGYqT19sEk1fEXAvm9CP3g4b+UlqggfbQqCuQRZ+v7vlt9BCNJnfueE4e+dqdpcvOS0j", - "UBZEaZhzAuVayd6sS6bmr83HfUU8jNNLaGGBUgy8c1cu4yExQVTJQs04M7/TccpUan6PBb+WROiL2QZT", - "W1jbZLQWdaEwAo0hoNmkatFjbpspbn++IosvvSE7CmOXhsdWYcKR5DZRufFHpRJltcAAd2vK6/uFhWe2", - "cIlJElzMq2qmyVOVpKqHzEKIshHg0BzS7soZCYdMcfRZmCoLiy/bn/MRvwBHTXCo8aTQxCxp+zMNv9TN", - "Wo6wXv0ImnpkEgIAGLb07TJs6d9TgTVHncoZwgF4zeqHxS1tm4PNBXArW1UIB5ihhCdppHk/QCqTf73U", - "B2TTwFGEFBwl963mgWAna9Zjje2+1JDW0m5Mo5VjBEkiC4epv3/oP0+SBIL4xO7/PH/zGsFVBSXFoVke", - "4QkwokzfolnlKz16b8ie42CGDN8Ezv/DFg2Hrbx09BbMNZXWFNDtAuP1C9TUN8N0aPhLr6e7MjzdAP3+", - "2fQy0GcpiUeKXxE2bH3poMKLKVWzdJy9e+8HaJ3B8rxECFDb0P4tlwtJr7BwDZp7A7MQcUtrowXCKKdA", - "Rel+TBkWKxM5eUBvIagFTDyVRWB8HoKKZ9gaDJ2SZ9jqDFuEzeGZ1QQNW1/8ELCJx+o9zU0uK9ssR6KD", - "fn9rvSeRha+HhS411MfvyxL3tXtrjIdlupYZD1vf3obJ6B20ReaB3boHzucpDl2ei58s3hoWz8rTBeYN", - "vi/eAwZ9I2J0xxUOTAvgkePAVkonBi0gTgwkDuf3ZwQO6ji4HHmL4kdVyFwWK/brTlkAU4wc/u3fA/7B", - "uHlmfxj3yX2NiyNTg8rluX5Y6Aib5RCx45eIXxL1PWBc/75IqStA8g3x96Hgz0ti+b4caBVqtg01Povq", - "lmrssyA4lrYX01jLqucwp+45YQo9h6c9+6+TeCBU9EPEpx8GyIAw4lMUUUak9cnIbBj6UrSwhI9Mqsbs", - "O5vtNJhhNiUStc39+c+//wMmRdn0n3//h+amzS847tvG3x8iIT/MCBZqTLD6MEC/EZJ0cUTnxC0GYpXI", - "nIgF2uvbqsbwypM7VQ7ZkL0lKhVMZp7+el0AE9OhLeWh10NZSiSSAEIoTjexLuhG7ekR4d1ZNqC81xPd", - "WZK57AoKC9C3osMB8CmkJhzUyl8tv/bMrLmkP6tqcJd0+uvpiyIflcHerpnghgQGQOw7d/DCLhq1z8+f", - "b/UQyBgGKyDMADjmvBvLPPd+0qT1NMlQlDJBASgb2lRIm1+r/z22bZopgG2PP5IGuK4OQL0K2Kg8iCCh", - "g9dPWaGJOtgPN6ca9ulnj13ZwHoF7c3XWxzC+Wk2EoRvb58d7i3D3NbPzEH2LURg1LblzLIUlqUind8K", - "6e/l1ijUds2uDsRN4sx7E8uecTaJaKBQ180FsmTEJBPVygjyUMjBWztrhN26qgG9xfttuxSfUnvTZaEq", - "+ZV397dHZdBNrpE86DjHtZ83yTrUOaYy4PrbArZ0A5zYBJ6GfcnOaRGL1imkjuF5duWsZJeOs6rP9kDe", - "n2rKDp2y6t1wD0TxuEIQvyEhrCQlLITpPyRsvsh20ZVIXqG5+r5Qs39/XNB9a7F8aP6Q1FhhBWyaCs6y", - "UlV16GWLWd3hRtsRPAs/J8KdajNRk+AuX5b5FAUzElyZBdlK3qs4ghNX7LuJ6Gv6+5EkX1NFbAOOxYL8", - "J4vSQNjNYbVKwD2xmRrvTr6FETYSb2/PzmsRzANkcDYZO421SYKI5YIFWz+UqfdebrNqtfAHdJLO0ihy", - "Fo85ESqvpVa8A7Y/g1vSet7enbaV18HF21ddwgIOfmiZD5WfiXIljm6XwzcbZpbyE02ayIQAKocY9Qz0", - "V+y/cRdEWb78f919YTPm/+vuC5Mz/1/3jkzW/K07Q5b+fZHm++a4HzDyaYabloEGpMkUIlrHoWatGjKp", - "rv0PxafaonabcKoZXH8yq02Y1SK4VvKrWX3BO+RYbSm2b2OSyZDNB2145fwTfzBO9X61fBYjC1X7S2YP", - "m3KSi7z8ma35/fAcKGmGccVro6G6Oj+QK68Ph7onxx1b2c7Uo8sCRO5Jee3mce/MrR33/jXXR/GYTlOe", - "ymLsCRQyJNIGK0WkTIAfGtudX8+1jPd3jKX9+7w67p2v/on3d8TxVzfUEG9jgVrH87tWTXl+2x5KBppq", - "FCZ27a2rcmHTqGzVOBW6OjBN0bhUsmjZ2dE3L58sgi60oJKLCwgkiMGQ/YeWP35XBMfvf3FBMmm/v3sA", - "zwmbv//FxcmwU4cqhClBiURYEHT0+hjMflOIXodkaHlIXnUeJsWZqQ1ty5b+jxOQcstncwnJYeFPCamR", - "hFQA12oJKauicpcikhnkm8lIDt98ALepNX5KSfchJcl0MqEBJUzlGYCXnMRsAvEHGFvGrH2o4NxRumgb", - "S0l5aaPVDGie9u7eHXuywe9fOHIZ9h6mjzw3UTGhE0fyy7BeHvne8KF/v8T5/uWQh4xihuGvgm6ZEG1P", - "bAJiP4PwgourppjnycN56wh4+9xJcYXfIW+ip0cKVQ6/IYsCl7fxrddIU+Zc7uFALiVX/ZYunQ4SVrg1", - "QZGUTfM6l1TNeGqyqozsQ5OVTZ8KW00GWJ7A9vqtyYse/R4Y0NdcIRonEYkJZG3rGmyC4qJpknCR1R+j", - "spCKeDPyp49N0cHWJLexVYA7yCZsBmWd27A26O2Xt8tLNSM+XR9Umw3uIkg9UbVDdiFNkpcPhhX+gDIi", - "ixRHkkQkUOh6RoMZRNjqZ9C/CcDFSfIhS6mx5YqlFjOLwOBtSQTFEVR55JGpV/phHscfBssZ4C5PT+Ej", - "E1xrcr19GCCX9S27IKRuVYyY1auIsFTotY0DbmtMEjyKzI5+0LdQYX1bNpY2T3kyZL64WkaubYd0gj4U", - "Qmw/1MTYOoL6ik/lt+KXOvWJqsxaFEcCAGdwk7CwVafYoZE/unan3/flT2kY6WumcceBvkuTecWnWZKs", - "EirjJGmKvnaagMXzOF6Bw6hdSGYuVchT9VepQiIEfGyxuw65URsH5g+FrzSiMluKzKWDB/Tzqi9N1hov", - "qDRRLeSTNn/N47jVadn5eKrnfn3EdLXDZTWb3plCWPRPTnuTgOcysS9EPFduDlu3op7ltuU4fnh5z5W7", - "/sZo+A30Y/ksKHOsCuxtXkf8YUVOmkotVV7MJM/3nZGs1Ev9KSkrlc/zNP3/A0VUs9ZqfZ57FlIzEPsk", - "s1J5i28unWbVNn5KqJmEygUKUzNcpd7NDyt2ZgQFpawkeVr29KayZ5ZkLgMz1CFkKw0COc3b/ux+ntyA", - "XfhOKGGntupLXTqjfNHfA8mtqYnWiOZ+Iz7JXqsFBuEbkmBXne2+KXAGFS3uZVTuuyDD5sBl1LhIc6Dy", - "PnWFF38S45Ia0GhKb0qMHfO5pAsskGfKukmE6+iy5VNrCbCtAvXDy2u5rPKDS2wBF8K4joEz2kMKXSzY", - "DAuiZzvBqSSd7MB0nN368vR0q+7QCLXyyIjvw6B9M86hUpYzDv11kQUNXZL6Z6fHNqU9lUikrIfexBQy", - "x18RkkBKSspTicAHsFesN1ZXBS0rKEaYEouEU6bWziJvejeT+XKjJN33TKds8PYPr1ayhXYfGpEC2qFv", - "b7uA1UKVMmX2vGY6Z7aizGTW18wHHvNU975UDw1NaETkQioSG5vdJI3gEEF6D5v91X5nfNc6iCoJ1cM7", - "4OuTEBFTKSlncsjGZKK5koQIPTYUnKQRKZgffJatc4UzqnlmSN/3YdqCEmlgzcGqDmrl6mg4SVx1NJ/5", - "JCvoduMpvQBbFZKLeMwjGqCIsiuJ2hG9Mjw4mksU6R9bK41dI/jutnPb3vxkaUifsAn3pv8zOJsh849A", - "4U4qZM0Z8x8cWXtJiofF0R/YaD9Zk2vpmiA4giKgmZstShWN6CdD6nQnVCoamJpJOIMdlHsx4/WG7JQo", - "odtgQVDAo4gEyukathPBg+1h2u/vBQmFeIg9ApMDglf/OoYRn51dQDtTkqYzZPoP6Pjd0RmiGqYTbEXm", - "wkRtYXt0sv1mjfn/HMD0P1geMwtcdSz8G/7Tsru5D2XtGZI1R5QnqwQgnvzwCgPLwf3UFjxMbQE4sWer", - "aU8FDoAplrNUhfya+TUDpkKq3P5sfpysC4VQOJhdulLR3we3a6vFrhvGLfBBHEq7ppCY9KTfRF9vC/o+", - "0HROGnBuCcDEFIM6/LeAKRT+o2H37RvrinD8Di11FqIu9e93c7bu++azc3ARfkV4PJRjbjDNrQRKVha1", - "T1k441rZLEiFIExBKpictQxwggOqFh2EI1dN1ZZHynRIeSH4sSD4St+0vSF7mwVS2vJMWrrqONEKhVRe", - "mR6s9NRDb+ZEyHScTQ4BYTJyHgDfFlQNcBSYSqRkMiGBonNiSoTKGukrm8pdpuXNB/FstHtpQffQRA4/", - "TsDu5WhhpY6Sp1xt+obzrFWz9A1ZrwVvmIKnyEqf55FraKrgb6Ky8wx+RWvd4u2rzbzXftMfNRy77CXl", - "n4R99ZWr/FGy4p0XnFOaJn3IMfyh5V8ozLx0VEsOXusDwRt7dN2lh9W6QPBs8PsOBD/3Ovk8sHRUuOS2", - "VRcB/v0hQv9+vYvvOwL8YeOWZiXkEujqKVGDSPDvAgPvJgT8G3vX3yAE/Lvy94QQ3m/nd/9deXpaj8XM", - "0/NnkPddOniaSG8IaK1z8DRUz2qeVwpKl7ZNMzHJ9vgjcfBWWbkB/+7A/jNlWwORoQAsdwtXyA3QfmkR", - "nsSJWjhtFJ+A302eU1DST+C95wucy5TOdxevdgN97O2hh8PTWm3sz1Rv96bwzfNhnxw//PxuxTNXuli2", - "9a3TxSKY0XkpXmvVCbYgSgTpJjwBPWtoAGbh4e4yhUVv+gnZ7ntD9m5G3F+IumwZJEQhFSRQ0QJRpjhQ", - "BDPGXyQSXEsC8J6LhU99Wzy5LwSPj+xq1tyH9kxZZVju5hcvuiFWuDt31GaFCu0rTFan+CON0xgIHqIM", - "vXyK2uSjEiZ5A5poyQfRSQZS8jEgJJSAk1vFCe/0azSb9BMZTcdNZrkiDccbm+YEBalUPHZ7f3KM2jhV", - "vDslTO+FZvUnwMkmgs9paHLk5kCd88hAdacGoJvqXTVTYf3Bc+HCTO6b8DBNLqTpJ5qUyYJxe2wNWmPK", - "MExubcKL8pkyHrh6PEzBDy4/Ow5zWj+vsGqVbY2JWshxQFSco0hz9Fs/r7mHfM0VPRncnVa67ZplMW3m", - "3NDQ5+AuMphmji/3q7a+/H7s8YWqxA9QdT7PBNI6tfn3hYL9+7sf7ltdfvmA/bdeEid8F1Tl0IHu0Ycw", - "r3iAIxSSOYl4Emu20rRtdVqpiFqD1kypZLC9Hel2My7V4LB/2G99ef/l/wcAAP//du3TWocdAQA=", + "H4sIAAAAAAAC/+x9a3MbOZLgX0HwdmOoHZKiHpZlbXTsyZLt1rZl6yxbeztNHw1WgSRaVUA1gKJEO/x1", + "fsD8xPklF0gA9SKKLMp6WGtvbEzLLDwTiUS+80sr4HHCGWFKtg6+tGQwJTGGPw+VwsH0gkdpTN6RP1Mi", + "lf45ETwhQlECjWKeMjVMsJrqf4VEBoIminLWOmidYTVFV1MiCJrBKEhOeRqFaEQQ9CNhq9Mi1zhOItI6", + "aG3GTG2GWOFWp6Xmif5JKkHZpPW10xIEh5xFczPNGKeRah2McSRJpzLtqR4aYYl0ly70ycYbcR4RzFpf", + "YcQ/UypI2Dr4vbiNj1ljPvqDBEpPfjjDNMKjiByTGQ3IIhiCVAjC1DAUdEbEIiiOzPdojkY8ZSEy7VCb", + "pVGE6BgxzshGCRhsRkOqIaGb6KlbB0qkxAOZENY0pKHnBI5OkPmMTo5Re0quy5NsPx3tt+qHZDgmi4P+", + "msaYdTVw9bLc+NC2OPbrXd/IlMdxOpwIniaLI5+8PT39gOAjYmk8IqI44v52Nh5likyI0AMmAR3iMBRE", + "Sv/+3cfi2vr9fv8Abx/0+72+b5UzwkIuakFqPvtButUPyZIhG4HUjr8A0jcXJ8cnh+iIi4QLDH0XZqog", + "dhE8xX0V0aZ8Kj78f57SKFzE+pH+mYghZVJhVoODJ/ajBhcfIzUlyPZDF6eoPeYChWSUTiaUTTaa4Lsm", + "WBFRJBxitTgdLBXZNpQzpGhMpMJx0uq0xlzEulMrxIp09ZdGEwqCV0ynWzSabPGqpeYkh7GsG901QZSh", + "mEYRlSTgLJTFOShTe7v1mylcGCIE91CoF/pnFBMp8YSgtiabmnYzJBVWqURUojGmEQkbnZEPEcxm/uAj", + "REPCFB3T8v026NTFo2Bre8dLO2I8IcOQTuxLVB7+GH7XKKbHUQha+zeiL9q82T5gSkHGi/O9BNINkwgy", + "JoJoHP/G6RLBZ4Tp26Ln+xeYt/W/NvMnetO+z5sAzLO8+ddO68+UpGSYcEnNChcol/2i0QhAjaCHf83w", + "adlZFzBKKiyW3w9ocQs30ayvEWzOTdOvnZbCk5Vd3us2VdoJpNFOWaICtSTyxYwwD5MUcKbshzJ0XvMJ", + "iigjyLawZ6Fpop7gl4gDSbwlOGTgX7z8et03IF7mh5rR9LdOi7A01sCM+KQIzSnBQo1ICZg1T5gdKF9d", + "LfjPSten8lZhSYbLKcgZZYyESLe0F9u0RKkETnVh+3CLLqkazoiQ3jsHy/qNKmRb1A4V8eByTCMynGI5", + "NSvGYQj3FUdnpZ14uLUS+4sTTQTdgMBFSKQ4Ov/1cPvJHrITeGAoeSoCs4LFnRR66+FNW6SwGOEo8uJG", + "Pbqt/0YvYogfA86zi1H39mQY6BDTULqWPU09fKeVpHJq/gLarVcFb58mAxq9Iv33R8+mj4BIGCmhVmby", + "84BvE3PYaBJxDdM5Shn9My0x2D10omUFhfRDQUMSdhCGD5pk41Tx7oQwIjSdQmPBY+C2CkwwapPepNdB", + "A80XdjUX3MXb3X6/2x+0ymxstNudJKkGBVaKCL3A//c77n4+7P6t3332Mf9z2Ot+/Ou/+BCgKWfuuEK7", + "z7a7+x3kFltk16sLXcXK35j6F5fvozjmqE80nVj3pI9OFhkHs9eQB5dE9CjfjOhIYDHfZBPKrg8irIhU", + "5Z0vb3ursIB9LAECm2gwrQmGitADaNyO+BURgabAEdGIJzuaCFMlOwhruRmIF9Kv5L+jADN9FwxzwQUi", + "LERXVE0RhnZlaMXzLk5ol5qltjqtGF+/Jmyipq2DvZ0FPNdI3rZ/dD/+m/tp4z+8qC7SiHiQ/B1PFWUT", + "BJ/Nqz6lEuVroIrEK0/EQTeNgM2LKTsx3baylWAh8PzbT9htZNlJG2Gu9qiD2MP5v50RIWjoXtWj02PU", + "juglseiORMrQIO33dwJoAH8S+0vA4xiz0Py20UNvY6r0a5bmj7TRBvWKx/17iwRTDnxGFHG9oQzUNUxM", + "DkNDhzzHeew0KRJZ6RzeVQx6MjjeV2cfNjVlS7CUaip4OpmWV2XJ6nrrofJySPlwlPjWROUlOtl8izTR", + "RxHV0MmI/Fa/f/p8Uw5a+h9P3D82eujYgAyWr8+PC/v2yCkWBDigEHGGjs4+IBxFPLDy51gzqmM6SQUJ", + "exW1B4zuuxyEKTFPOPUxwBXMyJsuIki3m39dAw82R5RtSn0M3WA9uBM2+wY27AWbUcFZrFnhGRZU07iS", + "EupL683b4xfDF28uWgf6EoVpYDU6Z2/fvW8dtHb6/X7Lx+loDFpxx1+dfTiCk9Ltp1wlUToZSvrZQ4YP", + "s/2hmMRcGPHD9kHtaZlKG+4MweEMWjuvnhvk2noFeOUOJaQSWrtRzMBljNl+9dyHLdN5QsSMSp+O4tfs", + "mzv5Ak01hKmM25KIGREZ0gIW9wq8XxDxNOwWpuy0xlSQQGCNdq1O608SayZo9lmjTr52Tz+/6qDR47/i", + "VcdRQhlZ8qx/J8/rFReXEcdhd+uWX1dGlB57cYtvzIfy+VqcIBlKtDoLoiALr2iopsOQXzG9ZA9dtV9Q", + "1jgjrtd6Jzj659//cXGa86hbr0aJpbRb20++kdJWaKse2it/ZhtJE/82PiT+TVyc/vPv/3A7edhNEKbx", + "MyzZdYz6p7yV/5oSNSWi8OK6A9Y/GQECuiOHL4XpS/qkohFogbjyGRERnheIpV1Ta6sPFKuyKkEV3C/b", + "T5O+S6Q7ryCdejT3ML+qCjXbfT9x9CzKs6bn+n5bWt5kJdlCtrZP7Z/bi0uqWdElTYYTzQsO8STTcS0z", + "z51f0gRBjy70MMcYRebyhqkeGY04V70B+68pYQjODg6YXJMA6JQW4tHh2YlEVzSKQCIGQrD4HAzY+wIp", + "MM2l0v8rUtZBo1QhQWKuCLKMJkySwlqg8YiglGFn/+sNWBEqdoNVvLJguSSCkWg4JTgkQjaEjOmEbKda", + "4MBWx1gqIgyFTpMyvI5/Oz1H7eM5wzEN0G9m1FMephFB52mi7/BGGXqdAUsEmREGMotmKqidl48RT1WX", + "j7tKEOKWGMNgmU7BGqdmr84+WPOm3OgN2DuiAUtYSEJYs3slJFJTrFDI2V/0jSVhedji/BWg++/yOrJP", + "pzULkrR8ItvV03gDBki99xkVKsWRJm8lDs5rjzSWbg+nbgzpRYnBkq0MObEqG5KaCohmZDB7L/KxfjnP", + "MCf1ct45w4mcclUr511SFq5alxvkN922lk/J9F7SNr9rViURpJsmE4HBUHubjMqNpW+AZv1prPDB8Bnb", + "MqgGqVQ8LpjcULuiKKRllWIZWDMedUOsMDB1DTlPs9xF83U8N0OZK1L3vg0nI4/2WT9jlKEJneDRXJUl", + "qa2+7yJ+qyrErcV3LHVuIOZik3Co+HJDOB0j17aJ3QucRoaKD2dj6hk5Y41yLSqVKKj4nFhyo4foJgG1", + "RLqDrqZUM1MSOSAAnb44LWoxegPWhYflAB1nE2TDZkPqiwkacxiizUVhERSMH2g030AYXZz20PtstX+R", + "iGFFZ8T5xUyxRCNCGEqBCSchzA+PZnEBqdQvFVXV7vZFMi40G6Cs4fZbD2khMsb2dddXIcaKBqBwH9HK", + "fsAoag5Kz6RJNyvyFo14gWXuA+/IhEolKs4DqP3u5dHOzs6zKle4/aTb3+puPXm/1T/o6///W3M/g9v3", + "EvKNdVimLdaEUaQ+Rx9OjrctC1qeR33exc/2r6+xerZHr+Szz/FITP7YwffiR+QnZce57QW1U0lE15FJ", + "jVU+i0vBsFFjUbmxoeSO7B65GXdZWwOJ97rlXThI+Uzv1vC7vgtTlWCuNN4XNrewH/2r5gLzW1JQIFkb", + "WUC91sBjKi+fC4IvQ37FPO+2ZsLk0Lxnfs1uqiXr0RyRa82wkxAJztVYGg1SmRnd2n26u7+zt7vf73v8", + "ghYRngd0GOgXqNEC3h6doAjPiUDQB7VB9A/RKOKjMqI/2dnbf9p/trXddB1GcG4Gh4xXdr1Q20Lkr87H", + "1H0pLWp7++nezs5Of29ve7fRqiwb32hRjuUvsSRPd57ubu1v7zaCgk8R8cL5aVV9SUKfEjdJImrULl2Z", + "kICOaYDA0wvpDqgdwxNGMh1A+U6OcDgUlr30vh0K00gu1R2byWxL49YXp5GiSUTMNziQRvIM7PwYRvLp", + "5SljRAwzN7Y1RrLebSt1pW4vWRNU8lIsge6USuBCcuaJkig8MDd0JZ2D08wX9rEOD+weGmLDay06dSMy", + "I1ERCczTpRcbc0FQhifm0Eq7omyGIxoOKUtSL0rUgvJlKoAXNYMiPOKpMsobOLDiJGA7B9ljrMl1MzeP", + "l1xcrrRC6pd4KFLG9DAr9S6HUcSv9BFfatjAK46R7e0cXQpMX6ZkMaoo+12id6aHUVXlPyepQpQprqVT", + "Fo7mHZiJhNCOIUGk4kBJcXCpOUw7TFPu0s+3vNEMi1OEm/ly2nlPVoDu2Chhb1fCFhOihlJhtZJj0Zjy", + "HtqfQ/PGTg2640oFSAO4M3J1H0AHr4+uRtuuZDi5G4gvM8tluoa8EbzCgoakh+B2gX3AeZlWbtq54klC", + "wkz/0xuwc3NVsp8kilMJOs9LAwc1JVQgLuiElie21+Ye7HvroKLDphujY7HjIocKH0EpXn/p8VgRYSDo", + "HOiLXnD2EFqdloV9q9OylKgMGvejByK50Xlhia/OPqxrpUsEH9PIs13QMNuvVjJz9qvXu/3z7tb/MbZo", + "jW/AolFmtNIxD0mvEqMC7Zu9PK/OPpzVrSkLEELF1S3sKbMjeChHpm52ELEa8gAzNCLISjAO/fXDkk2S", + "897PfLzsWOCYjNLxmIhh7FGuvdTfkWlgDEaUodPnZX5W881Npeaz0uGA2DzGgY3vaAZ9j0Kuso1OAZof", + "/cf1jphnuM4rVB+VsG2sY2gPvclCstCrsw8S5bYfj6aufLy1nkNn07mkAY7MiMbJm7Kigg2QszGHfJZ3", + "tKpID58ce3lDdxFQezZJUriG5++6J28vNuOQzDqlNYG9Zsojote9UaAWM+cbmrs5lYjErE7TYRBDNr1A", + "BVhlN7gxkAr31QMdxRWOhjLiyrOa9/ojgo+offHS+O7pFXRQUjpK/XsBCiX83vPeGE2R6qY9hwmrKtPS", + "BffKjuVIRqNeKWyvNKnvqvxKcGQCOMv4nIcZuIPnl+WD5pcrb68dxDfviXORaeBDeHR6bBiGgDOFKSMC", + "xURhGy5acPgCdqjVaXX1GxViEoMBc/zvy52/alTwGbosU+IeLUR/3YkCtyZqQRO5aEZCFGNGx0QqG7VQ", + "mllO8faTvQMTWxWS8e6TvV6vt6633ovcPa/RUWwaZ6aC415PTr/tHO7AKa/JXr60zg7f/9o6aG2mUmxG", + "PMDRphxRdlD4d/bP/AP8Yf45oszrzNcoHI+OF8LwyiZN/WaZ3w/0ThgJMoTkIMCvNDHVyDMaNSP6mYTI", + "67Wu8ETLJwbjvtU9/cYBbHkUtSoErhXt9A2C2Ojn5ZpQxxhBGztnyhSN8vi+RR3ojSI05dIgloUAloSw", + "LGwlisxfAWczfSt8MSwlAu6+fZP94MoIdMOQejD5v6y0F2rBS4F/6er71trESbIabf2MYkb/msbuWQ97", + "z0v04FT/Jja28uxvJ//55/+VZ0//2Prz9cXFf89e/efxG/rfF9HZ22/yJV0eXPGgERK3FhQBhqVSZERT", + "VDrFKvAwVFMuVQ2E7RekOIp15x46AsHvYMC66DVVRODoAA1aOKE9C/hewONBC7XJNQ6U6YU4Q3oo6z+2", + "oTufGfWP7vzFyZZfq2OE1lFM2APJfDplOgp5jCnbGLABs2MhtxEJNn39V4gCnKhUEH16moeN5mgkcJA7", + "iOWTd9AXnCRfNwYMJFxyrYTeQYKFyqLB3AyAFHZVxmfANichmuEoJdJKyAOWvUsg8utBjI6mlylBQDdf", + "0azWAMUrvnBRdnDc73c854h0O32QEZWKMJRpO6gEREdt56m63y+Riv3+fn8lg5/h0BL0g5uwmKvFIWWD", + "u2QQGKY2hHs4VSppoEvXtMncEfTr+/dnGgz6v+fIDZTDIjtiI+ThJIkokUZHqCLgdaxz8EbL7/CnT7fh", + "hoySDLpFDXwxX8DE6P3rc6SIiCkztL4daHCOaaD3B+Z/KmWqUZFidHh0+mKj1yDZDMA2W/+Sc3yf7bBi", + "OXZKszpdYIbxGr4ddHLc0WyavaE5AwduNS+5QJEhMPm9PkAfJCm7J8JRGau+OclonmvezAswaG24EZMq", + "pThA7zK+EWdLyaJUc2RwQ+b3Eoa1hhfj87Mweqe8VvBmsnKRJW3g4YMVsnZOeLbrScHy6++BONx5zqo6", + "zfXudlEZqifzo0Z+9nfOreysK6OuG6pW9kwvRCJk0WrNw8zuIlxrUV67pmpYa4RH+rM1uTup5OIUTbFk", + "f1HwsSKbbO08bZS0Rc/a1HxdNFzzsVlSdqucm3tmdjUO/5c0iow3g6QThiP0DLXPT179dvL69Qbqordv", + "T6tHsayH73waRK051H519gFCwbAcOgtQvdMjzh2HyTWVSi5GATQypC6Pkvu1FMnmDavYuMXwNmd9XtjG", + "fQSuPaRb3/cXNLc0zO1bY9Uss3tHoWq1xNUX5lWms+bn2w06u5PllMLHfPShyBM4n+sbR4x1WtTjb3oo", + "NQkkITo5yzNt5EopN3xlT8+2e1t7+72tfr+31W+iootxsGTu08Oj5pP3t40i4gCPDoLwgIy/QUVoEdsw", + "bzi6wnOJBo69HrQMP19g5AvX1rLgjcyvi4F5N4vDqzIUqyLt1omsaxYytyRd1nk5UVZjHu3J374ppxZp", + "+jJb1wXba7iO8pqggKdRqPmgkb55RqwioZX+JFF5DjK4rB/YJeNXrLx1o8PU9/fPlIg5ujg9LWm8BRnb", + "FEsNNg4uDzXnwJO1jmF7Bau8cjU3jF67j4i1KtUsvFa3Hp9WVLk5F0qDoQ1Ubzn36DV7U2aORuPJkj1V", + "lCYhmQ3T1McU6U8ucOLDh5PjEnJgvLe1399/1t0fbe11d8P+Vhdv7ex1t5/g/ngneLpTk+SwudvLzT1Z", + "yre5PlAJAA8KSBOHFh7o+5a5ooxShTI3NX2RjzR3iQpsrAnLAZ3ACaOK4oh+pmyihwER3XK5V1O9J4gJ", + "RpRRpb+ZIF3K9JZBF6IHsc5HB+gVtIVPOIZwIbcILduU1QA4nBs1qCYMbuoE/rV8yefTVGm2C/rIaaqQ", + "/hdsW4PBShvLhzA05gC94dBHOB9Rxqtii2kOvleLzasiTtt6BTnvUZjMEswD9DIjkhmZtWS1LYn909Bu", + "69gMTtsbJdc5e+ItjS35yRW8wjotA9FWp+UABd5ji35kdl3eEIkiKvrsAwRHQEJzP51U0Yh+NrdY74RK", + "RQMj9GE43LqbbDMTkHBoXvA6a59x/rCvfNbJEYqLU9SGaMS/IisT6n9tZJbB4q3c3X62+2zv6fazvUYx", + "B/kCVxP4I3BNWlzcSmofJOnQ5Y+t2frR2Qd4+/S7KtPYCPl27wUXz0TwQDOblKE8IW0++bPes2KoRcjT", + "UVRQGtm4LPDnb5I9uMa89SeNZnQ8Zn9+Di63/xA03rrek9sjr2yWTeRnZE+Kis4FqY+MuiabjN8bHhBK", + "yNqAkXdEwg7QOVEI8KeLcACPdOZRZFHOhZVYiHsRa3dnZ2f/6ZPtRnhlV1e4OEMQPxdXeWpXULhi0BK1", + "352fo80CwpkxnZtlIojUmzPxkt57hmxWr37JA1OLPjs+LKnhl3KssWPP4lqQX1gmyG7KAh0cozIGaeGW", + "e6G9s9N/uvtk/0mza2wFrqG4Xk5hbDvrUCBIQOisdPJtUI6/PzxDenQxxkFZwNja3tl9svd0f61VqbVW", + "pQRmMqZKrbWw/ad7T3Z3treaRT75FOA2pq90Ycu0y3PpPEjhOQ0PKBZJb6futfAxngbB3pEgwjQ+DJzz", + "SuX1AetWOBSmWX4ITR4GK+MvPFwN+jYS0SrJnw1rwAVKmTSpQky6txXazJspJ+vJtHkPVpPxRR46wkyD", + "y7rom/Q0N4BdIsiM8lTewkBckUAj0zjiXKzVt84b6B2RaaSMBpFKdHH6FyAiGrmQVCQpe7pb9FsSyHDD", + "za11gUs44cfqOmA1Oo0mR79sw52aa9pZ5sVauv618UKhJlUpW215PsJRkEZgX87OU+8KPP95qsBOPjc+", + "GlHEOUPBFLMJgaSUJn8MmyCMpjwKey2/pSMKh2OvBYJfoYibTAeXhCRA8N0idDfNs9AZQe1XPC9LYFCp", + "kifsSWyoCjevRxkbn8Q1JV6kz+8viy/S8MSKF4LwTZeSNB/xiQQpUIH3Sa+a+yXBwjiVYMZBQTyLjfBY", + "Dpza1q+9Z4kV6u17Qs3TycdWorU8huIZJHEguJSIRHRCR5F+UsvLXOY+GFNGY01nV1uTy4ttgLoy4Uz6", + "0prAm2Y8D5oofHwPoscv61ueRMBhcL/0av6NHd560qMYM83slRCZXCdUGPRoZtuecqmGWTDImouVaqh5", + "ZJkKkkeMufdyCu73c0PioI33XXSk7Sbgsk4PN+q9gFX+oeoWWE9TvRD1Q6uT4aAPjRfDYZZG4OQhPdX4", + "jXUCtvKkO1TCqLQQK4TajKsSWSokjtloYmfyy6h6nrqyQK93++dNY6mWh06dYTU9YWO+SAnWUfhbh3Tn", + "epAQEVMJRRBCwigJnfCYaf6tbgtc3CNJUJgSCznDkApsAY7N9U6wmoICEjpSNqnQ+uqETdTwZg3LUyzB", + "vLZhE4uh9DtGvxcpwMrY+CXCuYt0I4cFKod+TfHiwIJM0ggLVI0XXLJkOY8jyi6bjC7n8YhHNEC6Q9Wc", + "M+ZRxK+G+pP8Bfay0Wh3usMw9/CrmGfM4qx/pzmQyrz5Fn7Ru9yoeJeD6mXT9N+Eum9NDLBet5+XNCI2", + "pO4Do9cFRC/nINnd7tcFHtQMWgo5WAzHXJdyW5T13XgXKXmYZW/1uJcZB56KVaKsiCzt17db8BBbFmax", + "qIpBbWfTdTleynAt5FpppAlp5qRW9V5wq9mUJCjPvrv/5Olew2Q336TrXFIZ6xs0m7N4iUaz5qROm6jN", + "9p/sP3u2s/vk2fZaCirn6FJzPnXOLsXzqSRprijNnvTh/9ZalHF18S+pxt2lvKBSwuUbL+jrkqubBznX", + "mD2WVaXMT9LZWcoa0GY6xiXc0mGJ5SrUFGiT8ZiAUDk0cOvmi6k4xzdaQ4ATHFA19yhM8BX4C6OsSSVY", + "t4k2rbxYD0jt2DbfgqZcMh3l/phtNzn6N6Nar+DCfuOcWTId1anx31ZnNUr8XAdUNBE1sNAYjPCrC7L9", + "XGFZ8urQfweKhJ1CzYiq+49p0byymMP1rLhY7tjoCzj3FxIrHn/lOAtq3xKTXIX4sie0/gquJUN7XmSP", + "CB2sdqqt0Af7AN6s13BUzGa3NF1gKfVd/uquP2+zaheL/cwLtv58BQ/OdTpWE3sBPto1WJDnY3dKKFGD", + "TYqL1XmY7yA9j/EpuFGCHuuOcC85euzPd5KXZ+E4zgtu6M2DLlwvf33YkiPZXre/0+3vvd/aOXiyd7C1", + "dRcRopkTSZ0t/ennraun0TYe70b786d/bk2fTrbjHa/b7R3k/64UKaikA7d7SIiopmSrpjKUJKKMdGXm", + "f7LaE3BJ7LdR6SZ4DkzeEolsHTHAVVpccmvPy5ssXl6scuBUE8bfR4CAXf1SWaa6/JPj5cu+kUNHdSF+", + "BKsuBfCp2WIgY8HWraZ6B7sY3B4vIGs25UOZkkNkCYk/LqFgv9mLW0epbFyeXWGeHsxdJufgVcKa/PMC", + "oHwkdnkGs8ojZJzUignjspig201f9t4eaV0EWdU4VEyrd9j9m0mjh4a9g81f/vq/ux//7V/8qVRLoqMk", + "ohuSMXDMl2TehYhmpJGrV04/AtFtLamwTUCqCI6B2gWXxFDXGF8X1/ukn+my529wvLAFEDViyrJ/r9yQ", + "v8rnAqIZb9+63PixviGejKXUlO52FptCY9QmcaLmLr+H06lvrOd9fJgN6GWjbzlysv/sNvI8fFia2OEH", + "rMxQdA53C1rpFr5w/rXR1H6l3HE1SMtovm226XJQUSWHrlRLqtnHPGVqCKrnRf2a/mbU2jaTwSStpnTa", + "jJnatHlTfKb0UHP0yw0Z+S1zntJd6LRaP7/U66Ows8JK6s/GRAYspitYAqAzDZqrKRGkcBDQIU/+sCbI", + "rJK5gYOGyW6QENGtpiM3GfMEBa21BZABrAZBZohYtHYsD246xdfZDMCBYrnALcM+CqU/Xz2HLJjvXFpq", + "OnZDwDKqxdmer8aiZTBxWLV4GEWsWty3ae+9eJZWLaF+dXergpz5HCXUXMRHTeZIkAqq5ueaDNkQTIIF", + "EYepQUOgT7AJ+DmfHNJjfP0KyvixRyf3SktjNECHZyeAJTFm8Jaji1MU0TEJ5kFEbHaDhYgCkKffHp10", + "TVqWrIYWlMVXABBX1uTw7ASqJNiC9K1+b7sHZUR5QhhOaOugtdPbgpoRGgywxU3IpgV/WnOb8bShnJ2E", + "9nV+bproXgLHREFFs989ZitFhMnOJcFjFE8KjE2CqbCcTRKBMc1IZFT3hYAuR+APzCvRMQDHTSOjpJpb", + "1SJJ3tpj/ajRwbi2wBa3+32TOp0p+xzgPHv+5h/WBSmftxGXAeDxRDctsHyO07Eg/9pp7fa31lrPyoT3", + "vmk/MJyqKRf0M4FlPlkTCDea9IQZe4erw0psw/yeAQoVb9jvH/V5yTSOsZg7cOWwSrisY9GIRBhSbo9c", + "rf8esqIf5GOQU55GIdTzS0xRIU1GMVJY9CafERbBlM7IgNnXwxQvwALy1cRIvxpGb1W+GmZqc/qZm81z", + "Hs4r0M2G29TDAbdVBnA17luSIcS/DevyPmZ6k4QyBvnj80rILgHaAkU3BT9kwL2VTgjDTOX1I0ylj0sC", + "/v1jeu0dsFGgiiZ4cCwECktl+Y+2N/wWWgjn9zs3HGffkAVv+ZHTMgJlQZSGOSfgjAZYjHAUeT2ZJxEf", + "4cgWRLkkHsbpFbSwQClmPnBPLuMhMVHsyVxNOTN/p6OUqdT8PRL8ShKhH2abzcbC2lYDsKgLlaloDBll", + "TK48PeemWeLml0sy/9obsMMwdnkQbRlMHEluK8WYgCAqUVaMFXDXn2+hRm9yZCvHmSoNxcT2Zpk8VUmq", + "eshshCibggeaQ90DOSXhgCmOvghT5mr+dfNLPuNX4KgJDjWeFJqYLW1+oeHXulXLIda7H0JTj0xCAACD", + "ln5dBi3990RgzVGncopwAGFL+sfikbYz71LNrWxUIRxghhKeGM9cQCpTAKc0BqQzw1GEFFwl11fzQHCS", + "NfuxxnZfbm5raTem0co1gizdhcvU39333ydJAkF8Yvd/nr99g+Cp0mdgmuUpNgBGlOlXNCs9qmfvDdgL", + "HEyR4Zsg+nLQouGglfG84QasNZXWFNDtAuP1i17aL2aaDg1/6fX0UIanO0C/fzGjHOi7lMRDxS8JG7S+", + "dlDhw4SqaTrKvn30A7TOYHleIgSobWj/hktGCY7T+TNo3g3MQsQtrY3mCKOcAhWl+xFlWCzNpOkBvYWg", + "FjDxRBaB8WUAKp5B62DglDyDVmfQImwGv1lN0KD11Q8Bm/m1PtTPJBO1zXIk2uv3N1Z7Eln4eljoUkN9", + "/b4ucF/bt8Z4WKZrkfEwm3NxyvoETVpYw27dA+fzHIcu0dhPFm8Fi2fl6QLzBv2L74BB34gY3XGFA9MC", + "eOQ4sKXSiUELCNQHicP5/RmBgzoOLkfeovhRFTIXxYrdulsWwBIjh3+794B/MG9eWgnmfXZf8+LIFAF1", + "hUYeFzrCYTlE7Pgl4ldEfQ8Y178vUuoqwD0g/j4W/HlFLN+XA61CzTahyHpR3VJNPiMIjqUdxTTWsuo5", + "rKl7TphCL+DXnv2vk3ggV8eniE8+HSADwohPUEQZkdYnI7Nh6EfRwhI6mQC3rJ8NBnWhZW3zfv7z7/+A", + "RVE2+eff/6G5afMXXPdN4+8PqSg+TQkWakSw+nSAfiMk6eKIzojbDASLkxkRc7TTBzbTVp73JK+XAzZg", + "74hKBZOZp7+N8pJ2QFtLTe+HspRIJAGEUB14bF3QjdrTI8K7u2xAea83urMgc9kdFDagX0WHA+BTSE0+", + "Dit/tfzaM7Pnkv6sqsFd0Omvpi+KXCuDvV2zwDUJDIDYd+/gg900ap+fv9joIZAxDFZAmAFwzPkwlnnu", + "/aRJq2mSoShlggJQNrSpULeoVv97bNs0UwDbEX8kDXBdIaZ6FbBReRBBQgevn7JCE3WwH25ONezTzx67", + "us31Ctqb77c4hfPTbCQI3945O9xbhLktYJ6D7CFEYNS29WSzHOKlKukPhfT38moUiutnTwfiJnP5vYll", + "R5yNIxoo1HVrgTRlMclEtTKCPBZy8M6uGmG3r2pAb/F92yzFp9S+dFmoSv7k3f3rUZl0nWckDzrOce3n", + "S7IKdY6pDLjuW8CWboATm0HdsC/ZPS1i0SqF1DH8nj05S9klS55Pjt2FvD/VlJ06ZdW34R6I4nGFID4g", + "IaxkhS6E6T8mbP6QnaLd1zLN1feFmv3744LuW4vlQ/PHpMYKK2DTVHCa1QqtQy9bTfQOD9rO4Nn4ORHu", + "VrusKrDrbFumKwqmJLg0GwKD9HLZ98Q0aSb6mvF+JMnXlHFdg2OxIP/JojQQdnNYLRNwT2yq7LuTb2GG", + "tcTb27PzWgTzABmcTUZOY22yUGM5Z8HGD2XqvZfXzAD7UT5mZ2kUOYvHjAiVF7MtvgGbX8AtaTVv727b", + "0ufgw7vXXcICDn5omQ+Vn4lyNSZvl8M3B2a28hNNmsiEACqHGPUM9Decv3EXRFnBon/dfmlLFv3r9ktT", + "tOhfdw5N2aKNO0OW/n2R5vvmuB8x8mmGm5aBBqTJVIJcxaFmrRoyqa79D8Wn2qrC63CqGVx/MqtNmNUi", + "uJbyq1mB5zvkWG0t3IcxyWTI5oM2fHL+iT8Yp3q/Wj6LkS4jCZVls4dNOclFXn+WMpRK8ggdKGmGccVn", + "o6G6Or+QS58Ph7onxx1bWtgUBM4CRO5Jee3Wce/MrZ33/jXXh/GITlKeymLsCVSSJtIGK0WkTIAfG9ud", + "P8+1jPd3jKX9+3w67p2v/on3d8TxVw/UEG9jgVrF87tWTXl+2x5qNptyYCZ27Z0rM2bTqGzUOBW6XNxN", + "0bhUM3LR2dG3Lp8sgj5oQSUXFxBIEAcD9h9a/vhdERx//MUFyaT9/vYe/E7Y7OMvLk6GnTpUIUwJSiTC", + "gqDDN8dg9ptA9DokQ8tD8qrrMCnOAPVc3fj/cQJSbvlsLiE5LPwpITWSkArgWi4hZWXs7lJEMpM8mIzk", + "8M0HcJta46eUdB9SkkzHYxpQwlSeAXjBScwmEH+EsWXM2ocKzh2lh7axlJTXllzOgOZp7+7dsSeb/P6F", + "I5dh73H6yHMTFRM6cSR/DOvlke8NH/r3S5zvXw55zChmGP4q6BYJ0ebYJiD2MwgvubhsinmePJy3joC3", + "z50Ud/gd8iZ6eaRQZvoBWRR4vI1vvUaaMudyDxdyIbnqQ7p0OkhY4dYERVI2yQuN2xJ6Y8Hjof3RZGXT", + "t8JWkwGWJ7CjPjR50bPfAwP6hitE4yQiMYGsbV2DTVDd3VUANQVgqSykIl6P/OlrU3SwNcltpNE5dJBN", + "2AzKOndgbdDbLx6Xl2pGfLI6qDab3EWQeqJqB+yDNElePhlW+BPKiCxSHEkSkUChqykNphBhq3+D8U0A", + "Lk6ST1lKjQ1Xrb6YWQQmb0siKI6gzDaPTMH4T7M4/nSwmAHu4vQUOpngWpPr7dMBclnfsgdC6lbFiFm9", + "iwhLhd7YOOC2xiTBo8ic6Cf9ChX2t2FjafOUJwPmi6tl5MoOSMfoUyHE9lNNjK0jqK/5RD4Uv9SpT1Rl", + "9gJFGjXgDG4SFrbqFDs08kfXbvW9lRkbRvqaZdxxoO/CYl7zSZYkq4TKOEmaoq9dJmDxLI6X4DBqF5KZ", + "SxXyVP1VqpAIAZ0tdtchN2rbUoBI4UuNqMyWInPp4AH9vOpLk7XGCypNVAv5pM2/ZnFsi9bGmHlqMn57", + "xHR1wEU1mz6ZQlj0T057nYDnMrEvRDxXXg5bt6Ke5bblOH54ec8C6qF1Cg+gH8tXQZljVeBseZ6P/lFF", + "TppKLVVezCTP992RrNRL/S0pK5XP8zT9/wNFVLPXan2eexZSMxD7JLNSeYsHl06zahs/JdRMQuUChamZ", + "rlLv5ocVOzOCglJWkjwte3pT2TNLMpeB2ZSAX2oQyGne5hf358kN2IXvhBJ2aqu+1KUzyjf9PZDcmppo", + "jWjuA/FJ9lktMAgPSIJddbb7psAZVLS4l1G574IMmwuXUeMizVECM0ld4cWfxLikBjSa0psSY8d8LugC", + "C+SZsm4S4Tq6bPnUWgJsq0D98PJaLqv84BJbwIUwrmPgjPaYQhcLNsOC6NlOcCpJJ7swHWe3vjg93ai7", + "NEItvTLi+zBo34xzqJTljEN/XWRBQ5ek/uj02Ka0pxKJlPXQ25hC5vhLQhJISUl5KhH4APaK9cbqqqBl", + "BcUIU2KecMrUylXkTe9mMV9vlKT7numUDd7+4dVKttDuYyNSQDv06203sFyoUqbMntdM58xWlJnM+pr5", + "wCOe6tEX6qGhMY2InEtFYmOzG6cRXCJI72Gzv9p+xnetg6iSUD28A74+CRExlZJyJgdsRMaaK0mI0HND", + "wUkakYL5wWfZOlc4o5pnhvR9H6YtKJEG1hys6qBWro6Gk8RVR/OZT7KCbjde0kuwVSE5j0c8ogGKKLuU", + "qB3RS8ODo5lEkf5jY6mxawj9bju37c1vlob0CRtzb/o/g7MZMv8IFO6kQtacMf/RkbVXpHhZHP2Bg/aT", + "NbmSrgmCIygCmrnZolTRiH42pE4PQqWigamZhDPYQbkXM19vwE6JEroNFgQFPIpIoJyuYTMRPNgcpP3+", + "TpBQiIfYIbA4IHj1n2OY8ejsA7QzJWk6A6b/AQO/PzxDVMN0jK3IXFioLWyPTjbfrjD/nwOY/gfLY2aD", + "y66F/8B/WnbX96GsvUOy5oryZJkAxJMfXmFgObif2oLHqS0AJ/ZsN+2JwAEwxXKaqpBfMb9mwFRIlZtf", + "zB8nq0IhFA6mF65U9PfB7dpqsaumcRt8FJfS7ikkJj3pg+jrbUHfR5rOSQPObQGYmGJQh/8VMIXCfzTs", + "vn1jXRGO36GlzkLUpf79bu7Wfb98dg0uwq8Ij8dyzQ2muZ1Aycqi9ikLZ1wpmwWpEIQpSAWTs5YBTnBA", + "1byDcOSqqdrySJkOKS8EPxIEX+qXtjdg77JASlueSUtXHSdaoZDKSzOClZ566O2MCJmOssUhIExGzgPg", + "24KqAY4CU4mUjMckUHRGTIlQWSN9ZUu5y7S8+SSeg3YfLegem8jhxwk4vRwtZAXjNs1RbwoSRJjGRdGj", + "ChwgkKD4B+XASA/KGaJsHFklaCC4lMgO1SURndBRZFV6sofeTwmSOCYDlkSYMQLVV8FYq5feTQSRMgXj", + "PwwAec8NRnVQHiiTCK6sMiHiXEgj/2sMvzhFUpFkCZq9MyObirV3FLZuBrczPdCzUlmDmcWP8gbU+kAM", + "phiAazxKIyXvzR3kJHMAMQv6WZm02cV/L+hkQoS+FdgQWaNAN9fagdNc+pJ7bG3OlvOsVbOcLdmoBRe4", + "gnvY0kCHoWs4BPZzHT29Z/JLWhsLYz+t57L6m+7UcO6ya6R/EfbTN+7yR0mFeV7wSGua6SXH8MeWdKWw", + "8tJVLXl1rs7+0NiN8y7dKldlf8gmv+/sD+dez75HloMOl3w169I+fH+I0L/fkIL7TvvwuHFLyw9yAXT1", + "lKhB+ofvAgPvJu/DA4fU3CDvw3fl5A1x+w8XbPNduXdbN+XMvftnZoe79Oo26R0gir3Oq9tQPWtuWioo", + "Xdg2zcQkO+KPxMFbC8Ua/LsD+0+pv4HIUACWX2VnAsqkRXgSJ2ruVNB8DM52eSJRST+Dy64vWjazNN1d", + "kOoNjDC3hx4OT2tNMD/zO96blSdPgn9y/PiTOhbvXOlh2dSvTheLYEpnpF7pXr7BFkSJIN2EJ2BcCQ3A", + "LDzcW6aw6E0+Izt8b8DeT4n7F6IuRQ4JUUgFCVQ0R5QpDhTBzPEXiQTXkgB852LuU6YXb+5LweNDu5sV", + "76G9U1YZlvv2xvNuiBXuzhy1WaJC+wY79Sm+pnEaA8FDlKFXz1GbXCthMragsZZ8EB1nICXXASGhBJzc", + "KC54q1+j2aSfyXAyarLKJbl33trcRihIpeKxO/uTY9TGqeLdCWH6LDSrPwZONhF8RkOTGDsH6oxHBqpb", + "NQBdV++qmQobBJILF2ZxD8LDNHmQJp9pUiYLxte5ddAaUYZhcSuz3JTvlHG71/NhCs6v+d1xmNP6+YRV", + "S+trTNRCjgOi4hxFmqPf+PnMPeZnrui+5N600mvXLHVxM4+mho5Gd5G2OPN2u1+19cX344RTKEX+CFXn", + "s0wgrVObf18o2L+/9+G+1eUXj9hp8xVxwndBVQ4D6BF9CPOaBzhCIZmRiCexZitN21anlYqoddCaKpUc", + "bG5Gut2US3Ww39/vt75+/Pr/AwAA//9vAMjF/SoBAA==", } // GetSwagger returns the content of the embedded swagger specification file diff --git a/lib/providers/providers.go b/lib/providers/providers.go index 815f9c3b..756c13f8 100644 --- a/lib/providers/providers.go +++ b/lib/providers/providers.go @@ -135,6 +135,51 @@ func ProvideInstanceManager(p *paths.Paths, cfg *config.Config, imageManager ima return instances.NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, defaultHypervisor, meter, tracer, memoryPolicy), nil } +// ProvideGuestMemoryController provides the active ballooning controller. +func ProvideGuestMemoryController(instanceManager instances.Manager, cfg *config.Config, log *slog.Logger) (guestmemory.Controller, error) { + pollInterval, err := time.ParseDuration(cfg.Hypervisor.Memory.ActiveBallooning.PollInterval) + if err != nil { + return nil, fmt.Errorf("parse active ballooning poll interval: %w", err) + } + perVMCooldown, err := time.ParseDuration(cfg.Hypervisor.Memory.ActiveBallooning.PerVmCooldown) + if err != nil { + return nil, fmt.Errorf("parse active ballooning per-vm cooldown: %w", err) + } + protectedFloorMinBytes, err := parseByteSize(cfg.Hypervisor.Memory.ActiveBallooning.ProtectedFloorMinBytes) + if err != nil { + return nil, fmt.Errorf("parse active ballooning protected floor: %w", err) + } + minAdjustmentBytes, err := parseByteSize(cfg.Hypervisor.Memory.ActiveBallooning.MinAdjustmentBytes) + if err != nil { + return nil, fmt.Errorf("parse active ballooning min adjustment: %w", err) + } + perVMMaxStepBytes, err := parseByteSize(cfg.Hypervisor.Memory.ActiveBallooning.PerVmMaxStepBytes) + if err != nil { + return nil, fmt.Errorf("parse active ballooning per-vm max step: %w", err) + } + + policy := guestmemory.Policy{ + Enabled: cfg.Hypervisor.Memory.Enabled, + KernelPageInitMode: guestmemory.KernelPageInitMode(cfg.Hypervisor.Memory.KernelPageInitMode), + ReclaimEnabled: cfg.Hypervisor.Memory.ReclaimEnabled, + VZBalloonRequired: cfg.Hypervisor.Memory.VZBalloonRequired, + } + + controllerCfg := guestmemory.ActiveBallooningConfig{ + Enabled: cfg.Hypervisor.Memory.ActiveBallooning.Enabled, + PollInterval: pollInterval, + PressureHighWatermarkAvailablePercent: cfg.Hypervisor.Memory.ActiveBallooning.PressureHighWatermarkAvailablePercent, + PressureLowWatermarkAvailablePercent: cfg.Hypervisor.Memory.ActiveBallooning.PressureLowWatermarkAvailablePercent, + ProtectedFloorPercent: cfg.Hypervisor.Memory.ActiveBallooning.ProtectedFloorPercent, + ProtectedFloorMinBytes: protectedFloorMinBytes, + MinAdjustmentBytes: minAdjustmentBytes, + PerVMMaxStepBytes: perVMMaxStepBytes, + PerVMCooldown: perVMCooldown, + } + + return guestmemory.NewController(policy, controllerCfg, &guestMemoryInstanceSource{manager: instanceManager}, log), nil +} + // ProvideVolumeManager provides the volume manager func ProvideVolumeManager(p *paths.Paths, cfg *config.Config) (volumes.Manager, error) { // Parse max total volume storage (empty or "0" means unlimited) @@ -192,6 +237,40 @@ func ProvideVMMetricsManager(instanceManager instances.Manager, cfg *config.Conf return mgr, nil } +type guestMemoryInstanceSource struct { + manager instances.Manager +} + +func (s *guestMemoryInstanceSource) ListBalloonVMs(ctx context.Context) ([]guestmemory.BalloonVM, error) { + insts, err := s.manager.ListInstances(ctx, nil) + if err != nil { + return nil, err + } + + vms := make([]guestmemory.BalloonVM, 0, len(insts)) + for _, inst := range insts { + if inst.State != instances.StateRunning && inst.State != instances.StateInitializing { + continue + } + vms = append(vms, guestmemory.BalloonVM{ + ID: inst.Id, + Name: inst.Name, + HypervisorType: inst.HypervisorType, + SocketPath: inst.SocketPath, + AssignedMemoryBytes: inst.Size + inst.HotplugSize, + }) + } + return vms, nil +} + +func parseByteSize(value string) (int64, error) { + var size datasize.ByteSize + if err := size.UnmarshalText([]byte(value)); err != nil { + return 0, err + } + return int64(size), nil +} + // ProvideIngressManager provides the ingress manager func ProvideIngressManager(p *paths.Paths, cfg *config.Config, instanceManager instances.Manager) (ingress.Manager, error) { // Parse DNS provider - fail if invalid diff --git a/openapi.yaml b/openapi.yaml index cc61fc8f..070da4bb 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -1289,6 +1289,111 @@ components: items: $ref: "#/components/schemas/ResourceAllocation" + MemoryReclaimRequest: + type: object + required: [reclaim_bytes] + properties: + reclaim_bytes: + type: integer + format: int64 + minimum: 0 + description: Total bytes of guest memory to reclaim across eligible VMs. + example: 536870912 + hold_for: + type: string + description: How long to keep the reclaim hold active (Go duration string). Defaults to 5m when omitted. + example: 5m + dry_run: + type: boolean + description: Calculate a reclaim plan without applying balloon changes or creating a hold. + default: false + reason: + type: string + maxLength: 256 + description: Optional operator-provided reason attached to logs and traces. + example: prepare for another vm start + + MemoryReclaimAction: + type: object + required: + - instance_id + - instance_name + - hypervisor + - assigned_memory_bytes + - protected_floor_bytes + - previous_target_guest_memory_bytes + - planned_target_guest_memory_bytes + - target_guest_memory_bytes + - applied_reclaim_bytes + - status + properties: + instance_id: + type: string + instance_name: + type: string + hypervisor: + type: string + enum: [cloud-hypervisor, firecracker, qemu, vz] + assigned_memory_bytes: + type: integer + format: int64 + protected_floor_bytes: + type: integer + format: int64 + previous_target_guest_memory_bytes: + type: integer + format: int64 + planned_target_guest_memory_bytes: + type: integer + format: int64 + target_guest_memory_bytes: + type: integer + format: int64 + applied_reclaim_bytes: + type: integer + format: int64 + status: + type: string + description: Result of this VM's reclaim step. + example: applied + error: + type: string + description: Error message when status is error or unsupported. + + MemoryReclaimResponse: + type: object + required: + - requested_reclaim_bytes + - planned_reclaim_bytes + - applied_reclaim_bytes + - host_available_bytes + - host_pressure_state + - actions + properties: + requested_reclaim_bytes: + type: integer + format: int64 + planned_reclaim_bytes: + type: integer + format: int64 + applied_reclaim_bytes: + type: integer + format: int64 + hold_until: + type: string + format: date-time + description: When the current manual reclaim hold expires. + host_available_bytes: + type: integer + format: int64 + host_pressure_state: + type: string + enum: [healthy, pressure] + actions: + type: array + items: + $ref: "#/components/schemas/MemoryReclaimAction" + paths: /health: get: @@ -1325,6 +1430,48 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" + + /resources/memory/reclaim: + post: + summary: Trigger proactive guest memory reclaim + description: | + Requests runtime balloon inflation across reclaim-eligible guests. The same + planner used by host-pressure reclaim is applied, including protected floors + and per-VM step limits. + operationId: reclaimMemory + security: + - bearerAuth: [] + requestBody: + required: true + content: + application/json: + schema: + $ref: "#/components/schemas/MemoryReclaimRequest" + responses: + 200: + description: Reclaim plan and applied results + content: + application/json: + schema: + $ref: "#/components/schemas/MemoryReclaimResponse" + 400: + description: Invalid reclaim request + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + 401: + description: Unauthorized + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + 500: + description: Internal server error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" /images: get: From 472e9664fca5e46a19e17ebff81ae0bd8361f3d4 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Thu, 19 Mar 2026 17:18:55 -0400 Subject: [PATCH 02/22] test: stabilize ci boot timing and config defaults --- cmd/api/api/api_test.go | 7 +++++++ cmd/api/api/cp_test.go | 4 ++-- cmd/api/api/exec_test.go | 4 ++-- cmd/api/config/config.go | 6 +++--- config.example.darwin.yaml | 6 +++--- config.example.yaml | 6 +++--- lib/guestmemory/README.md | 8 ++++---- lib/instances/manager_test.go | 8 ++++++++ 8 files changed, 32 insertions(+), 17 deletions(-) diff --git a/cmd/api/api/api_test.go b/cmd/api/api/api_test.go index 38b7eadd..c227a5b4 100644 --- a/cmd/api/api/api_test.go +++ b/cmd/api/api/api_test.go @@ -112,6 +112,13 @@ func ctx() context.Context { return context.Background() } +func integrationTestTimeout(timeout time.Duration) time.Duration { + if os.Getenv("CI") == "true" && timeout < 45*time.Second { + return 45 * time.Second + } + return timeout +} + // ctxWithInstance creates a context with a resolved instance (simulates ResolveResource middleware) func ctxWithInstance(svc *ApiService, idOrName string) context.Context { inst, err := svc.InstanceManager.GetInstance(ctx(), idOrName) diff --git a/cmd/api/api/cp_test.go b/cmd/api/api/cp_test.go index 4a9d5646..3492a490 100644 --- a/cmd/api/api/cp_test.go +++ b/cmd/api/api/cp_test.go @@ -65,7 +65,7 @@ func TestCpToAndFromInstance(t *testing.T) { // Wait for guest-agent to be ready t.Log("Waiting for guest-agent to start...") agentReady := false - agentTimeout := time.After(15 * time.Second) + agentTimeout := time.After(integrationTestTimeout(15 * time.Second)) agentTicker := time.NewTicker(500 * time.Millisecond) defer agentTicker.Stop() @@ -203,7 +203,7 @@ func TestCpDirectoryToInstance(t *testing.T) { // Wait for guest-agent t.Log("Waiting for guest-agent...") agentReady := false - agentTimeout := time.After(15 * time.Second) + agentTimeout := time.After(integrationTestTimeout(15 * time.Second)) agentTicker := time.NewTicker(500 * time.Millisecond) defer agentTicker.Stop() diff --git a/cmd/api/api/exec_test.go b/cmd/api/api/exec_test.go index 2f30b1e5..c20672ef 100644 --- a/cmd/api/api/exec_test.go +++ b/cmd/api/api/exec_test.go @@ -66,7 +66,7 @@ func TestExecInstanceNonTTY(t *testing.T) { // Wait for nginx to be fully started (poll console logs) t.Log("Waiting for nginx to start...") nginxReady := false - nginxTimeout := time.After(15 * time.Second) + nginxTimeout := time.After(integrationTestTimeout(15 * time.Second)) nginxTicker := time.NewTicker(500 * time.Millisecond) defer nginxTicker.Stop() @@ -223,7 +223,7 @@ func TestExecWithDebianMinimal(t *testing.T) { // This is the key difference: we wait for guest-agent, not the app (which exits immediately) t.Log("Waiting for guest-agent to start...") execAgentReady := false - agentTimeout := time.After(15 * time.Second) + agentTimeout := time.After(integrationTestTimeout(15 * time.Second)) agentTicker := time.NewTicker(500 * time.Millisecond) defer agentTicker.Stop() diff --git a/cmd/api/config/config.go b/cmd/api/config/config.go index 2b90a9b8..f9e25b1d 100644 --- a/cmd/api/config/config.go +++ b/cmd/api/config/config.go @@ -352,9 +352,9 @@ func defaultConfig() *Config { PressureHighWatermarkAvailablePercent: 10, PressureLowWatermarkAvailablePercent: 15, ProtectedFloorPercent: 50, - ProtectedFloorMinBytes: "512MiB", - MinAdjustmentBytes: "64MiB", - PerVmMaxStepBytes: "256MiB", + ProtectedFloorMinBytes: "512MB", + MinAdjustmentBytes: "64MB", + PerVmMaxStepBytes: "256MB", PerVmCooldown: "5s", }, }, diff --git a/config.example.darwin.yaml b/config.example.darwin.yaml index 35d1e8e1..886469cc 100644 --- a/config.example.darwin.yaml +++ b/config.example.darwin.yaml @@ -45,9 +45,9 @@ hypervisor: pressure_high_watermark_available_percent: 10 pressure_low_watermark_available_percent: 15 protected_floor_percent: 50 - protected_floor_min_bytes: 512MiB - min_adjustment_bytes: 64MiB - per_vm_max_step_bytes: 256MiB + protected_floor_min_bytes: 512MB + min_adjustment_bytes: 64MB + per_vm_max_step_bytes: 256MB per_vm_cooldown: 5s # ============================================================================= diff --git a/config.example.yaml b/config.example.yaml index 7af5fd5a..32ca583d 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -39,9 +39,9 @@ data_dir: /var/lib/hypeman # pressure_high_watermark_available_percent: 10 # pressure_low_watermark_available_percent: 15 # protected_floor_percent: 50 -# protected_floor_min_bytes: 512MiB -# min_adjustment_bytes: 64MiB -# per_vm_max_step_bytes: 256MiB +# protected_floor_min_bytes: 512MB +# min_adjustment_bytes: 64MB +# per_vm_max_step_bytes: 256MB # per_vm_cooldown: 5s # ============================================================================= diff --git a/lib/guestmemory/README.md b/lib/guestmemory/README.md index 1f548470..9466febf 100644 --- a/lib/guestmemory/README.md +++ b/lib/guestmemory/README.md @@ -35,9 +35,9 @@ hypervisor: pressure_high_watermark_available_percent: 10 pressure_low_watermark_available_percent: 15 protected_floor_percent: 50 - protected_floor_min_bytes: 512MiB - min_adjustment_bytes: 64MiB - per_vm_max_step_bytes: 256MiB + protected_floor_min_bytes: 512MB + min_adjustment_bytes: 64MB + per_vm_max_step_bytes: 256MB per_vm_cooldown: 5s ``` @@ -77,7 +77,7 @@ Active reclaim never shrinks a guest below its protected floor: Example: - a 4 GiB guest with `protected_floor_percent=50` has a 2 GiB floor -- if `protected_floor_min_bytes=512MiB`, the effective floor is still 2 GiB +- if `protected_floor_min_bytes=512MB`, the effective floor is still 2 GiB - Hypeman can reclaim at most 2 GiB from that guest Reclaim is also rate-limited: diff --git a/lib/instances/manager_test.go b/lib/instances/manager_test.go index 3003781c..7fc7b5c3 100644 --- a/lib/instances/manager_test.go +++ b/lib/instances/manager_test.go @@ -112,6 +112,7 @@ func waitForVMReady(ctx context.Context, socketPath string, timeout time.Duratio // waitForInstanceState polls GetInstance until the expected state is observed or timeout expires. func waitForInstanceState(ctx context.Context, mgr Manager, instanceID string, expected State, timeout time.Duration) (*Instance, error) { + timeout = integrationTestTimeout(timeout) deadline := time.Now().Add(timeout) lastState := StateUnknown lastErr := error(nil) @@ -136,6 +137,13 @@ func waitForInstanceState(ctx context.Context, mgr Manager, instanceID string, e return nil, fmt.Errorf("instance %s did not reach %s within %v (last state: %s)", instanceID, expected, timeout, lastState) } +func integrationTestTimeout(timeout time.Duration) time.Duration { + if os.Getenv("CI") == "true" && timeout < 45*time.Second { + return 45 * time.Second + } + return timeout +} + // waitForLogMessage polls instance logs until the message appears or times out func waitForLogMessage(ctx context.Context, mgr *manager, instanceID, message string, timeout time.Duration) error { deadline := time.Now().Add(timeout) From 0e242a377f78c5d0f17806d01424c0a5fce9d5a9 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Thu, 19 Mar 2026 17:23:58 -0400 Subject: [PATCH 03/22] test: relax exec-agent wait under ci --- lib/instances/exec_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/instances/exec_test.go b/lib/instances/exec_test.go index a0ff212c..982c9371 100644 --- a/lib/instances/exec_test.go +++ b/lib/instances/exec_test.go @@ -20,6 +20,7 @@ import ( // waitForExecAgent polls until exec-agent is ready func waitForExecAgent(ctx context.Context, mgr *manager, instanceID string, timeout time.Duration) error { + timeout = integrationTestTimeout(timeout) deadline := time.Now().Add(timeout) lastState := StateUnknown var lastErr error From 4c067479905cdbbea6e3df198d527d00acf7a60a Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Fri, 20 Mar 2026 09:21:41 -0400 Subject: [PATCH 04/22] auth: scope manual memory reclaim endpoint --- lib/scopes/scopes.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/scopes/scopes.go b/lib/scopes/scopes.go index 1ebbc49a..7737d03a 100644 --- a/lib/scopes/scopes.go +++ b/lib/scopes/scopes.go @@ -209,8 +209,9 @@ var RouteScopes = map[string]Scope{ "GET /devices/{id}": DeviceRead, // Health & Resources - "GET /health": ResourceRead, - "GET /resources": ResourceRead, + "GET /health": ResourceRead, + "GET /resources": ResourceRead, + "POST /resources/memory/reclaim": ResourceRead, // Images "GET /images": ImageRead, From 699f18c4e883f41cf541063b6cf34ce57447a50c Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Fri, 20 Mar 2026 09:26:24 -0400 Subject: [PATCH 05/22] test: make initrd latest updates race-safe --- lib/system/initrd.go | 12 +++++------- lib/system/symlink.go | 27 +++++++++++++++++++++++++++ lib/system/symlink_test.go | 28 ++++++++++++++++++++++++++++ 3 files changed, 60 insertions(+), 7 deletions(-) create mode 100644 lib/system/symlink.go create mode 100644 lib/system/symlink_test.go diff --git a/lib/system/initrd.go b/lib/system/initrd.go index 3ef4c103..27c45412 100644 --- a/lib/system/initrd.go +++ b/lib/system/initrd.go @@ -75,8 +75,9 @@ func (m *manager) buildInitrd(ctx context.Context, arch string) (string, error) return "", fmt.Errorf("download kernel headers: %w", err) } - // Generate timestamp for this build - timestamp := strconv.FormatInt(time.Now().Unix(), 10) + // Use a unique build identifier so concurrent rebuilds against a shared + // prewarmed cache do not collide on the output directory. + timestamp := strconv.FormatInt(time.Now().UnixNano(), 10) // Package as cpio.gz outputPath := m.paths.SystemInitrdTimestamp(timestamp, arch) @@ -95,12 +96,9 @@ func (m *manager) buildInitrd(ctx context.Context, arch string) (string, error) return "", fmt.Errorf("write hash file: %w", err) } - // Update 'latest' symlink + // Update 'latest' symlink atomically so parallel rebuilds can safely race. latestLink := m.paths.SystemInitrdLatest(arch) - // Remove old symlink if it exists - os.Remove(latestLink) - // Create new symlink (relative path) - if err := os.Symlink(timestamp, latestLink); err != nil { + if err := replaceSymlinkAtomic(latestLink, timestamp); err != nil { return "", fmt.Errorf("create latest symlink: %w", err) } diff --git a/lib/system/symlink.go b/lib/system/symlink.go new file mode 100644 index 00000000..83488a9d --- /dev/null +++ b/lib/system/symlink.go @@ -0,0 +1,27 @@ +package system + +import ( + "fmt" + "os" + "path/filepath" + "strconv" + "time" +) + +func replaceSymlinkAtomic(linkPath, target string) error { + tmpLink := filepath.Join( + filepath.Dir(linkPath), + ".tmp-"+filepath.Base(linkPath)+"-"+strconv.FormatInt(time.Now().UnixNano(), 10), + ) + + if err := os.Symlink(target, tmpLink); err != nil { + return err + } + + if err := os.Rename(tmpLink, linkPath); err != nil { + _ = os.Remove(tmpLink) + return fmt.Errorf("rename temp symlink: %w", err) + } + + return nil +} diff --git a/lib/system/symlink_test.go b/lib/system/symlink_test.go new file mode 100644 index 00000000..befddfae --- /dev/null +++ b/lib/system/symlink_test.go @@ -0,0 +1,28 @@ +package system + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestReplaceSymlinkAtomic(t *testing.T) { + t.Parallel() + + tmpDir := t.TempDir() + linkPath := filepath.Join(tmpDir, "latest") + + require.NoError(t, replaceSymlinkAtomic(linkPath, "first")) + + target, err := os.Readlink(linkPath) + require.NoError(t, err) + require.Equal(t, "first", target) + + require.NoError(t, replaceSymlinkAtomic(linkPath, "second")) + + target, err = os.Readlink(linkPath) + require.NoError(t, err) + require.Equal(t, "second", target) +} From 7289e56ba8a14b3d85ecca2c66c94ca0c3565e4a Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Fri, 20 Mar 2026 10:05:34 -0400 Subject: [PATCH 06/22] obs: instrument active ballooning --- cmd/api/api/resources_reclaim.go | 53 ++++++ lib/guestmemory/README.md | 31 ++++ lib/guestmemory/active_ballooning.go | 11 ++ lib/guestmemory/controller.go | 122 +++++++++++- lib/guestmemory/metrics.go | 247 +++++++++++++++++++++++++ lib/guestmemory/observability.go | 168 +++++++++++++++++ lib/guestmemory/pressure_darwin.go | 70 +------ lib/guestmemory/pressure_linux.go | 58 +----- lib/guestmemory/pressure_parse.go | 147 +++++++++++++++ lib/guestmemory/pressure_parse_test.go | 97 ++++++++++ lib/otel/README.md | 16 ++ lib/providers/providers.go | 2 +- 12 files changed, 907 insertions(+), 115 deletions(-) create mode 100644 lib/guestmemory/metrics.go create mode 100644 lib/guestmemory/observability.go create mode 100644 lib/guestmemory/pressure_parse.go create mode 100644 lib/guestmemory/pressure_parse_test.go diff --git a/cmd/api/api/resources_reclaim.go b/cmd/api/api/resources_reclaim.go index 12675e93..3a66343a 100644 --- a/cmd/api/api/resources_reclaim.go +++ b/cmd/api/api/resources_reclaim.go @@ -6,7 +6,12 @@ import ( "time" "github.com/kernel/hypeman/lib/guestmemory" + "github.com/kernel/hypeman/lib/logger" "github.com/kernel/hypeman/lib/oapi" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/trace" ) const ( @@ -16,6 +21,7 @@ const ( // ReclaimMemory triggers proactive guest memory reclaim via runtime ballooning. func (s *ApiService) ReclaimMemory(ctx context.Context, request oapi.ReclaimMemoryRequestObject) (oapi.ReclaimMemoryResponseObject, error) { + log := logger.FromContext(ctx) if request.Body == nil { return oapi.ReclaimMemory400JSONResponse{ Code: "bad_request", @@ -37,6 +43,20 @@ func (s *ApiService) ReclaimMemory(ctx context.Context, request oapi.ReclaimMemo }, nil } + tracer := otel.Tracer("hypeman/guestmemory") + ctx, span := tracer.Start(ctx, "guestmemory.manual_reclaim", + traceAttrsForManualReclaim(request.Body.ReclaimBytes, holdFor, request.Body.DryRun != nil && *request.Body.DryRun, request.Body.Reason != nil)) + defer span.End() + + log.InfoContext(ctx, + "manual guest memory reclaim requested", + "operation", "manual_reclaim", + "requested_reclaim_bytes", request.Body.ReclaimBytes, + "hold_for_seconds", holdFor.Seconds(), + "dry_run", request.Body.DryRun != nil && *request.Body.DryRun, + "reason_present", request.Body.Reason != nil, + ) + resp, err := s.GuestMemoryController.TriggerReclaim(ctx, guestmemory.ManualReclaimRequest{ ReclaimBytes: request.Body.ReclaimBytes, HoldFor: holdFor, @@ -46,11 +66,17 @@ func (s *ApiService) ReclaimMemory(ctx context.Context, request oapi.ReclaimMemo if err != nil { switch { case errors.Is(err, guestmemory.ErrGuestMemoryDisabled), errors.Is(err, guestmemory.ErrActiveBallooningDisabled): + span.RecordError(err) + span.SetStatus(codes.Error, err.Error()) + log.WarnContext(ctx, "manual guest memory reclaim rejected", "operation", "manual_reclaim", "error", err) return oapi.ReclaimMemory400JSONResponse{ Code: "feature_disabled", Message: err.Error(), }, nil default: + span.RecordError(err) + span.SetStatus(codes.Error, err.Error()) + log.ErrorContext(ctx, "manual guest memory reclaim failed", "operation", "manual_reclaim", "error", err) return oapi.ReclaimMemory500JSONResponse{ Code: "internal_error", Message: err.Error(), @@ -58,9 +84,36 @@ func (s *ApiService) ReclaimMemory(ctx context.Context, request oapi.ReclaimMemo } } + span.SetAttributes( + attribute.Int64("planned_reclaim_bytes", resp.PlannedReclaimBytes), + attribute.Int64("applied_reclaim_bytes", resp.AppliedReclaimBytes), + attribute.Int64("host_available_bytes", resp.HostAvailableBytes), + attribute.String("host_pressure_state", string(resp.HostPressureState)), + attribute.Int("action_count", len(resp.Actions)), + ) + span.SetStatus(codes.Ok, "") + log.InfoContext(ctx, + "manual guest memory reclaim completed", + "operation", "manual_reclaim", + "planned_reclaim_bytes", resp.PlannedReclaimBytes, + "applied_reclaim_bytes", resp.AppliedReclaimBytes, + "host_available_bytes", resp.HostAvailableBytes, + "host_pressure_state", resp.HostPressureState, + "action_count", len(resp.Actions), + ) + return oapi.ReclaimMemory200JSONResponse(memoryReclaimResponseToOAPI(resp)), nil } +func traceAttrsForManualReclaim(reclaimBytes int64, holdFor time.Duration, dryRun bool, reasonPresent bool) trace.SpanStartOption { + return trace.WithAttributes( + attribute.Int64("requested_reclaim_bytes", reclaimBytes), + attribute.Float64("hold_for_seconds", holdFor.Seconds()), + attribute.Bool("dry_run", dryRun), + attribute.Bool("reason_present", reasonPresent), + ) +} + func parseMemoryReclaimHold(req *oapi.MemoryReclaimRequest) (time.Duration, error) { if req == nil { return 0, nil diff --git a/lib/guestmemory/README.md b/lib/guestmemory/README.md index 9466febf..028b2a6c 100644 --- a/lib/guestmemory/README.md +++ b/lib/guestmemory/README.md @@ -103,6 +103,37 @@ Manual reclaim uses the same planner and protected floors as automatic reclaim. By design, Hypeman does not reclaim memory without a reason. Automatic reclaim only happens under real host pressure. Proactive reclaim without host pressure is only done when an operator explicitly asks for it through the API. +## Observability + +Active ballooning emits structured logs, metrics, and traces so operators can tell whether reclaim is healthy and effective. + +Logs: + +- manual reclaim requests log start, success, and failure +- pressure state transitions log the old and new state plus current host availability +- per-VM apply failures log the affected `instance_id`, hypervisor, and requested target +- automatic reconcile summaries log when pressure changes, reclaim is applied, or errors occur + +Metrics: + +- `hypeman_guestmemory_reconcile_total` and `hypeman_guestmemory_reconcile_duration_seconds` +- `hypeman_guestmemory_reclaim_actions_total` +- `hypeman_guestmemory_pressure_transitions_total` +- `hypeman_guestmemory_sampler_errors_total` +- `hypeman_guestmemory_reclaim_bytes` +- `hypeman_guestmemory_host_available_bytes` +- `hypeman_guestmemory_target_reclaim_bytes` +- `hypeman_guestmemory_applied_reclaim_bytes` +- `hypeman_guestmemory_manual_hold_active` +- `hypeman_guestmemory_eligible_vms_total` +- `hypeman_guestmemory_pressure_state` + +Traces: + +- manual API calls create a `guestmemory.manual_reclaim` span +- each reconcile creates a `guestmemory.reconcile` span +- child spans capture host pressure sampling, VM enumeration, and balloon target application + ## Passive Reclaim vs Active Ballooning Passive reclaim and active reclaim are complementary: diff --git a/lib/guestmemory/active_ballooning.go b/lib/guestmemory/active_ballooning.go index d63c8ec6..3d0dfcff 100644 --- a/lib/guestmemory/active_ballooning.go +++ b/lib/guestmemory/active_ballooning.go @@ -7,6 +7,8 @@ import ( "time" "github.com/kernel/hypeman/lib/hypervisor" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/trace" ) var ( @@ -165,6 +167,8 @@ type controller struct { source Source sampler PressureSampler log *slog.Logger + metrics *Metrics + tracer trace.Tracer reconcileMu syncState } @@ -198,12 +202,19 @@ func NewControllerWithSampler(policy Policy, cfg ActiveBallooningConfig, source sampler = newHostPressureSampler() } + metrics, err := NewMetrics(otel.GetMeterProvider().Meter("hypeman")) + if err != nil { + log.Warn("failed to initialize guest memory metrics", "error", err) + } + c := &controller{ policy: policy.Normalize(), config: cfg.Normalize(), source: source, sampler: sampler, log: log, + metrics: metrics, + tracer: otel.Tracer("hypeman/guestmemory"), reconcileMu: syncState{ mu: make(chan struct{}, 1), pressureState: HostPressureStateHealthy, diff --git a/lib/guestmemory/controller.go b/lib/guestmemory/controller.go index 6dcdd31f..89d2ea0f 100644 --- a/lib/guestmemory/controller.go +++ b/lib/guestmemory/controller.go @@ -4,6 +4,9 @@ import ( "context" "fmt" "time" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" ) func (c *controller) Start(ctx context.Context) error { @@ -25,7 +28,7 @@ func (c *controller) Start(ctx context.Context) error { return nil case <-ticker.C: if _, err := c.reconcile(ctx, reconcileRequest{}); err != nil { - c.log.WarnContext(ctx, "active ballooning reconcile failed", "error", err) + logFromContext(ctx, c.log).WarnContext(ctx, "active ballooning reconcile failed", "operation", "active_ballooning_reconcile", "trigger", "auto", "error", err) } } } @@ -59,36 +62,83 @@ type reconcileRequest struct { } func (c *controller) reconcile(ctx context.Context, req reconcileRequest) (ManualReclaimResponse, error) { + trigger := reconcileTrigger(req) + start := time.Now() + ctx, span := c.startReconcileSpan(ctx, req) + defer span.End() + state := &c.reconcileMu <-state.mu defer func() { state.mu <- struct{}{} }() now := time.Now() - sample, err := c.sampler.Sample(ctx) + sampleCtx, sampleSpan := c.startChildSpan(ctx, "guestmemory.sample_host_pressure") + sample, err := c.sampler.Sample(sampleCtx) if err != nil { + c.metrics.RecordSamplerError(ctx, "host_pressure") + sampleSpan.RecordError(err) + sampleSpan.SetStatus(codes.Error, err.Error()) + sampleSpan.End() + c.recordReconcileError(ctx, trigger, start, span, err) return ManualReclaimResponse{}, err } + sampleSpan.SetAttributes( + attribute.Int64("host_available_bytes", sample.AvailableBytes), + attribute.Float64("host_available_percent", sample.AvailablePercent), + attribute.Bool("stressed", sample.Stressed), + ) + sampleSpan.SetStatus(codes.Ok, "") + sampleSpan.End() + + summary := reconcileSummary{ + hostAvailable: sample.AvailableBytes, + hostAvailablePerc: sample.AvailablePercent, + } if state.manualHold != nil && !state.manualHold.until.IsZero() && now.After(state.manualHold.until) { + logFromContext(ctx, c.log).InfoContext(ctx, + "guest memory manual reclaim hold expired", + "operation", "manual_reclaim", + ) state.manualHold = nil } if req.force && !req.dryRun { switch { case req.requestedReclaim <= 0 || req.holdFor <= 0: + if state.manualHold != nil { + logFromContext(ctx, c.log).InfoContext(ctx, + "guest memory manual reclaim hold cleared", + "operation", "manual_reclaim", + ) + } state.manualHold = nil default: state.manualHold = &manualHold{ reclaimBytes: req.requestedReclaim, until: now.Add(req.holdFor), } + logFromContext(ctx, c.log).InfoContext(ctx, + "guest memory manual reclaim hold set", + "operation", "manual_reclaim", + "requested_reclaim_bytes", req.requestedReclaim, + "hold_for_seconds", req.holdFor.Seconds(), + ) } } - vms, err := c.source.ListBalloonVMs(ctx) + listCtx, listSpan := c.startChildSpan(ctx, "guestmemory.list_balloon_vms") + vms, err := c.source.ListBalloonVMs(listCtx) if err != nil { + listSpan.RecordError(err) + listSpan.SetStatus(codes.Error, err.Error()) + listSpan.End() + c.recordReconcileError(ctx, trigger, start, span, err) return ManualReclaimResponse{}, err } + listSpan.SetAttributes(attribute.Int("vm_count", len(vms))) + listSpan.SetStatus(codes.Ok, "") + listSpan.End() candidates := make([]candidateState, 0, len(vms)) actions := make([]ManualReclaimAction, 0, len(vms)) @@ -130,8 +180,16 @@ func (c *controller) reconcile(ctx context.Context, req reconcileRequest) (Manua maxReclaimBytes: maxInt64(0, vm.AssignedMemoryBytes-protectedFloor), }) } + summary.eligibleVMs = len(candidates) + previousPressure := state.pressureState state.pressureState = nextPressureState(state.pressureState, c.config, sample) + summary.previousPressure = previousPressure + summary.currentPressure = state.pressureState + summary.pressureChanged = previousPressure != state.pressureState + if summary.pressureChanged { + c.metrics.RecordPressureTransition(ctx, previousPressure, state.pressureState) + } autoTarget := automaticTargetBytes(state.pressureState, c.config, sample, currentTotalReclaim) manualTarget := int64(0) @@ -141,6 +199,10 @@ func (c *controller) reconcile(ctx context.Context, req reconcileRequest) (Manua manualTarget = state.manualHold.reclaimBytes } totalTarget := maxInt64(autoTarget, manualTarget) + summary.autoTarget = autoTarget + summary.manualTarget = manualTarget + summary.effectiveTarget = totalTarget + summary.manualHoldActive = state.manualHold != nil plannedTargets := planGuestTargets(c.config, candidates, totalTarget) @@ -152,7 +214,18 @@ func (c *controller) reconcile(ctx context.Context, req reconcileRequest) (Manua Actions: make([]ManualReclaimAction, 0, len(actions)+len(candidates)), } resp.Actions = append(resp.Actions, actions...) + for _, action := range actions { + switch action.Status { + case "error": + summary.errorCount++ + case "unsupported": + summary.unsupportedCount++ + default: + summary.unchangedCount++ + } + } + applyCtx, applySpan := c.startChildSpan(ctx, "guestmemory.apply_balloon_targets") for _, candidate := range candidates { plannedTarget := plannedTargets[candidate.vm.ID] if plannedTarget == 0 { @@ -195,9 +268,19 @@ func (c *controller) reconcile(ctx context.Context, req reconcileRequest) (Manua resp.PlannedReclaimBytes += candidate.vm.AssignedMemoryBytes - plannedTarget if !req.dryRun && appliedTarget != candidate.currentTargetGuestBytes { - if err := candidate.hv.SetTargetGuestMemoryBytes(ctx, appliedTarget); err != nil { + if err := candidate.hv.SetTargetGuestMemoryBytes(applyCtx, appliedTarget); err != nil { action.Status = "error" action.Error = err.Error() + logFromContext(ctx, c.log).WarnContext(ctx, + "guest memory reclaim action failed", + "operation", "active_ballooning_apply", + "trigger", trigger, + "instance_id", candidate.vm.ID, + "hypervisor", candidate.vm.HypervisorType, + "previous_target_guest_memory_bytes", candidate.currentTargetGuestBytes, + "target_guest_memory_bytes", appliedTarget, + "error", err, + ) resp.Actions = append(resp.Actions, action) continue } @@ -212,7 +295,38 @@ func (c *controller) reconcile(ctx context.Context, req reconcileRequest) (Manua action.AppliedReclaimBytes = candidate.vm.AssignedMemoryBytes - action.TargetGuestMemoryBytes resp.AppliedReclaimBytes += action.AppliedReclaimBytes resp.Actions = append(resp.Actions, action) + + switch action.Status { + case "applied": + summary.appliedCount++ + case "planned": + summary.plannedCount++ + case "error": + summary.errorCount++ + case "unsupported": + summary.unsupportedCount++ + default: + summary.unchangedCount++ + } + } + applySpan.SetAttributes( + attribute.Int("eligible_vms", summary.eligibleVMs), + attribute.Int("applied_vms", summary.appliedCount), + attribute.Int("planned_vms", summary.plannedCount), + attribute.Int("error_vms", summary.errorCount), + ) + if summary.errorCount > 0 { + applySpan.SetStatus(codes.Error, "one or more balloon target updates failed") + } else { + applySpan.SetStatus(codes.Ok, "") } + applySpan.End() + + summary.plannedReclaim = resp.PlannedReclaimBytes + summary.appliedReclaim = resp.AppliedReclaimBytes + c.recordReconcileSuccess(ctx, trigger, req, span, start, summary, resp.Actions) + c.logPressureTransition(ctx, summary) + c.logReconcileSummary(ctx, req, summary, reconcileStatus(summary)) return resp, nil } diff --git a/lib/guestmemory/metrics.go b/lib/guestmemory/metrics.go new file mode 100644 index 00000000..9bba181c --- /dev/null +++ b/lib/guestmemory/metrics.go @@ -0,0 +1,247 @@ +package guestmemory + +import ( + "context" + "time" + + "github.com/kernel/hypeman/lib/hypervisor" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +type Metrics struct { + reconcileTotal metric.Int64Counter + reconcileDuration metric.Float64Histogram + reclaimActionsTotal metric.Int64Counter + pressureTransitionsTotal metric.Int64Counter + samplerErrorsTotal metric.Int64Counter + reclaimBytes metric.Int64Histogram + + hostAvailableBytes metric.Int64Gauge + targetReclaimBytes metric.Int64Gauge + appliedReclaimBytes metric.Int64Gauge + manualHoldActive metric.Int64Gauge + eligibleVMsTotal metric.Int64Gauge + pressureState metric.Int64Gauge +} + +type GaugeObservation struct { + HostAvailableBytes int64 + AutoTargetBytes int64 + ManualTargetBytes int64 + EffectiveTarget int64 + AppliedReclaim int64 + EligibleVMs int + PressureState HostPressureState + ManualHoldActive bool +} + +func NewMetrics(meter metric.Meter) (*Metrics, error) { + if meter == nil { + return nil, nil + } + + reconcileTotal, err := meter.Int64Counter( + "hypeman_guestmemory_reconcile_total", + metric.WithDescription("Total number of guest memory reconcile cycles"), + ) + if err != nil { + return nil, err + } + + reconcileDuration, err := meter.Float64Histogram( + "hypeman_guestmemory_reconcile_duration_seconds", + metric.WithDescription("Guest memory reconcile duration"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + + reclaimActionsTotal, err := meter.Int64Counter( + "hypeman_guestmemory_reclaim_actions_total", + metric.WithDescription("Total number of guest memory reclaim actions"), + ) + if err != nil { + return nil, err + } + + pressureTransitionsTotal, err := meter.Int64Counter( + "hypeman_guestmemory_pressure_transitions_total", + metric.WithDescription("Total number of guest memory pressure state transitions"), + ) + if err != nil { + return nil, err + } + + samplerErrorsTotal, err := meter.Int64Counter( + "hypeman_guestmemory_sampler_errors_total", + metric.WithDescription("Total number of guest memory host pressure sampler errors"), + ) + if err != nil { + return nil, err + } + + reclaimBytes, err := meter.Int64Histogram( + "hypeman_guestmemory_reclaim_bytes", + metric.WithDescription("Guest memory reclaim bytes observed per reconcile"), + metric.WithUnit("By"), + ) + if err != nil { + return nil, err + } + + hostAvailableBytes, err := meter.Int64Gauge( + "hypeman_guestmemory_host_available_bytes", + metric.WithDescription("Last observed host available memory"), + metric.WithUnit("By"), + ) + if err != nil { + return nil, err + } + + targetReclaimBytes, err := meter.Int64Gauge( + "hypeman_guestmemory_target_reclaim_bytes", + metric.WithDescription("Current guest memory reclaim target"), + metric.WithUnit("By"), + ) + if err != nil { + return nil, err + } + + appliedReclaimBytes, err := meter.Int64Gauge( + "hypeman_guestmemory_applied_reclaim_bytes", + metric.WithDescription("Current applied guest memory reclaim"), + metric.WithUnit("By"), + ) + if err != nil { + return nil, err + } + + manualHoldActive, err := meter.Int64Gauge( + "hypeman_guestmemory_manual_hold_active", + metric.WithDescription("Whether a manual guest memory reclaim hold is active"), + ) + if err != nil { + return nil, err + } + + eligibleVMsTotal, err := meter.Int64Gauge( + "hypeman_guestmemory_eligible_vms_total", + metric.WithDescription("Number of guest VMs eligible for active ballooning"), + ) + if err != nil { + return nil, err + } + + pressureState, err := meter.Int64Gauge( + "hypeman_guestmemory_pressure_state", + metric.WithDescription("Current guest memory host pressure state (0 healthy, 1 pressure)"), + ) + if err != nil { + return nil, err + } + + return &Metrics{ + reconcileTotal: reconcileTotal, + reconcileDuration: reconcileDuration, + reclaimActionsTotal: reclaimActionsTotal, + pressureTransitionsTotal: pressureTransitionsTotal, + samplerErrorsTotal: samplerErrorsTotal, + reclaimBytes: reclaimBytes, + hostAvailableBytes: hostAvailableBytes, + targetReclaimBytes: targetReclaimBytes, + appliedReclaimBytes: appliedReclaimBytes, + manualHoldActive: manualHoldActive, + eligibleVMsTotal: eligibleVMsTotal, + pressureState: pressureState, + }, nil +} + +func (m *Metrics) RecordReconcile(ctx context.Context, trigger, status string, duration time.Duration) { + if m == nil { + return + } + + opts := metric.WithAttributes( + attribute.String("trigger", trigger), + attribute.String("status", status), + ) + m.reconcileTotal.Add(ctx, 1, opts) + m.reconcileDuration.Record(ctx, duration.Seconds(), opts) +} + +func (m *Metrics) RecordReclaimAction(ctx context.Context, trigger, status string, hvType hypervisor.Type) { + if m == nil { + return + } + + m.reclaimActionsTotal.Add(ctx, 1, + metric.WithAttributes( + attribute.String("trigger", trigger), + attribute.String("status", status), + attribute.String("hypervisor", string(hvType)), + )) +} + +func (m *Metrics) RecordPressureTransition(ctx context.Context, from, to HostPressureState) { + if m == nil { + return + } + + m.pressureTransitionsTotal.Add(ctx, 1, + metric.WithAttributes( + attribute.String("from", string(from)), + attribute.String("to", string(to)), + )) +} + +func (m *Metrics) RecordSamplerError(ctx context.Context, sampler string) { + if m == nil { + return + } + + m.samplerErrorsTotal.Add(ctx, 1, + metric.WithAttributes(attribute.String("sampler", sampler))) +} + +func (m *Metrics) RecordReclaimBytes(ctx context.Context, trigger, kind string, bytes int64) { + if m == nil || bytes < 0 { + return + } + + m.reclaimBytes.Record(ctx, bytes, + metric.WithAttributes( + attribute.String("trigger", trigger), + attribute.String("kind", kind), + )) +} + +func (m *Metrics) RecordGaugeState(ctx context.Context, obs GaugeObservation) { + if m == nil { + return + } + + m.hostAvailableBytes.Record(ctx, obs.HostAvailableBytes) + m.targetReclaimBytes.Record(ctx, obs.AutoTargetBytes, metric.WithAttributes(attribute.String("source", "auto"))) + m.targetReclaimBytes.Record(ctx, obs.ManualTargetBytes, metric.WithAttributes(attribute.String("source", "manual"))) + m.targetReclaimBytes.Record(ctx, obs.EffectiveTarget, metric.WithAttributes(attribute.String("source", "effective"))) + m.appliedReclaimBytes.Record(ctx, obs.AppliedReclaim) + m.manualHoldActive.Record(ctx, boolToInt64(obs.ManualHoldActive)) + m.eligibleVMsTotal.Record(ctx, int64(obs.EligibleVMs)) + m.pressureState.Record(ctx, pressureStateMetricValue(obs.PressureState)) +} + +func pressureStateMetricValue(state HostPressureState) int64 { + if state == HostPressureStatePressure { + return 1 + } + return 0 +} + +func boolToInt64(v bool) int64 { + if v { + return 1 + } + return 0 +} diff --git a/lib/guestmemory/observability.go b/lib/guestmemory/observability.go new file mode 100644 index 00000000..965b5304 --- /dev/null +++ b/lib/guestmemory/observability.go @@ -0,0 +1,168 @@ +package guestmemory + +import ( + "context" + "log/slog" + "time" + + "github.com/kernel/hypeman/lib/logger" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/trace" +) + +type reconcileSummary struct { + eligibleVMs int + appliedCount int + plannedCount int + unchangedCount int + errorCount int + unsupportedCount int + plannedReclaim int64 + appliedReclaim int64 + effectiveTarget int64 + autoTarget int64 + manualTarget int64 + manualHoldActive bool + pressureChanged bool + previousPressure HostPressureState + currentPressure HostPressureState + hostAvailable int64 + hostAvailablePerc float64 +} + +func reconcileTrigger(req reconcileRequest) string { + if req.force { + return "manual" + } + return "auto" +} + +func logFromContext(ctx context.Context, fallback *slog.Logger) *slog.Logger { + if log := logger.FromContext(ctx); log != nil && log != slog.Default() { + return log + } + if fallback != nil { + return fallback + } + return slog.Default() +} + +func (c *controller) startReconcileSpan(ctx context.Context, req reconcileRequest) (context.Context, trace.Span) { + return c.tracer.Start(ctx, "guestmemory.reconcile", + trace.WithAttributes( + attribute.String("trigger", reconcileTrigger(req)), + attribute.Bool("force", req.force), + attribute.Bool("dry_run", req.dryRun), + attribute.Int64("requested_reclaim_bytes", req.requestedReclaim), + )) +} + +func (c *controller) startChildSpan(ctx context.Context, name string) (context.Context, trace.Span) { + return c.tracer.Start(ctx, name) +} + +func reconcileStatus(summary reconcileSummary) string { + if summary.errorCount > 0 { + if summary.appliedCount > 0 || summary.unchangedCount > 0 || summary.plannedCount > 0 { + return "partial" + } + return "error" + } + return "success" +} + +func (c *controller) recordReconcileError(ctx context.Context, trigger string, start time.Time, span trace.Span, err error) { + if err == nil { + return + } + + c.metrics.RecordReconcile(ctx, trigger, "error", time.Since(start)) + span.RecordError(err) + span.SetStatus(codes.Error, err.Error()) +} + +func (c *controller) recordReconcileSuccess(ctx context.Context, trigger string, req reconcileRequest, span trace.Span, start time.Time, summary reconcileSummary, actions []ManualReclaimAction) { + status := reconcileStatus(summary) + c.metrics.RecordReconcile(ctx, trigger, status, time.Since(start)) + for _, action := range actions { + c.metrics.RecordReclaimAction(ctx, trigger, action.Status, action.Hypervisor) + } + + if !req.dryRun { + c.metrics.RecordReclaimBytes(ctx, trigger, "auto_target", summary.autoTarget) + c.metrics.RecordReclaimBytes(ctx, trigger, "manual_target", summary.manualTarget) + c.metrics.RecordReclaimBytes(ctx, trigger, "effective_target", summary.effectiveTarget) + c.metrics.RecordReclaimBytes(ctx, trigger, "planned", summary.plannedReclaim) + c.metrics.RecordReclaimBytes(ctx, trigger, "applied", summary.appliedReclaim) + c.metrics.RecordGaugeState(ctx, GaugeObservation{ + HostAvailableBytes: summary.hostAvailable, + AutoTargetBytes: summary.autoTarget, + ManualTargetBytes: summary.manualTarget, + EffectiveTarget: summary.effectiveTarget, + AppliedReclaim: summary.appliedReclaim, + EligibleVMs: summary.eligibleVMs, + PressureState: summary.currentPressure, + ManualHoldActive: summary.manualHoldActive, + }) + } + + span.SetAttributes( + attribute.String("status", status), + attribute.Int("eligible_vms", summary.eligibleVMs), + attribute.Int("applied_vms", summary.appliedCount), + attribute.Int("planned_vms", summary.plannedCount), + attribute.Int("error_vms", summary.errorCount), + attribute.Int("unsupported_vms", summary.unsupportedCount), + attribute.Int64("auto_target_reclaim_bytes", summary.autoTarget), + attribute.Int64("manual_target_reclaim_bytes", summary.manualTarget), + attribute.Int64("effective_target_reclaim_bytes", summary.effectiveTarget), + attribute.Int64("planned_reclaim_bytes", summary.plannedReclaim), + attribute.Int64("applied_reclaim_bytes", summary.appliedReclaim), + attribute.Int64("host_available_bytes", summary.hostAvailable), + attribute.Float64("host_available_percent", summary.hostAvailablePerc), + attribute.String("pressure_state", string(summary.currentPressure)), + attribute.Bool("manual_hold_active", summary.manualHoldActive), + ) + span.SetStatus(codes.Ok, "") +} + +func (c *controller) logPressureTransition(ctx context.Context, summary reconcileSummary) { + if !summary.pressureChanged { + return + } + + logFromContext(ctx, c.log).InfoContext(ctx, + "guest memory pressure state changed", + "operation", "active_ballooning_reconcile", + "from", summary.previousPressure, + "to", summary.currentPressure, + "host_available_bytes", summary.hostAvailable, + "host_available_percent", summary.hostAvailablePerc, + ) +} + +func (c *controller) logReconcileSummary(ctx context.Context, req reconcileRequest, summary reconcileSummary, status string) { + if !req.force && !summary.pressureChanged && summary.appliedCount == 0 && summary.errorCount == 0 { + return + } + + logFromContext(ctx, c.log).InfoContext(ctx, + "guest memory reconcile completed", + "operation", "active_ballooning_reconcile", + "trigger", reconcileTrigger(req), + "dry_run", req.dryRun, + "status", status, + "eligible_vms", summary.eligibleVMs, + "applied_vms", summary.appliedCount, + "planned_vms", summary.plannedCount, + "error_vms", summary.errorCount, + "unsupported_vms", summary.unsupportedCount, + "host_available_bytes", summary.hostAvailable, + "host_available_percent", summary.hostAvailablePerc, + "pressure_state", summary.currentPressure, + "planned_reclaim_bytes", summary.plannedReclaim, + "applied_reclaim_bytes", summary.appliedReclaim, + "manual_hold_active", summary.manualHoldActive, + ) +} diff --git a/lib/guestmemory/pressure_darwin.go b/lib/guestmemory/pressure_darwin.go index 3e74dad8..ff7d1b59 100644 --- a/lib/guestmemory/pressure_darwin.go +++ b/lib/guestmemory/pressure_darwin.go @@ -6,8 +6,6 @@ import ( "context" "fmt" "os/exec" - "strconv" - "strings" ) type darwinPressureSampler struct{} @@ -39,80 +37,28 @@ func readDarwinVMStat(ctx context.Context) (int64, int64, error) { if err != nil { return 0, 0, fmt.Errorf("run vm_stat: %w", err) } - lines := strings.Split(string(out), "\n") - pageSize := int64(4096) - var freePages, speculativePages int64 - for _, line := range lines { - line = strings.TrimSpace(line) - if strings.Contains(line, "page size of") { - parts := strings.Fields(line) - for i := 0; i < len(parts); i++ { - if parts[i] == "of" && i+1 < len(parts) { - n, err := strconv.ParseInt(strings.TrimSuffix(parts[i+1], " bytes)"), 10, 64) - if err == nil && n > 0 { - pageSize = n - } - break - } - } - } - if strings.HasPrefix(line, "Pages free:") { - n, err := parseDarwinPageCount(line) - if err != nil { - return 0, 0, err - } - freePages = n - } - if strings.HasPrefix(line, "Pages speculative:") { - n, err := parseDarwinPageCount(line) - if err != nil { - return 0, 0, err - } - speculativePages = n - } - } memsizeOut, err := exec.CommandContext(ctx, "sysctl", "-n", "hw.memsize").Output() if err != nil { return 0, 0, fmt.Errorf("run sysctl hw.memsize: %w", err) } - total, err := strconv.ParseInt(strings.TrimSpace(string(memsizeOut)), 10, 64) + + total, available, err := parseDarwinVMStatOutput(string(out), string(memsizeOut)) if err != nil { - return 0, 0, fmt.Errorf("parse hw.memsize: %w", err) + return 0, 0, err } - - available := (freePages + speculativePages) * pageSize return total, available, nil } -func parseDarwinPageCount(line string) (int64, error) { - parts := strings.Split(line, ":") - if len(parts) != 2 { - return 0, fmt.Errorf("parse vm_stat line %q", line) - } - value := strings.TrimSpace(strings.TrimSuffix(parts[1], ".")) - return strconv.ParseInt(value, 10, 64) -} - func readDarwinMemoryPressure(ctx context.Context) (bool, error) { out, err := exec.CommandContext(ctx, "memory_pressure", "-Q").Output() if err != nil { return false, fmt.Errorf("run memory_pressure -Q: %w", err) } - for _, line := range strings.Split(string(out), "\n") { - line = strings.TrimSpace(line) - if strings.HasPrefix(line, "System-wide memory free percentage:") { - fields := strings.Fields(line) - if len(fields) == 0 { - break - } - last := strings.TrimSuffix(fields[len(fields)-1], "%") - value, err := strconv.ParseInt(last, 10, 64) - if err != nil { - return false, fmt.Errorf("parse memory_pressure free percentage: %w", err) - } - return value <= 10, nil - } + + stressed, err := parseDarwinMemoryPressureOutput(string(out)) + if err != nil { + return false, err } - return false, nil + return stressed, nil } diff --git a/lib/guestmemory/pressure_linux.go b/lib/guestmemory/pressure_linux.go index 25665785..7de4416e 100644 --- a/lib/guestmemory/pressure_linux.go +++ b/lib/guestmemory/pressure_linux.go @@ -3,12 +3,9 @@ package guestmemory import ( - "bufio" "context" "fmt" "os" - "strconv" - "strings" ) type linuxPressureSampler struct{} @@ -38,40 +35,14 @@ func (s *linuxPressureSampler) Sample(ctx context.Context) (HostPressureSample, } func readLinuxMeminfo() (int64, int64, error) { - file, err := os.Open("/proc/meminfo") + data, err := os.ReadFile("/proc/meminfo") if err != nil { - return 0, 0, fmt.Errorf("open /proc/meminfo: %w", err) + return 0, 0, fmt.Errorf("read /proc/meminfo: %w", err) } - defer file.Close() - var total, available int64 - scanner := bufio.NewScanner(file) - for scanner.Scan() { - line := scanner.Text() - fields := strings.Fields(line) - if len(fields) < 2 { - continue - } - switch fields[0] { - case "MemTotal:": - value, err := strconv.ParseInt(fields[1], 10, 64) - if err != nil { - return 0, 0, fmt.Errorf("parse MemTotal: %w", err) - } - total = value * 1024 - case "MemAvailable:": - value, err := strconv.ParseInt(fields[1], 10, 64) - if err != nil { - return 0, 0, fmt.Errorf("parse MemAvailable: %w", err) - } - available = value * 1024 - } - } - if err := scanner.Err(); err != nil { - return 0, 0, fmt.Errorf("scan /proc/meminfo: %w", err) - } - if total <= 0 || available < 0 { - return 0, 0, fmt.Errorf("missing memory totals from /proc/meminfo") + total, available, err := parseLinuxMeminfo(string(data)) + if err != nil { + return 0, 0, err } return total, available, nil } @@ -81,19 +52,10 @@ func readLinuxPSI() (bool, error) { if err != nil { return false, fmt.Errorf("read /proc/pressure/memory: %w", err) } - for _, line := range strings.Split(string(data), "\n") { - if strings.HasPrefix(line, "some ") { - fields := strings.Fields(line) - for _, field := range fields[1:] { - if strings.HasPrefix(field, "avg10=") { - value, err := strconv.ParseFloat(strings.TrimPrefix(field, "avg10="), 64) - if err != nil { - return false, fmt.Errorf("parse psi avg10: %w", err) - } - return value > 0, nil - } - } - } + + stressed, err := parseLinuxPSI(string(data)) + if err != nil { + return false, err } - return false, nil + return stressed, nil } diff --git a/lib/guestmemory/pressure_parse.go b/lib/guestmemory/pressure_parse.go new file mode 100644 index 00000000..c0869896 --- /dev/null +++ b/lib/guestmemory/pressure_parse.go @@ -0,0 +1,147 @@ +package guestmemory + +import ( + "bufio" + "fmt" + "strconv" + "strings" +) + +func parseLinuxMeminfo(data string) (int64, int64, error) { + var total, available int64 + var sawTotal, sawAvailable bool + + scanner := bufio.NewScanner(strings.NewReader(data)) + for scanner.Scan() { + line := scanner.Text() + fields := strings.Fields(line) + if len(fields) < 2 { + continue + } + + switch fields[0] { + case "MemTotal:": + value, err := strconv.ParseInt(fields[1], 10, 64) + if err != nil { + return 0, 0, fmt.Errorf("parse MemTotal: %w", err) + } + total = value * 1024 + sawTotal = true + case "MemAvailable:": + value, err := strconv.ParseInt(fields[1], 10, 64) + if err != nil { + return 0, 0, fmt.Errorf("parse MemAvailable: %w", err) + } + available = value * 1024 + sawAvailable = true + } + } + if err := scanner.Err(); err != nil { + return 0, 0, fmt.Errorf("scan meminfo: %w", err) + } + if !sawTotal || !sawAvailable || total <= 0 || available < 0 { + return 0, 0, fmt.Errorf("missing memory totals from /proc/meminfo") + } + + return total, available, nil +} + +func parseLinuxPSI(data string) (bool, error) { + for _, line := range strings.Split(data, "\n") { + if !strings.HasPrefix(line, "some ") { + continue + } + + fields := strings.Fields(line) + for _, field := range fields[1:] { + if !strings.HasPrefix(field, "avg10=") { + continue + } + + value, err := strconv.ParseFloat(strings.TrimPrefix(field, "avg10="), 64) + if err != nil { + return false, fmt.Errorf("parse psi avg10: %w", err) + } + return value > 0, nil + } + } + + return false, nil +} + +func parseDarwinVMStatOutput(vmStatOut, memsizeOut string) (int64, int64, error) { + lines := strings.Split(vmStatOut, "\n") + pageSize := int64(4096) + var freePages, speculativePages int64 + + for _, line := range lines { + line = strings.TrimSpace(line) + if strings.Contains(line, "page size of") { + parts := strings.Fields(line) + for i := 0; i < len(parts); i++ { + if parts[i] == "of" && i+1 < len(parts) { + n, err := strconv.ParseInt(strings.TrimSuffix(parts[i+1], " bytes)"), 10, 64) + if err == nil && n > 0 { + pageSize = n + } + break + } + } + } + if strings.HasPrefix(line, "Pages free:") { + n, err := parseDarwinPageCount(line) + if err != nil { + return 0, 0, err + } + freePages = n + } + if strings.HasPrefix(line, "Pages speculative:") { + n, err := parseDarwinPageCount(line) + if err != nil { + return 0, 0, err + } + speculativePages = n + } + } + + total, err := strconv.ParseInt(strings.TrimSpace(memsizeOut), 10, 64) + if err != nil { + return 0, 0, fmt.Errorf("parse hw.memsize: %w", err) + } + + available := (freePages + speculativePages) * pageSize + return total, available, nil +} + +func parseDarwinPageCount(line string) (int64, error) { + parts := strings.Split(line, ":") + if len(parts) != 2 { + return 0, fmt.Errorf("parse vm_stat line %q", line) + } + + value := strings.TrimSpace(strings.TrimSuffix(parts[1], ".")) + return strconv.ParseInt(value, 10, 64) +} + +func parseDarwinMemoryPressureOutput(out string) (bool, error) { + for _, line := range strings.Split(out, "\n") { + line = strings.TrimSpace(line) + if !strings.HasPrefix(line, "System-wide memory free percentage:") { + continue + } + + fields := strings.Fields(line) + if len(fields) == 0 { + break + } + + last := strings.TrimSuffix(fields[len(fields)-1], "%") + value, err := strconv.ParseInt(last, 10, 64) + if err != nil { + return false, fmt.Errorf("parse memory_pressure free percentage: %w", err) + } + return value <= 10, nil + } + + return false, nil +} diff --git a/lib/guestmemory/pressure_parse_test.go b/lib/guestmemory/pressure_parse_test.go new file mode 100644 index 00000000..e64d9eab --- /dev/null +++ b/lib/guestmemory/pressure_parse_test.go @@ -0,0 +1,97 @@ +package guestmemory + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestParseLinuxMeminfo(t *testing.T) { + t.Parallel() + + total, available, err := parseLinuxMeminfo(` +MemTotal: 16384256 kB +MemFree: 1122334 kB +MemAvailable: 9988776 kB +Buffers: 123456 kB +`) + require.NoError(t, err) + assert.Equal(t, int64(16384256*1024), total) + assert.Equal(t, int64(9988776*1024), available) +} + +func TestParseLinuxMeminfoRequiresTotalAndAvailable(t *testing.T) { + t.Parallel() + + _, _, err := parseLinuxMeminfo("MemTotal: 1024 kB\n") + require.Error(t, err) + assert.Contains(t, err.Error(), "missing memory totals") +} + +func TestParseLinuxPSI(t *testing.T) { + t.Parallel() + + stressed, err := parseLinuxPSI(` +some avg10=0.25 avg60=0.12 avg300=0.05 total=12345 +full avg10=0.00 avg60=0.00 avg300=0.00 total=0 +`) + require.NoError(t, err) + assert.True(t, stressed) +} + +func TestParseLinuxPSIZeroAvg10IsHealthy(t *testing.T) { + t.Parallel() + + stressed, err := parseLinuxPSI(` +some avg10=0.00 avg60=0.01 avg300=0.10 total=12345 +full avg10=0.00 avg60=0.00 avg300=0.00 total=0 +`) + require.NoError(t, err) + assert.False(t, stressed) +} + +func TestParseDarwinVMStatOutput(t *testing.T) { + t.Parallel() + + total, available, err := parseDarwinVMStatOutput(` +Mach Virtual Memory Statistics: (page size of 16384 bytes) +Pages free: 100. +Pages active: 10000. +Pages inactive: 2000. +Pages speculative: 50. +`, "17179869184\n") + require.NoError(t, err) + assert.Equal(t, int64(17179869184), total) + assert.Equal(t, int64(150*16384), available) +} + +func TestParseDarwinPageCountRejectsMalformedLine(t *testing.T) { + t.Parallel() + + _, err := parseDarwinPageCount("Pages free 100") + require.Error(t, err) + assert.Contains(t, err.Error(), "parse vm_stat line") +} + +func TestParseDarwinMemoryPressureOutput(t *testing.T) { + t.Parallel() + + stressed, err := parseDarwinMemoryPressureOutput(` +The system has 1234 pages wired down. +System-wide memory free percentage: 8% +`) + require.NoError(t, err) + assert.True(t, stressed) +} + +func TestParseDarwinMemoryPressureOutputHealthy(t *testing.T) { + t.Parallel() + + stressed, err := parseDarwinMemoryPressureOutput(` +The system has 1234 pages wired down. +System-wide memory free percentage: 21% +`) + require.NoError(t, err) + assert.False(t, stressed) +} diff --git a/lib/otel/README.md b/lib/otel/README.md index 525da933..afff2054 100644 --- a/lib/otel/README.md +++ b/lib/otel/README.md @@ -84,6 +84,22 @@ This keeps pull and push views aligned because both are sourced from the same OT | `hypeman_vmm_api_duration_seconds` | histogram | operation, status | CH API latency | | `hypeman_vmm_api_errors_total` | counter | operation | CH API errors | +### Guest Memory +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `hypeman_guestmemory_reconcile_total` | counter | trigger, status | Active ballooning reconcile cycles | +| `hypeman_guestmemory_reconcile_duration_seconds` | histogram | trigger, status | Reconcile latency | +| `hypeman_guestmemory_reclaim_actions_total` | counter | trigger, status, hypervisor | Per-VM reclaim action outcomes | +| `hypeman_guestmemory_pressure_transitions_total` | counter | from, to | Host pressure state transitions | +| `hypeman_guestmemory_sampler_errors_total` | counter | sampler | Host pressure sampling errors | +| `hypeman_guestmemory_reclaim_bytes` | histogram | trigger, kind | Reclaim byte targets and outcomes | +| `hypeman_guestmemory_host_available_bytes` | gauge | | Last observed host available memory | +| `hypeman_guestmemory_target_reclaim_bytes` | gauge | source | Current reclaim target (auto, manual, effective) | +| `hypeman_guestmemory_applied_reclaim_bytes` | gauge | | Current applied reclaim across eligible VMs | +| `hypeman_guestmemory_manual_hold_active` | gauge | | Whether a manual reclaim hold is active | +| `hypeman_guestmemory_eligible_vms_total` | gauge | | Eligible VM count seen by the controller | +| `hypeman_guestmemory_pressure_state` | gauge | | Current host pressure state (0 healthy, 1 pressure) | + ### Exec | Metric | Type | Labels | Description | |--------|------|--------|-------------| diff --git a/lib/providers/providers.go b/lib/providers/providers.go index 756c13f8..d0931dd9 100644 --- a/lib/providers/providers.go +++ b/lib/providers/providers.go @@ -177,7 +177,7 @@ func ProvideGuestMemoryController(instanceManager instances.Manager, cfg *config PerVMCooldown: perVMCooldown, } - return guestmemory.NewController(policy, controllerCfg, &guestMemoryInstanceSource{manager: instanceManager}, log), nil + return guestmemory.NewController(policy, controllerCfg, &guestMemoryInstanceSource{manager: instanceManager}, log.With("component", "guestmemory")), nil } // ProvideVolumeManager provides the volume manager From 4cfa79db53b1e4fd3c9e62efd948674dee6ed885 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Fri, 20 Mar 2026 10:17:15 -0400 Subject: [PATCH 07/22] sdk: expose memory reclaim in stainless --- stainless.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/stainless.yaml b/stainless.yaml index 53f516fd..3a7e39d5 100644 --- a/stainless.yaml +++ b/stainless.yaml @@ -153,6 +153,14 @@ resources: passthrough_device: "#/components/schemas/PassthroughDevice" methods: get: get /resources + subresources: + memory: + models: + reclaim_request: "#/components/schemas/MemoryReclaimRequest" + reclaim_action: "#/components/schemas/MemoryReclaimAction" + reclaim_response: "#/components/schemas/MemoryReclaimResponse" + methods: + reclaim: post /resources/memory/reclaim builds: models: From 763669b65bf16317f60db23db7e2d35a50a0ac30 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Fri, 20 Mar 2026 10:21:25 -0400 Subject: [PATCH 08/22] sdk: flatten reclaim method in stainless --- stainless.yaml | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/stainless.yaml b/stainless.yaml index 3a7e39d5..efb9fef7 100644 --- a/stainless.yaml +++ b/stainless.yaml @@ -151,16 +151,12 @@ resources: gpu_resource_status: "#/components/schemas/GPUResourceStatus" gpu_profile: "#/components/schemas/GPUProfile" passthrough_device: "#/components/schemas/PassthroughDevice" + memory_reclaim_request: "#/components/schemas/MemoryReclaimRequest" + memory_reclaim_action: "#/components/schemas/MemoryReclaimAction" + memory_reclaim_response: "#/components/schemas/MemoryReclaimResponse" methods: get: get /resources - subresources: - memory: - models: - reclaim_request: "#/components/schemas/MemoryReclaimRequest" - reclaim_action: "#/components/schemas/MemoryReclaimAction" - reclaim_response: "#/components/schemas/MemoryReclaimResponse" - methods: - reclaim: post /resources/memory/reclaim + reclaim_memory: post /resources/memory/reclaim builds: models: From ac2f9670fff59e8db09a4491b922e3a5d6ad9501 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Fri, 20 Mar 2026 10:40:21 -0400 Subject: [PATCH 09/22] fix: address ballooning review feedback --- lib/guestmemory/controller.go | 12 ++++++- lib/guestmemory/controller_test.go | 34 +++++++++++++++++++ .../cloudhypervisor/cloudhypervisor.go | 15 +++++--- .../cloudhypervisor/cloudhypervisor_test.go | 34 +++++++++++++++++++ lib/hypervisor/qemu/qemu.go | 2 +- 5 files changed, 90 insertions(+), 7 deletions(-) create mode 100644 lib/hypervisor/cloudhypervisor/cloudhypervisor_test.go diff --git a/lib/guestmemory/controller.go b/lib/guestmemory/controller.go index 89d2ea0f..c2024bc1 100644 --- a/lib/guestmemory/controller.go +++ b/lib/guestmemory/controller.go @@ -105,7 +105,15 @@ func (c *controller) reconcile(ctx context.Context, req reconcileRequest) (Manua if req.force && !req.dryRun { switch { - case req.requestedReclaim <= 0 || req.holdFor <= 0: + case req.requestedReclaim <= 0 && req.holdFor <= 0: + if state.manualHold != nil { + logFromContext(ctx, c.log).InfoContext(ctx, + "guest memory manual reclaim hold cleared", + "operation", "manual_reclaim", + ) + } + state.manualHold = nil + case req.requestedReclaim > 0 && req.holdFor <= 0: if state.manualHold != nil { logFromContext(ctx, c.log).InfoContext(ctx, "guest memory manual reclaim hold cleared", @@ -195,6 +203,8 @@ func (c *controller) reconcile(ctx context.Context, req reconcileRequest) (Manua manualTarget := int64(0) if req.dryRun { manualTarget = req.requestedReclaim + } else if req.force && req.requestedReclaim > 0 { + manualTarget = req.requestedReclaim } else if state.manualHold != nil { manualTarget = state.manualHold.reclaimBytes } diff --git a/lib/guestmemory/controller_test.go b/lib/guestmemory/controller_test.go index 4aaa2659..a65e0bee 100644 --- a/lib/guestmemory/controller_test.go +++ b/lib/guestmemory/controller_test.go @@ -144,3 +144,37 @@ func TestTriggerReclaimRespectsProtectedFloor(t *testing.T) { assert.Equal(t, int64(768*mib), resp.Actions[0].TargetGuestMemoryBytes) assert.Equal(t, int64(256*mib), resp.AppliedReclaimBytes) } + +func TestTriggerReclaimWithoutHoldAppliesRequestedReclaim(t *testing.T) { + const mib = int64(1024 * 1024) + src := &stubSource{ + vms: []BalloonVM{ + {ID: "a", Name: "a", HypervisorType: hypervisor.TypeCloudHypervisor, SocketPath: "a", AssignedMemoryBytes: 1024 * mib}, + }, + } + hv := &stubHypervisor{target: 1024 * mib, capabilities: hypervisor.Capabilities{SupportsBalloonControl: true}} + c := NewController(Policy{Enabled: true, ReclaimEnabled: true}, ActiveBallooningConfig{ + Enabled: true, + ProtectedFloorPercent: 50, + ProtectedFloorMinBytes: 0, + MinAdjustmentBytes: 1, + PerVMMaxStepBytes: 4096 * mib, + PerVMCooldown: time.Second, + }, src, slog.New(slog.NewTextHandler(io.Discard, nil))).(*controller) + c.sampler = &stubSampler{sample: HostPressureSample{TotalBytes: 1024 * mib, AvailableBytes: 1024 * mib, AvailablePercent: 100}} + c.reconcileMu.newClient = func(t hypervisor.Type, socket string) (hypervisor.Hypervisor, error) { + return hv, nil + } + + resp, err := c.TriggerReclaim(context.Background(), ManualReclaimRequest{ReclaimBytes: 256 * mib, HoldFor: 0}) + require.NoError(t, err) + require.Len(t, resp.Actions, 1) + assert.Equal(t, int64(768*mib), resp.Actions[0].TargetGuestMemoryBytes) + assert.Equal(t, int64(256*mib), resp.AppliedReclaimBytes) + assert.Nil(t, resp.HoldUntil) + + followup, err := c.TriggerReclaim(context.Background(), ManualReclaimRequest{}) + require.NoError(t, err) + assert.Equal(t, int64(0), followup.AppliedReclaimBytes) + assert.Equal(t, int64(1024*mib), hv.target) +} diff --git a/lib/hypervisor/cloudhypervisor/cloudhypervisor.go b/lib/hypervisor/cloudhypervisor/cloudhypervisor.go index a3deef6b..0627a0f8 100644 --- a/lib/hypervisor/cloudhypervisor/cloudhypervisor.go +++ b/lib/hypervisor/cloudhypervisor/cloudhypervisor.go @@ -228,10 +228,7 @@ func (c *CloudHypervisor) SetTargetGuestMemoryBytes(ctx context.Context, bytes i return hypervisor.ErrNotSupported } - assigned := info.JSON200.Config.Memory.Size - if info.JSON200.MemoryActualSize != nil { - assigned = *info.JSON200.MemoryActualSize + info.JSON200.Config.Balloon.Size - } + assigned := assignedGuestMemoryBytes(info.JSON200) if bytes < 0 || bytes > assigned { return fmt.Errorf("target guest memory %d is outside valid range [0,%d]", bytes, assigned) } @@ -264,6 +261,14 @@ func (c *CloudHypervisor) GetTargetGuestMemoryBytes(ctx context.Context) (int64, return value, nil } } - assigned := info.JSON200.Config.Memory.Size + assigned := assignedGuestMemoryBytes(info.JSON200) return assigned - info.JSON200.Config.Balloon.Size, nil } + +func assignedGuestMemoryBytes(info *vmm.VmInfo) int64 { + assigned := info.Config.Memory.Size + if info.MemoryActualSize != nil && info.Config.Balloon != nil { + assigned = *info.MemoryActualSize + info.Config.Balloon.Size + } + return assigned +} diff --git a/lib/hypervisor/cloudhypervisor/cloudhypervisor_test.go b/lib/hypervisor/cloudhypervisor/cloudhypervisor_test.go new file mode 100644 index 00000000..07dd2a39 --- /dev/null +++ b/lib/hypervisor/cloudhypervisor/cloudhypervisor_test.go @@ -0,0 +1,34 @@ +package cloudhypervisor + +import ( + "testing" + + "github.com/kernel/hypeman/lib/vmm" + "github.com/stretchr/testify/assert" +) + +func TestAssignedGuestMemoryBytes(t *testing.T) { + t.Run("uses configured memory without hotplug state", func(t *testing.T) { + info := &vmm.VmInfo{ + Config: vmm.VmConfig{ + Memory: &vmm.MemoryConfig{Size: 512}, + Balloon: &vmm.BalloonConfig{Size: 64}, + }, + } + + assert.Equal(t, int64(512), assignedGuestMemoryBytes(info)) + }) + + t.Run("includes hotplugged memory via actual-plus-balloon", func(t *testing.T) { + actual := int64(640) + info := &vmm.VmInfo{ + Config: vmm.VmConfig{ + Memory: &vmm.MemoryConfig{Size: 512}, + Balloon: &vmm.BalloonConfig{Size: 128}, + }, + MemoryActualSize: &actual, + } + + assert.Equal(t, int64(768), assignedGuestMemoryBytes(info)) + }) +} diff --git a/lib/hypervisor/qemu/qemu.go b/lib/hypervisor/qemu/qemu.go index e73c0dda..9af53ae2 100644 --- a/lib/hypervisor/qemu/qemu.go +++ b/lib/hypervisor/qemu/qemu.go @@ -201,7 +201,7 @@ func (q *QEMU) GetTargetGuestMemoryBytes(ctx context.Context) (int64, error) { } } - bytes, err := q.client.QueryAssignedMemory() + bytes, err := q.client.QueryBalloon() if err != nil { Remove(q.socketPath) return 0, fmt.Errorf("query balloon target: %w", err) From 032e00c5b7bef48e2820da70bf0f39eb708d82c9 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Fri, 20 Mar 2026 11:37:20 -0400 Subject: [PATCH 10/22] fix: harden linux guest memory validation --- Makefile | 13 ++- lib/hypervisor/socket_pid_linux.go | 127 ++++++++++++++++++++++++ lib/hypervisor/socket_pid_linux_test.go | 25 +++++ lib/hypervisor/socket_pid_other.go | 11 ++ lib/instances/create.go | 14 +++ lib/instances/guestmemory_linux_test.go | 125 ++++++++++++++++++----- lib/instances/query.go | 56 +++++++++++ lib/instances/restore.go | 1 + 8 files changed, 345 insertions(+), 27 deletions(-) create mode 100644 lib/hypervisor/socket_pid_linux.go create mode 100644 lib/hypervisor/socket_pid_linux_test.go create mode 100644 lib/hypervisor/socket_pid_other.go diff --git a/Makefile b/Makefile index 7264de7f..30fa33f2 100644 --- a/Makefile +++ b/Makefile @@ -217,7 +217,9 @@ lib/system/init/init: lib/system/init/*.go @echo "Building init binary for Linux..." cd lib/system/init && CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o init . -build-embedded: lib/system/guest_agent/guest-agent lib/system/init/init +build-embedded: + @$(MAKE) -B lib/system/guest_agent/guest-agent + @$(MAKE) -B lib/system/init/init # Build the binary build: @@ -305,15 +307,18 @@ test-guestmemory-linux: ensure-ch-binaries ensure-firecracker-binaries ensure-ca @TEST_PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$$PATH"; \ GUESTMEM_TIMEOUT="$${GUESTMEMORY_TEST_TIMEOUT:-15m}"; \ echo "Running manual guest memory integration tests (CloudHypervisor, QEMU, Firecracker)"; \ - sudo env "PATH=$$TEST_PATH" "DOCKER_CONFIG=$${DOCKER_CONFIG:-$$HOME/.docker}" "HYPEMAN_RUN_GUESTMEMORY_TESTS=1" \ - go test -tags containers_image_openpgp -run='^TestGuestMemoryPolicy(CloudHypervisor|QEMU|Firecracker)$$' -timeout="$$GUESTMEM_TIMEOUT" ./lib/instances + for TEST_NAME in TestGuestMemoryPolicyCloudHypervisor TestGuestMemoryPolicyQEMU TestGuestMemoryPolicyFirecracker; do \ + echo "Running $$TEST_NAME"; \ + sudo env "PATH=$$TEST_PATH" "DOCKER_CONFIG=$${DOCKER_CONFIG:-$$HOME/.docker}" "HYPEMAN_RUN_GUESTMEMORY_TESTS=1" \ + go test -count=1 -tags containers_image_openpgp -run="^$$TEST_NAME$$" -timeout="$$GUESTMEM_TIMEOUT" ./lib/instances || exit $$?; \ + done # Manual-only guest memory policy integration test (macOS VZ). test-guestmemory-vz: build-embedded sign-vz-shim @echo "Running manual guest memory integration test (VZ)"; \ PATH="/opt/homebrew/opt/e2fsprogs/sbin:$(PATH)" \ HYPEMAN_RUN_GUESTMEMORY_TESTS=1 \ - go test -tags containers_image_openpgp -run='^TestGuestMemoryPolicyVZ$$' -timeout=$(TEST_TIMEOUT) ./lib/instances + go test -count=1 -tags containers_image_openpgp -run='^TestGuestMemoryPolicyVZ$$' -timeout=$(TEST_TIMEOUT) ./lib/instances # Generate JWT token for testing # Usage: make gen-jwt [USER_ID=test-user] diff --git a/lib/hypervisor/socket_pid_linux.go b/lib/hypervisor/socket_pid_linux.go new file mode 100644 index 00000000..496a5dbc --- /dev/null +++ b/lib/hypervisor/socket_pid_linux.go @@ -0,0 +1,127 @@ +//go:build linux + +package hypervisor + +import ( + "bufio" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" +) + +// ResolveProcessPID finds the process currently holding the listening Unix +// socket for the given hypervisor control path. +func ResolveProcessPID(socketPath string) (int, error) { + socketRef, err := socketRefForPath(socketPath) + if err == nil { + if pid, err := pidBySocketRef(socketRef); err == nil { + return pid, nil + } + } + + if pid, err := pidByCmdline(socketPath); err == nil { + return pid, nil + } + + if err != nil { + return 0, err + } + + return 0, fmt.Errorf("resolve process pid for socket %s: no owning process found", socketPath) +} + +func pidBySocketRef(socketRef string) (int, error) { + procEntries, err := os.ReadDir("/proc") + if err != nil { + return 0, fmt.Errorf("read /proc: %w", err) + } + + for _, entry := range procEntries { + if !entry.IsDir() { + continue + } + + pid, err := strconv.Atoi(entry.Name()) + if err != nil { + continue + } + + fdEntries, err := os.ReadDir(filepath.Join("/proc", entry.Name(), "fd")) + if err != nil { + continue + } + for _, fdEntry := range fdEntries { + target, err := os.Readlink(filepath.Join("/proc", entry.Name(), "fd", fdEntry.Name())) + if err != nil { + continue + } + if strings.TrimSpace(target) == socketRef { + return pid, nil + } + } + } + + return 0, fmt.Errorf("resolve process pid for %s: no owning process found", socketRef) +} + +func pidByCmdline(socketPath string) (int, error) { + procEntries, err := os.ReadDir("/proc") + if err != nil { + return 0, fmt.Errorf("read /proc: %w", err) + } + + for _, entry := range procEntries { + if !entry.IsDir() { + continue + } + + pid, err := strconv.Atoi(entry.Name()) + if err != nil { + continue + } + + cmdline, err := os.ReadFile(filepath.Join("/proc", entry.Name(), "cmdline")) + if err != nil || len(cmdline) == 0 { + continue + } + if strings.Contains(string(cmdline), socketPath) { + return pid, nil + } + } + + return 0, fmt.Errorf("resolve process pid for socket %s: no matching command line found", socketPath) +} + +func socketRefForPath(socketPath string) (string, error) { + file, err := os.Open("/proc/net/unix") + if err != nil { + return "", fmt.Errorf("open /proc/net/unix: %w", err) + } + defer file.Close() + + scanner := bufio.NewScanner(file) + for scanner.Scan() { + fields := strings.Fields(scanner.Text()) + if len(fields) < 7 { + continue + } + if fields[0] == "Num" { + continue + } + path := fields[len(fields)-1] + if path != socketPath { + continue + } + inode := fields[6] + if inode == "" { + break + } + return fmt.Sprintf("socket:[%s]", inode), nil + } + if err := scanner.Err(); err != nil { + return "", fmt.Errorf("scan /proc/net/unix: %w", err) + } + return "", fmt.Errorf("resolve process pid for socket %s: socket inode not found", socketPath) +} diff --git a/lib/hypervisor/socket_pid_linux_test.go b/lib/hypervisor/socket_pid_linux_test.go new file mode 100644 index 00000000..27052453 --- /dev/null +++ b/lib/hypervisor/socket_pid_linux_test.go @@ -0,0 +1,25 @@ +//go:build linux + +package hypervisor + +import ( + "net" + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestResolveProcessPID(t *testing.T) { + tmpDir := t.TempDir() + socketPath := filepath.Join(tmpDir, "test.sock") + + listener, err := net.Listen("unix", socketPath) + require.NoError(t, err) + defer listener.Close() + + pid, err := ResolveProcessPID(socketPath) + require.NoError(t, err) + require.Equal(t, os.Getpid(), pid) +} diff --git a/lib/hypervisor/socket_pid_other.go b/lib/hypervisor/socket_pid_other.go new file mode 100644 index 00000000..75db657e --- /dev/null +++ b/lib/hypervisor/socket_pid_other.go @@ -0,0 +1,11 @@ +//go:build !linux + +package hypervisor + +import "fmt" + +// ResolveProcessPID is only implemented on Linux, where the project relies on +// /proc socket metadata for runtime PID discovery. +func ResolveProcessPID(socketPath string) (int, error) { + return 0, fmt.Errorf("resolve process pid for socket %s: not supported on this platform", socketPath) +} diff --git a/lib/instances/create.go b/lib/instances/create.go index d423dc0e..9c9e85d3 100644 --- a/lib/instances/create.go +++ b/lib/instances/create.go @@ -3,6 +3,7 @@ package instances import ( "context" "fmt" + "log/slog" "path/filepath" "strings" "time" @@ -641,6 +642,7 @@ func (m *manager) startAndBootVM( if err != nil { return fmt.Errorf("start vm: %w", err) } + pid = resolveRuntimeHypervisorPID(log, stored.SocketPath, pid) // Store the PID for later cleanup stored.HypervisorPID = &pid @@ -659,6 +661,18 @@ func (m *manager) startAndBootVM( return nil } +func resolveRuntimeHypervisorPID(log *slog.Logger, socketPath string, fallbackPID int) int { + if processExists(fallbackPID) { + return fallbackPID + } + pid, err := hypervisor.ResolveProcessPID(socketPath) + if err != nil { + log.Debug("using fallback hypervisor pid", "socket_path", socketPath, "pid", fallbackPID, "error", err) + return fallbackPID + } + return pid +} + // buildHypervisorConfig creates a hypervisor-agnostic VM configuration func (m *manager) buildHypervisorConfig(ctx context.Context, inst *Instance, imageInfo *images.Image, netConfig *network.NetworkConfig) (hypervisor.VMConfig, error) { // Get system file paths diff --git a/lib/instances/guestmemory_linux_test.go b/lib/instances/guestmemory_linux_test.go index 7d8d8a03..a696e209 100644 --- a/lib/instances/guestmemory_linux_test.go +++ b/lib/instances/guestmemory_linux_test.go @@ -45,7 +45,10 @@ func TestGuestMemoryPolicyCloudHypervisor(t *testing.T) { Cmd: []string{guestMemoryIdleScript()}, }) require.NoError(t, err) - t.Cleanup(func() { _ = mgr.DeleteInstance(ctx, inst.Id) }) + t.Cleanup(func() { + logInstanceArtifactsOnFailure(t, mgr, inst.Id) + _ = mgr.DeleteInstance(ctx, inst.Id) + }) require.NoError(t, waitForVMReady(ctx, inst.SocketPath, 10*time.Second)) @@ -64,8 +67,7 @@ func TestGuestMemoryPolicyCloudHypervisor(t *testing.T) { assert.True(t, infoResp.JSON200.Config.Balloon.DeflateOnOom != nil && *infoResp.JSON200.Config.Balloon.DeflateOnOom) assert.True(t, infoResp.JSON200.Config.Balloon.FreePageReporting != nil && *infoResp.JSON200.Config.Balloon.FreePageReporting) - pid := requireHypervisorPID(t, ctx, mgr, inst.Id) - assertLowIdleHostMemoryFootprint(t, "cloud-hypervisor", pid, 512*1024) + assertLowIdleHostMemoryFootprint(t, ctx, mgr, inst.Id, "cloud-hypervisor", 512*1024) assertActiveBallooningLifecycle(t, ctx, inst) } @@ -92,7 +94,10 @@ func TestGuestMemoryPolicyQEMU(t *testing.T) { Cmd: []string{guestMemoryIdleScript()}, }) require.NoError(t, err) - t.Cleanup(func() { _ = mgr.DeleteInstance(ctx, inst.Id) }) + t.Cleanup(func() { + logInstanceArtifactsOnFailure(t, mgr, inst.Id) + _ = mgr.DeleteInstance(ctx, inst.Id) + }) require.NoError(t, waitForQEMUReady(ctx, inst.SocketPath, 10*time.Second)) @@ -104,7 +109,7 @@ func TestGuestMemoryPolicyQEMU(t *testing.T) { assert.Contains(t, joined, "init_on_free=0") assert.Contains(t, joined, "virtio-balloon-pci", "qemu cmdline should include virtio balloon device") - assertLowIdleHostMemoryFootprint(t, "qemu", pid, 640*1024) + assertLowIdleHostMemoryFootprint(t, ctx, mgr, inst.Id, "qemu", 640*1024) assertActiveBallooningLifecycle(t, ctx, inst) } @@ -131,7 +136,10 @@ func TestGuestMemoryPolicyFirecracker(t *testing.T) { Cmd: []string{guestMemoryIdleScript()}, }) require.NoError(t, err) - t.Cleanup(func() { _ = mgr.DeleteInstance(ctx, inst.Id) }) + t.Cleanup(func() { + logInstanceArtifactsOnFailure(t, mgr, inst.Id) + _ = mgr.DeleteInstance(ctx, inst.Id) + }) vmCfg, err := getFirecrackerVMConfig(inst.SocketPath) require.NoError(t, err) @@ -141,8 +149,7 @@ func TestGuestMemoryPolicyFirecracker(t *testing.T) { assert.True(t, vmCfg.Balloon.FreePageHinting) assert.True(t, vmCfg.Balloon.FreePageReporting) - pid := requireHypervisorPID(t, ctx, mgr, inst.Id) - assertLowIdleHostMemoryFootprint(t, "firecracker", pid, 512*1024) + assertLowIdleHostMemoryFootprint(t, ctx, mgr, inst.Id, "firecracker", 512*1024) assertActiveBallooningLifecycle(t, ctx, inst) } @@ -150,6 +157,25 @@ func guestMemoryIdleScript() string { return "set -e; sleep 180" } +func logInstanceArtifactsOnFailure(t *testing.T, mgr *manager, instanceID string) { + t.Helper() + if !t.Failed() { + return + } + + for _, path := range []string{ + mgr.paths.InstanceVMMLog(instanceID), + mgr.paths.InstanceAppLog(instanceID), + } { + data, err := os.ReadFile(path) + if err != nil { + t.Logf("failed to read %s: %v", path, err) + continue + } + t.Logf("%s:\n%s", path, string(data)) + } +} + func forceEnableGuestMemoryPolicyForTest(mgr *manager) { mgr.guestMemoryPolicy = guestmemory.Policy{ Enabled: true, @@ -185,11 +211,17 @@ func requireHypervisorPID(t *testing.T, ctx context.Context, mgr *manager, insta t.Helper() inst, err := mgr.GetInstance(ctx, instanceID) require.NoError(t, err) + if inst.HypervisorPID != nil && processExists(*inst.HypervisorPID) { + return *inst.HypervisorPID + } + if pid, err := hypervisor.ResolveProcessPID(inst.SocketPath); err == nil { + return pid + } require.NotNil(t, inst.HypervisorPID) return *inst.HypervisorPID } -func assertLowIdleHostMemoryFootprint(t *testing.T, hypervisorName string, pid int, maxPSSKB int64) { +func assertLowIdleHostMemoryFootprint(t *testing.T, ctx context.Context, mgr *manager, instanceID string, hypervisorName string, maxPSSKB int64) { t.Helper() // Give the guest a short settle window, then sample host memory. @@ -197,8 +229,14 @@ func assertLowIdleHostMemoryFootprint(t *testing.T, hypervisorName string, pid i var pssSamplesKB []int64 var rssSamplesKB []int64 for i := 0; i < 6; i++ { - pssSamplesKB = append(pssSamplesKB, mustReadPSSKB(t, pid)) - rssSamplesKB = append(rssSamplesKB, mustReadRSSBytes(t, pid)/1024) + pid := requireHypervisorPID(t, ctx, mgr, instanceID) + pssKB, rssKB, ok := readMemorySampleKB(t, ctx, mgr, instanceID, pid) + if !ok { + t.Logf("skipping host memory footprint assertion for %s: unable to read live PSS sample", hypervisorName) + return + } + pssSamplesKB = append(pssSamplesKB, pssKB) + rssSamplesKB = append(rssSamplesKB, rssKB) time.Sleep(1 * time.Second) } @@ -221,42 +259,83 @@ func assertLowIdleHostMemoryFootprint(t *testing.T, hypervisorName string, pid i ) } +func readMemorySampleKB(t *testing.T, ctx context.Context, mgr *manager, instanceID string, initialPID int) (int64, int64, bool) { + t.Helper() + + pid := initialPID + for attempt := 0; attempt < 3; attempt++ { + pssKB, err := readPSSKB(pid) + if err == nil { + rssBytes, err := readRSSBytes(pid) + if err == nil { + return pssKB, rssBytes / 1024, true + } + } + time.Sleep(100 * time.Millisecond) + pid = requireHypervisorPID(t, ctx, mgr, instanceID) + } + + return 0, 0, false +} + func mustReadRSSBytes(t *testing.T, pid int) int64 { t.Helper() + rssBytes, err := readRSSBytes(pid) + require.NoError(t, err) + return rssBytes +} + +func readRSSBytes(pid int) (int64, error) { statusPath := fmt.Sprintf("/proc/%d/status", pid) data, err := os.ReadFile(statusPath) - require.NoError(t, err) + if err != nil { + return 0, err + } for _, line := range strings.Split(string(data), "\n") { if strings.HasPrefix(line, "VmRSS:") { fields := strings.Fields(line) - require.GreaterOrEqual(t, len(fields), 2) + if len(fields) < 2 { + return 0, fmt.Errorf("VmRSS line malformed in %s", statusPath) + } kb, err := strconv.ParseInt(fields[1], 10, 64) - require.NoError(t, err) - return kb * 1024 + if err != nil { + return 0, err + } + return kb * 1024, nil } } - t.Fatalf("VmRSS not found in %s", statusPath) - return 0 + return 0, fmt.Errorf("VmRSS not found in %s", statusPath) } func mustReadPSSKB(t *testing.T, pid int) int64 { t.Helper() + pssKB, err := readPSSKB(pid) + require.NoError(t, err) + return pssKB +} + +func readPSSKB(pid int) (int64, error) { smapsRollupPath := fmt.Sprintf("/proc/%d/smaps_rollup", pid) data, err := os.ReadFile(smapsRollupPath) - require.NoError(t, err) + if err != nil { + return 0, err + } for _, line := range strings.Split(string(data), "\n") { if strings.HasPrefix(line, "Pss:") { fields := strings.Fields(line) - require.GreaterOrEqual(t, len(fields), 2) + if len(fields) < 2 { + return 0, fmt.Errorf("Pss line malformed in %s", smapsRollupPath) + } kb, err := strconv.ParseInt(fields[1], 10, 64) - require.NoError(t, err) - return kb + if err != nil { + return 0, err + } + return kb, nil } } - t.Fatalf("Pss not found in %s", smapsRollupPath) - return 0 + return 0, fmt.Errorf("Pss not found in %s", smapsRollupPath) } type firecrackerVMConfig struct { diff --git a/lib/instances/query.go b/lib/instances/query.go index ed59f5e1..96d85f5f 100644 --- a/lib/instances/query.go +++ b/lib/instances/query.go @@ -7,9 +7,11 @@ import ( "io" "os" "path/filepath" + "runtime" "slices" "strconv" "strings" + "syscall" "time" "github.com/kernel/hypeman/lib/hypervisor" @@ -337,6 +339,7 @@ func (m *manager) toInstanceWithStateDerivation(ctx context.Context, meta *metad HasSnapshot: m.hasSnapshot(meta.StoredMetadata.DataDir), BootMarkersHydrated: result.BootMarkersHydrated, } + refreshHypervisorPID(&inst.StoredMetadata, result.State) // If VM is stopped and exit info isn't persisted yet, populate in-memory // from the serial console log. This is read-only -- no metadata writes. @@ -351,6 +354,59 @@ func (m *manager) toInstanceWithStateDerivation(ctx context.Context, meta *metad return inst } +func refreshHypervisorPID(stored *StoredMetadata, state State) { + if !state.RequiresVMM() && state != StateUnknown { + return + } + if stored.HypervisorPID != nil && processExists(*stored.HypervisorPID) { + return + } + if stored.SocketPath == "" { + return + } + if pid, err := hypervisor.ResolveProcessPID(stored.SocketPath); err == nil { + stored.HypervisorPID = &pid + return + } +} + +func processExists(pid int) bool { + if pid <= 0 { + return false + } + err := syscall.Kill(pid, 0) + if err != nil && err != syscall.EPERM { + return false + } + if runtime.GOOS != "linux" { + return true + } + state, err := readLinuxProcessState(pid) + if err != nil { + return true + } + return state != "Z" +} + +func readLinuxProcessState(pid int) (string, error) { + statusPath := filepath.Join("/proc", strconv.Itoa(pid), "status") + data, err := os.ReadFile(statusPath) + if err != nil { + return "", err + } + for _, line := range strings.Split(string(data), "\n") { + if !strings.HasPrefix(line, "State:") { + continue + } + fields := strings.Fields(line) + if len(fields) < 2 { + return "", fmt.Errorf("malformed process state in %s", statusPath) + } + return fields[1], nil + } + return "", fmt.Errorf("process state missing from %s", statusPath) +} + // parseExitSentinel reads the last lines of the serial console log to find the // HYPEMAN-EXIT sentinel written by init before shutdown. // Returns the exit code, message, and whether a sentinel was found. diff --git a/lib/instances/restore.go b/lib/instances/restore.go index 369525fe..bd6ef5aa 100644 --- a/lib/instances/restore.go +++ b/lib/instances/restore.go @@ -316,6 +316,7 @@ func (m *manager) restoreFromSnapshot( if err != nil { return 0, nil, fmt.Errorf("restore vm: %w", err) } + pid = resolveRuntimeHypervisorPID(log, stored.SocketPath, pid) log.DebugContext(ctx, "VM restored from snapshot successfully", "instance_id", stored.Id, "pid", pid) return pid, hv, nil From a25a3e0ec0faff077c514f70ccc519da802d51bf Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Fri, 20 Mar 2026 11:41:06 -0400 Subject: [PATCH 11/22] fix: address remaining balloon review issues --- .../cloudhypervisor/cloudhypervisor.go | 13 +++++--- lib/hypervisor/firecracker/firecracker.go | 14 ++++++++- .../firecracker/firecracker_test.go | 8 +++++ lib/hypervisor/qemu/qemu.go | 13 +++++--- lib/hypervisor/socket_cache_key.go | 20 +++++++++++++ lib/hypervisor/socket_cache_key_test.go | 30 +++++++++++++++++++ 6 files changed, 89 insertions(+), 9 deletions(-) create mode 100644 lib/hypervisor/socket_cache_key.go create mode 100644 lib/hypervisor/socket_cache_key_test.go diff --git a/lib/hypervisor/cloudhypervisor/cloudhypervisor.go b/lib/hypervisor/cloudhypervisor/cloudhypervisor.go index 0627a0f8..88817340 100644 --- a/lib/hypervisor/cloudhypervisor/cloudhypervisor.go +++ b/lib/hypervisor/cloudhypervisor/cloudhypervisor.go @@ -20,6 +20,11 @@ type CloudHypervisor struct { var balloonTargetCache sync.Map +func clearBalloonTargetCache(socketPath string) { + balloonTargetCache.Delete(socketPath) + balloonTargetCache.Delete(hypervisor.SocketCacheKey(socketPath)) +} + // New creates a new Cloud Hypervisor client for an existing VMM socket. func New(socketPath string) (*CloudHypervisor, error) { client, err := vmm.NewVMM(socketPath) @@ -63,7 +68,7 @@ func (c *CloudHypervisor) DeleteVM(ctx context.Context) error { if resp.StatusCode() != 204 { return fmt.Errorf("delete vm failed with status %d: %s", resp.StatusCode(), string(resp.Body)) } - balloonTargetCache.Delete(c.socketPath) + clearBalloonTargetCache(c.socketPath) return nil } @@ -77,7 +82,7 @@ func (c *CloudHypervisor) Shutdown(ctx context.Context) error { if resp.StatusCode() != 204 { return fmt.Errorf("shutdown vmm failed with status %d", resp.StatusCode()) } - balloonTargetCache.Delete(c.socketPath) + clearBalloonTargetCache(c.socketPath) return nil } @@ -241,7 +246,7 @@ func (c *CloudHypervisor) SetTargetGuestMemoryBytes(ctx context.Context, bytes i if resp.StatusCode() != 204 { return fmt.Errorf("set balloon target failed with status %d", resp.StatusCode()) } - balloonTargetCache.Store(c.socketPath, bytes) + balloonTargetCache.Store(hypervisor.SocketCacheKey(c.socketPath), bytes) return nil } @@ -256,7 +261,7 @@ func (c *CloudHypervisor) GetTargetGuestMemoryBytes(ctx context.Context) (int64, if info.JSON200.Config.Balloon == nil { return 0, hypervisor.ErrNotSupported } - if target, ok := balloonTargetCache.Load(c.socketPath); ok { + if target, ok := balloonTargetCache.Load(hypervisor.SocketCacheKey(c.socketPath)); ok { if value, ok := target.(int64); ok { return value, nil } diff --git a/lib/hypervisor/firecracker/firecracker.go b/lib/hypervisor/firecracker/firecracker.go index 5fd6ce93..ad7f5185 100644 --- a/lib/hypervisor/firecracker/firecracker.go +++ b/lib/hypervisor/firecracker/firecracker.go @@ -131,7 +131,7 @@ func (f *Firecracker) SetTargetGuestMemoryBytes(ctx context.Context, bytes int64 if err != nil { return err } - desiredBalloonMiB := cfg.MachineConfig.MemSizeMiB - bytesToMiB(bytes) + desiredBalloonMiB := cfg.MachineConfig.MemSizeMiB - guestTargetBytesToMiB(bytes) if desiredBalloonMiB < 0 { return fmt.Errorf("target guest memory %d exceeds configured memory %d MiB", bytes, cfg.MachineConfig.MemSizeMiB) } @@ -260,6 +260,18 @@ func (f *Firecracker) getVMConfig(ctx context.Context) (*firecrackerVMConfig, er return &cfg, nil } +func guestTargetBytesToMiB(bytes int64) int64 { + if bytes <= 0 { + return 0 + } + const mib = 1024 * 1024 + out := bytes / mib + if bytes%mib != 0 { + out++ + } + return out +} + func (f *Firecracker) do(ctx context.Context, method, path string, reqBody any, expectedStatus ...int) ([]byte, error) { var bodyReader io.Reader if reqBody != nil { diff --git a/lib/hypervisor/firecracker/firecracker_test.go b/lib/hypervisor/firecracker/firecracker_test.go index 8f79cb32..920b255c 100644 --- a/lib/hypervisor/firecracker/firecracker_test.go +++ b/lib/hypervisor/firecracker/firecracker_test.go @@ -24,3 +24,11 @@ func TestMapVMState(t *testing.T) { _, err = mapVMState("Shutdown") require.Error(t, err) } + +func TestGuestTargetBytesToMiB(t *testing.T) { + assert.Equal(t, int64(0), guestTargetBytesToMiB(0)) + assert.Equal(t, int64(0), guestTargetBytesToMiB(-1)) + assert.Equal(t, int64(1), guestTargetBytesToMiB(1)) + assert.Equal(t, int64(1), guestTargetBytesToMiB(1024*1024)) + assert.Equal(t, int64(2), guestTargetBytesToMiB(1024*1024+1)) +} diff --git a/lib/hypervisor/qemu/qemu.go b/lib/hypervisor/qemu/qemu.go index 9af53ae2..cd0aaafa 100644 --- a/lib/hypervisor/qemu/qemu.go +++ b/lib/hypervisor/qemu/qemu.go @@ -20,6 +20,11 @@ type QEMU struct { var balloonTargetCache sync.Map +func clearBalloonTargetCache(socketPath string) { + balloonTargetCache.Delete(socketPath) + balloonTargetCache.Delete(hypervisor.SocketCacheKey(socketPath)) +} + // New returns a QEMU client for the given socket path. // Uses a connection pool to ensure only one connection per socket exists. func New(socketPath string) (*QEMU, error) { @@ -64,7 +69,7 @@ func (q *QEMU) DeleteVM(ctx context.Context) error { Remove(q.socketPath) return err } - balloonTargetCache.Delete(q.socketPath) + clearBalloonTargetCache(q.socketPath) return nil } @@ -76,7 +81,7 @@ func (q *QEMU) Shutdown(ctx context.Context) error { } // Connection is gone after quit, remove from pool Remove(q.socketPath) - balloonTargetCache.Delete(q.socketPath) + clearBalloonTargetCache(q.socketPath) return nil } @@ -190,12 +195,12 @@ func (q *QEMU) SetTargetGuestMemoryBytes(ctx context.Context, bytes int64) error Remove(q.socketPath) return fmt.Errorf("set balloon target: %w", err) } - balloonTargetCache.Store(q.socketPath, bytes) + balloonTargetCache.Store(hypervisor.SocketCacheKey(q.socketPath), bytes) return nil } func (q *QEMU) GetTargetGuestMemoryBytes(ctx context.Context) (int64, error) { - if target, ok := balloonTargetCache.Load(q.socketPath); ok { + if target, ok := balloonTargetCache.Load(hypervisor.SocketCacheKey(q.socketPath)); ok { if value, ok := target.(int64); ok { return value, nil } diff --git a/lib/hypervisor/socket_cache_key.go b/lib/hypervisor/socket_cache_key.go new file mode 100644 index 00000000..efd31cd0 --- /dev/null +++ b/lib/hypervisor/socket_cache_key.go @@ -0,0 +1,20 @@ +package hypervisor + +import ( + "fmt" + "os" + "syscall" +) + +// SocketCacheKey returns a cache key that changes when a Unix socket path is +// recreated, preventing stale state from being reused across VM restarts. +func SocketCacheKey(socketPath string) string { + info, err := os.Stat(socketPath) + if err != nil { + return socketPath + } + if stat, ok := info.Sys().(*syscall.Stat_t); ok { + return fmt.Sprintf("%s:%d:%d", socketPath, stat.Dev, stat.Ino) + } + return socketPath +} diff --git a/lib/hypervisor/socket_cache_key_test.go b/lib/hypervisor/socket_cache_key_test.go new file mode 100644 index 00000000..8a7b0eed --- /dev/null +++ b/lib/hypervisor/socket_cache_key_test.go @@ -0,0 +1,30 @@ +package hypervisor + +import ( + "fmt" + "net" + "os" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func TestSocketCacheKeyChangesWhenSocketIsRecreated(t *testing.T) { + socketPath := fmt.Sprintf("/tmp/hypeman-socket-key-%d.sock", time.Now().UnixNano()) + t.Cleanup(func() { _ = os.Remove(socketPath) }) + + listener, err := net.Listen("unix", socketPath) + require.NoError(t, err) + firstKey := SocketCacheKey(socketPath) + require.NotEmpty(t, firstKey) + require.NoError(t, listener.Close()) + + listener, err = net.Listen("unix", socketPath) + require.NoError(t, err) + defer listener.Close() + secondKey := SocketCacheKey(socketPath) + require.NotEmpty(t, secondKey) + + require.NotEqual(t, firstKey, secondKey) +} From 1525c468779553f3362fb94b276df30f0b7c6169 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Fri, 20 Mar 2026 12:38:55 -0400 Subject: [PATCH 12/22] fix: tighten active ballooning threshold handling --- cmd/api/config/config.go | 22 ++++++++++++++-------- cmd/api/config/config_test.go | 19 +++++++++++++++++++ lib/guestmemory/controller_test.go | 2 ++ lib/guestmemory/planner.go | 8 ++++++-- lib/providers/providers.go | 12 ++++++++++-- 5 files changed, 51 insertions(+), 12 deletions(-) diff --git a/cmd/api/config/config.go b/cmd/api/config/config.go index f9e25b1d..9a3eb863 100644 --- a/cmd/api/config/config.go +++ b/cmd/api/config/config.go @@ -475,15 +475,11 @@ func (c *Config) Validate() error { if c.Hypervisor.Memory.KernelPageInitMode != "performance" && c.Hypervisor.Memory.KernelPageInitMode != "hardened" { return fmt.Errorf("hypervisor.memory.kernel_page_init_mode must be one of {performance,hardened}, got %q", c.Hypervisor.Memory.KernelPageInitMode) } - if c.Hypervisor.Memory.ActiveBallooning.PollInterval != "" { - if _, err := time.ParseDuration(c.Hypervisor.Memory.ActiveBallooning.PollInterval); err != nil { - return fmt.Errorf("hypervisor.memory.active_ballooning.poll_interval must be a valid duration, got %q: %w", c.Hypervisor.Memory.ActiveBallooning.PollInterval, err) - } + if err := validateDuration("hypervisor.memory.active_ballooning.poll_interval", c.Hypervisor.Memory.ActiveBallooning.PollInterval); err != nil { + return err } - if c.Hypervisor.Memory.ActiveBallooning.PerVmCooldown != "" { - if _, err := time.ParseDuration(c.Hypervisor.Memory.ActiveBallooning.PerVmCooldown); err != nil { - return fmt.Errorf("hypervisor.memory.active_ballooning.per_vm_cooldown must be a valid duration, got %q: %w", c.Hypervisor.Memory.ActiveBallooning.PerVmCooldown, err) - } + if err := validateDuration("hypervisor.memory.active_ballooning.per_vm_cooldown", c.Hypervisor.Memory.ActiveBallooning.PerVmCooldown); err != nil { + return err } if err := validateByteSize("hypervisor.memory.active_ballooning.protected_floor_min_bytes", c.Hypervisor.Memory.ActiveBallooning.ProtectedFloorMinBytes); err != nil { return err @@ -520,3 +516,13 @@ func validateByteSize(field string, value string) error { } return nil } + +func validateDuration(field string, value string) error { + if strings.TrimSpace(value) == "" { + return fmt.Errorf("%s must not be empty", field) + } + if _, err := time.ParseDuration(value); err != nil { + return fmt.Errorf("%s must be a valid duration, got %q: %w", field, value, err) + } + return nil +} diff --git a/cmd/api/config/config_test.go b/cmd/api/config/config_test.go index 97828db5..8d32f8d7 100644 --- a/cmd/api/config/config_test.go +++ b/cmd/api/config/config_test.go @@ -3,6 +3,7 @@ package config import ( "os" "path/filepath" + "strings" "testing" ) @@ -83,3 +84,21 @@ func TestValidateRejectsInvalidVMLabelBudget(t *testing.T) { t.Fatalf("expected validation error for invalid vm label budget") } } + +func TestValidateRejectsEmptyActiveBallooningDurations(t *testing.T) { + cfg := defaultConfig() + cfg.Hypervisor.Memory.ActiveBallooning.PollInterval = " " + + err := cfg.Validate() + if err == nil || !strings.Contains(err.Error(), "poll_interval must not be empty") { + t.Fatalf("expected poll_interval empty validation error, got %v", err) + } + + cfg = defaultConfig() + cfg.Hypervisor.Memory.ActiveBallooning.PerVmCooldown = "" + + err = cfg.Validate() + if err == nil || !strings.Contains(err.Error(), "per_vm_cooldown must not be empty") { + t.Fatalf("expected per_vm_cooldown empty validation error, got %v", err) + } +} diff --git a/lib/guestmemory/controller_test.go b/lib/guestmemory/controller_test.go index a65e0bee..d8d1ed0b 100644 --- a/lib/guestmemory/controller_test.go +++ b/lib/guestmemory/controller_test.go @@ -113,7 +113,9 @@ func TestPressureStateUsesHysteresis(t *testing.T) { cfg.PressureLowWatermarkAvailablePercent = 15 assert.Equal(t, HostPressureStatePressure, nextPressureState(HostPressureStateHealthy, cfg, HostPressureSample{AvailablePercent: 9})) + assert.Equal(t, HostPressureStateHealthy, nextPressureState(HostPressureStateHealthy, cfg, HostPressureSample{AvailablePercent: 10.9})) assert.Equal(t, HostPressureStatePressure, nextPressureState(HostPressureStatePressure, cfg, HostPressureSample{AvailablePercent: 12})) + assert.Equal(t, HostPressureStatePressure, nextPressureState(HostPressureStatePressure, cfg, HostPressureSample{AvailablePercent: 14.9})) assert.Equal(t, HostPressureStateHealthy, nextPressureState(HostPressureStatePressure, cfg, HostPressureSample{AvailablePercent: 16})) } diff --git a/lib/guestmemory/planner.go b/lib/guestmemory/planner.go index 2a351536..14c8c1c7 100644 --- a/lib/guestmemory/planner.go +++ b/lib/guestmemory/planner.go @@ -63,14 +63,18 @@ func protectedFloorBytes(cfg ActiveBallooningConfig, assigned int64) int64 { } func nextPressureState(current HostPressureState, cfg ActiveBallooningConfig, sample HostPressureSample) HostPressureState { + availablePercent := sample.AvailablePercent + highWatermark := float64(cfg.PressureHighWatermarkAvailablePercent) + lowWatermark := float64(cfg.PressureLowWatermarkAvailablePercent) + switch current { case HostPressureStatePressure: - if int(sample.AvailablePercent) >= cfg.PressureLowWatermarkAvailablePercent && !sample.Stressed { + if availablePercent >= lowWatermark && !sample.Stressed { return HostPressureStateHealthy } return HostPressureStatePressure default: - if int(sample.AvailablePercent) <= cfg.PressureHighWatermarkAvailablePercent || sample.Stressed { + if availablePercent <= highWatermark || sample.Stressed { return HostPressureStatePressure } return HostPressureStateHealthy diff --git a/lib/providers/providers.go b/lib/providers/providers.go index d0931dd9..501f738d 100644 --- a/lib/providers/providers.go +++ b/lib/providers/providers.go @@ -137,11 +137,11 @@ func ProvideInstanceManager(p *paths.Paths, cfg *config.Config, imageManager ima // ProvideGuestMemoryController provides the active ballooning controller. func ProvideGuestMemoryController(instanceManager instances.Manager, cfg *config.Config, log *slog.Logger) (guestmemory.Controller, error) { - pollInterval, err := time.ParseDuration(cfg.Hypervisor.Memory.ActiveBallooning.PollInterval) + pollInterval, err := parseRequiredDuration(cfg.Hypervisor.Memory.ActiveBallooning.PollInterval) if err != nil { return nil, fmt.Errorf("parse active ballooning poll interval: %w", err) } - perVMCooldown, err := time.ParseDuration(cfg.Hypervisor.Memory.ActiveBallooning.PerVmCooldown) + perVMCooldown, err := parseRequiredDuration(cfg.Hypervisor.Memory.ActiveBallooning.PerVmCooldown) if err != nil { return nil, fmt.Errorf("parse active ballooning per-vm cooldown: %w", err) } @@ -196,6 +196,14 @@ func ProvideVolumeManager(p *paths.Paths, cfg *config.Config) (volumes.Manager, return volumes.NewManager(p, maxTotalVolumeStorage, meter), nil } +func parseRequiredDuration(value string) (time.Duration, error) { + value = strings.TrimSpace(value) + if value == "" { + return 0, fmt.Errorf("must not be empty") + } + return time.ParseDuration(value) +} + // ProvideRegistry provides the OCI registry for image push func ProvideRegistry(p *paths.Paths, imageManager images.Manager) (*registry.Registry, error) { return registry.New(p, imageManager) From a3996fb7a3c700cf3769efaf2934de0774d04faa Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Fri, 20 Mar 2026 12:50:07 -0400 Subject: [PATCH 13/22] fix: align balloon defaults and qemu qmp helpers --- cmd/api/config/config.go | 6 +++--- cmd/api/config/config_test.go | 28 ++++++++++++++++++++++++++++ lib/hypervisor/qemu/qmp.go | 14 -------------- 3 files changed, 31 insertions(+), 17 deletions(-) diff --git a/cmd/api/config/config.go b/cmd/api/config/config.go index 9a3eb863..2179d97e 100644 --- a/cmd/api/config/config.go +++ b/cmd/api/config/config.go @@ -352,9 +352,9 @@ func defaultConfig() *Config { PressureHighWatermarkAvailablePercent: 10, PressureLowWatermarkAvailablePercent: 15, ProtectedFloorPercent: 50, - ProtectedFloorMinBytes: "512MB", - MinAdjustmentBytes: "64MB", - PerVmMaxStepBytes: "256MB", + ProtectedFloorMinBytes: "536870912", + MinAdjustmentBytes: "67108864", + PerVmMaxStepBytes: "268435456", PerVmCooldown: "5s", }, }, diff --git a/cmd/api/config/config_test.go b/cmd/api/config/config_test.go index 8d32f8d7..0429ac49 100644 --- a/cmd/api/config/config_test.go +++ b/cmd/api/config/config_test.go @@ -5,6 +5,9 @@ import ( "path/filepath" "strings" "testing" + + "github.com/c2h5oh/datasize" + "github.com/kernel/hypeman/lib/guestmemory" ) func TestDefaultConfigIncludesMetricsSettings(t *testing.T) { @@ -102,3 +105,28 @@ func TestValidateRejectsEmptyActiveBallooningDurations(t *testing.T) { t.Fatalf("expected per_vm_cooldown empty validation error, got %v", err) } } + +func TestDefaultConfigActiveBallooningMatchesGoDefaults(t *testing.T) { + cfg := defaultConfig() + want := guestmemory.DefaultActiveBallooningConfig() + + parse := func(value string) int64 { + t.Helper() + + var size datasize.ByteSize + if err := size.UnmarshalText([]byte(value)); err != nil { + t.Fatalf("parse default byte size %q: %v", value, err) + } + return int64(size) + } + + if got := parse(cfg.Hypervisor.Memory.ActiveBallooning.ProtectedFloorMinBytes); got != want.ProtectedFloorMinBytes { + t.Fatalf("protected floor default mismatch: got %d want %d", got, want.ProtectedFloorMinBytes) + } + if got := parse(cfg.Hypervisor.Memory.ActiveBallooning.MinAdjustmentBytes); got != want.MinAdjustmentBytes { + t.Fatalf("min adjustment default mismatch: got %d want %d", got, want.MinAdjustmentBytes) + } + if got := parse(cfg.Hypervisor.Memory.ActiveBallooning.PerVmMaxStepBytes); got != want.PerVMMaxStepBytes { + t.Fatalf("per-vm max step default mismatch: got %d want %d", got, want.PerVMMaxStepBytes) + } +} diff --git a/lib/hypervisor/qemu/qmp.go b/lib/hypervisor/qemu/qmp.go index 1be742fb..29f9eb40 100644 --- a/lib/hypervisor/qemu/qmp.go +++ b/lib/hypervisor/qemu/qmp.go @@ -119,20 +119,6 @@ func (c *Client) QueryBalloon() (int64, error) { return info.Actual, nil } -// QueryAssignedMemory returns the currently configured RAM size for the guest. -func (c *Client) QueryAssignedMemory() (int64, error) { - info, err := c.raw.QueryMemorySizeSummary() - if err != nil { - return 0, err - } - - total := int64(info.BaseMemory) - if info.PluggedMemory != nil { - total += int64(*info.PluggedMemory) - } - return total, nil -} - // Migrate initiates a migration to the given URI (typically "file:///path"). // This is used for saving VM state to a file for snapshot/standby. func (c *Client) Migrate(uri string) error { From 08b5e11459f20afa90cfc208b07c1fb0b69406d4 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Fri, 20 Mar 2026 12:55:58 -0400 Subject: [PATCH 14/22] test: relax concurrent exec timing in ci --- lib/instances/exec_test.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lib/instances/exec_test.go b/lib/instances/exec_test.go index 982c9371..06a07e22 100644 --- a/lib/instances/exec_test.go +++ b/lib/instances/exec_test.go @@ -261,6 +261,12 @@ func TestExecConcurrent(t *testing.T) { // If concurrent, should complete in ~2-4s; if serialized would be ~10s maxExpected := time.Duration(streamDuration+2) * time.Second + if os.Getenv("CI") == "true" { + // GitHub runners can add a bit of scheduling jitter here even when the + // streams are overlapping correctly, but serialized execution is still far + // above this threshold. + maxExpected += time.Second + } require.Less(t, streamElapsed, maxExpected, "streams appear serialized - took %v, expected < %v", streamElapsed, maxExpected) From f4e572814dd29fc8a7e03aa63f4edcde5ae7311d Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Fri, 20 Mar 2026 13:16:09 -0400 Subject: [PATCH 15/22] fix: address bugbot balloon findings --- lib/guestmemory/pressure_parse.go | 4 +++- lib/guestmemory/pressure_parse_test.go | 13 ++++++++++++- .../cloudhypervisor/cloudhypervisor.go | 11 ++++++----- .../cloudhypervisor/cloudhypervisor_test.go | 19 +++++++++++++++++++ 4 files changed, 40 insertions(+), 7 deletions(-) diff --git a/lib/guestmemory/pressure_parse.go b/lib/guestmemory/pressure_parse.go index c0869896..f2ac4e4f 100644 --- a/lib/guestmemory/pressure_parse.go +++ b/lib/guestmemory/pressure_parse.go @@ -7,6 +7,8 @@ import ( "strings" ) +const linuxPSIStressAvg10Threshold = 0.1 + func parseLinuxMeminfo(data string) (int64, int64, error) { var total, available int64 var sawTotal, sawAvailable bool @@ -62,7 +64,7 @@ func parseLinuxPSI(data string) (bool, error) { if err != nil { return false, fmt.Errorf("parse psi avg10: %w", err) } - return value > 0, nil + return value >= linuxPSIStressAvg10Threshold, nil } } diff --git a/lib/guestmemory/pressure_parse_test.go b/lib/guestmemory/pressure_parse_test.go index e64d9eab..76b777b7 100644 --- a/lib/guestmemory/pressure_parse_test.go +++ b/lib/guestmemory/pressure_parse_test.go @@ -29,7 +29,7 @@ func TestParseLinuxMeminfoRequiresTotalAndAvailable(t *testing.T) { assert.Contains(t, err.Error(), "missing memory totals") } -func TestParseLinuxPSI(t *testing.T) { +func TestParseLinuxPSIAboveThresholdIsStressed(t *testing.T) { t.Parallel() stressed, err := parseLinuxPSI(` @@ -40,6 +40,17 @@ full avg10=0.00 avg60=0.00 avg300=0.00 total=0 assert.True(t, stressed) } +func TestParseLinuxPSIBelowThresholdIsHealthy(t *testing.T) { + t.Parallel() + + stressed, err := parseLinuxPSI(` +some avg10=0.09 avg60=0.01 avg300=0.10 total=12345 +full avg10=0.00 avg60=0.00 avg300=0.00 total=0 +`) + require.NoError(t, err) + assert.False(t, stressed) +} + func TestParseLinuxPSIZeroAvg10IsHealthy(t *testing.T) { t.Parallel() diff --git a/lib/hypervisor/cloudhypervisor/cloudhypervisor.go b/lib/hypervisor/cloudhypervisor/cloudhypervisor.go index 88817340..5290a49d 100644 --- a/lib/hypervisor/cloudhypervisor/cloudhypervisor.go +++ b/lib/hypervisor/cloudhypervisor/cloudhypervisor.go @@ -251,6 +251,12 @@ func (c *CloudHypervisor) SetTargetGuestMemoryBytes(ctx context.Context, bytes i } func (c *CloudHypervisor) GetTargetGuestMemoryBytes(ctx context.Context) (int64, error) { + if target, ok := balloonTargetCache.Load(hypervisor.SocketCacheKey(c.socketPath)); ok { + if value, ok := target.(int64); ok { + return value, nil + } + } + info, err := c.client.GetVmInfoWithResponse(ctx) if err != nil { return 0, fmt.Errorf("get vm info for balloon read: %w", err) @@ -261,11 +267,6 @@ func (c *CloudHypervisor) GetTargetGuestMemoryBytes(ctx context.Context) (int64, if info.JSON200.Config.Balloon == nil { return 0, hypervisor.ErrNotSupported } - if target, ok := balloonTargetCache.Load(hypervisor.SocketCacheKey(c.socketPath)); ok { - if value, ok := target.(int64); ok { - return value, nil - } - } assigned := assignedGuestMemoryBytes(info.JSON200) return assigned - info.JSON200.Config.Balloon.Size, nil } diff --git a/lib/hypervisor/cloudhypervisor/cloudhypervisor_test.go b/lib/hypervisor/cloudhypervisor/cloudhypervisor_test.go index 07dd2a39..cb8d9ae2 100644 --- a/lib/hypervisor/cloudhypervisor/cloudhypervisor_test.go +++ b/lib/hypervisor/cloudhypervisor/cloudhypervisor_test.go @@ -3,8 +3,10 @@ package cloudhypervisor import ( "testing" + "github.com/kernel/hypeman/lib/hypervisor" "github.com/kernel/hypeman/lib/vmm" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestAssignedGuestMemoryBytes(t *testing.T) { @@ -32,3 +34,20 @@ func TestAssignedGuestMemoryBytes(t *testing.T) { assert.Equal(t, int64(768), assignedGuestMemoryBytes(info)) }) } + +func TestGetTargetGuestMemoryBytesUsesWarmCacheBeforeVMInfo(t *testing.T) { + t.Parallel() + + socketPath := t.TempDir() + "/cloud-hypervisor.sock" + cacheKey := hypervisor.SocketCacheKey(socketPath) + balloonTargetCache.Store(cacheKey, int64(384)) + t.Cleanup(func() { + clearBalloonTargetCache(socketPath) + }) + + hv := &CloudHypervisor{socketPath: socketPath} + + target, err := hv.GetTargetGuestMemoryBytes(t.Context()) + require.NoError(t, err) + assert.Equal(t, int64(384), target) +} From 9662fe28d6ea78ce191822ff2b57dfc672c1e5e5 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Fri, 20 Mar 2026 13:25:30 -0400 Subject: [PATCH 16/22] fix: remove balloon plan zero sentinel --- lib/guestmemory/controller.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/guestmemory/controller.go b/lib/guestmemory/controller.go index c2024bc1..b867e607 100644 --- a/lib/guestmemory/controller.go +++ b/lib/guestmemory/controller.go @@ -237,8 +237,8 @@ func (c *controller) reconcile(ctx context.Context, req reconcileRequest) (Manua applyCtx, applySpan := c.startChildSpan(ctx, "guestmemory.apply_balloon_targets") for _, candidate := range candidates { - plannedTarget := plannedTargets[candidate.vm.ID] - if plannedTarget == 0 { + plannedTarget, ok := plannedTargets[candidate.vm.ID] + if !ok { plannedTarget = candidate.vm.AssignedMemoryBytes } From ce84f1bafdb68d71a739aa5a65c0b07efefaccd4 Mon Sep 17 00:00:00 2001 From: sjmiller609 <7516283+sjmiller609@users.noreply.github.com> Date: Sat, 21 Mar 2026 00:27:46 +0000 Subject: [PATCH 17/22] fix: use MiB in example configs and remove dead TrimSuffix - Change example YAML byte-size values from MB (decimal SI, 10^6) to MiB (binary, 2^20) so they match the Go default config which uses raw binary byte counts (e.g. 536870912 = 512 MiB). - Remove dead strings.TrimSuffix call in parseDarwinVMStatOutput; the " bytes)" suffix is never present after strings.Fields splits on whitespace. Addresses remaining Cursor Bugbot review findings on PR #160. Co-Authored-By: Claude Opus 4.6 --- config.example.darwin.yaml | 6 +++--- config.example.yaml | 6 +++--- lib/guestmemory/pressure_parse.go | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/config.example.darwin.yaml b/config.example.darwin.yaml index 886469cc..35d1e8e1 100644 --- a/config.example.darwin.yaml +++ b/config.example.darwin.yaml @@ -45,9 +45,9 @@ hypervisor: pressure_high_watermark_available_percent: 10 pressure_low_watermark_available_percent: 15 protected_floor_percent: 50 - protected_floor_min_bytes: 512MB - min_adjustment_bytes: 64MB - per_vm_max_step_bytes: 256MB + protected_floor_min_bytes: 512MiB + min_adjustment_bytes: 64MiB + per_vm_max_step_bytes: 256MiB per_vm_cooldown: 5s # ============================================================================= diff --git a/config.example.yaml b/config.example.yaml index 32ca583d..7af5fd5a 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -39,9 +39,9 @@ data_dir: /var/lib/hypeman # pressure_high_watermark_available_percent: 10 # pressure_low_watermark_available_percent: 15 # protected_floor_percent: 50 -# protected_floor_min_bytes: 512MB -# min_adjustment_bytes: 64MB -# per_vm_max_step_bytes: 256MB +# protected_floor_min_bytes: 512MiB +# min_adjustment_bytes: 64MiB +# per_vm_max_step_bytes: 256MiB # per_vm_cooldown: 5s # ============================================================================= diff --git a/lib/guestmemory/pressure_parse.go b/lib/guestmemory/pressure_parse.go index f2ac4e4f..f781a3da 100644 --- a/lib/guestmemory/pressure_parse.go +++ b/lib/guestmemory/pressure_parse.go @@ -82,7 +82,7 @@ func parseDarwinVMStatOutput(vmStatOut, memsizeOut string) (int64, int64, error) parts := strings.Fields(line) for i := 0; i < len(parts); i++ { if parts[i] == "of" && i+1 < len(parts) { - n, err := strconv.ParseInt(strings.TrimSuffix(parts[i+1], " bytes)"), 10, 64) + n, err := strconv.ParseInt(parts[i+1], 10, 64) if err == nil && n > 0 { pageSize = n } From 89b0320f0379c4b1d8e4d511e6cf38453fb09ac9 Mon Sep 17 00:00:00 2001 From: sjmiller609 <7516283+sjmiller609@users.noreply.github.com> Date: Sat, 21 Mar 2026 00:32:41 +0000 Subject: [PATCH 18/22] fix: use raw byte values in example configs to match Go defaults The c2h5oh/datasize library does not support MiB (binary IEC) suffixes. Use raw byte counts (e.g. 536870912 = 512*1024*1024) to match the Go default config and avoid the ~5% discrepancy from using MB (decimal SI). Co-Authored-By: Claude Opus 4.6 --- config.example.darwin.yaml | 6 +++--- config.example.yaml | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/config.example.darwin.yaml b/config.example.darwin.yaml index 35d1e8e1..b3ee8813 100644 --- a/config.example.darwin.yaml +++ b/config.example.darwin.yaml @@ -45,9 +45,9 @@ hypervisor: pressure_high_watermark_available_percent: 10 pressure_low_watermark_available_percent: 15 protected_floor_percent: 50 - protected_floor_min_bytes: 512MiB - min_adjustment_bytes: 64MiB - per_vm_max_step_bytes: 256MiB + protected_floor_min_bytes: 536870912 + min_adjustment_bytes: 67108864 + per_vm_max_step_bytes: 268435456 per_vm_cooldown: 5s # ============================================================================= diff --git a/config.example.yaml b/config.example.yaml index 7af5fd5a..bfd30a27 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -39,9 +39,9 @@ data_dir: /var/lib/hypeman # pressure_high_watermark_available_percent: 10 # pressure_low_watermark_available_percent: 15 # protected_floor_percent: 50 -# protected_floor_min_bytes: 512MiB -# min_adjustment_bytes: 64MiB -# per_vm_max_step_bytes: 256MiB +# protected_floor_min_bytes: 536870912 +# min_adjustment_bytes: 67108864 +# per_vm_max_step_bytes: 268435456 # per_vm_cooldown: 5s # ============================================================================= From 010cec5a61f5bbeb2aa42779d55ef4e032ceeac8 Mon Sep 17 00:00:00 2001 From: sjmiller609 <7516283+sjmiller609@users.noreply.github.com> Date: Mon, 23 Mar 2026 15:07:26 +0000 Subject: [PATCH 19/22] Address review feedback from masnwilliams - Clamp Low watermark to High+1 instead of cascading reset to defaults - Collapse duplicate hold-clearing branches in reconcile - Add lastApplied pruning for VMs no longer in candidate list - Wrap sampler error with context for better caller diagnostics - Fix err shadowing in ResolveProcessPID (socket_pid_linux.go) - Use exact arg match in pidByCmdline to avoid substring false positives - Extract assertActiveBallooningLifecycle to shared test helpers Co-Authored-By: Claude Opus 4.6 --- lib/guestmemory/active_ballooning.go | 5 ++-- lib/guestmemory/controller.go | 26 +++++++++-------- lib/hypervisor/socket_pid_linux.go | 14 ++++------ ...ory_active_ballooning_test_helpers_test.go | 28 +++++++++++++++++++ lib/instances/guestmemory_darwin_test.go | 28 +------------------ lib/instances/guestmemory_linux_test.go | 26 ----------------- 6 files changed, 52 insertions(+), 75 deletions(-) diff --git a/lib/guestmemory/active_ballooning.go b/lib/guestmemory/active_ballooning.go index 3d0dfcff..c9d1c8b5 100644 --- a/lib/guestmemory/active_ballooning.go +++ b/lib/guestmemory/active_ballooning.go @@ -62,9 +62,10 @@ func (c ActiveBallooningConfig) Normalize() ActiveBallooningConfig { c.PressureLowWatermarkAvailablePercent = d.PressureLowWatermarkAvailablePercent } if c.PressureLowWatermarkAvailablePercent <= c.PressureHighWatermarkAvailablePercent { - c.PressureLowWatermarkAvailablePercent = d.PressureLowWatermarkAvailablePercent - if c.PressureLowWatermarkAvailablePercent <= c.PressureHighWatermarkAvailablePercent { + c.PressureLowWatermarkAvailablePercent = c.PressureHighWatermarkAvailablePercent + 1 + if c.PressureLowWatermarkAvailablePercent >= 100 { c.PressureHighWatermarkAvailablePercent = d.PressureHighWatermarkAvailablePercent + c.PressureLowWatermarkAvailablePercent = d.PressureLowWatermarkAvailablePercent } } if c.ProtectedFloorPercent <= 0 || c.ProtectedFloorPercent >= 100 { diff --git a/lib/guestmemory/controller.go b/lib/guestmemory/controller.go index b867e607..40cb4e34 100644 --- a/lib/guestmemory/controller.go +++ b/lib/guestmemory/controller.go @@ -80,7 +80,7 @@ func (c *controller) reconcile(ctx context.Context, req reconcileRequest) (Manua sampleSpan.SetStatus(codes.Error, err.Error()) sampleSpan.End() c.recordReconcileError(ctx, trigger, start, span, err) - return ManualReclaimResponse{}, err + return ManualReclaimResponse{}, fmt.Errorf("sample host pressure: %w", err) } sampleSpan.SetAttributes( attribute.Int64("host_available_bytes", sample.AvailableBytes), @@ -104,16 +104,7 @@ func (c *controller) reconcile(ctx context.Context, req reconcileRequest) (Manua } if req.force && !req.dryRun { - switch { - case req.requestedReclaim <= 0 && req.holdFor <= 0: - if state.manualHold != nil { - logFromContext(ctx, c.log).InfoContext(ctx, - "guest memory manual reclaim hold cleared", - "operation", "manual_reclaim", - ) - } - state.manualHold = nil - case req.requestedReclaim > 0 && req.holdFor <= 0: + if req.holdFor <= 0 { if state.manualHold != nil { logFromContext(ctx, c.log).InfoContext(ctx, "guest memory manual reclaim hold cleared", @@ -121,7 +112,7 @@ func (c *controller) reconcile(ctx context.Context, req reconcileRequest) (Manua ) } state.manualHold = nil - default: + } else { state.manualHold = &manualHold{ reclaimBytes: req.requestedReclaim, until: now.Add(req.holdFor), @@ -319,6 +310,17 @@ func (c *controller) reconcile(ctx context.Context, req reconcileRequest) (Manua summary.unchangedCount++ } } + // Prune lastApplied entries for VMs no longer in the candidate list. + activeVMs := make(map[string]struct{}, len(candidates)) + for _, candidate := range candidates { + activeVMs[candidate.vm.ID] = struct{}{} + } + for vmID := range state.lastApplied { + if _, ok := activeVMs[vmID]; !ok { + delete(state.lastApplied, vmID) + } + } + applySpan.SetAttributes( attribute.Int("eligible_vms", summary.eligibleVMs), attribute.Int("applied_vms", summary.appliedCount), diff --git a/lib/hypervisor/socket_pid_linux.go b/lib/hypervisor/socket_pid_linux.go index 496a5dbc..7f46ebfa 100644 --- a/lib/hypervisor/socket_pid_linux.go +++ b/lib/hypervisor/socket_pid_linux.go @@ -16,19 +16,15 @@ import ( func ResolveProcessPID(socketPath string) (int, error) { socketRef, err := socketRefForPath(socketPath) if err == nil { - if pid, err := pidBySocketRef(socketRef); err == nil { + if pid, refErr := pidBySocketRef(socketRef); refErr == nil { return pid, nil } } - if pid, err := pidByCmdline(socketPath); err == nil { + if pid, cmdErr := pidByCmdline(socketPath); cmdErr == nil { return pid, nil } - if err != nil { - return 0, err - } - return 0, fmt.Errorf("resolve process pid for socket %s: no owning process found", socketPath) } @@ -86,8 +82,10 @@ func pidByCmdline(socketPath string) (int, error) { if err != nil || len(cmdline) == 0 { continue } - if strings.Contains(string(cmdline), socketPath) { - return pid, nil + for _, arg := range strings.Split(string(cmdline), "\x00") { + if arg == socketPath { + return pid, nil + } } } diff --git a/lib/instances/guestmemory_active_ballooning_test_helpers_test.go b/lib/instances/guestmemory_active_ballooning_test_helpers_test.go index f2916fd1..4abec94f 100644 --- a/lib/instances/guestmemory_active_ballooning_test_helpers_test.go +++ b/lib/instances/guestmemory_active_ballooning_test_helpers_test.go @@ -9,6 +9,7 @@ import ( "github.com/kernel/hypeman/lib/guestmemory" "github.com/kernel/hypeman/lib/hypervisor" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -129,3 +130,30 @@ func requireManualReclaimCleared(t *testing.T, ctx context.Context, controller g requireRuntimeGuestMemoryTargetEventually(t, ctx, inst, inst.Size+inst.HotplugSize) return resp } + +func assertActiveBallooningLifecycle(t *testing.T, ctx context.Context, inst *Instance) { + t.Helper() + + assigned := inst.Size + inst.HotplugSize + initialTarget := requireRuntimeGuestMemoryTarget(t, ctx, inst) + assert.Equal(t, assigned, initialTarget, "runtime balloon target should start at full assigned memory") + + controller := newActiveBallooningTestController(t, inst) + + reclaimResp := requireManualReclaimApplied(t, ctx, controller, inst, 1*1024*1024*1024, 5*time.Minute) + require.Len(t, reclaimResp.Actions, 1) + assert.NotNil(t, reclaimResp.HoldUntil) + assert.Equal(t, int64(1*1024*1024*1024), reclaimResp.Actions[0].AppliedReclaimBytes) + assert.Equal(t, assigned-int64(1*1024*1024*1024), reclaimResp.Actions[0].TargetGuestMemoryBytes) + + clearResp := requireManualReclaimCleared(t, ctx, controller, inst) + assert.Nil(t, clearResp.HoldUntil) + + floorResp := requireManualReclaimApplied(t, ctx, controller, inst, assigned, 5*time.Minute) + require.Len(t, floorResp.Actions, 1) + expectedFloor := assigned / 2 + assert.Equal(t, expectedFloor, floorResp.Actions[0].TargetGuestMemoryBytes) + assert.Equal(t, assigned-expectedFloor, floorResp.Actions[0].AppliedReclaimBytes) + + requireManualReclaimCleared(t, ctx, controller, inst) +} diff --git a/lib/instances/guestmemory_darwin_test.go b/lib/instances/guestmemory_darwin_test.go index c944034a..08b2868f 100644 --- a/lib/instances/guestmemory_darwin_test.go +++ b/lib/instances/guestmemory_darwin_test.go @@ -69,7 +69,7 @@ func TestGuestMemoryPolicyVZ(t *testing.T) { require.NoError(t, err) require.NotNil(t, instMeta.HypervisorPID) assertLowIdleVZHostMemoryFootprint(t, *instMeta.HypervisorPID, 192*1024) - assertActiveBallooningLifecycleVZ(t, ctx, inst) + assertActiveBallooningLifecycle(t, ctx, inst) } func forceEnableGuestMemoryPolicyForVZTest(mgr *manager) { @@ -177,29 +177,3 @@ func mustReadDarwinRSSBytes(t *testing.T, pid int) int64 { return kb * 1024 } -func assertActiveBallooningLifecycleVZ(t *testing.T, ctx context.Context, inst *Instance) { - t.Helper() - - assigned := inst.Size + inst.HotplugSize - initialTarget := requireRuntimeGuestMemoryTarget(t, ctx, inst) - assert.Equal(t, assigned, initialTarget, "runtime balloon target should start at full assigned memory") - - controller := newActiveBallooningTestController(t, inst) - - reclaimResp := requireManualReclaimApplied(t, ctx, controller, inst, 1*1024*1024*1024, 5*time.Minute) - require.Len(t, reclaimResp.Actions, 1) - assert.NotNil(t, reclaimResp.HoldUntil) - assert.Equal(t, int64(1*1024*1024*1024), reclaimResp.Actions[0].AppliedReclaimBytes) - assert.Equal(t, assigned-int64(1*1024*1024*1024), reclaimResp.Actions[0].TargetGuestMemoryBytes) - - clearResp := requireManualReclaimCleared(t, ctx, controller, inst) - assert.Nil(t, clearResp.HoldUntil) - - floorResp := requireManualReclaimApplied(t, ctx, controller, inst, assigned, 5*time.Minute) - require.Len(t, floorResp.Actions, 1) - expectedFloor := assigned / 2 - assert.Equal(t, expectedFloor, floorResp.Actions[0].TargetGuestMemoryBytes) - assert.Equal(t, assigned-expectedFloor, floorResp.Actions[0].AppliedReclaimBytes) - - requireManualReclaimCleared(t, ctx, controller, inst) -} diff --git a/lib/instances/guestmemory_linux_test.go b/lib/instances/guestmemory_linux_test.go index a696e209..75451fca 100644 --- a/lib/instances/guestmemory_linux_test.go +++ b/lib/instances/guestmemory_linux_test.go @@ -379,29 +379,3 @@ func getFirecrackerVMConfig(socketPath string) (*firecrackerVMConfig, error) { return &cfg, nil } -func assertActiveBallooningLifecycle(t *testing.T, ctx context.Context, inst *Instance) { - t.Helper() - - assigned := inst.Size + inst.HotplugSize - initialTarget := requireRuntimeGuestMemoryTarget(t, ctx, inst) - assert.Equal(t, assigned, initialTarget, "runtime balloon target should start at full assigned memory") - - controller := newActiveBallooningTestController(t, inst) - - reclaimResp := requireManualReclaimApplied(t, ctx, controller, inst, 1*1024*1024*1024, 5*time.Minute) - require.Len(t, reclaimResp.Actions, 1) - assert.NotNil(t, reclaimResp.HoldUntil) - assert.Equal(t, int64(1*1024*1024*1024), reclaimResp.Actions[0].AppliedReclaimBytes) - assert.Equal(t, assigned-int64(1*1024*1024*1024), reclaimResp.Actions[0].TargetGuestMemoryBytes) - - clearResp := requireManualReclaimCleared(t, ctx, controller, inst) - assert.Nil(t, clearResp.HoldUntil) - - floorResp := requireManualReclaimApplied(t, ctx, controller, inst, assigned, 5*time.Minute) - require.Len(t, floorResp.Actions, 1) - expectedFloor := assigned / 2 - assert.Equal(t, expectedFloor, floorResp.Actions[0].TargetGuestMemoryBytes) - assert.Equal(t, assigned-expectedFloor, floorResp.Actions[0].AppliedReclaimBytes) - - requireManualReclaimCleared(t, ctx, controller, inst) -} From 29cac923f6aa22303168d885ae87f7c39b107941 Mon Sep 17 00:00:00 2001 From: sjmiller609 <7516283+sjmiller609@users.noreply.github.com> Date: Mon, 23 Mar 2026 15:10:42 +0000 Subject: [PATCH 20/22] Fix gofmt: remove trailing blank lines Co-Authored-By: Claude Opus 4.6 --- lib/instances/guestmemory_darwin_test.go | 1 - lib/instances/guestmemory_linux_test.go | 1 - 2 files changed, 2 deletions(-) diff --git a/lib/instances/guestmemory_darwin_test.go b/lib/instances/guestmemory_darwin_test.go index 08b2868f..a7ef8f0f 100644 --- a/lib/instances/guestmemory_darwin_test.go +++ b/lib/instances/guestmemory_darwin_test.go @@ -176,4 +176,3 @@ func mustReadDarwinRSSBytes(t *testing.T, pid int) int64 { require.NoError(t, err) return kb * 1024 } - diff --git a/lib/instances/guestmemory_linux_test.go b/lib/instances/guestmemory_linux_test.go index 75451fca..c206dc19 100644 --- a/lib/instances/guestmemory_linux_test.go +++ b/lib/instances/guestmemory_linux_test.go @@ -378,4 +378,3 @@ func getFirecrackerVMConfig(socketPath string) (*firecrackerVMConfig, error) { } return &cfg, nil } - From 66518592f16b9ae3b47bc2b57a77d298c293b939 Mon Sep 17 00:00:00 2001 From: sjmiller609 <7516283+sjmiller609@users.noreply.github.com> Date: Mon, 23 Mar 2026 15:17:54 +0000 Subject: [PATCH 21/22] Add resource:write scope for POST /resources/memory/reclaim The reclaim endpoint is a mutating POST but was mapped to ResourceRead. Add ResourceWrite scope and use it for the reclaim route, consistent with every other POST endpoint using a write scope. Co-Authored-By: Claude Opus 4.6 --- lib/scopes/scopes.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lib/scopes/scopes.go b/lib/scopes/scopes.go index 648b8642..be2a7fed 100644 --- a/lib/scopes/scopes.go +++ b/lib/scopes/scopes.go @@ -53,8 +53,9 @@ const ( IngressWrite Scope = "ingress:write" IngressDelete Scope = "ingress:delete" - // Resource/health scopes (read-only) - ResourceRead Scope = "resource:read" + // Resource/health scopes + ResourceRead Scope = "resource:read" + ResourceWrite Scope = "resource:write" // Wildcard scope — grants all permissions All Scope = "*" @@ -69,7 +70,7 @@ var allScopes = []Scope{ BuildRead, BuildWrite, BuildDelete, DeviceRead, DeviceWrite, DeviceDelete, IngressRead, IngressWrite, IngressDelete, - ResourceRead, + ResourceRead, ResourceWrite, } // AllScopes returns the complete list of valid scopes (excluding wildcard). @@ -211,7 +212,7 @@ var RouteScopes = map[string]Scope{ // Health & Resources "GET /health": ResourceRead, "GET /resources": ResourceRead, - "POST /resources/memory/reclaim": ResourceRead, + "POST /resources/memory/reclaim": ResourceWrite, // Images "GET /images": ImageRead, From 5f806d8aaff7b3e73679d1914d6aadf7a9205aac Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Mon, 23 Mar 2026 11:43:50 -0400 Subject: [PATCH 22/22] fix: address active ballooning review feedback --- lib/guestmemory/controller.go | 12 ++- lib/guestmemory/controller_test.go | 60 +++++++++++++ lib/guestmemory/planner.go | 2 +- lib/hypervisor/balloon_target_cache.go | 90 +++++++++++++++++++ lib/hypervisor/balloon_target_cache_test.go | 63 +++++++++++++ .../cloudhypervisor/cloudhypervisor.go | 12 +-- .../cloudhypervisor/cloudhypervisor_test.go | 4 +- lib/hypervisor/qemu/qemu.go | 21 ++--- lib/hypervisor/qemu/qemu_test.go | 21 +++++ 9 files changed, 258 insertions(+), 27 deletions(-) create mode 100644 lib/hypervisor/balloon_target_cache.go create mode 100644 lib/hypervisor/balloon_target_cache_test.go create mode 100644 lib/hypervisor/qemu/qemu_test.go diff --git a/lib/guestmemory/controller.go b/lib/guestmemory/controller.go index 40cb4e34..f685697c 100644 --- a/lib/guestmemory/controller.go +++ b/lib/guestmemory/controller.go @@ -68,7 +68,13 @@ func (c *controller) reconcile(ctx context.Context, req reconcileRequest) (Manua defer span.End() state := &c.reconcileMu - <-state.mu + select { + case <-ctx.Done(): + err := ctx.Err() + c.recordReconcileError(ctx, trigger, start, span, err) + return ManualReclaimResponse{}, err + case <-state.mu: + } defer func() { state.mu <- struct{}{} }() now := time.Now() @@ -234,8 +240,7 @@ func (c *controller) reconcile(ctx context.Context, req reconcileRequest) (Manua } appliedTarget := plannedTarget - delta := plannedTarget - candidate.currentTargetGuestBytes - if absInt64(delta) < c.config.MinAdjustmentBytes { + if absInt64(appliedTarget-candidate.currentTargetGuestBytes) < c.config.MinAdjustmentBytes { appliedTarget = candidate.currentTargetGuestBytes } if !req.force { @@ -243,6 +248,7 @@ func (c *controller) reconcile(ctx context.Context, req reconcileRequest) (Manua appliedTarget = candidate.currentTargetGuestBytes } } + delta := appliedTarget - candidate.currentTargetGuestBytes if appliedTarget != candidate.currentTargetGuestBytes { if delta > 0 { appliedTarget = candidate.currentTargetGuestBytes + minInt64(delta, c.config.PerVMMaxStepBytes) diff --git a/lib/guestmemory/controller_test.go b/lib/guestmemory/controller_test.go index d8d1ed0b..2b20427f 100644 --- a/lib/guestmemory/controller_test.go +++ b/lib/guestmemory/controller_test.go @@ -113,12 +113,72 @@ func TestPressureStateUsesHysteresis(t *testing.T) { cfg.PressureLowWatermarkAvailablePercent = 15 assert.Equal(t, HostPressureStatePressure, nextPressureState(HostPressureStateHealthy, cfg, HostPressureSample{AvailablePercent: 9})) + assert.Equal(t, HostPressureStateHealthy, nextPressureState(HostPressureStateHealthy, cfg, HostPressureSample{AvailablePercent: 10})) assert.Equal(t, HostPressureStateHealthy, nextPressureState(HostPressureStateHealthy, cfg, HostPressureSample{AvailablePercent: 10.9})) assert.Equal(t, HostPressureStatePressure, nextPressureState(HostPressureStatePressure, cfg, HostPressureSample{AvailablePercent: 12})) assert.Equal(t, HostPressureStatePressure, nextPressureState(HostPressureStatePressure, cfg, HostPressureSample{AvailablePercent: 14.9})) assert.Equal(t, HostPressureStateHealthy, nextPressureState(HostPressureStatePressure, cfg, HostPressureSample{AvailablePercent: 16})) } +func TestTriggerReclaimReturnsWhenContextIsCanceledWhileWaitingForLock(t *testing.T) { + const mib = int64(1024 * 1024) + + src := &stubSource{ + vms: []BalloonVM{ + {ID: "a", Name: "a", HypervisorType: hypervisor.TypeCloudHypervisor, SocketPath: "a", AssignedMemoryBytes: 1024 * mib}, + }, + } + + c := NewController(Policy{Enabled: true, ReclaimEnabled: true}, ActiveBallooningConfig{ + Enabled: true, + ProtectedFloorPercent: 50, + ProtectedFloorMinBytes: 0, + MinAdjustmentBytes: 1, + PerVMMaxStepBytes: 4096 * mib, + PerVMCooldown: time.Second, + }, src, slog.New(slog.NewTextHandler(io.Discard, nil))).(*controller) + c.sampler = &stubSampler{sample: HostPressureSample{TotalBytes: 1024 * mib, AvailableBytes: 1024 * mib, AvailablePercent: 100}} + + <-c.reconcileMu.mu + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + _, err := c.TriggerReclaim(ctx, ManualReclaimRequest{ReclaimBytes: 128 * mib}) + require.ErrorIs(t, err, context.Canceled) + + c.reconcileMu.mu <- struct{}{} +} + +func TestTriggerReclaimMinAdjustmentKeepsCurrentTarget(t *testing.T) { + const mib = int64(1024 * 1024) + + src := &stubSource{ + vms: []BalloonVM{ + {ID: "a", Name: "a", HypervisorType: hypervisor.TypeCloudHypervisor, SocketPath: "a", AssignedMemoryBytes: 1024 * mib}, + }, + } + hv := &stubHypervisor{target: 1024 * mib, capabilities: hypervisor.Capabilities{SupportsBalloonControl: true}} + + c := NewController(Policy{Enabled: true, ReclaimEnabled: true}, ActiveBallooningConfig{ + Enabled: true, + ProtectedFloorPercent: 50, + ProtectedFloorMinBytes: 0, + MinAdjustmentBytes: 64 * mib, + PerVMMaxStepBytes: 64 * mib, + PerVMCooldown: time.Minute, + }, src, slog.New(slog.NewTextHandler(io.Discard, nil))).(*controller) + c.sampler = &stubSampler{sample: HostPressureSample{TotalBytes: 1024 * mib, AvailableBytes: 1024 * mib, AvailablePercent: 100}} + c.reconcileMu.newClient = func(t hypervisor.Type, socket string) (hypervisor.Hypervisor, error) { + return hv, nil + } + + resp, err := c.TriggerReclaim(context.Background(), ManualReclaimRequest{ReclaimBytes: 32 * mib}) + require.NoError(t, err) + require.Len(t, resp.Actions, 1) + assert.Equal(t, "unchanged", resp.Actions[0].Status) + assert.Equal(t, int64(1024*mib), resp.Actions[0].TargetGuestMemoryBytes) +} + func TestTriggerReclaimRespectsProtectedFloor(t *testing.T) { const mib = int64(1024 * 1024) src := &stubSource{ diff --git a/lib/guestmemory/planner.go b/lib/guestmemory/planner.go index 14c8c1c7..55550a09 100644 --- a/lib/guestmemory/planner.go +++ b/lib/guestmemory/planner.go @@ -74,7 +74,7 @@ func nextPressureState(current HostPressureState, cfg ActiveBallooningConfig, sa } return HostPressureStatePressure default: - if availablePercent <= highWatermark || sample.Stressed { + if availablePercent < highWatermark || sample.Stressed { return HostPressureStatePressure } return HostPressureStateHealthy diff --git a/lib/hypervisor/balloon_target_cache.go b/lib/hypervisor/balloon_target_cache.go new file mode 100644 index 00000000..ccfd43a6 --- /dev/null +++ b/lib/hypervisor/balloon_target_cache.go @@ -0,0 +1,90 @@ +package hypervisor + +import ( + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + "sync" +) + +// BalloonTargetCache keeps the last requested guest-memory target warm across +// reconnects and Hypeman restarts. +type BalloonTargetCache struct { + targets sync.Map + keys sync.Map +} + +func (c *BalloonTargetCache) Store(socketPath string, bytes int64) { + key := SocketCacheKey(socketPath) + c.targets.Store(key, bytes) + c.keys.Store(socketPath, key) + _ = os.WriteFile(balloonTargetStatePath(socketPath), []byte(fmt.Sprintf("%s\n%d\n", key, bytes)), 0o600) +} + +func (c *BalloonTargetCache) Load(socketPath string) (int64, bool) { + key := SocketCacheKey(socketPath) + if value, ok := c.loadKey(key); ok { + c.keys.Store(socketPath, key) + return value, true + } + + if indexedKey, ok := c.keys.Load(socketPath); ok { + if keyString, ok := indexedKey.(string); ok { + if value, ok := c.loadKey(keyString); ok { + return value, true + } + } + } + + value, ok := loadBalloonTargetState(socketPath, key) + if !ok { + return 0, false + } + c.targets.Store(key, value) + c.keys.Store(socketPath, key) + return value, true +} + +func (c *BalloonTargetCache) Delete(socketPath string) { + if indexedKey, ok := c.keys.LoadAndDelete(socketPath); ok { + if keyString, ok := indexedKey.(string); ok { + c.targets.Delete(keyString) + } + } + c.targets.Delete(SocketCacheKey(socketPath)) + _ = os.Remove(balloonTargetStatePath(socketPath)) +} + +func (c *BalloonTargetCache) loadKey(key string) (int64, bool) { + target, ok := c.targets.Load(key) + if !ok { + return 0, false + } + value, ok := target.(int64) + return value, ok +} + +func balloonTargetStatePath(socketPath string) string { + base := filepath.Base(socketPath) + return filepath.Join(filepath.Dir(socketPath), "."+base+".balloon-target") +} + +func loadBalloonTargetState(socketPath, expectedKey string) (int64, bool) { + data, err := os.ReadFile(balloonTargetStatePath(socketPath)) + if err != nil { + return 0, false + } + + lines := strings.Split(strings.TrimSpace(string(data)), "\n") + if len(lines) != 2 || lines[0] != expectedKey { + return 0, false + } + + value, err := strconv.ParseInt(lines[1], 10, 64) + if err != nil { + return 0, false + } + return value, true +} diff --git a/lib/hypervisor/balloon_target_cache_test.go b/lib/hypervisor/balloon_target_cache_test.go new file mode 100644 index 00000000..7ade8e37 --- /dev/null +++ b/lib/hypervisor/balloon_target_cache_test.go @@ -0,0 +1,63 @@ +package hypervisor + +import ( + "net" + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestBalloonTargetCachePersistsAcrossProcessRestarts(t *testing.T) { + t.Parallel() + + socketPath := testSocketPath(t) + listener, err := net.Listen("unix", socketPath) + require.NoError(t, err) + defer listener.Close() + + var cache BalloonTargetCache + cache.Store(socketPath, 384) + + var restarted BalloonTargetCache + value, ok := restarted.Load(socketPath) + require.True(t, ok) + assert.Equal(t, int64(384), value) +} + +func TestBalloonTargetCacheDeleteClearsIndexedKeyAfterSocketRemoval(t *testing.T) { + t.Parallel() + + socketPath := testSocketPath(t) + listener, err := net.Listen("unix", socketPath) + require.NoError(t, err) + + var cache BalloonTargetCache + cache.Store(socketPath, 512) + + require.NoError(t, listener.Close()) + if err := os.Remove(socketPath); err != nil && !os.IsNotExist(err) { + require.NoError(t, err) + } + + cache.Delete(socketPath) + _, ok := cache.Load(socketPath) + assert.False(t, ok) +} + +func testSocketPath(t *testing.T) string { + t.Helper() + + file, err := os.CreateTemp("", "btc-*.sock") + require.NoError(t, err) + path := file.Name() + require.NoError(t, file.Close()) + require.NoError(t, os.Remove(path)) + t.Cleanup(func() { + _ = os.Remove(path) + _ = os.Remove(balloonTargetStatePath(path)) + }) + return filepath.Clean(path) +} diff --git a/lib/hypervisor/cloudhypervisor/cloudhypervisor.go b/lib/hypervisor/cloudhypervisor/cloudhypervisor.go index 5290a49d..95e0792a 100644 --- a/lib/hypervisor/cloudhypervisor/cloudhypervisor.go +++ b/lib/hypervisor/cloudhypervisor/cloudhypervisor.go @@ -5,7 +5,6 @@ package cloudhypervisor import ( "context" "fmt" - "sync" "time" "github.com/kernel/hypeman/lib/hypervisor" @@ -18,11 +17,10 @@ type CloudHypervisor struct { socketPath string } -var balloonTargetCache sync.Map +var balloonTargetCache hypervisor.BalloonTargetCache func clearBalloonTargetCache(socketPath string) { balloonTargetCache.Delete(socketPath) - balloonTargetCache.Delete(hypervisor.SocketCacheKey(socketPath)) } // New creates a new Cloud Hypervisor client for an existing VMM socket. @@ -246,15 +244,13 @@ func (c *CloudHypervisor) SetTargetGuestMemoryBytes(ctx context.Context, bytes i if resp.StatusCode() != 204 { return fmt.Errorf("set balloon target failed with status %d", resp.StatusCode()) } - balloonTargetCache.Store(hypervisor.SocketCacheKey(c.socketPath), bytes) + balloonTargetCache.Store(c.socketPath, bytes) return nil } func (c *CloudHypervisor) GetTargetGuestMemoryBytes(ctx context.Context) (int64, error) { - if target, ok := balloonTargetCache.Load(hypervisor.SocketCacheKey(c.socketPath)); ok { - if value, ok := target.(int64); ok { - return value, nil - } + if target, ok := balloonTargetCache.Load(c.socketPath); ok { + return target, nil } info, err := c.client.GetVmInfoWithResponse(ctx) diff --git a/lib/hypervisor/cloudhypervisor/cloudhypervisor_test.go b/lib/hypervisor/cloudhypervisor/cloudhypervisor_test.go index cb8d9ae2..2e2596fb 100644 --- a/lib/hypervisor/cloudhypervisor/cloudhypervisor_test.go +++ b/lib/hypervisor/cloudhypervisor/cloudhypervisor_test.go @@ -3,7 +3,6 @@ package cloudhypervisor import ( "testing" - "github.com/kernel/hypeman/lib/hypervisor" "github.com/kernel/hypeman/lib/vmm" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -39,8 +38,7 @@ func TestGetTargetGuestMemoryBytesUsesWarmCacheBeforeVMInfo(t *testing.T) { t.Parallel() socketPath := t.TempDir() + "/cloud-hypervisor.sock" - cacheKey := hypervisor.SocketCacheKey(socketPath) - balloonTargetCache.Store(cacheKey, int64(384)) + balloonTargetCache.Store(socketPath, int64(384)) t.Cleanup(func() { clearBalloonTargetCache(socketPath) }) diff --git a/lib/hypervisor/qemu/qemu.go b/lib/hypervisor/qemu/qemu.go index cd0aaafa..1da70965 100644 --- a/lib/hypervisor/qemu/qemu.go +++ b/lib/hypervisor/qemu/qemu.go @@ -5,7 +5,6 @@ import ( "fmt" "os" "path/filepath" - "sync" "time" "github.com/digitalocean/go-qemu/qemu" @@ -18,11 +17,10 @@ type QEMU struct { socketPath string // for self-removal from pool on error } -var balloonTargetCache sync.Map +var balloonTargetCache hypervisor.BalloonTargetCache func clearBalloonTargetCache(socketPath string) { balloonTargetCache.Delete(socketPath) - balloonTargetCache.Delete(hypervisor.SocketCacheKey(socketPath)) } // New returns a QEMU client for the given socket path. @@ -195,21 +193,20 @@ func (q *QEMU) SetTargetGuestMemoryBytes(ctx context.Context, bytes int64) error Remove(q.socketPath) return fmt.Errorf("set balloon target: %w", err) } - balloonTargetCache.Store(hypervisor.SocketCacheKey(q.socketPath), bytes) + balloonTargetCache.Store(q.socketPath, bytes) return nil } func (q *QEMU) GetTargetGuestMemoryBytes(ctx context.Context) (int64, error) { - if target, ok := balloonTargetCache.Load(hypervisor.SocketCacheKey(q.socketPath)); ok { - if value, ok := target.(int64); ok { - return value, nil - } + _ = ctx + + if target, ok := balloonTargetCache.Load(q.socketPath); ok { + return target, nil } - bytes, err := q.client.QueryBalloon() + config, err := loadVMConfig(filepath.Dir(q.socketPath)) if err != nil { - Remove(q.socketPath) - return 0, fmt.Errorf("query balloon target: %w", err) + return 0, fmt.Errorf("read qemu guest memory target: %w", err) } - return bytes, nil + return config.MemoryBytes, nil } diff --git a/lib/hypervisor/qemu/qemu_test.go b/lib/hypervisor/qemu/qemu_test.go new file mode 100644 index 00000000..e010a44a --- /dev/null +++ b/lib/hypervisor/qemu/qemu_test.go @@ -0,0 +1,21 @@ +package qemu + +import ( + "testing" + + "github.com/kernel/hypeman/lib/hypervisor" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestGetTargetGuestMemoryBytesUsesSavedConfigOnColdStart(t *testing.T) { + t.Parallel() + + dir := t.TempDir() + require.NoError(t, saveVMConfig(dir, hypervisor.VMConfig{MemoryBytes: 768})) + + hv := &QEMU{socketPath: dir + "/qemu.sock"} + target, err := hv.GetTargetGuestMemoryBytes(t.Context()) + require.NoError(t, err) + assert.Equal(t, int64(768), target) +}