Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions DEVELOPMENT.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ Common settings:
| `metrics.listen_address` | Bind address for `/metrics` endpoint | `127.0.0.1` |
| `metrics.port` | Port for `/metrics` endpoint | `9464` |
| `metrics.vm_label_budget` | Warning threshold for observed per-VM metric labels | `200` |
| `metrics.resource_refresh_interval` | Refresh interval for cached resource capacity metrics | `120s` |
| `limits.max_concurrent_builds` | Max concurrent image builds | `1` |
| `limits.max_overlay_size` | Max overlay filesystem size | `100GB` |
| `acme.email` | Email for ACME certificate registration | _(empty)_ |
Expand Down
6 changes: 3 additions & 3 deletions cmd/api/api/registry_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ func TestRegistryLayerCaching(t *testing.T) {
// that share layers reuses the cached shared layers.
func TestRegistrySharedLayerCaching(t *testing.T) {
t.Parallel()
_, serverHost := setupRegistryTest(t)
svc, serverHost := setupRegistryTest(t)

// Pull alpine image (this will be our base)
t.Log("Pulling alpine:latest...")
Expand Down Expand Up @@ -297,6 +297,7 @@ func TestRegistrySharedLayerCaching(t *testing.T) {
err = remote.Write(dstRef, alpineImg, remote.WithTransport(transport1))
require.NoError(t, err)
t.Logf("First push (alpine): %d blob uploads", firstPushBlobUploads)
waitForImageReady(t, svc, "shared/alpine@"+alpineDigest.String(), 60*time.Second)

// Now pull a different alpine-based image (e.g., alpine:3.18)
// which should share the base layer with alpine:latest
Expand Down Expand Up @@ -339,8 +340,7 @@ func TestRegistrySharedLayerCaching(t *testing.T) {
assert.LessOrEqual(t, secondPushBlobUploads, firstPushBlobUploads,
"Second push should upload same or fewer blobs due to layer sharing")

// Wait for async conversion
time.Sleep(2 * time.Second)
waitForImageReady(t, svc, "shared/alpine318@"+alpine318Digest.String(), 60*time.Second)
}

// TestRegistryTagPush verifies that pushing with a tag reference (not digest)
Expand Down
24 changes: 18 additions & 6 deletions cmd/api/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,10 @@ type APIConfig struct {

// MetricsConfig holds metrics endpoint settings.
type MetricsConfig struct {
ListenAddress string `koanf:"listen_address"`
Port int `koanf:"port"`
VMLabelBudget int `koanf:"vm_label_budget"`
ListenAddress string `koanf:"listen_address"`
Port int `koanf:"port"`
VMLabelBudget int `koanf:"vm_label_budget"`
ResourceRefreshInterval string `koanf:"resource_refresh_interval"`
}

// OtelConfig holds OpenTelemetry settings.
Expand Down Expand Up @@ -294,9 +295,10 @@ func defaultConfig() *Config {
},

Metrics: MetricsConfig{
ListenAddress: "127.0.0.1",
Port: 9464,
VMLabelBudget: 200,
ListenAddress: "127.0.0.1",
Port: 9464,
VMLabelBudget: 200,
ResourceRefreshInterval: "120s",
},

Otel: OtelConfig{
Expand Down Expand Up @@ -462,6 +464,16 @@ func (c *Config) Validate() error {
if c.Metrics.VMLabelBudget <= 0 {
return fmt.Errorf("metrics.vm_label_budget must be positive, got %d", c.Metrics.VMLabelBudget)
}
if strings.TrimSpace(c.Metrics.ResourceRefreshInterval) == "" {
return fmt.Errorf("metrics.resource_refresh_interval must not be empty")
}
interval, err := time.ParseDuration(c.Metrics.ResourceRefreshInterval)
if err != nil {
return fmt.Errorf("metrics.resource_refresh_interval must be a valid duration, got %q: %w", c.Metrics.ResourceRefreshInterval, err)
}
if interval <= 0 {
return fmt.Errorf("metrics.resource_refresh_interval must be positive, got %q", c.Metrics.ResourceRefreshInterval)
}
if c.Otel.MetricExportInterval != "" {
if _, err := time.ParseDuration(c.Otel.MetricExportInterval); err != nil {
return fmt.Errorf("otel.metric_export_interval must be a valid duration, got %q: %w", c.Otel.MetricExportInterval, err)
Expand Down
33 changes: 33 additions & 0 deletions cmd/api/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ func TestDefaultConfigIncludesMetricsSettings(t *testing.T) {
if cfg.Metrics.VMLabelBudget != 200 {
t.Fatalf("expected default metrics.vm_label_budget to be 200, got %d", cfg.Metrics.VMLabelBudget)
}
if cfg.Metrics.ResourceRefreshInterval != "120s" {
t.Fatalf("expected default metrics.resource_refresh_interval to be 120s, got %q", cfg.Metrics.ResourceRefreshInterval)
}
if cfg.Otel.MetricExportInterval != "60s" {
t.Fatalf("expected default otel.metric_export_interval to be 60s, got %q", cfg.Otel.MetricExportInterval)
}
Expand All @@ -31,6 +34,7 @@ func TestLoadEnvOverridesMetricsAndOtelInterval(t *testing.T) {
t.Setenv("METRICS__LISTEN_ADDRESS", "0.0.0.0")
t.Setenv("METRICS__PORT", "9999")
t.Setenv("METRICS__VM_LABEL_BUDGET", "350")
t.Setenv("METRICS__RESOURCE_REFRESH_INTERVAL", "30s")
t.Setenv("OTEL__METRIC_EXPORT_INTERVAL", "15s")

tmp := t.TempDir()
Expand All @@ -53,6 +57,9 @@ func TestLoadEnvOverridesMetricsAndOtelInterval(t *testing.T) {
if cfg.Metrics.VMLabelBudget != 350 {
t.Fatalf("expected metrics.vm_label_budget override, got %d", cfg.Metrics.VMLabelBudget)
}
if cfg.Metrics.ResourceRefreshInterval != "30s" {
t.Fatalf("expected metrics.resource_refresh_interval override, got %q", cfg.Metrics.ResourceRefreshInterval)
}
if cfg.Otel.MetricExportInterval != "15s" {
t.Fatalf("expected otel.metric_export_interval override, got %q", cfg.Otel.MetricExportInterval)
}
Expand Down Expand Up @@ -88,6 +95,32 @@ func TestValidateRejectsInvalidVMLabelBudget(t *testing.T) {
}
}

func TestValidateRejectsInvalidResourceRefreshInterval(t *testing.T) {
cfg := defaultConfig()
cfg.Metrics.ResourceRefreshInterval = ""

err := cfg.Validate()
if err == nil {
t.Fatalf("expected validation error for empty resource refresh interval")
}

cfg = defaultConfig()
cfg.Metrics.ResourceRefreshInterval = "not-a-duration"

err = cfg.Validate()
if err == nil {
t.Fatalf("expected validation error for invalid resource refresh interval")
}

cfg = defaultConfig()
cfg.Metrics.ResourceRefreshInterval = "0s"

err = cfg.Validate()
if err == nil {
t.Fatalf("expected validation error for non-positive resource refresh interval")
}
}

func TestValidateRejectsEmptyActiveBallooningDurations(t *testing.T) {
cfg := defaultConfig()
cfg.Hypervisor.Memory.ActiveBallooning.PollInterval = " "
Expand Down
8 changes: 8 additions & 0 deletions cmd/api/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,14 @@ func run() error {

logger := app.Logger

resourceRefreshInterval, err := time.ParseDuration(app.Config.Metrics.ResourceRefreshInterval)
if err != nil {
return fmt.Errorf("invalid metrics resource refresh interval %q: %w", app.Config.Metrics.ResourceRefreshInterval, err)
}
if err := app.ResourceManager.StartMonitoring(ctx, otelProvider.Meter, resourceRefreshInterval); err != nil {
return fmt.Errorf("start resource monitoring: %w", err)
}

// Log OTel status
if cfg.Otel.Enabled {
logger.Info("OpenTelemetry push enabled", "endpoint", cfg.Otel.Endpoint, "service", cfg.Otel.ServiceName, "metric_export_interval", cfg.Otel.MetricExportInterval)
Expand Down
1 change: 1 addition & 0 deletions config.example.darwin.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ limits:
# listen_address: 127.0.0.1
# port: 9464
# vm_label_budget: 200
# resource_refresh_interval: 120s
#
# otel:
# enabled: false
Expand Down
1 change: 1 addition & 0 deletions config.example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ data_dir: /var/lib/hypeman
# listen_address: 127.0.0.1
# port: 9464
# vm_label_budget: 200
# resource_refresh_interval: 120s
#
# otel:
# enabled: false
Expand Down
12 changes: 10 additions & 2 deletions lib/instances/compression_integration_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ type compressionIntegrationHarness struct {
waitHypervisorUp func(ctx context.Context, inst *Instance) error
}

const compressionGuestExecTimeout = 20 * time.Second

func TestCloudHypervisorStandbyRestoreCompressionScenarios(t *testing.T) {
t.Parallel()

Expand Down Expand Up @@ -261,14 +263,20 @@ func waitForRunningAndExecReady(t *testing.T, ctx context.Context, mgr *manager,

func writeGuestMarker(t *testing.T, ctx context.Context, inst *Instance, path string, value string) {
t.Helper()
output, exitCode, err := execCommand(ctx, inst, "sh", "-c", fmt.Sprintf("printf %q > %s && sync", value, path))
execCtx, cancel := context.WithTimeout(ctx, integrationTestTimeout(compressionGuestExecTimeout))
defer cancel()

output, exitCode, err := execCommand(execCtx, inst, "sh", "-c", fmt.Sprintf("printf %q > %s && sync", value, path))
require.NoError(t, err)
require.Equal(t, 0, exitCode, output)
}

func assertGuestMarker(t *testing.T, ctx context.Context, inst *Instance, path string, expected string) {
t.Helper()
output, exitCode, err := execCommand(ctx, inst, "cat", path)
execCtx, cancel := context.WithTimeout(ctx, integrationTestTimeout(compressionGuestExecTimeout))
defer cancel()

output, exitCode, err := execCommand(execCtx, inst, "cat", path)
require.NoError(t, err)
require.Equal(t, 0, exitCode, output)
assert.Equal(t, expected, output)
Expand Down
2 changes: 1 addition & 1 deletion lib/instances/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1117,7 +1117,7 @@ func TestEntrypointEnvVars(t *testing.T) {
require.NoError(t, err)
require.NotNil(t, inst)
assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State)
inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second)
inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 60*time.Second)
require.NoError(t, err)
t.Logf("Instance created: %s", inst.Id)

Expand Down
2 changes: 1 addition & 1 deletion lib/instances/qemu_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -671,7 +671,7 @@ func TestQEMUEntrypointEnvVars(t *testing.T) {
require.NoError(t, err)
require.NotNil(t, inst)
assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State)
inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second)
inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 60*time.Second)
require.NoError(t, err)
assert.Equal(t, hypervisor.TypeQEMU, inst.HypervisorType, "Instance should use QEMU hypervisor")
t.Logf("Instance created: %s", inst.Id)
Expand Down
Loading
Loading