Skip to content

Commit 4d3545d

Browse files
committed
Add resource capacity monitoring metrics
1 parent 23ebbed commit 4d3545d

9 files changed

Lines changed: 591 additions & 12 deletions

File tree

DEVELOPMENT.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ Common settings:
119119
| `metrics.listen_address` | Bind address for `/metrics` endpoint | `127.0.0.1` |
120120
| `metrics.port` | Port for `/metrics` endpoint | `9464` |
121121
| `metrics.vm_label_budget` | Warning threshold for observed per-VM metric labels | `200` |
122+
| `metrics.resource_refresh_interval` | Refresh interval for cached resource capacity metrics | `120s` |
122123
| `limits.max_concurrent_builds` | Max concurrent image builds | `1` |
123124
| `limits.max_overlay_size` | Max overlay filesystem size | `100GB` |
124125
| `acme.email` | Email for ACME certificate registration | _(empty)_ |

cmd/api/config/config.go

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -98,9 +98,10 @@ type APIConfig struct {
9898

9999
// MetricsConfig holds metrics endpoint settings.
100100
type MetricsConfig struct {
101-
ListenAddress string `koanf:"listen_address"`
102-
Port int `koanf:"port"`
103-
VMLabelBudget int `koanf:"vm_label_budget"`
101+
ListenAddress string `koanf:"listen_address"`
102+
Port int `koanf:"port"`
103+
VMLabelBudget int `koanf:"vm_label_budget"`
104+
ResourceRefreshInterval string `koanf:"resource_refresh_interval"`
104105
}
105106

106107
// OtelConfig holds OpenTelemetry settings.
@@ -294,9 +295,10 @@ func defaultConfig() *Config {
294295
},
295296

296297
Metrics: MetricsConfig{
297-
ListenAddress: "127.0.0.1",
298-
Port: 9464,
299-
VMLabelBudget: 200,
298+
ListenAddress: "127.0.0.1",
299+
Port: 9464,
300+
VMLabelBudget: 200,
301+
ResourceRefreshInterval: "120s",
300302
},
301303

302304
Otel: OtelConfig{
@@ -462,6 +464,15 @@ func (c *Config) Validate() error {
462464
if c.Metrics.VMLabelBudget <= 0 {
463465
return fmt.Errorf("metrics.vm_label_budget must be positive, got %d", c.Metrics.VMLabelBudget)
464466
}
467+
if c.Metrics.ResourceRefreshInterval != "" {
468+
interval, err := time.ParseDuration(c.Metrics.ResourceRefreshInterval)
469+
if err != nil {
470+
return fmt.Errorf("metrics.resource_refresh_interval must be a valid duration, got %q: %w", c.Metrics.ResourceRefreshInterval, err)
471+
}
472+
if interval <= 0 {
473+
return fmt.Errorf("metrics.resource_refresh_interval must be positive, got %q", c.Metrics.ResourceRefreshInterval)
474+
}
475+
}
465476
if c.Otel.MetricExportInterval != "" {
466477
if _, err := time.ParseDuration(c.Otel.MetricExportInterval); err != nil {
467478
return fmt.Errorf("otel.metric_export_interval must be a valid duration, got %q: %w", c.Otel.MetricExportInterval, err)

cmd/api/config/config_test.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ func TestDefaultConfigIncludesMetricsSettings(t *testing.T) {
2222
if cfg.Metrics.VMLabelBudget != 200 {
2323
t.Fatalf("expected default metrics.vm_label_budget to be 200, got %d", cfg.Metrics.VMLabelBudget)
2424
}
25+
if cfg.Metrics.ResourceRefreshInterval != "120s" {
26+
t.Fatalf("expected default metrics.resource_refresh_interval to be 120s, got %q", cfg.Metrics.ResourceRefreshInterval)
27+
}
2528
if cfg.Otel.MetricExportInterval != "60s" {
2629
t.Fatalf("expected default otel.metric_export_interval to be 60s, got %q", cfg.Otel.MetricExportInterval)
2730
}
@@ -31,6 +34,7 @@ func TestLoadEnvOverridesMetricsAndOtelInterval(t *testing.T) {
3134
t.Setenv("METRICS__LISTEN_ADDRESS", "0.0.0.0")
3235
t.Setenv("METRICS__PORT", "9999")
3336
t.Setenv("METRICS__VM_LABEL_BUDGET", "350")
37+
t.Setenv("METRICS__RESOURCE_REFRESH_INTERVAL", "30s")
3438
t.Setenv("OTEL__METRIC_EXPORT_INTERVAL", "15s")
3539

3640
tmp := t.TempDir()
@@ -53,6 +57,9 @@ func TestLoadEnvOverridesMetricsAndOtelInterval(t *testing.T) {
5357
if cfg.Metrics.VMLabelBudget != 350 {
5458
t.Fatalf("expected metrics.vm_label_budget override, got %d", cfg.Metrics.VMLabelBudget)
5559
}
60+
if cfg.Metrics.ResourceRefreshInterval != "30s" {
61+
t.Fatalf("expected metrics.resource_refresh_interval override, got %q", cfg.Metrics.ResourceRefreshInterval)
62+
}
5663
if cfg.Otel.MetricExportInterval != "15s" {
5764
t.Fatalf("expected otel.metric_export_interval override, got %q", cfg.Otel.MetricExportInterval)
5865
}
@@ -88,6 +95,24 @@ func TestValidateRejectsInvalidVMLabelBudget(t *testing.T) {
8895
}
8996
}
9097

98+
func TestValidateRejectsInvalidResourceRefreshInterval(t *testing.T) {
99+
cfg := defaultConfig()
100+
cfg.Metrics.ResourceRefreshInterval = "not-a-duration"
101+
102+
err := cfg.Validate()
103+
if err == nil {
104+
t.Fatalf("expected validation error for invalid resource refresh interval")
105+
}
106+
107+
cfg = defaultConfig()
108+
cfg.Metrics.ResourceRefreshInterval = "0s"
109+
110+
err = cfg.Validate()
111+
if err == nil {
112+
t.Fatalf("expected validation error for non-positive resource refresh interval")
113+
}
114+
}
115+
91116
func TestValidateRejectsEmptyActiveBallooningDurations(t *testing.T) {
92117
cfg := defaultConfig()
93118
cfg.Hypervisor.Memory.ActiveBallooning.PollInterval = " "

cmd/api/main.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,14 @@ func run() error {
139139

140140
logger := app.Logger
141141

142+
resourceRefreshInterval, err := time.ParseDuration(app.Config.Metrics.ResourceRefreshInterval)
143+
if err != nil {
144+
return fmt.Errorf("invalid metrics resource refresh interval %q: %w", app.Config.Metrics.ResourceRefreshInterval, err)
145+
}
146+
if err := app.ResourceManager.StartMonitoring(ctx, otelProvider.Meter, resourceRefreshInterval); err != nil {
147+
return fmt.Errorf("start resource monitoring: %w", err)
148+
}
149+
142150
// Log OTel status
143151
if cfg.Otel.Enabled {
144152
logger.Info("OpenTelemetry push enabled", "endpoint", cfg.Otel.Endpoint, "service", cfg.Otel.ServiceName, "metric_export_interval", cfg.Otel.MetricExportInterval)

config.example.darwin.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ limits:
111111
# listen_address: 127.0.0.1
112112
# port: 9464
113113
# vm_label_budget: 200
114+
# resource_refresh_interval: 120s
114115
#
115116
# otel:
116117
# enabled: false

config.example.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ data_dir: /var/lib/hypeman
105105
# listen_address: 127.0.0.1
106106
# port: 9464
107107
# vm_label_budget: 200
108+
# resource_refresh_interval: 120s
108109
#
109110
# otel:
110111
# enabled: false

lib/resources/monitoring.go

Lines changed: 251 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,251 @@
1+
package resources
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"sync"
7+
"time"
8+
9+
"go.opentelemetry.io/otel/attribute"
10+
"go.opentelemetry.io/otel/metric"
11+
)
12+
13+
type monitoringState struct {
14+
mu sync.RWMutex
15+
started bool
16+
metricsRegistered bool
17+
snapshot monitoringSnapshot
18+
hasSnapshot bool
19+
}
20+
21+
type monitoringSnapshot struct {
22+
capturedAt time.Time
23+
status FullResourceStatus
24+
imageStorageCurrent int64
25+
imageStorageMax int64
26+
}
27+
28+
type monitoringMetrics struct {
29+
capacity metric.Int64ObservableGauge
30+
effectiveLimit metric.Int64ObservableGauge
31+
allocated metric.Int64ObservableGauge
32+
oversubRatio metric.Float64ObservableGauge
33+
diskBreakdown metric.Int64ObservableGauge
34+
imageStorage metric.Int64ObservableGauge
35+
gpuSlots metric.Int64ObservableGauge
36+
gpuProfileSlots metric.Int64ObservableGauge
37+
}
38+
39+
func (m *Manager) StartMonitoring(ctx context.Context, meter metric.Meter, refreshInterval time.Duration) error {
40+
if meter == nil {
41+
return nil
42+
}
43+
if refreshInterval <= 0 {
44+
return fmt.Errorf("resource monitoring refresh interval must be positive, got %s", refreshInterval)
45+
}
46+
47+
m.monitoring.mu.Lock()
48+
if m.monitoring.started {
49+
m.monitoring.mu.Unlock()
50+
return nil
51+
}
52+
if !m.monitoring.metricsRegistered {
53+
metrics, err := newMonitoringMetrics(meter, m)
54+
if err != nil {
55+
m.monitoring.mu.Unlock()
56+
return err
57+
}
58+
m.monitoring.metricsRegistered = true
59+
_ = metrics
60+
}
61+
m.monitoring.mu.Unlock()
62+
63+
if err := m.refreshMonitoringSnapshot(ctx); err != nil {
64+
return err
65+
}
66+
67+
m.monitoring.mu.Lock()
68+
if m.monitoring.started {
69+
m.monitoring.mu.Unlock()
70+
return nil
71+
}
72+
m.monitoring.started = true
73+
m.monitoring.mu.Unlock()
74+
75+
go func() {
76+
ticker := time.NewTicker(refreshInterval)
77+
defer ticker.Stop()
78+
79+
for {
80+
select {
81+
case <-ctx.Done():
82+
return
83+
case <-ticker.C:
84+
_ = m.refreshMonitoringSnapshot(ctx)
85+
}
86+
}
87+
}()
88+
89+
return nil
90+
}
91+
92+
func (m *Manager) refreshMonitoringSnapshot(ctx context.Context) error {
93+
status, err := m.GetFullStatus(ctx)
94+
if err != nil {
95+
return err
96+
}
97+
98+
snapshot := monitoringSnapshot{
99+
capturedAt: time.Now(),
100+
status: *status,
101+
}
102+
if status.DiskDetail != nil {
103+
snapshot.imageStorageCurrent = status.DiskDetail.Images + status.DiskDetail.OCICache
104+
}
105+
snapshot.imageStorageMax = m.MaxImageStorageBytes()
106+
107+
m.monitoring.mu.Lock()
108+
m.monitoring.snapshot = snapshot
109+
m.monitoring.hasSnapshot = true
110+
m.monitoring.mu.Unlock()
111+
112+
return nil
113+
}
114+
115+
func (m *Manager) currentMonitoringSnapshot() (monitoringSnapshot, bool) {
116+
m.monitoring.mu.RLock()
117+
defer m.monitoring.mu.RUnlock()
118+
119+
if !m.monitoring.hasSnapshot {
120+
return monitoringSnapshot{}, false
121+
}
122+
123+
return m.monitoring.snapshot, true
124+
}
125+
126+
func newMonitoringMetrics(meter metric.Meter, mgr *Manager) (*monitoringMetrics, error) {
127+
capacity, err := meter.Int64ObservableGauge(
128+
"hypeman_resources_capacity",
129+
metric.WithDescription("Raw host capacity by resource type"),
130+
)
131+
if err != nil {
132+
return nil, err
133+
}
134+
135+
effectiveLimit, err := meter.Int64ObservableGauge(
136+
"hypeman_resources_effective_limit",
137+
metric.WithDescription("Effective allocatable limit by resource type after oversubscription"),
138+
)
139+
if err != nil {
140+
return nil, err
141+
}
142+
143+
allocated, err := meter.Int64ObservableGauge(
144+
"hypeman_resources_allocated",
145+
metric.WithDescription("Current allocated amount by resource type"),
146+
)
147+
if err != nil {
148+
return nil, err
149+
}
150+
151+
oversubRatio, err := meter.Float64ObservableGauge(
152+
"hypeman_resources_oversub_ratio",
153+
metric.WithDescription("Oversubscription ratio by resource type"),
154+
)
155+
if err != nil {
156+
return nil, err
157+
}
158+
159+
diskBreakdown, err := meter.Int64ObservableGauge(
160+
"hypeman_resources_disk_breakdown_bytes",
161+
metric.WithDescription("Disk usage broken down by component"),
162+
metric.WithUnit("By"),
163+
)
164+
if err != nil {
165+
return nil, err
166+
}
167+
168+
imageStorage, err := meter.Int64ObservableGauge(
169+
"hypeman_resources_image_storage_bytes",
170+
metric.WithDescription("Current and maximum image storage bytes"),
171+
metric.WithUnit("By"),
172+
)
173+
if err != nil {
174+
return nil, err
175+
}
176+
177+
gpuSlots, err := meter.Int64ObservableGauge(
178+
"hypeman_resources_gpu_slots",
179+
metric.WithDescription("Total and used GPU slots"),
180+
)
181+
if err != nil {
182+
return nil, err
183+
}
184+
185+
gpuProfileSlots, err := meter.Int64ObservableGauge(
186+
"hypeman_resources_gpu_profile_slots",
187+
metric.WithDescription("Available GPU slots by vGPU profile"),
188+
)
189+
if err != nil {
190+
return nil, err
191+
}
192+
193+
if _, err := meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error {
194+
snapshot, ok := mgr.currentMonitoringSnapshot()
195+
if !ok {
196+
return nil
197+
}
198+
199+
resourceStatuses := []ResourceStatus{
200+
snapshot.status.CPU,
201+
snapshot.status.Memory,
202+
snapshot.status.Disk,
203+
snapshot.status.Network,
204+
snapshot.status.DiskIO,
205+
}
206+
for _, status := range resourceStatuses {
207+
attrs := metric.WithAttributes(attribute.String("resource", string(status.Type)))
208+
o.ObserveInt64(capacity, status.Capacity, attrs)
209+
o.ObserveInt64(effectiveLimit, status.EffectiveLimit, attrs)
210+
o.ObserveInt64(allocated, status.Allocated, attrs)
211+
o.ObserveFloat64(oversubRatio, status.OversubRatio, attrs)
212+
}
213+
214+
if snapshot.status.DiskDetail != nil {
215+
o.ObserveInt64(diskBreakdown, snapshot.status.DiskDetail.Images, metric.WithAttributes(attribute.String("component", "images")))
216+
o.ObserveInt64(diskBreakdown, snapshot.status.DiskDetail.OCICache, metric.WithAttributes(attribute.String("component", "oci_cache")))
217+
o.ObserveInt64(diskBreakdown, snapshot.status.DiskDetail.Volumes, metric.WithAttributes(attribute.String("component", "volumes")))
218+
o.ObserveInt64(diskBreakdown, snapshot.status.DiskDetail.Overlays, metric.WithAttributes(attribute.String("component", "overlays")))
219+
o.ObserveInt64(imageStorage, snapshot.imageStorageCurrent, metric.WithAttributes(attribute.String("kind", "current")))
220+
}
221+
o.ObserveInt64(imageStorage, snapshot.imageStorageMax, metric.WithAttributes(attribute.String("kind", "max")))
222+
223+
if snapshot.status.GPU != nil {
224+
o.ObserveInt64(gpuSlots, int64(snapshot.status.GPU.UsedSlots), metric.WithAttributes(attribute.String("kind", "used")))
225+
o.ObserveInt64(gpuSlots, int64(snapshot.status.GPU.TotalSlots), metric.WithAttributes(attribute.String("kind", "total")))
226+
for _, profile := range snapshot.status.GPU.Profiles {
227+
o.ObserveInt64(gpuProfileSlots, int64(profile.Available),
228+
metric.WithAttributes(
229+
attribute.String("profile", profile.Name),
230+
attribute.String("kind", "available"),
231+
),
232+
)
233+
}
234+
}
235+
236+
return nil
237+
}, capacity, effectiveLimit, allocated, oversubRatio, diskBreakdown, imageStorage, gpuSlots, gpuProfileSlots); err != nil {
238+
return nil, err
239+
}
240+
241+
return &monitoringMetrics{
242+
capacity: capacity,
243+
effectiveLimit: effectiveLimit,
244+
allocated: allocated,
245+
oversubRatio: oversubRatio,
246+
diskBreakdown: diskBreakdown,
247+
imageStorage: imageStorage,
248+
gpuSlots: gpuSlots,
249+
gpuProfileSlots: gpuProfileSlots,
250+
}, nil
251+
}

0 commit comments

Comments
 (0)