Skip to content

Commit 3af9562

Browse files
committed
Reduce tracing volume for read-heavy paths
1 parent 4a24278 commit 3af9562

13 files changed

Lines changed: 553 additions & 69 deletions

File tree

cmd/api/config/config.go

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -106,12 +106,13 @@ type MetricsConfig struct {
106106

107107
// OtelConfig holds OpenTelemetry settings.
108108
type OtelConfig struct {
109-
Enabled bool `koanf:"enabled"`
110-
Endpoint string `koanf:"endpoint"`
111-
ServiceName string `koanf:"service_name"`
112-
ServiceInstanceID string `koanf:"service_instance_id"`
113-
Insecure bool `koanf:"insecure"`
114-
MetricExportInterval string `koanf:"metric_export_interval"`
109+
Enabled bool `koanf:"enabled"`
110+
Endpoint string `koanf:"endpoint"`
111+
ServiceName string `koanf:"service_name"`
112+
ServiceInstanceID string `koanf:"service_instance_id"`
113+
Insecure bool `koanf:"insecure"`
114+
MetricExportInterval string `koanf:"metric_export_interval"`
115+
SuccessfulGetSampleRatio float64 `koanf:"successful_get_sample_ratio"`
115116
}
116117

117118
// LoggingConfig holds log rotation and level settings.
@@ -302,12 +303,13 @@ func defaultConfig() *Config {
302303
},
303304

304305
Otel: OtelConfig{
305-
Enabled: false,
306-
Endpoint: "127.0.0.1:4317",
307-
ServiceName: "hypeman",
308-
ServiceInstanceID: getHostname(),
309-
Insecure: true,
310-
MetricExportInterval: "60s",
306+
Enabled: false,
307+
Endpoint: "127.0.0.1:4317",
308+
ServiceName: "hypeman",
309+
ServiceInstanceID: getHostname(),
310+
Insecure: true,
311+
MetricExportInterval: "60s",
312+
SuccessfulGetSampleRatio: 0.1,
311313
},
312314

313315
Logging: LoggingConfig{
@@ -479,6 +481,9 @@ func (c *Config) Validate() error {
479481
return fmt.Errorf("otel.metric_export_interval must be a valid duration, got %q: %w", c.Otel.MetricExportInterval, err)
480482
}
481483
}
484+
if c.Otel.SuccessfulGetSampleRatio < 0 || c.Otel.SuccessfulGetSampleRatio > 1 {
485+
return fmt.Errorf("otel.successful_get_sample_ratio must be between 0 and 1, got %v", c.Otel.SuccessfulGetSampleRatio)
486+
}
482487
if c.Oversubscription.CPU <= 0 {
483488
return fmt.Errorf("oversubscription.cpu must be positive, got %v", c.Oversubscription.CPU)
484489
}

cmd/api/config/config_test.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ func TestDefaultConfigIncludesMetricsSettings(t *testing.T) {
2828
if cfg.Otel.MetricExportInterval != "60s" {
2929
t.Fatalf("expected default otel.metric_export_interval to be 60s, got %q", cfg.Otel.MetricExportInterval)
3030
}
31+
if cfg.Otel.SuccessfulGetSampleRatio != 0.1 {
32+
t.Fatalf("expected default otel.successful_get_sample_ratio to be 0.1, got %v", cfg.Otel.SuccessfulGetSampleRatio)
33+
}
3134
}
3235

3336
func TestLoadEnvOverridesMetricsAndOtelInterval(t *testing.T) {
@@ -36,6 +39,7 @@ func TestLoadEnvOverridesMetricsAndOtelInterval(t *testing.T) {
3639
t.Setenv("METRICS__VM_LABEL_BUDGET", "350")
3740
t.Setenv("METRICS__RESOURCE_REFRESH_INTERVAL", "30s")
3841
t.Setenv("OTEL__METRIC_EXPORT_INTERVAL", "15s")
42+
t.Setenv("OTEL__SUCCESSFUL_GET_SAMPLE_RATIO", "0.25")
3943

4044
tmp := t.TempDir()
4145
cfgPath := filepath.Join(tmp, "config.yaml")
@@ -63,6 +67,9 @@ func TestLoadEnvOverridesMetricsAndOtelInterval(t *testing.T) {
6367
if cfg.Otel.MetricExportInterval != "15s" {
6468
t.Fatalf("expected otel.metric_export_interval override, got %q", cfg.Otel.MetricExportInterval)
6569
}
70+
if cfg.Otel.SuccessfulGetSampleRatio != 0.25 {
71+
t.Fatalf("expected otel.successful_get_sample_ratio override, got %v", cfg.Otel.SuccessfulGetSampleRatio)
72+
}
6673
}
6774

6875
func TestValidateRejectsInvalidMetricsPort(t *testing.T) {
@@ -85,6 +92,16 @@ func TestValidateRejectsInvalidMetricExportInterval(t *testing.T) {
8592
}
8693
}
8794

95+
func TestValidateRejectsInvalidSuccessfulGetSampleRatio(t *testing.T) {
96+
cfg := defaultConfig()
97+
cfg.Otel.SuccessfulGetSampleRatio = 1.1
98+
99+
err := cfg.Validate()
100+
if err == nil {
101+
t.Fatalf("expected validation error for invalid successful get sample ratio")
102+
}
103+
}
104+
88105
func TestValidateRejectsInvalidVMLabelBudget(t *testing.T) {
89106
cfg := defaultConfig()
90107
cfg.Metrics.VMLabelBudget = 0

cmd/api/main.go

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -78,14 +78,15 @@ func run() error {
7878

7979
// Initialize OpenTelemetry (before wire initialization)
8080
otelCfg := otel.Config{
81-
Enabled: cfg.Otel.Enabled,
82-
Endpoint: cfg.Otel.Endpoint,
83-
ServiceName: cfg.Otel.ServiceName,
84-
ServiceInstanceID: cfg.Otel.ServiceInstanceID,
85-
Insecure: cfg.Otel.Insecure,
86-
MetricExportInterval: cfg.Otel.MetricExportInterval,
87-
Version: cfg.Version,
88-
Env: cfg.Env,
81+
Enabled: cfg.Otel.Enabled,
82+
Endpoint: cfg.Otel.Endpoint,
83+
ServiceName: cfg.Otel.ServiceName,
84+
ServiceInstanceID: cfg.Otel.ServiceInstanceID,
85+
Insecure: cfg.Otel.Insecure,
86+
MetricExportInterval: cfg.Otel.MetricExportInterval,
87+
SuccessfulGetSampleRatio: cfg.Otel.SuccessfulGetSampleRatio,
88+
Version: cfg.Version,
89+
Env: cfg.Env,
8990
}
9091

9192
otelProvider, otelShutdown, err := otel.Init(context.Background(), otelCfg)
@@ -149,7 +150,7 @@ func run() error {
149150

150151
// Log OTel status
151152
if cfg.Otel.Enabled {
152-
logger.Info("OpenTelemetry push enabled", "endpoint", cfg.Otel.Endpoint, "service", cfg.Otel.ServiceName, "metric_export_interval", cfg.Otel.MetricExportInterval)
153+
logger.Info("OpenTelemetry push enabled", "endpoint", cfg.Otel.Endpoint, "service", cfg.Otel.ServiceName, "metric_export_interval", cfg.Otel.MetricExportInterval, "successful_get_sample_ratio", cfg.Otel.SuccessfulGetSampleRatio)
153154
} else {
154155
logger.Info("OpenTelemetry push disabled; Prometheus pull metrics remain available")
155156
}
@@ -323,6 +324,7 @@ func run() error {
323324
r.Use(middleware.Recoverer)
324325
if cfg.Otel.Enabled {
325326
r.Use(otelchi.Middleware(cfg.Otel.ServiceName, otelchi.WithChiRoutes(r)))
327+
r.Use(otel.NewSuccessfulGETErrorTraceMiddleware(cfg.Otel.ServiceName))
326328
}
327329
r.Use(mw.InjectLogger(logger))
328330
r.Use(mw.AccessLogger(accessLogger))
@@ -345,6 +347,7 @@ func run() error {
345347
// OpenTelemetry tracing middleware FIRST (creates span context)
346348
if cfg.Otel.Enabled {
347349
r.Use(otelchi.Middleware(cfg.Otel.ServiceName, otelchi.WithChiRoutes(r)))
350+
r.Use(otel.NewSuccessfulGETErrorTraceMiddleware(cfg.Otel.ServiceName))
348351
}
349352

350353
// Inject logger into request context for handlers to use

lib/hypervisor/firecracker/firecracker.go

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -283,24 +283,38 @@ func (f *Firecracker) do(ctx context.Context, method, path string, reqBody any,
283283
attribute.String("http.method", method),
284284
attribute.String("http.route", path),
285285
)
286-
ctx, span := otel.Tracer("hypeman/hypervisor/firecracker").Start(ctx, "hypervisor.http "+method+" "+path, trace.WithAttributes(attrs...))
287-
defer span.End()
286+
tracer := otel.Tracer("hypeman/hypervisor/firecracker")
287+
spanName := "hypervisor.http " + method + " " + path
288+
shouldTrace := hypervisor.ShouldTraceHypervisorHTTPSpan(method, path)
289+
290+
var span trace.Span
291+
if shouldTrace {
292+
var spanCtx context.Context
293+
spanCtx, span = tracer.Start(ctx, spanName, trace.WithAttributes(attrs...))
294+
ctx = spanCtx
295+
defer span.End()
296+
}
297+
298+
recordError := func(err error) {
299+
if shouldTrace {
300+
span.RecordError(err)
301+
span.SetStatus(codes.Error, err.Error())
302+
}
303+
}
288304

289305
var bodyReader io.Reader
290306
if reqBody != nil {
291307
data, err := json.Marshal(reqBody)
292308
if err != nil {
293-
span.RecordError(err)
294-
span.SetStatus(codes.Error, err.Error())
309+
recordError(err)
295310
return nil, fmt.Errorf("marshal request body: %w", err)
296311
}
297312
bodyReader = bytes.NewReader(data)
298313
}
299314

300315
req, err := http.NewRequestWithContext(ctx, method, "http://localhost"+path, bodyReader)
301316
if err != nil {
302-
span.RecordError(err)
303-
span.SetStatus(codes.Error, err.Error())
317+
recordError(err)
304318
return nil, fmt.Errorf("create request: %w", err)
305319
}
306320
req.Header.Set("Accept", "application/json")
@@ -310,35 +324,43 @@ func (f *Firecracker) do(ctx context.Context, method, path string, reqBody any,
310324

311325
resp, err := f.client.Do(req)
312326
if err != nil {
313-
span.RecordError(err)
314-
span.SetStatus(codes.Error, err.Error())
327+
recordError(err)
315328
return nil, fmt.Errorf("request %s %s: %w", method, path, err)
316329
}
317330
defer resp.Body.Close()
318-
span.SetAttributes(attribute.Int("http.status_code", resp.StatusCode))
331+
if shouldTrace {
332+
span.SetAttributes(attribute.Int("http.status_code", resp.StatusCode))
333+
}
319334

320335
data, err := io.ReadAll(resp.Body)
321336
if err != nil {
322-
span.RecordError(err)
323-
span.SetStatus(codes.Error, err.Error())
337+
recordError(err)
324338
return nil, fmt.Errorf("read response body: %w", err)
325339
}
326340

327341
for _, status := range expectedStatus {
328342
if resp.StatusCode == status {
329-
span.SetStatus(codes.Ok, "")
343+
if shouldTrace {
344+
span.SetStatus(codes.Ok, "")
345+
}
330346
return data, nil
331347
}
332348
}
333349

334350
if len(data) > 0 {
335351
var apiErr apiError
336352
if err := json.Unmarshal(data, &apiErr); err == nil && apiErr.FaultMessage != "" {
337-
span.SetStatus(codes.Error, apiErr.FaultMessage)
353+
if shouldTrace {
354+
span.SetAttributes(attribute.Int("http.status_code", resp.StatusCode))
355+
span.SetStatus(codes.Error, apiErr.FaultMessage)
356+
}
338357
return nil, fmt.Errorf("status %d: %s", resp.StatusCode, apiErr.FaultMessage)
339358
}
340359
}
341-
span.SetStatus(codes.Error, resp.Status)
360+
if shouldTrace {
361+
span.SetAttributes(attribute.Int("http.status_code", resp.StatusCode))
362+
span.SetStatus(codes.Error, resp.Status)
363+
}
342364
return nil, fmt.Errorf("status %d: %s", resp.StatusCode, string(data))
343365
}
344366

lib/hypervisor/tracing.go

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package hypervisor
22

33
import (
44
"context"
5+
"net/http"
56
"time"
67

78
"github.com/kernel/hypeman/lib/paths"
@@ -52,6 +53,18 @@ func TraceAttributesFromContext(ctx context.Context) []attribute.KeyValue {
5253
return out
5354
}
5455

56+
func ShouldTraceHypervisorHTTPSpan(method, path string) bool {
57+
if method != http.MethodGet {
58+
return true
59+
}
60+
switch path {
61+
case "/", "/api/v1/vm.info":
62+
return false
63+
default:
64+
return true
65+
}
66+
}
67+
5568
func WrapHypervisor(hvType Type, hv Hypervisor) Hypervisor {
5669
if hv == nil {
5770
return nil
@@ -115,6 +128,21 @@ func FinishTraceSpan(span trace.Span, err error) {
115128
finishTraceSpan(span, err)
116129
}
117130

131+
func StartDetachedTraceSpan(ctx context.Context, tracer trace.Tracer, name string, attrs ...attribute.KeyValue) (context.Context, trace.Span) {
132+
allAttrs := TraceAttributesFromContext(ctx)
133+
if len(attrs) > 0 {
134+
allAttrs = append(allAttrs, attrs...)
135+
}
136+
137+
spanOpts := []trace.SpanStartOption{
138+
trace.WithNewRoot(),
139+
}
140+
if len(allAttrs) > 0 {
141+
spanOpts = append(spanOpts, trace.WithAttributes(allAttrs...))
142+
}
143+
return tracer.Start(context.Background(), name, spanOpts...)
144+
}
145+
118146
func startTraceSpan(ctx context.Context, tracer trace.Tracer, name string, attrs ...attribute.KeyValue) (context.Context, trace.Span) {
119147
allAttrs := TraceAttributesFromContext(ctx)
120148
if len(attrs) > 0 {
@@ -164,9 +192,17 @@ func (h *tracingHypervisor) Shutdown(ctx context.Context) (err error) {
164192
}
165193

166194
func (h *tracingHypervisor) GetVMInfo(ctx context.Context) (_ *VMInfo, err error) {
167-
ctx, span := startTraceSpan(ctx, h.tracer, "hypervisor.get_vm_info", h.spanAttrs(attribute.String("operation", "get_vm_info"))...)
168-
defer func() { finishTraceSpan(span, err) }()
169-
return h.next.GetVMInfo(ctx)
195+
info, err := h.next.GetVMInfo(ctx)
196+
if err != nil {
197+
_, span := StartDetachedTraceSpan(ctx, h.tracer, "hypervisor.get_vm_info",
198+
h.spanAttrs(
199+
attribute.String("operation", "get_vm_info"),
200+
attribute.String("sampled_from", "error_only"),
201+
)...,
202+
)
203+
finishTraceSpan(span, err)
204+
}
205+
return info, err
170206
}
171207

172208
func (h *tracingHypervisor) Pause(ctx context.Context) (err error) {

lib/hypervisor/tracing_test.go

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package hypervisor
22

33
import (
44
"context"
5+
"errors"
56
"testing"
67
"time"
78

@@ -16,6 +17,7 @@ import (
1617
)
1718

1819
type fakeHypervisor struct{}
20+
type fakeHypervisorGetVMInfoError struct{}
1921

2022
func (fakeHypervisor) DeleteVM(context.Context) error { return nil }
2123
func (fakeHypervisor) Shutdown(context.Context) error { return nil }
@@ -33,7 +35,28 @@ func (fakeHypervisor) SetTargetGuestMemoryBytes(context.Context, int64) error {
3335
func (fakeHypervisor) GetTargetGuestMemoryBytes(context.Context) (int64, error) {
3436
return 0, nil
3537
}
36-
func (fakeHypervisor) Capabilities() Capabilities { return Capabilities{} }
38+
func (fakeHypervisor) Capabilities() Capabilities { return Capabilities{} }
39+
func (fakeHypervisorGetVMInfoError) DeleteVM(context.Context) error { return nil }
40+
func (fakeHypervisorGetVMInfoError) Shutdown(context.Context) error { return nil }
41+
func (fakeHypervisorGetVMInfoError) GetVMInfo(context.Context) (*VMInfo, error) {
42+
return nil, errors.New("vm info failed")
43+
}
44+
func (fakeHypervisorGetVMInfoError) Pause(context.Context) error { return nil }
45+
func (fakeHypervisorGetVMInfoError) Resume(context.Context) error { return nil }
46+
func (fakeHypervisorGetVMInfoError) Snapshot(context.Context, string) error { return nil }
47+
func (fakeHypervisorGetVMInfoError) ResizeMemory(context.Context, int64) error {
48+
return nil
49+
}
50+
func (fakeHypervisorGetVMInfoError) ResizeMemoryAndWait(context.Context, int64, time.Duration) error {
51+
return nil
52+
}
53+
func (fakeHypervisorGetVMInfoError) SetTargetGuestMemoryBytes(context.Context, int64) error {
54+
return nil
55+
}
56+
func (fakeHypervisorGetVMInfoError) GetTargetGuestMemoryBytes(context.Context) (int64, error) {
57+
return 0, nil
58+
}
59+
func (fakeHypervisorGetVMInfoError) Capabilities() Capabilities { return Capabilities{} }
3760

3861
type fakeStarter struct {
3962
returned Hypervisor
@@ -100,6 +123,37 @@ func TestWrapVMStarterWrapsReturnedHypervisor(t *testing.T) {
100123
assert.Equal(t, string(TypeCloudHypervisor), attrs["hypervisor"])
101124
}
102125

126+
func TestWrapHypervisorSkipsGetVMInfoTraceByDefault(t *testing.T) {
127+
recorder, _ := newTestTracerProvider(t)
128+
129+
hv := WrapHypervisor(TypeQEMU, fakeHypervisor{})
130+
_, err := hv.GetVMInfo(context.Background())
131+
require.NoError(t, err)
132+
133+
for _, span := range recorder.Ended() {
134+
if span.Name() == "hypervisor.get_vm_info" {
135+
t.Fatalf("expected get vm info to be skipped by default")
136+
}
137+
}
138+
}
139+
140+
func TestWrapHypervisorCreatesDetachedErrorSpanForGetVMInfoFailures(t *testing.T) {
141+
recorder, _ := newTestTracerProvider(t)
142+
143+
ctx := WithTraceAttributes(context.Background(), attribute.String("instance_id", "inst_999"))
144+
hv := WrapHypervisor(TypeQEMU, fakeHypervisorGetVMInfoError{})
145+
_, err := hv.GetVMInfo(ctx)
146+
require.Error(t, err)
147+
148+
span := findSpanByName(t, recorder.Ended(), "hypervisor.get_vm_info")
149+
require.False(t, span.Parent().IsValid())
150+
151+
attrs := attrsToMap(span.Attributes())
152+
assert.Equal(t, "inst_999", attrs["instance_id"])
153+
assert.Equal(t, string(TypeQEMU), attrs["hypervisor"])
154+
assert.Equal(t, "error_only", attrs["sampled_from"])
155+
}
156+
103157
func newTestTracerProvider(t *testing.T) (*tracetest.SpanRecorder, *sdktrace.TracerProvider) {
104158
t.Helper()
105159

0 commit comments

Comments
 (0)