Skip to content

Commit e40c5c6

Browse files
sjmiller609claude
andauthored
Add scheduled instance snapshots with retention cleanup (#139)
* feat: add scheduled snapshots with retention cleanup * fix: aggregate scheduled snapshot runner errors * perf: scan schedule index and reduce lock contention * fix: validate scheduled snapshot name prefix length * refactor: move scheduled snapshot domain logic to lib package * fix: ignore missing snapshots during retention cleanup * chore: remove dead wrapper and align retention schema * fix: tighten schedule run due-check under write lock * feat: auto-select scheduled snapshot kind and retain post-delete schedules * fix: return 404 when scheduling a missing instance * fix: retire converged deleted-instance schedules * chore: remove test-only snapshot schedule aliases * fix: preserve schedule runtime history on updates * fix: tighten schedule retention request handling * fix: honor cancellation in schedule runner loop * fix: preserve explicit zero retention settings * fix: evaluate schedule timing per-instance * fix: make schedule interval step arithmetic explicit * simplify snapshot scheduling: extract error helper, inline thin wrappers, remove dead code - Extract failScheduleRun helper to deduplicate the "record error + save schedule" pattern repeated 4 times in runSnapshotScheduleForInstanceLocked - Inline countScheduledSnapshots (single-use len() wrapper) at call site - Inline validateSetSnapshotScheduleRequest (single-use wrapper) at call site - Merge snapshot_schedule_types.go contents into snapshot_schedule.go - Remove dead truncation/fallback code in BuildSnapshotName that could never execute because ValidateSetRequest already rejects invalid prefixes upstream Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * deslop: remove redundant nil guards and unnecessary make - Remove redundant nil-check-before-assign for pointer fields in snapshotScheduleToOAPI (assigning nil to a nil field is a no-op) - Use var declaration instead of make([]error, 0) for error accumulator that is only appended to (not serialized to JSON) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * add snapshot-schedule routes to scope mappings The scopes package (added on main) requires all routes to have scope mappings. Map the three snapshot-schedule endpoints to existing snapshot scopes: read, write, and delete. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix gofmt alignment in scope mappings Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Harden snapshot schedule persistence and timing * Fix manager formatting for CI * Stabilize scheduled snapshot cleanup tests * Add snapshot schedule routes to Stainless config --------- Co-authored-by: sjmiller609 <7516283+sjmiller609@users.noreply.github.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 23ebbed commit e40c5c6

19 files changed

Lines changed: 3023 additions & 235 deletions

cmd/api/api/snapshots.go

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package api
33
import (
44
"context"
55
"errors"
6+
"time"
67

78
"github.com/kernel/hypeman/lib/hypervisor"
89
"github.com/kernel/hypeman/lib/instances"
@@ -203,6 +204,124 @@ func (s *ApiService) ForkSnapshot(ctx context.Context, request oapi.ForkSnapshot
203204
return oapi.ForkSnapshot201JSONResponse(instanceToOAPI(*result)), nil
204205
}
205206

207+
// GetInstanceSnapshotSchedule gets a snapshot schedule for the resolved instance.
208+
func (s *ApiService) GetInstanceSnapshotSchedule(ctx context.Context, request oapi.GetInstanceSnapshotScheduleRequestObject) (oapi.GetInstanceSnapshotScheduleResponseObject, error) {
209+
inst := mw.GetResolvedInstance[instances.Instance](ctx)
210+
if inst == nil {
211+
return oapi.GetInstanceSnapshotSchedule500JSONResponse{Code: "internal_error", Message: "resource not resolved"}, nil
212+
}
213+
214+
scheduleManager, ok := s.InstanceManager.(instances.SnapshotScheduleManager)
215+
if !ok {
216+
return oapi.GetInstanceSnapshotSchedule500JSONResponse{Code: "internal_error", Message: "snapshot scheduling is not available"}, nil
217+
}
218+
219+
schedule, err := scheduleManager.GetSnapshotSchedule(ctx, inst.Id)
220+
if err != nil {
221+
log := logger.FromContext(ctx)
222+
switch {
223+
case errors.Is(err, instances.ErrSnapshotScheduleNotFound):
224+
return oapi.GetInstanceSnapshotSchedule404JSONResponse{Code: "not_found", Message: "snapshot schedule not found"}, nil
225+
case errors.Is(err, instances.ErrNotFound):
226+
return oapi.GetInstanceSnapshotSchedule404JSONResponse{Code: "not_found", Message: "instance not found"}, nil
227+
default:
228+
log.ErrorContext(ctx, "failed to get snapshot schedule", "error", err)
229+
return oapi.GetInstanceSnapshotSchedule500JSONResponse{Code: "internal_error", Message: "failed to get snapshot schedule"}, nil
230+
}
231+
}
232+
233+
return oapi.GetInstanceSnapshotSchedule200JSONResponse(snapshotScheduleToOAPI(*schedule)), nil
234+
}
235+
236+
// SetInstanceSnapshotSchedule creates or updates a snapshot schedule for the resolved instance.
237+
func (s *ApiService) SetInstanceSnapshotSchedule(ctx context.Context, request oapi.SetInstanceSnapshotScheduleRequestObject) (oapi.SetInstanceSnapshotScheduleResponseObject, error) {
238+
inst := mw.GetResolvedInstance[instances.Instance](ctx)
239+
if inst == nil {
240+
return oapi.SetInstanceSnapshotSchedule500JSONResponse{Code: "internal_error", Message: "resource not resolved"}, nil
241+
}
242+
if request.Body == nil {
243+
return oapi.SetInstanceSnapshotSchedule400JSONResponse{Code: "invalid_request", Message: "request body is required"}, nil
244+
}
245+
246+
scheduleManager, ok := s.InstanceManager.(instances.SnapshotScheduleManager)
247+
if !ok {
248+
return oapi.SetInstanceSnapshotSchedule500JSONResponse{Code: "internal_error", Message: "snapshot scheduling is not available"}, nil
249+
}
250+
251+
interval, err := time.ParseDuration(request.Body.Interval)
252+
if err != nil {
253+
return oapi.SetInstanceSnapshotSchedule400JSONResponse{Code: "invalid_request", Message: "interval must be a valid duration"}, nil
254+
}
255+
if request.Body.Retention.MaxCount == nil && request.Body.Retention.MaxAge == nil {
256+
return oapi.SetInstanceSnapshotSchedule400JSONResponse{Code: "invalid_request", Message: "retention must include max_count or max_age"}, nil
257+
}
258+
259+
retention := instances.SnapshotScheduleRetention{}
260+
if request.Body.Retention.MaxCount != nil {
261+
retention.MaxCount = *request.Body.Retention.MaxCount
262+
}
263+
if request.Body.Retention.MaxAge != nil {
264+
maxAge, parseErr := time.ParseDuration(*request.Body.Retention.MaxAge)
265+
if parseErr != nil {
266+
return oapi.SetInstanceSnapshotSchedule400JSONResponse{Code: "invalid_request", Message: "retention.max_age must be a valid duration"}, nil
267+
}
268+
retention.MaxAge = maxAge
269+
}
270+
req := instances.SetSnapshotScheduleRequest{
271+
Interval: interval,
272+
Metadata: toMapTags(request.Body.Metadata),
273+
Retention: retention,
274+
}
275+
if request.Body.NamePrefix != nil {
276+
req.NamePrefix = *request.Body.NamePrefix
277+
}
278+
279+
schedule, err := scheduleManager.SetSnapshotSchedule(ctx, inst.Id, req)
280+
if err != nil {
281+
log := logger.FromContext(ctx)
282+
switch {
283+
case errors.Is(err, instances.ErrInvalidRequest):
284+
return oapi.SetInstanceSnapshotSchedule400JSONResponse{Code: "invalid_request", Message: err.Error()}, nil
285+
case errors.Is(err, instances.ErrNotFound):
286+
return oapi.SetInstanceSnapshotSchedule404JSONResponse{Code: "not_found", Message: "instance not found"}, nil
287+
default:
288+
log.ErrorContext(ctx, "failed to set snapshot schedule", "error", err)
289+
return oapi.SetInstanceSnapshotSchedule500JSONResponse{Code: "internal_error", Message: "failed to set snapshot schedule"}, nil
290+
}
291+
}
292+
293+
return oapi.SetInstanceSnapshotSchedule200JSONResponse(snapshotScheduleToOAPI(*schedule)), nil
294+
}
295+
296+
// DeleteInstanceSnapshotSchedule deletes a snapshot schedule for the resolved instance.
297+
func (s *ApiService) DeleteInstanceSnapshotSchedule(ctx context.Context, request oapi.DeleteInstanceSnapshotScheduleRequestObject) (oapi.DeleteInstanceSnapshotScheduleResponseObject, error) {
298+
inst := mw.GetResolvedInstance[instances.Instance](ctx)
299+
if inst == nil {
300+
return oapi.DeleteInstanceSnapshotSchedule500JSONResponse{Code: "internal_error", Message: "resource not resolved"}, nil
301+
}
302+
303+
scheduleManager, ok := s.InstanceManager.(instances.SnapshotScheduleManager)
304+
if !ok {
305+
return oapi.DeleteInstanceSnapshotSchedule500JSONResponse{Code: "internal_error", Message: "snapshot scheduling is not available"}, nil
306+
}
307+
308+
err := scheduleManager.DeleteSnapshotSchedule(ctx, inst.Id)
309+
if err != nil {
310+
log := logger.FromContext(ctx)
311+
switch {
312+
case errors.Is(err, instances.ErrSnapshotScheduleNotFound):
313+
return oapi.DeleteInstanceSnapshotSchedule404JSONResponse{Code: "not_found", Message: "snapshot schedule not found"}, nil
314+
case errors.Is(err, instances.ErrNotFound):
315+
return oapi.DeleteInstanceSnapshotSchedule404JSONResponse{Code: "not_found", Message: "instance not found"}, nil
316+
default:
317+
log.ErrorContext(ctx, "failed to delete snapshot schedule", "error", err)
318+
return oapi.DeleteInstanceSnapshotSchedule500JSONResponse{Code: "internal_error", Message: "failed to delete snapshot schedule"}, nil
319+
}
320+
}
321+
322+
return oapi.DeleteInstanceSnapshotSchedule204Response{}, nil
323+
}
324+
206325
func snapshotToOAPI(snapshot instances.Snapshot) oapi.Snapshot {
207326
kind := oapi.SnapshotKind(snapshot.Kind)
208327
sourceHypervisor := oapi.SnapshotSourceHypervisor(snapshot.SourceHypervisor)
@@ -239,3 +358,30 @@ func snapshotToOAPI(snapshot instances.Snapshot) oapi.Snapshot {
239358
}
240359
return out
241360
}
361+
362+
func snapshotScheduleToOAPI(schedule instances.SnapshotSchedule) oapi.SnapshotSchedule {
363+
retention := oapi.SnapshotScheduleRetention{
364+
MaxCount: lo.ToPtr(schedule.Retention.MaxCount),
365+
}
366+
if schedule.Retention.MaxAge > 0 {
367+
maxAge := schedule.Retention.MaxAge.String()
368+
retention.MaxAge = &maxAge
369+
}
370+
371+
out := oapi.SnapshotSchedule{
372+
InstanceId: schedule.InstanceID,
373+
Interval: schedule.Interval.String(),
374+
Metadata: toOAPITags(schedule.Metadata),
375+
Retention: retention,
376+
NextRunAt: schedule.NextRunAt,
377+
CreatedAt: schedule.CreatedAt,
378+
UpdatedAt: schedule.UpdatedAt,
379+
}
380+
if schedule.NamePrefix != "" {
381+
out.NamePrefix = lo.ToPtr(schedule.NamePrefix)
382+
}
383+
out.LastRunAt = schedule.LastRunAt
384+
out.LastSnapshotId = schedule.LastSnapshotID
385+
out.LastError = schedule.LastError
386+
return out
387+
}

cmd/api/api/snapshots_test.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
package api
2+
3+
import (
4+
"testing"
5+
"time"
6+
7+
"github.com/kernel/hypeman/lib/instances"
8+
"github.com/stretchr/testify/assert"
9+
"github.com/stretchr/testify/require"
10+
)
11+
12+
func TestSnapshotScheduleToOAPIPreservesZeroMaxCount(t *testing.T) {
13+
t.Parallel()
14+
15+
schedule := instances.SnapshotSchedule{
16+
InstanceID: "inst-1",
17+
Interval: time.Hour,
18+
Retention: instances.SnapshotScheduleRetention{
19+
MaxCount: 0,
20+
MaxAge: 24 * time.Hour,
21+
},
22+
NextRunAt: time.Now().UTC().Add(time.Hour),
23+
CreatedAt: time.Now().UTC(),
24+
UpdatedAt: time.Now().UTC(),
25+
}
26+
27+
out := snapshotScheduleToOAPI(schedule)
28+
require.NotNil(t, out.Retention.MaxCount)
29+
assert.Equal(t, 0, *out.Retention.MaxCount)
30+
require.NotNil(t, out.Retention.MaxAge)
31+
assert.Equal(t, "24h0m0s", *out.Retention.MaxAge)
32+
}

cmd/api/main.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,29 @@ func run() error {
510510
}
511511
})
512512

513+
// Snapshot schedule scheduler
514+
if scheduleManager, ok := app.InstanceManager.(instances.SnapshotScheduleManager); ok {
515+
const snapshotSchedulePollInterval = time.Minute
516+
grp.Go(func() error {
517+
ticker := time.NewTicker(snapshotSchedulePollInterval)
518+
defer ticker.Stop()
519+
520+
logger.Info("snapshot schedule scheduler started", "interval", snapshotSchedulePollInterval)
521+
for {
522+
select {
523+
case <-gctx.Done():
524+
return nil
525+
case <-ticker.C:
526+
if err := scheduleManager.RunSnapshotSchedules(gctx); err != nil {
527+
logger.Error("snapshot schedule run completed with errors", "error", err)
528+
}
529+
}
530+
}
531+
})
532+
} else {
533+
logger.Warn("snapshot schedule manager unavailable; scheduled snapshots disabled")
534+
}
535+
513536
err = grp.Wait()
514537
slog.Info("all goroutines finished")
515538
return err

lib/instances/README.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,28 @@ Any State → Stopped
108108
- Don't prefault pages (lazy loading)
109109
- Parallel with TAP device setup
110110

111+
## Scheduled Snapshot Behavior
112+
113+
- Schedules are configured per instance and persisted in the server data store (outside snapshot payloads).
114+
- A background scheduler evaluates due schedules every minute.
115+
- Each due run chooses snapshot behavior from current source state:
116+
- `Running`/`Standby` sources use `Standby` snapshots.
117+
- `Stopped` sources use `Stopped` snapshots.
118+
- `Standby` runs from `Running` sources perform a brief pause/resume cycle during capture.
119+
- The minimum interval is `1m`, but larger intervals are recommended for heavier or latency-sensitive workloads because running captures pause/resume the guest.
120+
- Scheduled snapshot `name_prefix` is optional and capped at 47 chars so generated names stay within the 63-char snapshot name limit.
121+
- New schedules establish cadence at `now + interval + deterministic jitter` derived from the instance ID.
122+
- Updating only retention, metadata, or `name_prefix` preserves `next_run_at`; changing `interval` establishes a new cadence.
123+
- Schedule runs advance to the next future interval (no backfill flood after downtime).
124+
- Each schedule stores operational status:
125+
- `next_run_at`
126+
- `last_run_at`
127+
- `last_snapshot_id`
128+
- `last_error`
129+
- Retention cleanup runs after successful scheduled snapshot creation and only affects scheduled snapshots for that instance.
130+
- If an instance is deleted, its schedule is retained so retention can continue cleaning existing scheduled snapshots.
131+
- Once the deleted instance has no scheduled snapshots left, the scheduler removes that schedule automatically.
132+
111133
## Reference Handling
112134

113135
Instances use OCI image references directly:

lib/instances/errors.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,7 @@ var (
2929

3030
// ErrSnapshotNotFound is returned when a snapshot is not found.
3131
ErrSnapshotNotFound = errors.New("snapshot not found")
32+
33+
// ErrSnapshotScheduleNotFound is returned when a snapshot schedule is not found.
34+
ErrSnapshotScheduleNotFound = errors.New("snapshot schedule not found")
3235
)

lib/instances/manager.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package instances
33
import (
44
"context"
55
"fmt"
6+
"os"
67
"sync"
78
"time"
89

@@ -88,6 +89,8 @@ type manager struct {
8889
meter metric.Meter
8990
tracer trace.Tracer
9091
now func() time.Time
92+
writeFile func(string, []byte, os.FileMode) error
93+
deleteSnapshotFn func(context.Context, string) error
9194
egressProxy *egressproxy.Service
9295
egressProxyServiceOptions egressproxy.ServiceOptions
9396
egressProxyMu sync.Mutex
@@ -141,13 +144,15 @@ func NewManager(p *paths.Paths, imageManager images.Manager, systemManager syste
141144
vmStarters: vmStarters,
142145
defaultHypervisor: defaultHypervisor,
143146
now: time.Now,
147+
writeFile: os.WriteFile,
144148
meter: meter,
145149
tracer: tracer,
146150
guestMemoryPolicy: policy,
147151
snapshotDefaults: snapshotDefaults,
148152
compressionJobs: make(map[string]*compressionJob),
149153
nativeCodecPaths: make(map[string]string),
150154
}
155+
m.deleteSnapshotFn = m.deleteSnapshot
151156

152157
// Initialize metrics if meter is provided
153158
if meter != nil {

0 commit comments

Comments
 (0)