From 782fce8a978f8bd3b4b547ad8759813a49a4a5f0 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Mon, 23 Mar 2026 14:30:07 -0400 Subject: [PATCH 1/2] Fix image surfaces and dry-run reclaim semantics --- cmd/api/api/images_test.go | 84 ++++++++++++++++++++++++++++++ lib/guestmemory/controller.go | 4 +- lib/guestmemory/controller_test.go | 37 +++++++++++++ lib/images/manager.go | 30 ++++++----- lib/images/manager_test.go | 76 +++++++++++++++++++++++++++ lib/images/metrics.go | 2 +- lib/images/storage.go | 71 +++++++++++++++++++------ 7 files changed, 275 insertions(+), 29 deletions(-) diff --git a/cmd/api/api/images_test.go b/cmd/api/api/images_test.go index 7b8b1062..8295ffac 100644 --- a/cmd/api/api/images_test.go +++ b/cmd/api/api/images_test.go @@ -1,12 +1,15 @@ package api import ( + "encoding/json" "fmt" + "os" "testing" "time" "github.com/kernel/hypeman/lib/images" "github.com/kernel/hypeman/lib/oapi" + "github.com/kernel/hypeman/lib/paths" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -23,6 +26,38 @@ func TestListImages_Empty(t *testing.T) { assert.Empty(t, list) } +func TestListImages_FilterByTagsIncludesDigestOnlyImages(t *testing.T) { + t.Parallel() + svc := newTestService(t) + + const digestRef = "docker.io/library/alpine@sha256:029a752048e32e843bd6defe3841186fb8d19a28dae8ec287f433bb9d6d1ad85" + seedReadyDigestOnlyImage(t, svc, digestRef, map[string]string{ + "qa": "pr43-qa-20260323134902", + "surface": "image", + }) + + getResp, err := svc.GetImage(ctxWithImage(svc, digestRef), oapi.GetImageRequestObject{Name: digestRef}) + require.NoError(t, err) + + gotImage, ok := getResp.(oapi.GetImage200JSONResponse) + require.True(t, ok, "expected 200 response") + require.NotNil(t, gotImage.Tags) + require.Equal(t, "pr43-qa-20260323134902", (*gotImage.Tags)["qa"]) + + filter := oapi.Tags{ + "qa": "pr43-qa-20260323134902", + } + listResp, err := svc.ListImages(ctx(), oapi.ListImagesRequestObject{ + Params: oapi.ListImagesParams{Tags: &filter}, + }) + require.NoError(t, err) + + list, ok := listResp.(oapi.ListImages200JSONResponse) + require.True(t, ok, "expected 200 response") + require.Len(t, list, 1, "digest-only images with matching tags should be listed") + require.Equal(t, digestRef, list[0].Name) +} + func TestGetImage_NotFound(t *testing.T) { t.Parallel() svc := newTestService(t) @@ -33,6 +68,22 @@ func TestGetImage_NotFound(t *testing.T) { require.Error(t, err) } +func TestDeleteImage_DigestOnlyImageDoesNotInternalError(t *testing.T) { + t.Parallel() + svc := newTestService(t) + + const digestRef = "docker.io/library/alpine@sha256:029a752048e32e843bd6defe3841186fb8d19a28dae8ec287f433bb9d6d1ad85" + seedReadyDigestOnlyImage(t, svc, digestRef, map[string]string{ + "qa": "pr43-delete-20260323134902", + }) + + resp, err := svc.DeleteImage(ctxWithImage(svc, digestRef), oapi.DeleteImageRequestObject{Name: digestRef}) + require.NoError(t, err) + + _, ok := resp.(oapi.DeleteImage204Response) + require.True(t, ok, "expected deleting an existing digest-only image not to return internal_error") +} + func TestCreateImage_Async(t *testing.T) { t.Parallel() svc := newTestService(t) @@ -313,3 +364,36 @@ func formatQueuePos(pos *int) string { } return fmt.Sprintf("%d", *pos) } + +func seedReadyDigestOnlyImage(t *testing.T, svc *ApiService, imageRef string, imageTags map[string]string) { + t.Helper() + + ref, err := images.ParseNormalizedRef(imageRef) + require.NoError(t, err) + require.True(t, ref.IsDigest(), "test helper expects a digest reference") + + p := paths.New(svc.Config.DataDir) + digestDir := p.ImageDigestDir(ref.Repository(), ref.DigestHex()) + require.NoError(t, os.MkdirAll(digestDir, 0o755)) + require.NoError(t, os.WriteFile(p.ImageDigestPath(ref.Repository(), ref.DigestHex()), []byte("rootfs"), 0o644)) + + meta := struct { + Name string `json:"name"` + Digest string `json:"digest"` + Status string `json:"status"` + SizeBytes int64 `json:"size_bytes"` + Tags map[string]string `json:"tags,omitempty"` + CreatedAt time.Time `json:"created_at"` + }{ + Name: imageRef, + Digest: "sha256:" + ref.DigestHex(), + Status: "ready", + SizeBytes: int64(len("rootfs")), + Tags: imageTags, + CreatedAt: time.Now().UTC(), + } + + data, err := json.Marshal(meta) + require.NoError(t, err) + require.NoError(t, os.WriteFile(p.ImageMetadata(ref.Repository(), ref.DigestHex()), data, 0o644)) +} diff --git a/lib/guestmemory/controller.go b/lib/guestmemory/controller.go index f685697c..4be91fd0 100644 --- a/lib/guestmemory/controller.go +++ b/lib/guestmemory/controller.go @@ -299,7 +299,9 @@ func (c *controller) reconcile(ctx context.Context, req reconcileRequest) (Manua action.Status = "planned" action.TargetGuestMemoryBytes = appliedTarget } - action.AppliedReclaimBytes = candidate.vm.AssignedMemoryBytes - action.TargetGuestMemoryBytes + if !req.dryRun { + action.AppliedReclaimBytes = candidate.vm.AssignedMemoryBytes - action.TargetGuestMemoryBytes + } resp.AppliedReclaimBytes += action.AppliedReclaimBytes resp.Actions = append(resp.Actions, action) diff --git a/lib/guestmemory/controller_test.go b/lib/guestmemory/controller_test.go index 2b20427f..4720ec7a 100644 --- a/lib/guestmemory/controller_test.go +++ b/lib/guestmemory/controller_test.go @@ -240,3 +240,40 @@ func TestTriggerReclaimWithoutHoldAppliesRequestedReclaim(t *testing.T) { assert.Equal(t, int64(0), followup.AppliedReclaimBytes) assert.Equal(t, int64(1024*mib), hv.target) } + +func TestTriggerReclaimDryRunDoesNotReportAppliedReclaim(t *testing.T) { + const mib = int64(1024 * 1024) + + src := &stubSource{ + vms: []BalloonVM{ + {ID: "a", Name: "a", HypervisorType: hypervisor.TypeCloudHypervisor, SocketPath: "a", AssignedMemoryBytes: 1024 * mib}, + }, + } + hv := &stubHypervisor{target: 1024 * mib, capabilities: hypervisor.Capabilities{SupportsBalloonControl: true}} + + c := NewController(Policy{Enabled: true, ReclaimEnabled: true}, ActiveBallooningConfig{ + Enabled: true, + ProtectedFloorPercent: 50, + ProtectedFloorMinBytes: 0, + MinAdjustmentBytes: 1, + PerVMMaxStepBytes: 4096 * mib, + PerVMCooldown: time.Second, + }, src, slog.New(slog.NewTextHandler(io.Discard, nil))).(*controller) + c.sampler = &stubSampler{sample: HostPressureSample{TotalBytes: 1024 * mib, AvailableBytes: 1024 * mib, AvailablePercent: 100}} + c.reconcileMu.newClient = func(t hypervisor.Type, socket string) (hypervisor.Hypervisor, error) { + return hv, nil + } + + resp, err := c.TriggerReclaim(context.Background(), ManualReclaimRequest{ + ReclaimBytes: 256 * mib, + DryRun: true, + HoldFor: 30 * time.Second, + }) + require.NoError(t, err) + require.Len(t, resp.Actions, 1) + assert.Equal(t, int64(256*mib), resp.PlannedReclaimBytes) + assert.Equal(t, int64(0), resp.AppliedReclaimBytes, "dry-run should not report applied reclaim") + assert.Equal(t, "planned", resp.Actions[0].Status) + assert.Equal(t, int64(0), resp.Actions[0].AppliedReclaimBytes, "dry-run actions should not report applied reclaim") + assert.Equal(t, int64(1024*mib), hv.target, "dry-run should not mutate the hypervisor target") +} diff --git a/lib/images/manager.go b/lib/images/manager.go index 44959d39..27dd7ff5 100644 --- a/lib/images/manager.go +++ b/lib/images/manager.go @@ -91,9 +91,9 @@ func NewManager(p *paths.Paths, maxConcurrentBuilds int, meter metric.Meter) (Ma } func (m *manager) ListImages(ctx context.Context) ([]Image, error) { - metas, err := listAllTags(m.paths) + metas, err := listAllMetadata(m.paths) if err != nil { - return nil, fmt.Errorf("list tags: %w", err) + return nil, fmt.Errorf("list images: %w", err) } images := make([]Image, 0, len(metas)) @@ -349,7 +349,7 @@ func (m *manager) updateStatusByDigest(ref *ResolvedRef, status string, err erro } func (m *manager) RecoverInterruptedBuilds() { - metas, err := listAllTags(m.paths) + metas, err := listAllMetadata(m.paths) if err != nil { return // Best effort } @@ -422,12 +422,24 @@ func (m *manager) DeleteImage(ctx context.Context, name string) error { return fmt.Errorf("%w: %s", ErrInvalidName, err.Error()) } - // Only allow deleting by tag, not by digest + repository := ref.Repository() + + // Hold createMu during delete so tag resolution, tag removal, and orphan checks + // stay consistent with concurrent creates for the same repository digest. + m.createMu.Lock() + defer m.createMu.Unlock() + if ref.IsDigest() { - return fmt.Errorf("cannot delete by digest, use tag name instead") + digestHex := ref.DigestHex() + if _, err := readMetadata(m.paths, repository, digestHex); err != nil { + return err + } + if err := deleteTagsForDigest(m.paths, repository, digestHex); err != nil { + return err + } + return deleteDigest(m.paths, repository, digestHex) } - repository := ref.Repository() tag := ref.Tag() // Resolve the tag to get the digest before deleting @@ -441,12 +453,6 @@ func (m *manager) DeleteImage(ctx context.Context, name string) error { return err } - // Hold createMu during orphan check and delete to prevent race with CreateImage. - // Without this lock, a concurrent CreateImage could create a new tag pointing to - // the same digest between our count check and delete, leaving a dangling symlink. - m.createMu.Lock() - defer m.createMu.Unlock() - // Check if the digest is now orphaned (no other tags reference it) count, err := countTagsForDigest(m.paths, repository, digestHex) if err != nil { diff --git a/lib/images/manager_test.go b/lib/images/manager_test.go index 90d3c560..dbd3b2fb 100644 --- a/lib/images/manager_test.go +++ b/lib/images/manager_test.go @@ -2,6 +2,7 @@ package images import ( "context" + "encoding/json" "os" "path/filepath" "strings" @@ -165,6 +166,28 @@ func TestListImages(t *testing.T) { require.NotEmpty(t, images[0].Digest) } +func TestListImages_IncludesDigestOnlyImages(t *testing.T) { + dataDir := t.TempDir() + p := paths.New(dataDir) + mgr, err := NewManager(p, 1, nil) + require.NoError(t, err) + + const digestRef = "docker.io/library/alpine@sha256:029a752048e32e843bd6defe3841186fb8d19a28dae8ec287f433bb9d6d1ad85" + seedReadyDigestOnlyImageMetadata(t, p, digestRef, map[string]string{ + "qa": "pr43-qa-20260323134902", + "surface": "image", + }) + + images, err := mgr.ListImages(context.Background()) + require.NoError(t, err) + require.Len(t, images, 1) + require.Equal(t, digestRef, images[0].Name) + require.Equal(t, map[string]string{ + "qa": "pr43-qa-20260323134902", + "surface": "image", + }, map[string]string(images[0].Tags)) +} + func TestGetImage(t *testing.T) { dataDir := t.TempDir() mgr, err := NewManager(paths.New(dataDir), 1, nil) @@ -235,6 +258,27 @@ func TestDeleteImage(t *testing.T) { require.True(t, os.IsNotExist(err), "digest directory should be deleted when orphaned") } +func TestDeleteImageByDigest(t *testing.T) { + dataDir := t.TempDir() + p := paths.New(dataDir) + mgr, err := NewManager(p, 1, nil) + require.NoError(t, err) + + ctx := context.Background() + const digestRef = "docker.io/library/alpine@sha256:029a752048e32e843bd6defe3841186fb8d19a28dae8ec287f433bb9d6d1ad85" + seedReadyDigestOnlyImageMetadata(t, p, digestRef, map[string]string{ + "qa": "pr43-delete-20260323134902", + }) + + err = mgr.DeleteImage(ctx, digestRef) + require.NoError(t, err) + + ref, err := ParseNormalizedRef(digestRef) + require.NoError(t, err) + _, err = os.Stat(digestDir(p, ref.Repository(), ref.DigestHex())) + require.True(t, os.IsNotExist(err), "digest directory should be deleted when deleting by digest") +} + func TestDeleteImageNotFound(t *testing.T) { dataDir := t.TempDir() mgr, err := NewManager(paths.New(dataDir), 1, nil) @@ -402,6 +446,38 @@ func countFiles(dir string) (int, error) { return len(entries), nil } +func seedReadyDigestOnlyImageMetadata(t *testing.T, p *paths.Paths, imageRef string, imageTags map[string]string) { + t.Helper() + + ref, err := ParseNormalizedRef(imageRef) + require.NoError(t, err) + require.True(t, ref.IsDigest(), "test helper expects a digest reference") + + digestDirPath := p.ImageDigestDir(ref.Repository(), ref.DigestHex()) + require.NoError(t, os.MkdirAll(digestDirPath, 0o755)) + require.NoError(t, os.WriteFile(p.ImageDigestPath(ref.Repository(), ref.DigestHex()), []byte("rootfs"), 0o644)) + + meta := struct { + Name string `json:"name"` + Digest string `json:"digest"` + Status string `json:"status"` + SizeBytes int64 `json:"size_bytes"` + Tags map[string]string `json:"tags,omitempty"` + CreatedAt time.Time `json:"created_at"` + }{ + Name: imageRef, + Digest: ref.Digest(), + Status: StatusReady, + SizeBytes: int64(len("rootfs")), + Tags: imageTags, + CreatedAt: time.Now().UTC(), + } + + data, err := json.Marshal(meta) + require.NoError(t, err) + require.NoError(t, os.WriteFile(p.ImageMetadata(ref.Repository(), ref.DigestHex()), data, 0o644)) +} + // TestImportLocalImageFromOCICache is an integration test that simulates the full // builder image import flow used by buildBuilderFromDockerfile: // diff --git a/lib/images/metrics.go b/lib/images/metrics.go index 08334ef2..1e8d480a 100644 --- a/lib/images/metrics.go +++ b/lib/images/metrics.go @@ -56,7 +56,7 @@ func newMetrics(meter metric.Meter, m *manager) (*Metrics, error) { o.ObserveInt64(buildQueueLength, int64(m.queue.QueueLength())) // Count images by status - metas, err := listAllTags(m.paths) + metas, err := listAllMetadata(m.paths) if err != nil { return nil } diff --git a/lib/images/storage.go b/lib/images/storage.go index 01b2a2ea..accfa9cf 100644 --- a/lib/images/storage.go +++ b/lib/images/storage.go @@ -2,6 +2,7 @@ package images import ( "encoding/json" + "errors" "fmt" "os" "path/filepath" @@ -223,41 +224,43 @@ func listTags(p *paths.Paths, repository string) ([]string, error) { return tags, nil } -// listAllTags returns all tags across all repositories -func listAllTags(p *paths.Paths) ([]*imageMetadata, error) { +// listAllMetadata returns one metadata record per digest across all repositories. +// Tagged images are discovered through tag symlinks, and digest-only images are +// discovered directly from their metadata.json files. +func listAllMetadata(p *paths.Paths) ([]*imageMetadata, error) { imagesDir := p.ImagesDir() - var metas []*imageMetadata + seen := make(map[string]struct{}) + metas := make([]*imageMetadata, 0) - // Walk the images directory to find all repositories err := filepath.Walk(imagesDir, func(path string, info os.FileInfo, err error) error { if err != nil { return nil // Skip errors } - // Check if this is a symlink (tag) - if info.Mode()&os.ModeSymlink != 0 { - // Read the symlink to get digest hex + switch { + case info.Mode()&os.ModeSymlink != 0: digestHex, err := os.Readlink(path) if err != nil { return nil // Skip invalid symlinks } - // Get repository from path - relPath, err := filepath.Rel(imagesDir, filepath.Dir(path)) + repository, err := filepath.Rel(imagesDir, filepath.Dir(path)) if err != nil { return nil } - // Read metadata for this digest - meta, err := readMetadata(p, relPath, digestHex) + return appendMetadataIfNew(p, repository, digestHex, seen, &metas) + case !info.IsDir() && info.Name() == "metadata.json": + digestHex := filepath.Base(filepath.Dir(path)) + repository, err := filepath.Rel(imagesDir, filepath.Dir(filepath.Dir(path))) if err != nil { - return nil // Skip if metadata can't be read + return nil } - metas = append(metas, meta) + return appendMetadataIfNew(p, repository, digestHex, seen, &metas) + default: + return nil } - - return nil }) if err != nil && !os.IsNotExist(err) { @@ -267,6 +270,22 @@ func listAllTags(p *paths.Paths) ([]*imageMetadata, error) { return metas, nil } +func appendMetadataIfNew(p *paths.Paths, repository, digestHex string, seen map[string]struct{}, metas *[]*imageMetadata) error { + key := repository + "@" + digestHex + if _, ok := seen[key]; ok { + return nil + } + + meta, err := readMetadata(p, repository, digestHex) + if err != nil { + return nil // Skip if metadata can't be read + } + + seen[key] = struct{}{} + *metas = append(*metas, meta) + return nil +} + // digestExists checks if a digest directory exists func digestExists(p *paths.Paths, repository, digestHex string) bool { dir := digestDir(p, repository, digestHex) @@ -314,6 +333,28 @@ func countTagsForDigest(p *paths.Paths, repository, digestHex string) (int, erro return count, nil } +func deleteTagsForDigest(p *paths.Paths, repository, digestHex string) error { + tags, err := listTags(p, repository) + if err != nil { + return err + } + + for _, tag := range tags { + target, err := resolveTag(p, repository, tag) + if err != nil { + continue + } + if target != digestHex { + continue + } + if err := deleteTag(p, repository, tag); err != nil && !errors.Is(err, ErrNotFound) { + return err + } + } + + return nil +} + // deleteDigest removes a digest directory and all its contents func deleteDigest(p *paths.Paths, repository, digestHex string) error { dir := digestDir(p, repository, digestHex) From 84e2c2221e24f4a401be5e878a89c1fc5024e50d Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Mon, 23 Mar 2026 15:12:20 -0400 Subject: [PATCH 2/2] Add test agent notes --- skills/test-agent/agents/test-agent/NOTES.md | 66 ++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/skills/test-agent/agents/test-agent/NOTES.md b/skills/test-agent/agents/test-agent/NOTES.md index f83cf8c6..3b92fc5e 100644 --- a/skills/test-agent/agents/test-agent/NOTES.md +++ b/skills/test-agent/agents/test-agent/NOTES.md @@ -145,3 +145,69 @@ - Run 3: 96s (pass) - `lib/instances` runtime in those runs: - 97.618s, 117.392s, 71.886s + +## 2026-03-23 - Current branch verification (`codex/pr43-fix-image-and-reclaim`) + +### What initially looked flaky but was actually command-shape drift +- A fresh direct run without test-prewarm env exported failed in `lib/instances` with several image waits timing out at 60s: + - `TestEntrypointEnvVars` + - `TestQEMUEntrypointEnvVars` + - `TestQEMUForkFromRunningNetwork` + - `TestQEMUStandbyAndRestore` +- Symptom: + - image status still `pending` after 60 seconds +- Root cause: + - I ran `go run ./cmd/test-prewarm` but forgot to export: + - `HYPEMAN_TEST_PREWARM_DIR=/root/.cache/hypeman-ci/linux-x86_64` + - `HYPEMAN_TEST_REGISTRY=127.0.0.1:5001` + - Without those env vars, `integrationTestImageRef` in `lib/instances/test_prewarm_test.go` does not remap Docker Hub refs to the local registry mirror, so several tests pay cold remote-pull + queue time and the 60s readiness assumptions become invalid. +- Conclusion: + - This was not a repository flake on the current branch; it was an incorrect manual command shape. + +### Fresh-cache CI-like runs on `deft-kernel-dev` +- Remote workspace: + - `~/hm43` +- Bootstrap: + - `make ensure-ch-binaries ensure-firecracker-binaries ensure-caddy-binaries build-embedded` +- Smoke runs with fresh Go caches and correct prewarm env: + - flow: + - `go mod download` + - `make oapi-generate` + - `make build` + - `go run ./cmd/test-prewarm` + - `make test TEST_TIMEOUT=20m` + - results: + - Run 1: 221s (pass) + - Run 2: 208s (pass) +- Strict no-cache runs with fresh Go caches and direct test execution: + - flow: + - `go mod download` + - `make oapi-generate` + - `make build` + - `go run ./cmd/test-prewarm` + - `go test -count=1 -tags containers_image_openpgp -timeout=20m ./...` + - results: + - Run 1: 211s (pass) + - Run 2: 210s (pass) + +### Current slowest individual tests (`lib/instances`, `go test -json`) +- `TestCloudHypervisorStandbyRestoreCompressionScenarios`: 51.34s +- `TestForkCloudHypervisorFromRunningNetwork`: 44.64s +- `TestFirecrackerForkFromRunningNetwork`: 34.00s +- `TestFirecrackerStopClearsStaleSnapshot`: 31.59s +- `TestCreateInstanceWithNetwork`: 30.80s +- `TestCompressSnapshotMemoryFileReturnsContextCanceledWhenNativeProcessIsKilled`: 30.00s +- `TestQEMUStandbyRestoreCompressionScenarios`: 29.74s +- `TestFirecrackerNetworkLifecycle`: 28.98s +- `TestFirecrackerStandbyRestoreCompressionScenarios`: 27.91s +- `TestQEMUForkFromRunningNetwork`: 27.02s + +### Assessment +- No current-branch flakes reproduced once the command shape matched the repo’s intended prewarm setup. +- The remaining cost is concentrated in `lib/instances` hypervisor integration coverage and looks mostly non-redundant: + - running-fork coverage is split across CH / Firecracker / QEMU + - standby/restore compression scenarios are split across hypervisors + - network lifecycle / create-instance / end-to-end tests each cover different surfaces +- One environmental contributor on `deft-kernel-dev`: + - `lz4` and `zstd` are not installed, so snapshot compression tests fall back to the Go implementation, which likely inflates compression-scenario timings. +- I did not make a test-quality code change in this pass because I did not find a low-risk redundancy or speed improvement that was clearly justified after the no-cache runs came back clean.