Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions index/index_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -666,6 +666,78 @@ func TestReindexModifiedFilePreservesOldHash(t *testing.T) {
}
}

// TestHashFileSizePairsWithDigest pins the stat-after-hash contract: the
// size hashFile returns is the count of bytes that produced the digest, read
// from the open handle's fstat rather than a path stat that could observe a
// different inode state. A regression to a pre-hash path stat would let the
// digest and the size describe different bytes; here they must agree.
func TestHashFileSizePairsWithDigest(t *testing.T) {
root := t.TempDir()
path := filepath.Join(root, "doc.txt")
content := "the quick brown fox"
writeFile(t, path, content)

got, err := hashFile(path, make([]byte, hashReadBufferSize))
if err != nil {
t.Fatalf("hashFile: %v", err)
}
if got.sizeBytes != int64(len(content)) {
t.Fatalf("sizeBytes = %d, want %d (size of the hashed bytes)", got.sizeBytes, len(content))
}
if !bytes.Equal(got.digest, blake3Of(t, content)) {
t.Fatalf("digest does not match BLAKE3 of the %d hashed bytes", len(content))
}
}

// TestReindexAfterAppendSupersedesToConsistentRow exercises the residual the
// stat-after-hash fix documents: a file that grows between index runs cannot
// leave a row whose size disagrees with its hash. The grown file mints a new
// (digest, size) pair that supersedes the prior row, and each live row's size
// matches the content its hash covers.
func TestReindexAfterAppendSupersedesToConsistentRow(t *testing.T) {
root := t.TempDir()
path := filepath.Join(root, "doc.txt")
writeFile(t, path, "abc")

s := setupStore(t)
ctx := context.Background()
if _, err := Index(ctx, s, root, Options{}); err != nil {
t.Fatalf("first Index: %v", err)
}
absRoot, _ := filepath.Abs(root)
vol := volumeFor(t, s, absRoot)
first, err := s.GetByPath(ctx, vol.ID, "doc.txt")
if err != nil {
t.Fatalf("GetByPath after first: %v", err)
}
if first.SizeBytes != 3 || !bytes.Equal(first.Blake3, blake3Of(t, "abc")) {
t.Fatalf("first row = (size=%d), want size=3 matching BLAKE3(abc)", first.SizeBytes)
}

writeFile(t, path, "abcdef")
if _, err := Index(ctx, s, root, Options{}); err != nil {
t.Fatalf("second Index: %v", err)
}

history, err := s.ListHistoryByPath(ctx, vol.ID, "doc.txt")
if err != nil {
t.Fatalf("ListHistoryByPath: %v", err)
}
if len(history) != 2 {
t.Fatalf("history = %d rows, want 2", len(history))
}
for _, row := range history {
want := blake3Of(t, "abc")
wantSize := int64(3)
if row.Status == store.StatusPresent {
want, wantSize = blake3Of(t, "abcdef"), 6
}
if row.SizeBytes != wantSize || !bytes.Equal(row.Blake3, want) {
t.Fatalf("row (status=%s) size=%d does not match its hash", row.Status, row.SizeBytes)
}
}
}

func TestDryRunDoesNotRecordRun(t *testing.T) {
root := t.TempDir()
writeFile(t, filepath.Join(root, "a.txt"), "hello")
Expand Down
33 changes: 33 additions & 0 deletions store/migrations.go
Original file line number Diff line number Diff line change
Expand Up @@ -1442,7 +1442,17 @@ func migrateV13ToV14(ctx context.Context, db *sql.DB) error {
// row per distinct blake3 in the old files table. The seed row per hash
// is chosen by (first_seen_run_id, rowid) ascending so the backfill is
// deterministic when several rows share the earliest run.
//
// The size guard runs first: one contents row per blake3 can carry one
// size, so two v13 rows sharing a hash with differing sizes (reachable
// only via prior corruption or an indexer stat/hash TOCTOU) would force
// the seed to silently keep the earliest observation's size and discard
// the rest. Refusing turns that into a loud pre-migration failure the
// operator can investigate against the pre-migration snapshot.
func createAndSeedContentsV14(ctx context.Context, tx *sql.Tx) error {
if err := refuseSameHashDifferentSizeV14(ctx, tx); err != nil {
return err
}
stmts := []string{
// origin_run_id is in the origin node's run space (NULL together
// with origin_node_id means "introduced locally"), so it is
Expand Down Expand Up @@ -1476,6 +1486,29 @@ func createAndSeedContentsV14(ctx context.Context, tx *sql.Tx) error {
return nil
}

// refuseSameHashDifferentSizeV14 aborts the migration if any blake3 in the
// old files table appears with more than one size_bytes. A BLAKE3 digest is
// over the bytes, so differing sizes for one hash is impossible from honest
// indexing — it signals prior corruption or a stat/hash TOCTOU. Coalescing
// it to one size would erase the disagreement instead of surfacing it.
func refuseSameHashDifferentSizeV14(ctx context.Context, tx *sql.Tx) error {
var conflicts int
if err := tx.QueryRowContext(ctx, `
SELECT COUNT(*) FROM (
SELECT blake3 FROM files
GROUP BY blake3
HAVING COUNT(DISTINCT size_bytes) > 1
)`).Scan(&conflicts); err != nil {
return fmt.Errorf("check same-hash-different-size: %w", err)
}
if conflicts > 0 {
return fmt.Errorf("refusing v13→v14: %d blake3 hash(es) in files carry differing size_bytes; "+
"the index is corrupt and one contents row cannot represent both sizes — "+
"restore from the pre-migration snapshot and re-index", conflicts)
}
return nil
}

// rebuildFilesV14 stages the reshaped files table, copies every old row
// with its blake3 resolved to the freshly seeded contents id, and swaps
// the new table into place. blake3↔content_id is one-to-one, so the PK
Expand Down
Loading
Loading