From 1ef578a0afb25bc156a049675f4f640a2c00cc41 Mon Sep 17 00:00:00 2001 From: justrach <54503978+justrach@users.noreply.github.com> Date: Thu, 11 Jun 2026 12:14:00 +0800 Subject: [PATCH 1/2] =?UTF-8?q?test(#606):=20failing=20=E2=80=94=20word=20?= =?UTF-8?q?index=20never=20reuses=20doc=5Fid=20slots=20freed=20by=20remove?= =?UTF-8?q?File?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Fable 5 --- src/test_index.zig | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/test_index.zig b/src/test_index.zig index 6bacff5..fafa05b 100644 --- a/src/test_index.zig +++ b/src/test_index.zig @@ -3254,3 +3254,15 @@ test "issue-600: mmap_overlay writeToDisk persists overlay edits" { try testing.expectEqual(@as(usize, 0), g.len); } } + + +test "issue-606: word index reuses doc_id slots freed by removeFile" { + var wi = WordIndex.init(testing.allocator); + defer wi.deinit(); + + try wi.indexFile("a.zig", "const alpha = 1;\n"); + wi.removeFile("a.zig"); + try wi.indexFile("b.zig", "const beta = 2;\n"); + + try testing.expectEqual(@as(usize, 1), wi.id_to_path.items.len); +} From 508c41689dd14a52d16f15d6f57884ddf5a9b0e6 Mon Sep 17 00:00:00 2001 From: justrach <54503978+justrach@users.noreply.github.com> Date: Thu, 11 Jun 2026 15:26:53 +0800 Subject: [PATCH 2/2] fix(#606): word index reuses freed doc_id slots; persist/reload keeps attribution getOrCreateDocId pops from a free_ids pool fed by removeFile, mirroring TrigramIndex. Reuse is safe across persistence because writeToDisk compacts tombstones out of the id table and remaps postings to dense disk ids; the round-trip test pins that. Both branches of getOrCreateDocId now mutate state only after the fallible put, so a failed put can no longer leave a dangling id_to_path slot. Co-Authored-By: Claude Fable 5 --- src/index.zig | 19 +++++++++++++++- src/test_index.zig | 55 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 72 insertions(+), 2 deletions(-) diff --git a/src/index.zig b/src/index.zig index 94d5d44..7c17870 100644 --- a/src/index.zig +++ b/src/index.zig @@ -21,6 +21,8 @@ pub const WordIndex = struct { enabled: bool = true, path_to_id: std.StringHashMap(u32), id_to_path: std.ArrayList([]const u8), + /// freed doc_id slots available for reuse by getOrCreateDocId + free_ids: std.ArrayList(u32) = .empty, /// doc_id → number of tokens indexed for that doc (BM25 length normalization). doc_lengths: std.AutoHashMap(u32, u32), /// Sum of all values in doc_lengths. @@ -41,9 +43,20 @@ pub const WordIndex = struct { fn getOrCreateDocId(self: *WordIndex, path: []const u8) !u32 { if (self.path_to_id.get(path)) |id| return id; + if (self.free_ids.items.len > 0) { + const freed = self.free_ids.items[self.free_ids.items.len - 1]; + // put first: a failed put must leave the free list and slot untouched + try self.path_to_id.put(path, freed); + _ = self.free_ids.pop(); + self.id_to_path.items[@as(usize, freed)] = path; + return freed; + } const id: u32 = @intCast(self.id_to_path.items.len); try self.id_to_path.append(self.allocator, path); - try self.path_to_id.put(path, id); + self.path_to_id.put(path, id) catch |err| { + _ = self.id_to_path.pop(); + return err; + }; return id; } @@ -71,6 +84,7 @@ pub const WordIndex = struct { if (path.len > 0) self.allocator.free(path); } self.id_to_path.deinit(self.allocator); + self.free_ids.deinit(self.allocator); self.doc_lengths.deinit(); self.allocator.free(self.word_dir); std.posix.munmap(m); @@ -100,6 +114,7 @@ pub const WordIndex = struct { self.path_to_id.deinit(); self.id_to_path.deinit(self.allocator); + self.free_ids.deinit(self.allocator); self.doc_lengths.deinit(); } @@ -136,6 +151,7 @@ pub const WordIndex = struct { // otherwise the slot aliases stable_path, freed below. if (self.skip_file_words and old_path.len > 0) self.allocator.free(old_path); self.id_to_path.items[doc_id] = ""; + self.free_ids.append(self.allocator, doc_id) catch {}; } if (self.doc_lengths.fetchRemove(doc_id)) |kv| { self.total_tokens -= kv.value; @@ -184,6 +200,7 @@ pub const WordIndex = struct { const old_path = self.id_to_path.items[doc_id]; if (self.skip_file_words and old_path.len > 0) self.allocator.free(old_path); self.id_to_path.items[doc_id] = ""; + self.free_ids.append(self.allocator, doc_id) catch {}; } var empty_words: std.ArrayList([]const u8) = .empty; defer empty_words.deinit(self.allocator); diff --git a/src/test_index.zig b/src/test_index.zig index fafa05b..8ae6ea8 100644 --- a/src/test_index.zig +++ b/src/test_index.zig @@ -3255,7 +3255,6 @@ test "issue-600: mmap_overlay writeToDisk persists overlay edits" { } } - test "issue-606: word index reuses doc_id slots freed by removeFile" { var wi = WordIndex.init(testing.allocator); defer wi.deinit(); @@ -3265,4 +3264,58 @@ test "issue-606: word index reuses doc_id slots freed by removeFile" { try wi.indexFile("b.zig", "const beta = 2;\n"); try testing.expectEqual(@as(usize, 1), wi.id_to_path.items.len); + + const beta_hits = try wi.searchDeduped("beta", testing.allocator); + defer testing.allocator.free(beta_hits); + try testing.expectEqual(@as(usize, 1), beta_hits.len); + try testing.expectEqualStrings("b.zig", wi.hitPath(beta_hits[0])); + + const alpha_hits = try wi.searchDeduped("alpha", testing.allocator); + defer testing.allocator.free(alpha_hits); + try testing.expectEqual(@as(usize, 0), alpha_hits.len); +} + +test "issue-606: doc_id reuse survives a persist/reload round-trip" { + const alloc = testing.allocator; + var wi = WordIndex.init(alloc); + defer wi.deinit(); + + try wi.indexFile("a.zig", "const alpha = 1;\n"); + try wi.indexFile("keep.zig", "const kappa = 3;\n"); + wi.removeFile("a.zig"); + // Reuses a.zig's freed slot; a stale posting resolving to the recycled id + // would attribute alpha hits to c.zig after reload. + try wi.indexFile("c.zig", "const gamma = 4;\n"); + + try testing.expectEqual(@as(usize, 2), wi.id_to_path.items.len); + + var tmp = testing.tmpDir(.{}); + defer tmp.cleanup(); + var path_buf: [std.fs.max_path_bytes]u8 = undefined; + const dir_path_len = try tmp.dir.realPathFile(io, ".", &path_buf); + const dir_path = path_buf[0..dir_path_len]; + + try wi.writeToDisk(io, dir_path, null); + + const maybe_loaded = WordIndex.readFromDisk(io, dir_path, alloc); + try testing.expect(maybe_loaded != null); + var loaded = maybe_loaded.?; + defer loaded.deinit(); + + try testing.expectEqual(@as(usize, 2), loaded.id_to_path.items.len); + try testing.expect(loaded.path_to_id.get("a.zig") == null); + + const gamma_hits = try loaded.searchDeduped("gamma", alloc); + defer alloc.free(gamma_hits); + try testing.expectEqual(@as(usize, 1), gamma_hits.len); + try testing.expectEqualStrings("c.zig", loaded.hitPath(gamma_hits[0])); + + const kappa_hits = try loaded.searchDeduped("kappa", alloc); + defer alloc.free(kappa_hits); + try testing.expectEqual(@as(usize, 1), kappa_hits.len); + try testing.expectEqualStrings("keep.zig", loaded.hitPath(kappa_hits[0])); + + const alpha_hits = try loaded.searchDeduped("alpha", alloc); + defer alloc.free(alpha_hits); + try testing.expectEqual(@as(usize, 0), alpha_hits.len); }