diff --git a/src/index.zig b/src/index.zig index 94d5d44..7c17870 100644 --- a/src/index.zig +++ b/src/index.zig @@ -21,6 +21,8 @@ pub const WordIndex = struct { enabled: bool = true, path_to_id: std.StringHashMap(u32), id_to_path: std.ArrayList([]const u8), + /// freed doc_id slots available for reuse by getOrCreateDocId + free_ids: std.ArrayList(u32) = .empty, /// doc_id → number of tokens indexed for that doc (BM25 length normalization). doc_lengths: std.AutoHashMap(u32, u32), /// Sum of all values in doc_lengths. @@ -41,9 +43,20 @@ pub const WordIndex = struct { fn getOrCreateDocId(self: *WordIndex, path: []const u8) !u32 { if (self.path_to_id.get(path)) |id| return id; + if (self.free_ids.items.len > 0) { + const freed = self.free_ids.items[self.free_ids.items.len - 1]; + // put first: a failed put must leave the free list and slot untouched + try self.path_to_id.put(path, freed); + _ = self.free_ids.pop(); + self.id_to_path.items[@as(usize, freed)] = path; + return freed; + } const id: u32 = @intCast(self.id_to_path.items.len); try self.id_to_path.append(self.allocator, path); - try self.path_to_id.put(path, id); + self.path_to_id.put(path, id) catch |err| { + _ = self.id_to_path.pop(); + return err; + }; return id; } @@ -71,6 +84,7 @@ pub const WordIndex = struct { if (path.len > 0) self.allocator.free(path); } self.id_to_path.deinit(self.allocator); + self.free_ids.deinit(self.allocator); self.doc_lengths.deinit(); self.allocator.free(self.word_dir); std.posix.munmap(m); @@ -100,6 +114,7 @@ pub const WordIndex = struct { self.path_to_id.deinit(); self.id_to_path.deinit(self.allocator); + self.free_ids.deinit(self.allocator); self.doc_lengths.deinit(); } @@ -136,6 +151,7 @@ pub const WordIndex = struct { // otherwise the slot aliases stable_path, freed below. if (self.skip_file_words and old_path.len > 0) self.allocator.free(old_path); self.id_to_path.items[doc_id] = ""; + self.free_ids.append(self.allocator, doc_id) catch {}; } if (self.doc_lengths.fetchRemove(doc_id)) |kv| { self.total_tokens -= kv.value; @@ -184,6 +200,7 @@ pub const WordIndex = struct { const old_path = self.id_to_path.items[doc_id]; if (self.skip_file_words and old_path.len > 0) self.allocator.free(old_path); self.id_to_path.items[doc_id] = ""; + self.free_ids.append(self.allocator, doc_id) catch {}; } var empty_words: std.ArrayList([]const u8) = .empty; defer empty_words.deinit(self.allocator); diff --git a/src/test_index.zig b/src/test_index.zig index 6bacff5..8ae6ea8 100644 --- a/src/test_index.zig +++ b/src/test_index.zig @@ -3254,3 +3254,68 @@ test "issue-600: mmap_overlay writeToDisk persists overlay edits" { try testing.expectEqual(@as(usize, 0), g.len); } } + +test "issue-606: word index reuses doc_id slots freed by removeFile" { + var wi = WordIndex.init(testing.allocator); + defer wi.deinit(); + + try wi.indexFile("a.zig", "const alpha = 1;\n"); + wi.removeFile("a.zig"); + try wi.indexFile("b.zig", "const beta = 2;\n"); + + try testing.expectEqual(@as(usize, 1), wi.id_to_path.items.len); + + const beta_hits = try wi.searchDeduped("beta", testing.allocator); + defer testing.allocator.free(beta_hits); + try testing.expectEqual(@as(usize, 1), beta_hits.len); + try testing.expectEqualStrings("b.zig", wi.hitPath(beta_hits[0])); + + const alpha_hits = try wi.searchDeduped("alpha", testing.allocator); + defer testing.allocator.free(alpha_hits); + try testing.expectEqual(@as(usize, 0), alpha_hits.len); +} + +test "issue-606: doc_id reuse survives a persist/reload round-trip" { + const alloc = testing.allocator; + var wi = WordIndex.init(alloc); + defer wi.deinit(); + + try wi.indexFile("a.zig", "const alpha = 1;\n"); + try wi.indexFile("keep.zig", "const kappa = 3;\n"); + wi.removeFile("a.zig"); + // Reuses a.zig's freed slot; a stale posting resolving to the recycled id + // would attribute alpha hits to c.zig after reload. + try wi.indexFile("c.zig", "const gamma = 4;\n"); + + try testing.expectEqual(@as(usize, 2), wi.id_to_path.items.len); + + var tmp = testing.tmpDir(.{}); + defer tmp.cleanup(); + var path_buf: [std.fs.max_path_bytes]u8 = undefined; + const dir_path_len = try tmp.dir.realPathFile(io, ".", &path_buf); + const dir_path = path_buf[0..dir_path_len]; + + try wi.writeToDisk(io, dir_path, null); + + const maybe_loaded = WordIndex.readFromDisk(io, dir_path, alloc); + try testing.expect(maybe_loaded != null); + var loaded = maybe_loaded.?; + defer loaded.deinit(); + + try testing.expectEqual(@as(usize, 2), loaded.id_to_path.items.len); + try testing.expect(loaded.path_to_id.get("a.zig") == null); + + const gamma_hits = try loaded.searchDeduped("gamma", alloc); + defer alloc.free(gamma_hits); + try testing.expectEqual(@as(usize, 1), gamma_hits.len); + try testing.expectEqualStrings("c.zig", loaded.hitPath(gamma_hits[0])); + + const kappa_hits = try loaded.searchDeduped("kappa", alloc); + defer alloc.free(kappa_hits); + try testing.expectEqual(@as(usize, 1), kappa_hits.len); + try testing.expectEqualStrings("keep.zig", loaded.hitPath(kappa_hits[0])); + + const alpha_hits = try loaded.searchDeduped("alpha", alloc); + defer alloc.free(alpha_hits); + try testing.expectEqual(@as(usize, 0), alpha_hits.len); +}