Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions src/explore.zig
Original file line number Diff line number Diff line change
Expand Up @@ -2009,22 +2009,91 @@ pub const Explorer = struct {
fn bumpSearchGen(self: *Explorer) void {
_ = self.search_gen.fetchAdd(1, .release);
}
/// Remove every skip_trigram_files entry the current trigram index
/// covers. Caller must hold the EXCLUSIVE lock. Snapshot restore parks
/// every file in the skip set (#507/#537 — it cannot know what a disk
/// trigram index covers), so without this reconciliation tier 3
/// content-scans the ENTIRE project on each fall-through query even
/// though the loaded index already answers for those files. Keys are
/// borrowed from `outlines`, so removal never frees.
fn pruneSkipTrigramLocked(self: *Explorer) void {
var to_remove: std.ArrayList([]const u8) = .empty;
defer to_remove.deinit(self.allocator);
var it = self.skip_trigram_files.keyIterator();
while (it.next()) |k| {
if (self.trigram_index.containsFile(k.*)) to_remove.append(self.allocator, k.*) catch break;
}
for (to_remove.items) |k| _ = self.skip_trigram_files.remove(k);
}

/// Swap in a trigram index (disk mmap load / post-scan build) and
/// reconcile the skip set with what it covers. Every external
/// trigram_index replacement must go through here — a bare swap leaves
/// stale skip_trigram_files entries (see pruneSkipTrigramLocked).
pub fn adoptTrigramIndex(self: *Explorer, new_index: AnyTrigramIndex) void {
self.mu.lock();
defer self.mu.unlock();
self.bumpSearchGen();
self.trigram_index.deinit();
self.trigram_index = new_index;
self.pruneSkipTrigramLocked();
}

/// Adopt a disk-loaded mmap trigram as the BASE index, keeping any files
/// already trigram-indexed in the current heap index (snapshot freshness
/// reindex touches changed files BEFORE the disk load runs) as a masked
/// overlay so their newer content wins. Replaces the `fileCount() > 0`
/// early-return that let a single reindexed file block the disk load
/// entirely — leaving tier 3 to content-scan the whole project on every
/// fall-through query.
pub fn adoptTrigramBase(self: *Explorer, base: idx.MmapTrigramIndex) void {
self.mu.lock();
defer self.mu.unlock();
self.bumpSearchGen();
switch (self.trigram_index) {
.heap => |heap_copy| {
if (heap_copy.fileCount() == 0) {
var old = heap_copy;
old.deinit();
self.trigram_index = .{ .mmap = base };
} else {
self.trigram_index = .{ .mmap_overlay = .{
.base = base,
.overlay = heap_copy,
.masked = std.StringHashMap(void).init(self.allocator),
} };
var it = self.trigram_index.mmap_overlay.overlay.file_trigrams.keyIterator();
while (it.next()) |k| self.trigram_index.mmap_overlay.mask(k.*);
}
},
.mmap, .mmap_overlay => {
// Already disk-backed — nothing to gain from a second base.
var dupe_base = base;
dupe_base.deinit();
},
}
self.pruneSkipTrigramLocked();
}

/// Rebuild trigram index from the stored file contents.
/// Used after a cache hit to populate trigrams when they were skipped during the fast scan.
pub fn rebuildTrigrams(self: *Explorer) !void {
self.mu.lock();
defer self.mu.unlock();
self.bumpSearchGen();
var iter = self.contents.iterator();
while (iter.next()) |entry| {
// Skip large files to prevent OOM on large repos
if (entry.value_ptr.*.len > 1024 * 1024) continue;
self.trigram_index.indexFile(entry.key_ptr.*, entry.value_ptr.*) catch |err| switch (err) {
error.OutOfMemory => {
std.log.warn("trigram OOM, skipping remaining files", .{});
self.pruneSkipTrigramLocked();
return;
},
};
}
self.pruneSkipTrigramLocked();
}

/// Rebuild the inverted word index from cached contents when complete, or
Expand Down
2 changes: 1 addition & 1 deletion src/index.zig
Original file line number Diff line number Diff line change
Expand Up @@ -2443,7 +2443,7 @@ pub const AnyTrigramIndex = union(enum) {
masked: std.StringHashMap(void),
masked_in_base: u32 = 0,

fn mask(self: *MmapOverlay, path: []const u8) void {
pub fn mask(self: *MmapOverlay, path: []const u8) void {
if (self.masked.contains(path)) return;
const key = self.overlay.allocator.dupe(u8, path) catch return;
self.masked.put(key, {}) catch {
Expand Down
64 changes: 21 additions & 43 deletions src/main.zig
Original file line number Diff line number Diff line change
Expand Up @@ -1335,10 +1335,7 @@ fn mainImpl() !void {
tri.deinit();
std.heap.c_allocator.destroy(tri);
if (MmapTrigramIndex.initFromDisk(io, data_dir, allocator)) |loaded| {
explorer.mu.lock();
explorer.trigram_index.deinit();
explorer.trigram_index = .{ .mmap = loaded };
explorer.mu.unlock();
explorer.adoptTrigramIndex(.{ .mmap = loaded });
}
}
} else {
Expand All @@ -1359,15 +1356,9 @@ fn mainImpl() !void {
const current_count = @as(u32, @intCast(explorer.outlines.count()));
if (disk_hdr != null and current_count == disk_hdr.?.file_count) {
if (MmapTrigramIndex.initFromDisk(io, data_dir, allocator)) |loaded| {
explorer.mu.lock();
explorer.trigram_index.deinit();
explorer.trigram_index = .{ .mmap = loaded };
explorer.mu.unlock();
explorer.adoptTrigramIndex(.{ .mmap = loaded });
} else if (TrigramIndex.readFromDisk(io, data_dir, allocator)) |loaded| {
explorer.mu.lock();
explorer.trigram_index.deinit();
explorer.trigram_index = .{ .heap = loaded };
explorer.mu.unlock();
explorer.adoptTrigramIndex(.{ .heap = loaded });
} else {
explorer.rebuildTrigrams() catch {};
explorer.trigram_index.writeToDisk(io, data_dir, git_head) catch |err| {
Expand Down Expand Up @@ -1403,10 +1394,7 @@ fn mainImpl() !void {
// Load trigrams as mmap (zero heap cost); then we can safely
// release file contents since mmap serves future searches.
if (MmapTrigramIndex.initFromDisk(io, data_dir, allocator)) |loaded| {
explorer.mu.lock();
explorer.trigram_index.deinit();
explorer.trigram_index = .{ .mmap = loaded };
explorer.mu.unlock();
explorer.adoptTrigramIndex(.{ .mmap = loaded });
}
release_contents_after_cache = true;
}
Expand Down Expand Up @@ -2108,20 +2096,22 @@ fn getDataDir(io: std.Io, allocator: std.mem.Allocator, abs_root: []const u8) ![

fn loadTrigramFromDiskIfPresent(io: std.Io, explorer: *Explorer, data_dir: []const u8, allocator: std.mem.Allocator) void {
explorer.mu.lockShared();
const already_loaded = explorer.trigram_index.fileCount() > 0;
const disk_backed = explorer.trigram_index != .heap;
const heap_files = explorer.trigram_index.fileCount();
const total_files = explorer.outlines.count();
explorer.mu.unlockShared();
if (already_loaded) return;
// Skip only when a disk index is already adopted or the heap index
// covers the whole project. A PARTIAL heap (snapshot freshness reindex
// touches a few changed files before this runs) must not block the
// load — adoptTrigramBase keeps those files as a masking overlay.
if (disk_backed or (heap_files > 0 and heap_files >= total_files)) return;

if (MmapTrigramIndex.initFromDisk(io, data_dir, allocator)) |loaded| {
explorer.mu.lock();
defer explorer.mu.unlock();
explorer.trigram_index.deinit();
explorer.trigram_index = .{ .mmap = loaded };
} else if (TrigramIndex.readFromDisk(io, data_dir, allocator)) |loaded| {
explorer.mu.lock();
defer explorer.mu.unlock();
explorer.trigram_index.deinit();
explorer.trigram_index = .{ .heap = loaded };
explorer.adoptTrigramBase(loaded);
} else if (heap_files == 0) {
if (TrigramIndex.readFromDisk(io, data_dir, allocator)) |loaded| {
explorer.adoptTrigramIndex(.{ .heap = loaded });
}
}
}

Expand Down Expand Up @@ -2498,10 +2488,7 @@ fn scanBg(io: std.Io, store: *Store, explorer: *Explorer, root: []const u8, allo
const current_count = @as(u32, @intCast(explorer.outlines.count()));
if (disk_hdr != null and current_count == disk_hdr.?.file_count) {
if (MmapTrigramIndex.initFromDisk(io, data_dir, allocator)) |loaded| {
explorer.mu.lock();
explorer.trigram_index.deinit();
explorer.trigram_index = .{ .mmap = loaded };
explorer.mu.unlock();
explorer.adoptTrigramIndex(.{ .mmap = loaded });
scan_done.store(true, .release);
mcp_server.setScanState(.ready);
if (shutdown.load(.acquire)) return;
Expand All @@ -2522,10 +2509,7 @@ fn scanBg(io: std.Io, store: *Store, explorer: *Explorer, root: []const u8, allo
return;
}
if (TrigramIndex.readFromDisk(io, data_dir, allocator)) |loaded| {
explorer.mu.lock();
explorer.trigram_index.deinit();
explorer.trigram_index = .{ .heap = loaded };
explorer.mu.unlock();
explorer.adoptTrigramIndex(.{ .heap = loaded });
scan_done.store(true, .release);
mcp_server.setScanState(.ready);
if (shutdown.load(.acquire)) return;
Expand Down Expand Up @@ -2566,15 +2550,9 @@ fn scanBg(io: std.Io, store: *Store, explorer: *Explorer, root: []const u8, allo

// Compact: swap heap index for mmap — zero RSS, data lives in OS page cache.
if (MmapTrigramIndex.initFromDisk(io, data_dir, allocator)) |loaded| {
explorer.mu.lock();
explorer.trigram_index.deinit();
explorer.trigram_index = .{ .mmap = loaded };
explorer.mu.unlock();
explorer.adoptTrigramIndex(.{ .mmap = loaded });
} else if (TrigramIndex.readFromDisk(io, data_dir, allocator)) |loaded| {
explorer.mu.lock();
explorer.trigram_index.deinit();
explorer.trigram_index = .{ .heap = loaded };
explorer.mu.unlock();
explorer.adoptTrigramIndex(.{ .heap = loaded });
}

scan_done.store(true, .release);
Expand Down
24 changes: 12 additions & 12 deletions src/mcp.zig
Original file line number Diff line number Diff line change
Expand Up @@ -217,23 +217,23 @@ fn getProjectDataDir(allocator: std.mem.Allocator, project_path: []const u8) ?[]

fn loadProjectTrigramFromDiskIfPresent(io: std.Io, explorer: *Explorer, project_path: []const u8, allocator: std.mem.Allocator) void {
explorer.mu.lockShared();
const already_loaded = explorer.trigram_index.fileCount() > 0;
const disk_backed = explorer.trigram_index != .heap;
const heap_files = explorer.trigram_index.fileCount();
const total_files = explorer.outlines.count();
explorer.mu.unlockShared();
if (already_loaded) return;
// Mirror main.zig's loadTrigramFromDiskIfPresent: a PARTIAL heap index
// (freshness reindex of changed files) must not block the disk load.
if (disk_backed or (heap_files > 0 and heap_files >= total_files)) return;

const data_dir = getProjectDataDir(allocator, project_path) orelse return;
defer allocator.free(data_dir);

if (idx.MmapTrigramIndex.initFromDisk(io, data_dir, allocator)) |loaded| {
explorer.mu.lock();
defer explorer.mu.unlock();
explorer.trigram_index.deinit();
explorer.trigram_index = .{ .mmap = loaded };
} else if (idx.TrigramIndex.readFromDisk(io, data_dir, allocator)) |loaded| {
explorer.mu.lock();
defer explorer.mu.unlock();
explorer.trigram_index.deinit();
explorer.trigram_index = .{ .heap = loaded };
explorer.adoptTrigramBase(loaded);
} else if (heap_files == 0) {
if (idx.TrigramIndex.readFromDisk(io, data_dir, allocator)) |loaded| {
explorer.adoptTrigramIndex(.{ .heap = loaded });
}
}
}

Expand Down Expand Up @@ -5176,7 +5176,7 @@ fn appendSearchProvenanceMeta(out: *std.ArrayList(u8), alloc: std.mem.Allocator,
out.appendSlice(alloc, ",") catch {};
appendJsonKeyUsize(out, alloc, "skip_trigram_files", skip);
out.appendSlice(alloc, ",") catch {};
appendJsonKeyUsize(out, alloc, "trigram_cap", 15_000);
appendJsonKeyUsize(out, alloc, "trigram_cap", watcher.trigramFileCap());
out.appendSlice(alloc, ",") catch {};
appendJsonKeyBool(out, alloc, "recall_complete", recall_complete);
out.append(alloc, '}') catch {};
Expand Down
88 changes: 88 additions & 0 deletions src/test_search.zig
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ const Explorer = @import("explore.zig").Explorer;
const SearchResult = @import("explore.zig").SearchResult;
const WordIndex = @import("index.zig").WordIndex;
const TrigramIndex = @import("index.zig").TrigramIndex;
const MmapTrigramIndex = @import("index.zig").MmapTrigramIndex;
const SparseNgramIndex = @import("index.zig").SparseNgramIndex;
const explore = @import("explore.zig");
const Language = explore.Language;
Expand Down Expand Up @@ -2552,6 +2553,93 @@ test "search-cache: ranked cache is invalidated by indexFile" {
try testing.expect(saw_new);
}

test "skip-trigram: adoptTrigramIndex prunes covered files from the tier-3 scan set" {
var explorer = Explorer.init(testing.allocator, Explorer.DEFAULT_CONTENT_CACHE_CAPACITY);
defer explorer.deinit();
// Outline-only indexing mirrors snapshot restore: files land in
// skip_trigram_files with no trigram coverage.
try explorer.indexFileSkipTrigram("src/cov.zig", "const a = 1; // ocelottok\n");
try explorer.indexFileSkipTrigram("src/uncov.zig", "const b = 2; // ocelottok\n");
try testing.expectEqual(@as(usize, 2), explorer.skipTrigramFileCount());

// Build a trigram index covering only ONE of the two files and adopt it
// (stands in for the mmap disk load all server modes perform).
var tri = TrigramIndex.init(testing.allocator);
try tri.indexFile("src/cov.zig", "const a = 1; // ocelottok\n");
explorer.adoptTrigramIndex(.{ .heap = tri });

// The covered file must leave the scan set; the uncovered one must stay.
try testing.expectEqual(@as(usize, 1), explorer.skipTrigramFileCount());
try testing.expect(explorer.skip_trigram_files.contains("src/uncov.zig"));
try testing.expect(!explorer.skip_trigram_files.contains("src/cov.zig"));

// Recall must be intact: tier 1 serves the covered file, tier 3 the rest.
const r = try explorer.searchContent("ocelottok", testing.allocator, 10);
defer freeSearchResults(r);
try testing.expectEqual(@as(usize, 2), r.len);
}

test "skip-trigram: adoptTrigramBase keeps freshness-reindexed files as a masking overlay" {
var tmp_dir = testing.tmpDir(.{});
defer tmp_dir.cleanup();
var path_buf: [std.fs.max_path_bytes]u8 = undefined;
const tmp_path_len = try tmp_dir.dir.realPathFile(io, ".", &path_buf);
const tmp_path = path_buf[0..tmp_path_len];

// On-disk index: stale content for changed.zig, current for stable.zig.
{
var seed = TrigramIndex.init(testing.allocator);
defer seed.deinit();
try seed.indexFile("src/changed.zig", "const old = 1; // staletok\n");
try seed.indexFile("src/stable.zig", "const s = 2; // stabletok\n");
try seed.writeToDisk(io, tmp_path, null);
}

var explorer = Explorer.init(testing.allocator, Explorer.DEFAULT_CONTENT_CACHE_CAPACITY);
defer explorer.deinit();
// Mirror snapshot load: the unchanged file restores outline-only (skip
// set), the changed file gets a full freshness reindex into the heap
// trigram BEFORE the disk index is adopted.
try explorer.indexFileSkipTrigram("src/stable.zig", "const s = 2; // stabletok\n");
try explorer.indexFile("src/changed.zig", "const new = 1; // freshtok\n");

const base = MmapTrigramIndex.initFromDisk(io, tmp_path, testing.allocator) orelse
return error.MmapInitFailed;
explorer.adoptTrigramBase(base);

// Both files are now trigram-covered: the scan set must be empty.
try testing.expectEqual(@as(usize, 0), explorer.skipTrigramFileCount());

// The overlay's NEW content wins for the reindexed file...
const fresh = try explorer.searchContent("freshtok", testing.allocator, 10);
defer freeSearchResults(fresh);
try testing.expectEqual(@as(usize, 1), fresh.len);
try testing.expectEqualStrings("src/changed.zig", fresh[0].path);
// ...its stale base entry is masked...
const stale = try explorer.searchContent("staletok", testing.allocator, 10);
defer freeSearchResults(stale);
try testing.expectEqual(@as(usize, 0), stale.len);
// ...and base-only files still resolve through the adopted mmap.
const stable = try explorer.searchContent("stabletok", testing.allocator, 10);
defer freeSearchResults(stable);
try testing.expectEqual(@as(usize, 1), stable.len);
try testing.expectEqualStrings("src/stable.zig", stable[0].path);
}

test "skip-trigram: rebuildTrigrams prunes the files it just covered" {
var explorer = Explorer.init(testing.allocator, Explorer.DEFAULT_CONTENT_CACHE_CAPACITY);
defer explorer.deinit();
try explorer.indexFileSkipTrigram("src/rt.zig", "const a = 1; // marmottok\n");
try testing.expectEqual(@as(usize, 1), explorer.skipTrigramFileCount());

try explorer.rebuildTrigrams();
try testing.expectEqual(@as(usize, 0), explorer.skipTrigramFileCount());

const r = try explorer.searchContent("marmottok", testing.allocator, 10);
defer freeSearchResults(r);
try testing.expectEqual(@as(usize, 1), r.len);
}

// ── warmup: queries.log replay ───────────────────────────────────────────────

const warmup = @import("warmup.zig");
Expand Down
Loading
Loading