From a8a379742fe1ecb1db0c55d9371a05ce334c9b28 Mon Sep 17 00:00:00 2001 From: justrach <54503978+justrach@users.noreply.github.com> Date: Fri, 12 Jun 2026 21:50:16 +0800 Subject: [PATCH] perf(search): reconcile skip_trigram_files with disk-loaded trigram indexes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Snapshot restore parks EVERY file in skip_trigram_files (it cannot know what a disk trigram index covers), and nothing ever removed entries when the index was later mmap-loaded. Worse, loadTrigramFromDiskIfPresent early-returned whenever trigram fileCount() > 0 — and the snapshot freshness pass reindexes changed files into the heap trigram BEFORE that check runs, so one dirty file blocked the disk load entirely. Net effect on the dominant serve/mcp/cli-daemon startup path: tier 3 content-scanned the ENTIRE project on every fall-through query, with recall_complete=false. Measured live on this repo: 613/616 files in the scan set, 9.2ms negative searches -> 0 files, 0.5-0.9ms, recall_complete =true. This matches the production search tail (p90 30ms). - adoptTrigramIndex: single funnel for trigram replacement — swaps, bumps the search generation, and prunes covered files from the skip set - adoptTrigramBase: mmap disk load keeps freshness-reindexed files as a masking overlay (their newer content wins; stale base entries masked) - rebuildTrigrams: prunes what it covers + bumps the generation (it did neither) - both disk-load gates now skip only when already disk-backed or the heap covers the whole project - watcher: trigram cap env override CODEDB_TRIGRAM_CAP (measured on a 20k-file corpus: uncapped = zero-hit queries 7.1->1.4ms for +110MB peak RSS); provenance meta reports the effective cap Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- src/explore.zig | 69 +++++++++++++++++++++++++++++++++++ src/index.zig | 2 +- src/main.zig | 64 +++++++++++---------------------- src/mcp.zig | 24 ++++++------- src/test_search.zig | 88 +++++++++++++++++++++++++++++++++++++++++++++ src/watcher.zig | 15 ++++++-- 6 files changed, 204 insertions(+), 58 deletions(-) diff --git a/src/explore.zig b/src/explore.zig index ebbe4d7..9816922 100644 --- a/src/explore.zig +++ b/src/explore.zig @@ -2009,11 +2009,78 @@ pub const Explorer = struct { fn bumpSearchGen(self: *Explorer) void { _ = self.search_gen.fetchAdd(1, .release); } + /// Remove every skip_trigram_files entry the current trigram index + /// covers. Caller must hold the EXCLUSIVE lock. Snapshot restore parks + /// every file in the skip set (#507/#537 — it cannot know what a disk + /// trigram index covers), so without this reconciliation tier 3 + /// content-scans the ENTIRE project on each fall-through query even + /// though the loaded index already answers for those files. Keys are + /// borrowed from `outlines`, so removal never frees. + fn pruneSkipTrigramLocked(self: *Explorer) void { + var to_remove: std.ArrayList([]const u8) = .empty; + defer to_remove.deinit(self.allocator); + var it = self.skip_trigram_files.keyIterator(); + while (it.next()) |k| { + if (self.trigram_index.containsFile(k.*)) to_remove.append(self.allocator, k.*) catch break; + } + for (to_remove.items) |k| _ = self.skip_trigram_files.remove(k); + } + + /// Swap in a trigram index (disk mmap load / post-scan build) and + /// reconcile the skip set with what it covers. Every external + /// trigram_index replacement must go through here — a bare swap leaves + /// stale skip_trigram_files entries (see pruneSkipTrigramLocked). + pub fn adoptTrigramIndex(self: *Explorer, new_index: AnyTrigramIndex) void { + self.mu.lock(); + defer self.mu.unlock(); + self.bumpSearchGen(); + self.trigram_index.deinit(); + self.trigram_index = new_index; + self.pruneSkipTrigramLocked(); + } + + /// Adopt a disk-loaded mmap trigram as the BASE index, keeping any files + /// already trigram-indexed in the current heap index (snapshot freshness + /// reindex touches changed files BEFORE the disk load runs) as a masked + /// overlay so their newer content wins. Replaces the `fileCount() > 0` + /// early-return that let a single reindexed file block the disk load + /// entirely — leaving tier 3 to content-scan the whole project on every + /// fall-through query. + pub fn adoptTrigramBase(self: *Explorer, base: idx.MmapTrigramIndex) void { + self.mu.lock(); + defer self.mu.unlock(); + self.bumpSearchGen(); + switch (self.trigram_index) { + .heap => |heap_copy| { + if (heap_copy.fileCount() == 0) { + var old = heap_copy; + old.deinit(); + self.trigram_index = .{ .mmap = base }; + } else { + self.trigram_index = .{ .mmap_overlay = .{ + .base = base, + .overlay = heap_copy, + .masked = std.StringHashMap(void).init(self.allocator), + } }; + var it = self.trigram_index.mmap_overlay.overlay.file_trigrams.keyIterator(); + while (it.next()) |k| self.trigram_index.mmap_overlay.mask(k.*); + } + }, + .mmap, .mmap_overlay => { + // Already disk-backed — nothing to gain from a second base. + var dupe_base = base; + dupe_base.deinit(); + }, + } + self.pruneSkipTrigramLocked(); + } + /// Rebuild trigram index from the stored file contents. /// Used after a cache hit to populate trigrams when they were skipped during the fast scan. pub fn rebuildTrigrams(self: *Explorer) !void { self.mu.lock(); defer self.mu.unlock(); + self.bumpSearchGen(); var iter = self.contents.iterator(); while (iter.next()) |entry| { // Skip large files to prevent OOM on large repos @@ -2021,10 +2088,12 @@ pub const Explorer = struct { self.trigram_index.indexFile(entry.key_ptr.*, entry.value_ptr.*) catch |err| switch (err) { error.OutOfMemory => { std.log.warn("trigram OOM, skipping remaining files", .{}); + self.pruneSkipTrigramLocked(); return; }, }; } + self.pruneSkipTrigramLocked(); } /// Rebuild the inverted word index from cached contents when complete, or diff --git a/src/index.zig b/src/index.zig index 7c17870..330dfdc 100644 --- a/src/index.zig +++ b/src/index.zig @@ -2443,7 +2443,7 @@ pub const AnyTrigramIndex = union(enum) { masked: std.StringHashMap(void), masked_in_base: u32 = 0, - fn mask(self: *MmapOverlay, path: []const u8) void { + pub fn mask(self: *MmapOverlay, path: []const u8) void { if (self.masked.contains(path)) return; const key = self.overlay.allocator.dupe(u8, path) catch return; self.masked.put(key, {}) catch { diff --git a/src/main.zig b/src/main.zig index c6c21a3..85b00d0 100644 --- a/src/main.zig +++ b/src/main.zig @@ -1335,10 +1335,7 @@ fn mainImpl() !void { tri.deinit(); std.heap.c_allocator.destroy(tri); if (MmapTrigramIndex.initFromDisk(io, data_dir, allocator)) |loaded| { - explorer.mu.lock(); - explorer.trigram_index.deinit(); - explorer.trigram_index = .{ .mmap = loaded }; - explorer.mu.unlock(); + explorer.adoptTrigramIndex(.{ .mmap = loaded }); } } } else { @@ -1359,15 +1356,9 @@ fn mainImpl() !void { const current_count = @as(u32, @intCast(explorer.outlines.count())); if (disk_hdr != null and current_count == disk_hdr.?.file_count) { if (MmapTrigramIndex.initFromDisk(io, data_dir, allocator)) |loaded| { - explorer.mu.lock(); - explorer.trigram_index.deinit(); - explorer.trigram_index = .{ .mmap = loaded }; - explorer.mu.unlock(); + explorer.adoptTrigramIndex(.{ .mmap = loaded }); } else if (TrigramIndex.readFromDisk(io, data_dir, allocator)) |loaded| { - explorer.mu.lock(); - explorer.trigram_index.deinit(); - explorer.trigram_index = .{ .heap = loaded }; - explorer.mu.unlock(); + explorer.adoptTrigramIndex(.{ .heap = loaded }); } else { explorer.rebuildTrigrams() catch {}; explorer.trigram_index.writeToDisk(io, data_dir, git_head) catch |err| { @@ -1403,10 +1394,7 @@ fn mainImpl() !void { // Load trigrams as mmap (zero heap cost); then we can safely // release file contents since mmap serves future searches. if (MmapTrigramIndex.initFromDisk(io, data_dir, allocator)) |loaded| { - explorer.mu.lock(); - explorer.trigram_index.deinit(); - explorer.trigram_index = .{ .mmap = loaded }; - explorer.mu.unlock(); + explorer.adoptTrigramIndex(.{ .mmap = loaded }); } release_contents_after_cache = true; } @@ -2108,20 +2096,22 @@ fn getDataDir(io: std.Io, allocator: std.mem.Allocator, abs_root: []const u8) ![ fn loadTrigramFromDiskIfPresent(io: std.Io, explorer: *Explorer, data_dir: []const u8, allocator: std.mem.Allocator) void { explorer.mu.lockShared(); - const already_loaded = explorer.trigram_index.fileCount() > 0; + const disk_backed = explorer.trigram_index != .heap; + const heap_files = explorer.trigram_index.fileCount(); + const total_files = explorer.outlines.count(); explorer.mu.unlockShared(); - if (already_loaded) return; + // Skip only when a disk index is already adopted or the heap index + // covers the whole project. A PARTIAL heap (snapshot freshness reindex + // touches a few changed files before this runs) must not block the + // load — adoptTrigramBase keeps those files as a masking overlay. + if (disk_backed or (heap_files > 0 and heap_files >= total_files)) return; if (MmapTrigramIndex.initFromDisk(io, data_dir, allocator)) |loaded| { - explorer.mu.lock(); - defer explorer.mu.unlock(); - explorer.trigram_index.deinit(); - explorer.trigram_index = .{ .mmap = loaded }; - } else if (TrigramIndex.readFromDisk(io, data_dir, allocator)) |loaded| { - explorer.mu.lock(); - defer explorer.mu.unlock(); - explorer.trigram_index.deinit(); - explorer.trigram_index = .{ .heap = loaded }; + explorer.adoptTrigramBase(loaded); + } else if (heap_files == 0) { + if (TrigramIndex.readFromDisk(io, data_dir, allocator)) |loaded| { + explorer.adoptTrigramIndex(.{ .heap = loaded }); + } } } @@ -2498,10 +2488,7 @@ fn scanBg(io: std.Io, store: *Store, explorer: *Explorer, root: []const u8, allo const current_count = @as(u32, @intCast(explorer.outlines.count())); if (disk_hdr != null and current_count == disk_hdr.?.file_count) { if (MmapTrigramIndex.initFromDisk(io, data_dir, allocator)) |loaded| { - explorer.mu.lock(); - explorer.trigram_index.deinit(); - explorer.trigram_index = .{ .mmap = loaded }; - explorer.mu.unlock(); + explorer.adoptTrigramIndex(.{ .mmap = loaded }); scan_done.store(true, .release); mcp_server.setScanState(.ready); if (shutdown.load(.acquire)) return; @@ -2522,10 +2509,7 @@ fn scanBg(io: std.Io, store: *Store, explorer: *Explorer, root: []const u8, allo return; } if (TrigramIndex.readFromDisk(io, data_dir, allocator)) |loaded| { - explorer.mu.lock(); - explorer.trigram_index.deinit(); - explorer.trigram_index = .{ .heap = loaded }; - explorer.mu.unlock(); + explorer.adoptTrigramIndex(.{ .heap = loaded }); scan_done.store(true, .release); mcp_server.setScanState(.ready); if (shutdown.load(.acquire)) return; @@ -2566,15 +2550,9 @@ fn scanBg(io: std.Io, store: *Store, explorer: *Explorer, root: []const u8, allo // Compact: swap heap index for mmap — zero RSS, data lives in OS page cache. if (MmapTrigramIndex.initFromDisk(io, data_dir, allocator)) |loaded| { - explorer.mu.lock(); - explorer.trigram_index.deinit(); - explorer.trigram_index = .{ .mmap = loaded }; - explorer.mu.unlock(); + explorer.adoptTrigramIndex(.{ .mmap = loaded }); } else if (TrigramIndex.readFromDisk(io, data_dir, allocator)) |loaded| { - explorer.mu.lock(); - explorer.trigram_index.deinit(); - explorer.trigram_index = .{ .heap = loaded }; - explorer.mu.unlock(); + explorer.adoptTrigramIndex(.{ .heap = loaded }); } scan_done.store(true, .release); diff --git a/src/mcp.zig b/src/mcp.zig index 7d9fab7..8a9c81b 100644 --- a/src/mcp.zig +++ b/src/mcp.zig @@ -217,23 +217,23 @@ fn getProjectDataDir(allocator: std.mem.Allocator, project_path: []const u8) ?[] fn loadProjectTrigramFromDiskIfPresent(io: std.Io, explorer: *Explorer, project_path: []const u8, allocator: std.mem.Allocator) void { explorer.mu.lockShared(); - const already_loaded = explorer.trigram_index.fileCount() > 0; + const disk_backed = explorer.trigram_index != .heap; + const heap_files = explorer.trigram_index.fileCount(); + const total_files = explorer.outlines.count(); explorer.mu.unlockShared(); - if (already_loaded) return; + // Mirror main.zig's loadTrigramFromDiskIfPresent: a PARTIAL heap index + // (freshness reindex of changed files) must not block the disk load. + if (disk_backed or (heap_files > 0 and heap_files >= total_files)) return; const data_dir = getProjectDataDir(allocator, project_path) orelse return; defer allocator.free(data_dir); if (idx.MmapTrigramIndex.initFromDisk(io, data_dir, allocator)) |loaded| { - explorer.mu.lock(); - defer explorer.mu.unlock(); - explorer.trigram_index.deinit(); - explorer.trigram_index = .{ .mmap = loaded }; - } else if (idx.TrigramIndex.readFromDisk(io, data_dir, allocator)) |loaded| { - explorer.mu.lock(); - defer explorer.mu.unlock(); - explorer.trigram_index.deinit(); - explorer.trigram_index = .{ .heap = loaded }; + explorer.adoptTrigramBase(loaded); + } else if (heap_files == 0) { + if (idx.TrigramIndex.readFromDisk(io, data_dir, allocator)) |loaded| { + explorer.adoptTrigramIndex(.{ .heap = loaded }); + } } } @@ -5176,7 +5176,7 @@ fn appendSearchProvenanceMeta(out: *std.ArrayList(u8), alloc: std.mem.Allocator, out.appendSlice(alloc, ",") catch {}; appendJsonKeyUsize(out, alloc, "skip_trigram_files", skip); out.appendSlice(alloc, ",") catch {}; - appendJsonKeyUsize(out, alloc, "trigram_cap", 15_000); + appendJsonKeyUsize(out, alloc, "trigram_cap", watcher.trigramFileCap()); out.appendSlice(alloc, ",") catch {}; appendJsonKeyBool(out, alloc, "recall_complete", recall_complete); out.append(alloc, '}') catch {}; diff --git a/src/test_search.zig b/src/test_search.zig index 833894e..3405e64 100644 --- a/src/test_search.zig +++ b/src/test_search.zig @@ -7,6 +7,7 @@ const Explorer = @import("explore.zig").Explorer; const SearchResult = @import("explore.zig").SearchResult; const WordIndex = @import("index.zig").WordIndex; const TrigramIndex = @import("index.zig").TrigramIndex; +const MmapTrigramIndex = @import("index.zig").MmapTrigramIndex; const SparseNgramIndex = @import("index.zig").SparseNgramIndex; const explore = @import("explore.zig"); const Language = explore.Language; @@ -2552,6 +2553,93 @@ test "search-cache: ranked cache is invalidated by indexFile" { try testing.expect(saw_new); } +test "skip-trigram: adoptTrigramIndex prunes covered files from the tier-3 scan set" { + var explorer = Explorer.init(testing.allocator, Explorer.DEFAULT_CONTENT_CACHE_CAPACITY); + defer explorer.deinit(); + // Outline-only indexing mirrors snapshot restore: files land in + // skip_trigram_files with no trigram coverage. + try explorer.indexFileSkipTrigram("src/cov.zig", "const a = 1; // ocelottok\n"); + try explorer.indexFileSkipTrigram("src/uncov.zig", "const b = 2; // ocelottok\n"); + try testing.expectEqual(@as(usize, 2), explorer.skipTrigramFileCount()); + + // Build a trigram index covering only ONE of the two files and adopt it + // (stands in for the mmap disk load all server modes perform). + var tri = TrigramIndex.init(testing.allocator); + try tri.indexFile("src/cov.zig", "const a = 1; // ocelottok\n"); + explorer.adoptTrigramIndex(.{ .heap = tri }); + + // The covered file must leave the scan set; the uncovered one must stay. + try testing.expectEqual(@as(usize, 1), explorer.skipTrigramFileCount()); + try testing.expect(explorer.skip_trigram_files.contains("src/uncov.zig")); + try testing.expect(!explorer.skip_trigram_files.contains("src/cov.zig")); + + // Recall must be intact: tier 1 serves the covered file, tier 3 the rest. + const r = try explorer.searchContent("ocelottok", testing.allocator, 10); + defer freeSearchResults(r); + try testing.expectEqual(@as(usize, 2), r.len); +} + +test "skip-trigram: adoptTrigramBase keeps freshness-reindexed files as a masking overlay" { + var tmp_dir = testing.tmpDir(.{}); + defer tmp_dir.cleanup(); + var path_buf: [std.fs.max_path_bytes]u8 = undefined; + const tmp_path_len = try tmp_dir.dir.realPathFile(io, ".", &path_buf); + const tmp_path = path_buf[0..tmp_path_len]; + + // On-disk index: stale content for changed.zig, current for stable.zig. + { + var seed = TrigramIndex.init(testing.allocator); + defer seed.deinit(); + try seed.indexFile("src/changed.zig", "const old = 1; // staletok\n"); + try seed.indexFile("src/stable.zig", "const s = 2; // stabletok\n"); + try seed.writeToDisk(io, tmp_path, null); + } + + var explorer = Explorer.init(testing.allocator, Explorer.DEFAULT_CONTENT_CACHE_CAPACITY); + defer explorer.deinit(); + // Mirror snapshot load: the unchanged file restores outline-only (skip + // set), the changed file gets a full freshness reindex into the heap + // trigram BEFORE the disk index is adopted. + try explorer.indexFileSkipTrigram("src/stable.zig", "const s = 2; // stabletok\n"); + try explorer.indexFile("src/changed.zig", "const new = 1; // freshtok\n"); + + const base = MmapTrigramIndex.initFromDisk(io, tmp_path, testing.allocator) orelse + return error.MmapInitFailed; + explorer.adoptTrigramBase(base); + + // Both files are now trigram-covered: the scan set must be empty. + try testing.expectEqual(@as(usize, 0), explorer.skipTrigramFileCount()); + + // The overlay's NEW content wins for the reindexed file... + const fresh = try explorer.searchContent("freshtok", testing.allocator, 10); + defer freeSearchResults(fresh); + try testing.expectEqual(@as(usize, 1), fresh.len); + try testing.expectEqualStrings("src/changed.zig", fresh[0].path); + // ...its stale base entry is masked... + const stale = try explorer.searchContent("staletok", testing.allocator, 10); + defer freeSearchResults(stale); + try testing.expectEqual(@as(usize, 0), stale.len); + // ...and base-only files still resolve through the adopted mmap. + const stable = try explorer.searchContent("stabletok", testing.allocator, 10); + defer freeSearchResults(stable); + try testing.expectEqual(@as(usize, 1), stable.len); + try testing.expectEqualStrings("src/stable.zig", stable[0].path); +} + +test "skip-trigram: rebuildTrigrams prunes the files it just covered" { + var explorer = Explorer.init(testing.allocator, Explorer.DEFAULT_CONTENT_CACHE_CAPACITY); + defer explorer.deinit(); + try explorer.indexFileSkipTrigram("src/rt.zig", "const a = 1; // marmottok\n"); + try testing.expectEqual(@as(usize, 1), explorer.skipTrigramFileCount()); + + try explorer.rebuildTrigrams(); + try testing.expectEqual(@as(usize, 0), explorer.skipTrigramFileCount()); + + const r = try explorer.searchContent("marmottok", testing.allocator, 10); + defer freeSearchResults(r); + try testing.expectEqual(@as(usize, 1), r.len); +} + // ── warmup: queries.log replay ─────────────────────────────────────────────── const warmup = @import("warmup.zig"); diff --git a/src/watcher.zig b/src/watcher.zig index 226da1e..c57102a 100644 --- a/src/watcher.zig +++ b/src/watcher.zig @@ -408,6 +408,17 @@ const FilteredWalker = struct { } }; +/// Files beyond this count are indexed without trigrams and land in +/// skip_trigram_files, which tier 3 of searchContent linearly content-scans +/// on every query that falls through the earlier tiers — on a corpus well +/// past the cap that scan dominates zero-hit query latency. The cap bounds +/// trigram-index RSS; CODEDB_TRIGRAM_CAP overrides it for corpora where the +/// memory trade is worth it. +pub fn trigramFileCap() usize { + const raw = cio.posixGetenv("CODEDB_TRIGRAM_CAP") orelse return 15_000; + return std.fmt.parseInt(usize, raw, 10) catch 15_000; +} + fn collectInitialScanEntries(io: std.Io, store: *Store, dir: std.Io.Dir, allocator: std.mem.Allocator, skip_trigram: bool) !std.ArrayList(InitialScanEntry) { var walker = try FilteredWalker.init(io, dir, allocator); defer walker.deinit(); @@ -418,7 +429,7 @@ fn collectInitialScanEntries(io: std.Io, store: *Store, dir: std.Io.Dir, allocat entries.deinit(allocator); } - const max_trigram_files: usize = 15_000; + const max_trigram_files = trigramFileCap(); var file_count: usize = 0; while (try walker.next()) |entry| { const stat = dir.statFile(io, entry.path, .{}) catch continue; @@ -1022,7 +1033,7 @@ pub fn incrementalLoop(io: std.Io, store: *Store, explorer: *Explorer, queue: *E defer dir.close(io); var walker = FilteredWalker.init(io, dir, tmp) catch continue; defer walker.deinit(); - const max_trigram_files: usize = 15_000; + const max_trigram_files = trigramFileCap(); var file_count: usize = 0; while (walker.next() catch null) |entry| { const stat = dir.statFile(io, entry.path, .{}) catch continue;