From f97286eef86741e189ccf5ddf198ce1befd7b4a5 Mon Sep 17 00:00:00 2001
From: justrach <54503978+justrach@users.noreply.github.com>
Date: Fri, 12 Jun 2026 16:04:38 +0800
Subject: [PATCH 1/4] =?UTF-8?q?perf(search):=20cut=20searchContent=20hot?=
 =?UTF-8?q?=20path=20~2-4x=20=E2=80=94=20line-offset=20cache,=20doc=5Fid?=
 =?UTF-8?q?=20grouping,=20packed-key=20sorts?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tier 0: per-file newline-offset cache (LineOffsetCache) replaces per-query
line rescans; grouping keys postings by doc_id with a contiguous-run fast
path (one map op per unique file, not per hit); candidate sort packs the
(is_doc, defines, count, first_seen) comparator into one u64 key over
(key, idx) pairs. renderPlainSearch precomputes path priors once per file
instead of twice per comparison. Rerank memoizes per-path facts across
consecutive results and switches both final sorts block -> pdq (total-order
comparators, identical permutation). Tier 1 hits_per_file gets the same
contiguity fast path.

Benchmark harness gains opt-in CODEDB_BENCH_CALLOC=1 (production c_allocator
instead of DebugAllocator) and CODEDB_BENCH_BREAKDOWN=1 (per-tier ns).

codedb repo, 300 iters, c_allocator: middleware 14us, database 15us,
error 28us (was 88/65/107us pre-round-1). 814/814 tests, e2e MCP 20/20.

Generated with [Devin](https://cli.devin.ai/docs)

Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com>
---
 src/benchmark.zig |   8 +-
 src/explore.zig   | 525 +++++++++++++++++++++++++++++++++++++---------
 2 files changed, 434 insertions(+), 99 deletions(-)

diff --git a/src/benchmark.zig b/src/benchmark.zig
index 1dda8631..2729dcd9 100644
--- a/src/benchmark.zig
+++ b/src/benchmark.zig
@@ -88,6 +88,12 @@ fn benchSearch(explorer: *Explorer, query: []const u8, n: usize, alloc: std.mem.
         for (r) |e| alloc.free(e.line_text);
         alloc.free(r);
     }
+    if (cio.posixGetenv("CODEDB_BENCH_BREAKDOWN") != null) {
+        const b = explorer.last_search_breakdown;
+        var buf: [512]u8 = undefined;
+        const msg = std.fmt.bufPrint(&buf, "  breakdown[{s}]: t0={d}ns t05={d}ns t1={d}ns t2={d}ns rerank={d}ns tier_reached={d} cands={d} results={d}\n", .{ query, b.tier0_ns, b.tier05_ns, b.tier1_ns, b.tier2_ns, b.rerank_ns, b.tier_reached, b.candidate_count, b.result_count }) catch "";
+        cio.File.stderr().writeAll(msg) catch {};
+    }
     return .{ .name = query, .kind = "search", .hits = hits, .avg_ns = total / n };
 }
 
@@ -274,7 +280,7 @@ pub fn main(init: std.process.Init.Minimal) !void {
     cio.setProcessArgs(init.args.vector);
     var gpa: std.heap.DebugAllocator(.{}) = .init;
     defer _ = gpa.deinit();
-    const alloc = gpa.allocator();
+    const alloc = if (cio.posixGetenv("CODEDB_BENCH_CALLOC") != null) std.heap.c_allocator else gpa.allocator();
 
     var threaded: std.Io.Threaded = .init(alloc, .{});
     defer threaded.deinit();
diff --git a/src/explore.zig b/src/explore.zig
index 30a20b82..9046bbdc 100644
--- a/src/explore.zig
+++ b/src/explore.zig
@@ -743,10 +743,142 @@ const LexFreqPenalty = struct {
         return 1.0 - self.amp * norm;
     }
 };
+
+/// Per-file newline-offset tables so Tier 0's line-number → line-text lookups
+/// skip rescanning file bytes on every query. Entries self-validate against
+/// the content slice (ptr+len) they were built from and are invalidated on
+/// reindex/remove. Guarded by its own mutex because searchContent runs under
+/// the Explorer's SHARED lock — concurrent readers may build entries.
+const LineOffsetCache = struct {
+    const Entry = struct {
+        content_ptr: usize,
+        content_len: usize,
+        offsets: []u32,
+    };
+    pub const Span = struct {
+        line: u32,
+        start: usize,
+        end: usize,
+    };
+
+    map: std.StringHashMap(Entry),
+    mu: cio.Mutex = .{},
+    total_bytes: usize = 0,
+
+    const MAX_BYTES: usize = 16 * 1024 * 1024;
+
+    fn init(allocator: std.mem.Allocator) LineOffsetCache {
+        return .{ .map = std.StringHashMap(Entry).init(allocator) };
+    }
+
+    fn deinit(self: *LineOffsetCache) void {
+        var iter = self.map.iterator();
+        while (iter.next()) |e| {
+            self.map.allocator.free(e.key_ptr.*);
+            self.map.allocator.free(e.value_ptr.offsets);
+        }
+        self.map.deinit();
+    }
+
+    fn clearLocked(self: *LineOffsetCache) void {
+        var iter = self.map.iterator();
+        while (iter.next()) |e| {
+            self.map.allocator.free(e.key_ptr.*);
+            self.map.allocator.free(e.value_ptr.offsets);
+        }
+        self.map.clearRetainingCapacity();
+        self.total_bytes = 0;
+    }
+
+    fn clear(self: *LineOffsetCache) void {
+        self.mu.lock();
+        defer self.mu.unlock();
+        self.clearLocked();
+    }
+
+    fn invalidate(self: *LineOffsetCache, path: []const u8) void {
+        self.mu.lock();
+        defer self.mu.unlock();
+        if (self.map.fetchRemove(path)) |kv| {
+            self.total_bytes -= kv.value.offsets.len * @sizeOf(u32);
+            self.map.allocator.free(kv.key);
+            self.map.allocator.free(kv.value.offsets);
+        }
+    }
+
+    fn buildOffsets(allocator: std.mem.Allocator, content: []const u8) ?[]u32 {
+        var offsets: std.ArrayList(u32) = .empty;
+        offsets.ensureTotalCapacity(allocator, @max(16, content.len / 32)) catch return null;
+        offsets.appendAssumeCapacity(0);
+        var pos: usize = 0;
+        while (std.mem.indexOfScalarPos(u8, content, pos, '\n')) |nl| {
+            pos = nl + 1;
+            offsets.append(allocator, @intCast(pos)) catch {
+                offsets.deinit(allocator);
+                return null;
+            };
+        }
+        return offsets.toOwnedSlice(allocator) catch {
+            offsets.deinit(allocator);
+            return null;
+        };
+    }
+
+    /// Resolve ascending 1-based `target_lines` to byte spans in `content`,
+    /// building (and caching) the offset table for `path` on first touch.
+    /// Span semantics match std.mem.splitScalar(content, '\n'): a line ends
+    /// before its '\n'; the final line ends at content.len. Returns the
+    /// number of spans filled, or null when the table cannot be built (OOM)
+    /// — the caller falls back to the scanning path.
+    fn lineSpans(self: *LineOffsetCache, path: []const u8, content: []const u8, target_lines: []const u32, spans: []Span) ?usize {
+        self.mu.lock();
+        defer self.mu.unlock();
+        var offsets: []const u32 = undefined;
+        if (self.map.getPtr(path)) |e| {
+            if (e.content_ptr == @intFromPtr(content.ptr) and e.content_len == content.len) {
+                offsets = e.offsets;
+            } else {
+                const fresh = buildOffsets(self.map.allocator, content) orelse return null;
+                self.total_bytes -= e.offsets.len * @sizeOf(u32);
+                self.map.allocator.free(e.offsets);
+                e.* = .{ .content_ptr = @intFromPtr(content.ptr), .content_len = content.len, .offsets = fresh };
+                self.total_bytes += fresh.len * @sizeOf(u32);
+                offsets = fresh;
+            }
+        } else {
+            const fresh = buildOffsets(self.map.allocator, content) orelse return null;
+            const key = self.map.allocator.dupe(u8, path) catch {
+                self.map.allocator.free(fresh);
+                return null;
+            };
+            self.map.put(key, .{ .content_ptr = @intFromPtr(content.ptr), .content_len = content.len, .offsets = fresh }) catch {
+                self.map.allocator.free(fresh);
+                self.map.allocator.free(key);
+                return null;
+            };
+            self.total_bytes += fresh.len * @sizeOf(u32);
+            offsets = fresh;
+        }
+
+        var n: usize = 0;
+        for (target_lines) |ln| {
+            if (n >= spans.len) break;
+            if (ln == 0 or ln > offsets.len) continue;
+            const start: usize = offsets[ln - 1];
+            const end: usize = if (ln < offsets.len) offsets[ln] - 1 else content.len;
+            spans[n] = .{ .line = ln, .start = start, .end = end };
+            n += 1;
+        }
+
+        if (self.total_bytes > MAX_BYTES) self.clearLocked();
+        return n;
+    }
+};
 pub const Explorer = struct {
     outlines: std.StringHashMap(FileOutline),
     dep_graph: DependencyGraph,
     contents: ContentCache,
+    line_offsets: LineOffsetCache,
     symbol_index: std.StringHashMap(std.ArrayList(SymbolLocation)),
     /// False after a snapshot fast-load until ensureSymbolIndex runs (#564).
     symbol_index_complete: bool,
@@ -832,6 +964,7 @@ pub const Explorer = struct {
             .outlines = std.StringHashMap(FileOutline).init(allocator),
             .dep_graph = DependencyGraph.init(allocator),
             .contents = try ContentCache.initAlloc(allocator, content_cache_capacity),
+            .line_offsets = LineOffsetCache.init(allocator),
             .symbol_index = std.StringHashMap(std.ArrayList(SymbolLocation)).init(allocator),
             .symbol_index_complete = true,
             .word_index = WordIndex.init(allocator),
@@ -859,6 +992,7 @@ pub const Explorer = struct {
         self.symbol_index.deinit();
 
         self.contents.deinit();
+        self.line_offsets.deinit();
         if (self.call_centrality) |*c| c.deinit();
         if (self.call_graph) |*cg| cg.deinit(self.allocator);
         if (self.co_change) |*cc| git.freeCoChange(cc, self.allocator);
@@ -911,6 +1045,7 @@ pub const Explorer = struct {
         self.mu.lock();
         defer self.mu.unlock();
         self.contents.clear();
+        self.line_offsets.clear();
     }
 
     pub fn releaseSecondaryIndexes(self: *Explorer) void {
@@ -1028,6 +1163,7 @@ pub const Explorer = struct {
         // Last fallible step: put frees the prior cache value in place, so it
         // must run only once nothing after it can still need prior_content.
         try self.contents.put(stable_path, content);
+        self.line_offsets.invalidate(stable_path);
 
         outline_gop.value_ptr.* = persistent_outline;
         if (prior_outline) |*old_outline| old_outline.deinit();
@@ -1627,6 +1763,7 @@ pub const Explorer = struct {
         self.removeSymbolIndexFor(path);
         _ = self.skip_trigram_files.remove(path);
         self.contents.remove(path);
+        self.line_offsets.invalidate(path);
         self.word_index.removeFile(path);
         self.trigram_index.removeFile(path);
 
@@ -2520,77 +2657,140 @@ pub const Explorer = struct {
         if (word_hits.len > 0) {
             const Tier0File = struct {
                 path: []const u8,
+                doc_id: u32,
                 count: u32,
                 first_seen: usize,
+                // One past the ordinal of this file's last hit. Together with
+                // first_seen it bounds the file's posting run, so per-file
+                // target-line collection slices word_hits[first_seen..hits_end]
+                // instead of filtering the whole hit list per file. The
+                // doc_id filter stays, so the bounds are correct even if a
+                // file's hits were ever non-contiguous.
+                hits_end: usize,
                 is_doc: bool,
                 defines: bool,
             };
 
-            var tier0_files_by_path = std.StringHashMap(Tier0File).init(allocator);
-            defer tier0_files_by_path.deinit();
-
+            // Keyed by doc_id, not path: with the contiguity fast path below
+            // the map sees one getOrPut per UNIQUE file, and a u32 hash is
+            // several times cheaper than re-hashing a ~40-byte path string.
+            // hitPath only runs on first sight of a doc_id. Invalid postings
+            // (hitPath == "") stay in the map as empty-path tombstones and
+            // are skipped when the candidate list is built.
+            var tier0_files_by_doc = std.AutoHashMap(u32, Tier0File).init(allocator);
+            defer tier0_files_by_doc.deinit();
+            // Pre-size for the unique-file count so high-frequency words
+            // (hundreds of files) don't pay a rehash cascade while inserting.
+            tier0_files_by_doc.ensureTotalCapacity(@intCast(@min(word_hits.len, 1024))) catch {};
+
+            // Postings for one file are appended contiguously (indexFile
+            // processes whole files), so consecutive hits almost always share
+            // a doc_id — the cached-entry fast path turns one hash getOrPut
+            // PER HIT (~30µs on 1800-hit words) into one per unique file.
+            // last_entry is only dereferenced immediately after it was
+            // (re)set with no intervening map mutation, so it cannot dangle
+            // across a rehash.
+            var last_doc_id: u32 = 0;
+            var last_entry: ?*Tier0File = null;
             for (word_hits, 0..) |hit, ordinal| {
-                const hit_path = self.word_index.hitPath(hit);
-                if (hit_path.len == 0) continue;
-                const gop = tier0_files_by_path.getOrPut(hit_path) catch continue;
+                if (last_entry) |entry| {
+                    if (hit.doc_id == last_doc_id) {
+                        entry.count +|= 1;
+                        entry.hits_end = ordinal + 1;
+                        continue;
+                    }
+                }
+                const gop = tier0_files_by_doc.getOrPut(hit.doc_id) catch continue;
                 if (!gop.found_existing) {
-                    const is_doc = isDocLanguage(detectLanguage(hit_path));
-                    const defines = !is_doc and self.fileDefinesSymbol(hit_path, query);
+                    const hit_path = self.word_index.hitPath(hit);
+                    const is_doc = hit_path.len > 0 and isDocLanguage(detectLanguage(hit_path));
+                    const defines = hit_path.len > 0 and !is_doc and self.fileDefinesSymbol(hit_path, query);
                     gop.value_ptr.* = .{
                         .path = hit_path,
+                        .doc_id = hit.doc_id,
                         .count = 0,
                         .first_seen = ordinal,
+                        .hits_end = ordinal + 1,
                         .is_doc = is_doc,
                         .defines = defines,
                     };
                 }
                 gop.value_ptr.count +|= 1;
+                gop.value_ptr.hits_end = ordinal + 1;
+                last_doc_id = hit.doc_id;
+                last_entry = gop.value_ptr;
             }
-
             var tier0_files: std.ArrayList(Tier0File) = .empty;
             defer tier0_files.deinit(allocator);
-            try tier0_files.ensureTotalCapacity(allocator, tier0_files_by_path.count());
-            var tier0_iter = tier0_files_by_path.valueIterator();
+            try tier0_files.ensureTotalCapacity(allocator, tier0_files_by_doc.count());
+            var tier0_iter = tier0_files_by_doc.valueIterator();
             while (tier0_iter.next()) |stats| {
+                if (stats.path.len == 0) continue;
                 tier0_files.appendAssumeCapacity(stats.*);
             }
 
-            if (tier0_files.items.len > 1) {
-                std.sort.block(Tier0File, tier0_files.items, {}, struct {
-                    pub fn lessThan(_: void, a: Tier0File, b: Tier0File) bool {
-                        if (a.is_doc != b.is_doc) return !a.is_doc;
-                        if (a.defines != b.defines) return a.defines;
-                        if (a.count != b.count) return a.count > b.count;
-                        if (a.first_seen != b.first_seen) return a.first_seen < b.first_seen;
-                        return std.mem.lessThan(u8, a.path, b.path);
+            // Sort 12-byte (key, index) pairs instead of the 48-byte structs.
+            // The old comparator (is_doc asc, defines desc, count desc,
+            // first_seen asc, path asc) packs losslessly into one u64:
+            // first_seen is the ordinal of a file's first hit, unique per
+            // file, so the path tiebreak was unreachable. count saturates at
+            // 2^30−1; beyond that ties fall to first_seen, same as before.
+            const Tier0Order = struct { key: u64, idx: u32 };
+            var tier0_order: std.ArrayList(Tier0Order) = .empty;
+            defer tier0_order.deinit(allocator);
+            try tier0_order.ensureTotalCapacity(allocator, tier0_files.items.len);
+            for (tier0_files.items, 0..) |stats, i| {
+                const cnt: u64 = @min(stats.count, (1 << 30) - 1);
+                const key = (@as(u64, @intFromBool(stats.is_doc)) << 63) |
+                    (@as(u64, @intFromBool(!stats.defines)) << 62) |
+                    ((((1 << 30) - 1) - cnt) << 32) |
+                    @as(u64, @as(u32, @truncate(stats.first_seen)));
+                tier0_order.appendAssumeCapacity(.{ .key = key, .idx = @intCast(i) });
+            }
+            if (tier0_order.items.len > 1) {
+                std.sort.pdq(Tier0Order, tier0_order.items, {}, struct {
+                    pub fn lessThan(_: void, a: Tier0Order, b: Tier0Order) bool {
+                        return a.key < b.key;
                     }
                 }.lessThan);
             }
-
             const tier0_per_file_cap: usize = if (tier0_files.items.len <= 1) max_results else @max(1, max_results / 5);
             var tier0_exact_capacity: usize = 0;
-            for (tier0_files.items) |stats| {
-                tier0_exact_capacity += @min(@as(usize, stats.count), tier0_per_file_cap);
+            for (tier0_order.items) |ord| {
+                tier0_exact_capacity += @min(@as(usize, tier0_files.items[ord.idx].count), tier0_per_file_cap);
                 if (tier0_exact_capacity >= max_results) break;
             }
             const use_line_hits = tier0_exact_capacity >= max_results and tier0_per_file_cap <= 256;
-            for (tier0_files.items) |stats| {
+            for (tier0_order.items) |ord| {
+                const stats = tier0_files.items[ord.idx];
                 if (result_list.items.len >= max_results) break;
                 const ref = self.readContentForSearch(stats.path, allocator) orelse continue;
                 defer ref.deinit();
                 if (use_line_hits) {
                     var target_lines: [256]u32 = undefined;
                     var target_count: usize = 0;
-                    for (word_hits) |hit| {
+                    for (word_hits[stats.first_seen..stats.hits_end]) |hit| {
                         if (target_count >= tier0_per_file_cap) break;
-                        const hit_path = self.word_index.hitPath(hit);
-                        if (!std.mem.eql(u8, hit_path, stats.path)) continue;
+                        if (hit.doc_id != stats.doc_id) continue;
                         if (target_count == 0 or target_lines[target_count - 1] != hit.line_num) {
                             target_lines[target_count] = hit.line_num;
                             target_count += 1;
                         }
                     }
-                    try appendTargetLineHits(stats.path, ref.data, allocator, target_lines[0..target_count], max_results, &result_list);
+                    var spans: [256]LineOffsetCache.Span = undefined;
+                    if (self.line_offsets.lineSpans(stats.path, ref.data, target_lines[0..target_count], &spans)) |n_spans| {
+                        result_list.ensureUnusedCapacity(allocator, @min(n_spans, max_results - result_list.items.len)) catch {};
+                        for (spans[0..n_spans]) |sp| {
+                            if (result_list.items.len >= max_results) break;
+                            const line_text = try allocator.dupe(u8, ref.data[sp.start..sp.end]);
+                            errdefer allocator.free(line_text);
+                            const path_copy = try allocator.dupe(u8, stats.path);
+                            errdefer allocator.free(path_copy);
+                            try result_list.append(allocator, .{ .path = path_copy, .line_num = sp.line, .line_text = line_text });
+                        }
+                    } else {
+                        try appendTargetLineHits(stats.path, ref.data, allocator, target_lines[0..target_count], max_results, &result_list);
+                    }
                     if (result_list.items.len < max_results) searched.put(stats.path, {}) catch {};
                 } else {
                     searched.put(stats.path, {}) catch {};
@@ -2659,12 +2859,28 @@ pub const Explorer = struct {
                 // file behind unrelated short files when max_per_file was 1.
                 var hits_per_file = std.StringHashMap(u32).init(allocator);
                 defer hits_per_file.deinit();
+                hits_per_file.ensureTotalCapacity(@intCast(@min(word_hits.len, 1024))) catch {};
+                // Same contiguous-posting fast path as Tier 0's grouping:
+                // consecutive hits share a doc_id, so the per-hit hitPath +
+                // string getOrPut collapses to once per unique file. The
+                // cached pointer is only dereferenced immediately after being
+                // (re)set, so it cannot dangle across a rehash.
+                var hpf_last_doc: u32 = 0;
+                var hpf_last: ?*u32 = null;
                 for (word_hits) |hit| {
+                    if (hpf_last) |cnt| {
+                        if (hit.doc_id == hpf_last_doc) {
+                            cnt.* += 1;
+                            continue;
+                        }
+                    }
                     const hp = self.word_index.hitPath(hit);
                     if (hp.len == 0) continue;
                     const gop_h = try hits_per_file.getOrPut(hp);
                     if (!gop_h.found_existing) gop_h.value_ptr.* = 0;
                     gop_h.value_ptr.* += 1;
+                    hpf_last_doc = hit.doc_id;
+                    hpf_last = gop_h.value_ptr;
                 }
                 const SortCtx = struct {
                     contents: *ContentCache,
@@ -2790,12 +3006,26 @@ pub const Explorer = struct {
             path: []const u8,
             count: u32,
             first_seen: usize,
+            // One past the ordinal of this file's last hit — bounds the
+            // file's posting run for the per-file target-line collection
+            // below (the doc_id filter stays, so the bounds are correct even
+            // if a file's hits were ever non-contiguous).
+            hits_end: usize,
             is_doc: bool,
         };
 
         var tier0_files_buf: [512]Tier0File = undefined;
         var tier0_files_len: usize = 0;
         for (word_hits, 0..) |hit, ordinal| {
+            // Postings for one file are appended contiguously (indexFile
+            // processes whole files), so consecutive hits almost always share
+            // a doc_id — checking the newest entry first turns the O(hits ×
+            // files) linear rescan below into one pass per unique file.
+            if (tier0_files_len > 0 and tier0_files_buf[tier0_files_len - 1].doc_id == hit.doc_id) {
+                tier0_files_buf[tier0_files_len - 1].count +|= 1;
+                tier0_files_buf[tier0_files_len - 1].hits_end = ordinal + 1;
+                continue;
+            }
             const hit_path = self.word_index.hitPath(hit);
             if (hit_path.len == 0) continue;
 
@@ -2808,6 +3038,7 @@ pub const Explorer = struct {
             }
             if (found_i) |i| {
                 tier0_files_buf[i].count +|= 1;
+                tier0_files_buf[i].hits_end = ordinal + 1;
             } else {
                 if (tier0_files_len >= tier0_files_buf.len) return false;
                 tier0_files_buf[tier0_files_len] = .{
@@ -2815,6 +3046,7 @@ pub const Explorer = struct {
                     .path = hit_path,
                     .count = 1,
                     .first_seen = ordinal,
+                    .hits_end = ordinal + 1,
                     .is_doc = isDocLanguage(detectLanguage(hit_path)),
                 };
                 tier0_files_len += 1;
@@ -2825,11 +3057,15 @@ pub const Explorer = struct {
         const tier0_files = tier0_files_buf[0..tier0_files_len];
         if (tier0_files.len > 1) {
             const RankCtx = struct {
-                query: []const u8,
+                priors: []const f32,
+                files: []const Tier0File,
                 // Path-prior portion of rerankSignalScore: the canonical-file signals
                 // (basename-stem match, path segment) and demotion penalties. Without it
                 // this fast-path rendered in raw hit-count order, so a high-frequency
                 // non-canonical file outranked the canonical basename match.
+                // Computed ONCE per file into `priors` — the old shape
+                // recomputed both priors (basename + ~10 path-segment scans)
+                // inside the comparator on every comparison.
                 fn prior(path: []const u8, q: []const u8) f32 {
                     const base = std.fs.path.basename(path);
                     const stem_end = std.mem.indexOfScalar(u8, base, '.') orelse base.len;
@@ -2848,17 +3084,31 @@ pub const Explorer = struct {
                         pathHasSegment(path, "third_party")) s *= 0.4;
                     return s;
                 }
-                pub fn lessThan(ctx: @This(), a: Tier0File, b: Tier0File) bool {
-                    const pa = prior(a.path, ctx.query);
-                    const pb = prior(b.path, ctx.query);
+                pub fn lessThan(ctx: @This(), ai: u32, bi: u32) bool {
+                    const pa = ctx.priors[ai];
+                    const pb = ctx.priors[bi];
                     if (pa != pb) return pa > pb;
+                    const a = ctx.files[ai];
+                    const b = ctx.files[bi];
                     if (a.is_doc != b.is_doc) return !a.is_doc;
                     if (a.count != b.count) return a.count > b.count;
                     if (a.first_seen != b.first_seen) return a.first_seen < b.first_seen;
                     return std.mem.lessThan(u8, a.path, b.path);
                 }
             };
-            std.sort.block(Tier0File, tier0_files, RankCtx{ .query = query }, RankCtx.lessThan);
+            var priors_buf: [512]f32 = undefined;
+            var order_buf: [512]u32 = undefined;
+            for (tier0_files, 0..) |stats, i| {
+                priors_buf[i] = RankCtx.prior(stats.path, query);
+                order_buf[i] = @intCast(i);
+            }
+            const order = order_buf[0..tier0_files.len];
+            std.sort.pdq(u32, order, RankCtx{ .priors = priors_buf[0..tier0_files.len], .files = tier0_files }, RankCtx.lessThan);
+            // Apply the permutation back into the value buffer so the render
+            // loops below keep iterating tier0_files directly.
+            var sorted_buf: [512]Tier0File = undefined;
+            for (order, 0..) |src, dst| sorted_buf[dst] = tier0_files[src];
+            @memcpy(tier0_files, sorted_buf[0..tier0_files.len]);
         }
 
         const tier0_per_file_cap: usize = if (tier0_files.len <= 1) max_results else @max(1, max_results / 5);
@@ -2895,7 +3145,7 @@ pub const Explorer = struct {
 
             var target_lines: [256]u32 = undefined;
             var target_count: usize = 0;
-            for (word_hits) |hit| {
+            for (word_hits[stats.first_seen..stats.hits_end]) |hit| {
                 if (target_count >= tier0_per_file_cap) break;
                 if (hit.doc_id != stats.doc_id) continue;
                 if (target_count == 0 or target_lines[target_count - 1] != hit.line_num) {
@@ -2915,17 +3165,11 @@ pub const Explorer = struct {
             }
 
             const content = self.contents.get(stats.path) orelse return false;
-            var target_i: usize = 0;
-            var line_num: u32 = 0;
-            var lines = std.mem.splitScalar(u8, content, '\n');
-            while (lines.next()) |line| {
-                line_num += 1;
-                while (target_i < target_count and target_lines[target_i] < line_num) {
-                    target_i += 1;
-                }
-                if (target_i >= target_count) break;
-                if (target_lines[target_i] != line_num) continue;
-                target_i += 1;
+            var spans: [256]LineOffsetCache.Span = undefined;
+            // OOM building the offset table → bail to the full searchContent
+            // path (caller falls through), which renders the same results.
+            const n_spans = self.line_offsets.lineSpans(stats.path, content, target_lines[0..target_count], &spans) orelse return false;
+            for (spans[0..n_spans]) |line_span| {
                 rendered += 1;
 
                 var count_idx: ?usize = null;
@@ -2948,7 +3192,7 @@ pub const Explorer = struct {
                     }
                 } else {
                     shown += 1;
-                    try w.print("  {s}:{d}: {s}\n", .{ stats.path, line_num, line });
+                    try w.print("  {s}:{d}: {s}\n", .{ stats.path, line_span.line, content[line_span.start..line_span.end] });
                 }
                 if (rendered >= max_results) break;
             }
@@ -2993,7 +3237,6 @@ pub const Explorer = struct {
                 if (gop.value_ptr.* > max_file_hits) max_file_hits = gop.value_ptr.*;
             }
         }
-
         // #550: a single-token query that exactly names a known symbol gets the
         // call-graph distance boost here too (the multi-word BM25 path applies
         // it in searchContentRanked). The symbol_index gate keeps plain word
@@ -3012,28 +3255,99 @@ pub const Explorer = struct {
             graph_dist = self.queryGraphDistances(&gd_terms, ga);
         }
 
-        // #550 signal 2: git co-change. Seeds are the result files that
-        // DEFINE the queried symbol, so plain word queries never trigger the
-        // one-time `git log` shell-out.
+        // Per-unique-path rerank facts, computed ONCE per path instead of per
+        // result — every path-level signal (outline definition scan, path
+        // priors, boost multipliers) is identical for all hits in the same
+        // file. `defines` feeds the co-change seeds (#550 signal 2: seeds are
+        // the result files that DEFINE the queried symbol, so plain word
+        // queries never trigger the one-time `git log` shell-out).
+        // Results arrive grouped by file (every tier emits per-file), so a
+        // consecutive-path fast path resolves most results with one
+        // std.mem.eql instead of a string hash + probe.
+        var facts_by_path = std.StringHashMap(PathRerankFacts).init(allocator);
+        defer facts_by_path.deinit();
+        {
+            var last_path: []const u8 = "";
+            for (result_list.items) |r| {
+                if (last_path.len > 0 and std.mem.eql(u8, r.path, last_path)) continue;
+                last_path = r.path;
+                const gop = facts_by_path.getOrPut(r.path) catch continue;
+                if (gop.found_existing) continue;
+                gop.value_ptr.* = self.pathRerankFacts(r.path, query);
+            }
+        }
+
         var cc_seeds = std.StringHashMap(void).init(allocator);
         defer cc_seeds.deinit();
         if (cio.posixGetenv("CODEDB_NO_COCHANGE") == null) {
-            for (result_list.items) |r| {
-                if (cc_seeds.contains(r.path)) continue;
-                if (self.fileDefinesSymbol(r.path, query)) cc_seeds.put(r.path, {}) catch {};
+            var facts_iter = facts_by_path.iterator();
+            while (facts_iter.next()) |entry| {
+                if (entry.value_ptr.defines) cc_seeds.put(entry.key_ptr.*, {}) catch {};
             }
             if (cc_seeds.count() > 0) self.ensureCoChange();
         }
 
+        // The boost multipliers depend on graph_dist / cc_seeds / the hit
+        // tally, so they fill in a second pass over the deduped path set.
+        var boosts_iter = facts_by_path.iterator();
+        while (boosts_iter.next()) |entry| {
+            const path = entry.key_ptr.*;
+            entry.value_ptr.gd = graphDistanceBoost(graph_dist, path);
+            entry.value_ptr.cc = self.coChangeBoost(&cc_seeds, path);
+            if (lfp.enabled) entry.value_ptr.lfp_mult = lfp.multiplier(file_hit_counts.get(path) orelse 1, max_file_hits);
+            if (sp.enabled) entry.value_ptr.sp_mult = sp.multiplier(self, path);
+        }
+
+        // Same consecutive-path memoization as the facts pass: the facts are
+        // copied by VALUE, so later map lookups can never be invalidated (the
+        // map is no longer mutated here anyway).
+        var score_last_path: []const u8 = "";
+        var score_last_facts: PathRerankFacts = .{};
         for (result_list.items) |*r| {
-            r.score = self.rerankSignalScore(r.*, query);
-            r.score *= graphDistanceBoost(graph_dist, r.path);
-            r.score *= self.coChangeBoost(&cc_seeds, r.path);
-            if (lfp.enabled) r.score *= lfp.multiplier(file_hit_counts.get(r.path) orelse 1, max_file_hits);
-            if (sp.enabled) r.score *= sp.multiplier(self, r.path);
+            if (score_last_path.len == 0 or !std.mem.eql(u8, r.path, score_last_path)) {
+                score_last_facts = facts_by_path.get(r.path) orelse PathRerankFacts{};
+                score_last_path = r.path;
+            }
+            const facts = score_last_facts;
+            const def_line_match = blk: {
+                for (facts.def_lines[0..facts.def_count]) |ln| {
+                    if (ln == r.line_num) break :blk true;
+                }
+                if (facts.def_overflow) {
+                    if (self.outlines.get(r.path)) |outline| {
+                        for (outline.symbols.items) |sym| {
+                            if (sym.line_start == r.line_num and asciiEqlIgnoreCase(sym.name, query)) break :blk true;
+                        }
+                    }
+                }
+                break :blk false;
+            };
+            var score: f32 = countOccurrences(r.line_text, query);
+            if (facts.is_tooling) score = @min(score, 2.0);
+            if (def_line_match) score += 5.0;
+            score += facts.add_boost;
+            if (facts.is_test) score *= 0.6;
+            if (facts.is_example) score *= 0.6;
+            if (facts.is_tooling) score *= 0.5;
+            if (facts.is_vendor) score *= 0.4;
+            // Doc-language penalty: markdown / data files (CHANGELOG.md, design
+            // docs, benchmark logs) often mention an identifier many times in a
+            // single line, which lets per-line frequency dwarf code call sites.
+            // For doc files, more mentions don't reflect more code-relevance —
+            // they reflect prose density. Cap at 1.0 then halve so any code hit
+            // (score >= 1) outranks any doc hit. Symmetric with path-prior.
+            if (facts.is_doc) score = @min(score, 1.0) * 0.5;
+            score *= facts.gd;
+            score *= facts.cc;
+            score *= facts.lfp_mult;
+            score *= facts.sp_mult;
+            r.score = score;
         }
         if (result_list.items.len > 1) {
-            std.sort.block(SearchResult, result_list.items, {}, struct {
+            // pdq, not block: (score, path, line_num) is a total order, so an
+            // unstable sort yields the identical permutation while moving the
+            // fat SearchResult structs far less.
+            std.sort.pdq(SearchResult, result_list.items, {}, struct {
                 pub fn lessThan(_: void, a: SearchResult, b: SearchResult) bool {
                     const sa = if (a.score == a.score) a.score else 0;
                     const sb = if (b.score == b.score) b.score else 0;
@@ -3048,71 +3362,86 @@ pub const Explorer = struct {
         return result_list.toOwnedSlice(allocator);
     }
 
-    /// Compose the rerank signals for one search hit (issue #429).
-    fn rerankSignalScore(self: *const Explorer, r: SearchResult, query: []const u8) f32 {
-        var score: f32 = countOccurrences(r.line_text, query);
+    /// Path-level rerank signals (issue #429), computed once per UNIQUE path
+    /// by rerankAndFinalize — every field here is identical for all hits in
+    /// the same file, so recomputing per result (the pre-fix shape) only
+    /// burned time. Per-result composition stays in rerankAndFinalize.
+    const PathRerankFacts = struct {
+        defines: bool = false,
+        def_lines: [16]u32 = undefined,
+        def_count: u8 = 0,
+        def_overflow: bool = false,
+        is_tooling: bool = false,
+        is_test: bool = false,
+        is_example: bool = false,
+        is_vendor: bool = false,
+        is_doc: bool = false,
+        add_boost: f32 = 0,
+        gd: f32 = 1,
+        cc: f32 = 1,
+        lfp_mult: f32 = 1,
+        sp_mult: f32 = 1,
+    };
 
-        // #598: mention-dense tooling files (a bench script repeating the term
-        // six times per line) saturate the per-line count and shrug off the
-        // ×0.5 path prior below. Cap the occurrence BASE for tooling paths
-        // before the stem/symbol boosts so density cannot dominate, while an
-        // eponymous lookup (query 'install' → install/install.sh) still wins
-        // through its +15 stem boost.
-        const is_tooling_path = pathHasSegment(r.path, "bench") or pathHasSegment(r.path, "benchmarks") or
-            pathHasSegment(r.path, "scripts") or pathHasSegment(r.path, "website") or
-            pathHasSegment(r.path, "install");
-        if (is_tooling_path) score = @min(score, 2.0);
-
-        if (self.outlines.get(r.path)) |outline| {
+    fn pathRerankFacts(self: *const Explorer, path: []const u8, query: []const u8) PathRerankFacts {
+        var facts: PathRerankFacts = .{};
+
+        // Symbol-definition facts from the outline: which lines start a
+        // symbol named exactly like the query (case-insensitive). Feeds the
+        // +5 definition-line boost and the co-change seed set (#550).
+        if (self.outlines.get(path)) |outline| {
             for (outline.symbols.items) |sym| {
-                if (sym.line_start == r.line_num and asciiEqlIgnoreCase(sym.name, query)) {
-                    score += 5.0;
-                    break;
+                if (!asciiEqlIgnoreCase(sym.name, query)) continue;
+                facts.defines = true;
+                if (facts.def_count < facts.def_lines.len) {
+                    facts.def_lines[facts.def_count] = sym.line_start;
+                    facts.def_count += 1;
+                } else {
+                    facts.def_overflow = true;
                 }
             }
         }
 
-        const basename = std.fs.path.basename(r.path);
+        // #598: mention-dense tooling files (a bench script repeating the term
+        // six times per line) saturate the per-line count and shrug off the
+        // ×0.5 path prior below. The occurrence BASE is capped for tooling
+        // paths before the stem/symbol boosts so density cannot dominate,
+        // while an eponymous lookup (query 'install' → install/install.sh)
+        // still wins through its +15 stem boost.
+        facts.is_tooling = pathHasSegment(path, "bench") or pathHasSegment(path, "benchmarks") or
+            pathHasSegment(path, "scripts") or pathHasSegment(path, "website") or
+            pathHasSegment(path, "install");
+
+        const basename = std.fs.path.basename(path);
         const stem_end = std.mem.indexOfScalar(u8, basename, '.') orelse basename.len;
         const stem = basename[0..stem_end];
         const stem_contains_query = asciiContainsIgnoreCase(stem, query);
         const query_contains_stem = asciiContainsIgnoreCase(query, stem);
         const stem_related_to_query = stem_contains_query or query_contains_stem;
         if (asciiEqlIgnoreCase(stem, query)) {
-            score += 15.0;
+            facts.add_boost += 15.0;
         } else if (stem_related_to_query) {
-            score += 8.0;
+            facts.add_boost += 8.0;
         }
         // Path-segment match boost: query matches a directory segment in
         // the path (e.g. query="parser" boosts src/parser/foo.zig). Weaker
         // than basename match because the file's own name is a stronger
         // intent signal than the directory it lives in. Skip when basename
         // already matched to avoid double-counting.
-        if (!stem_related_to_query and pathHasSegmentIgnoreCase(r.path, query)) {
-            score += 6.0;
+        if (!stem_related_to_query and pathHasSegmentIgnoreCase(path, query)) {
+            facts.add_boost += 6.0;
         }
 
         // #580: match BM25's pathRelevanceMultiplier — test files identified by
         // BASENAME (tests.zig, test_*.zig, *_tests.zig) are tests even without
         // a test/ directory segment.
-        const is_test_file = pathHasSegment(r.path, "tests") or pathHasSegment(r.path, "test") or
+        facts.is_test = pathHasSegment(path, "tests") or pathHasSegment(path, "test") or
             std.mem.startsWith(u8, basename, "test") or std.mem.indexOf(u8, basename, "_test") != null;
-        if (is_test_file) score *= 0.6;
-        if (pathHasSegment(r.path, "examples") or pathHasSegment(r.path, "example")) score *= 0.6;
-        if (is_tooling_path) score *= 0.5;
-        if (pathHasSegment(r.path, "vendor") or pathHasSegment(r.path, "node_modules") or
-            pathHasSegment(r.path, "third_party")) score *= 0.4;
-        // Doc-language penalty: markdown / data files (CHANGELOG.md, design
-        // docs, benchmark logs) often mention an identifier many times in a
-        // single line, which lets per-line frequency dwarf code call sites.
-        // For doc files, more mentions don't reflect more code-relevance —
-        // they reflect prose density. Cap at 1.0 then halve so any code hit
-        // (score >= 1) outranks any doc hit. Symmetric with path-prior.
-        if (isDocLanguage(detectLanguage(r.path))) {
-            score = @min(score, 1.0) * 0.5;
-        }
-
-        return score;
+        facts.is_example = pathHasSegment(path, "examples") or pathHasSegment(path, "example");
+        facts.is_vendor = pathHasSegment(path, "vendor") or pathHasSegment(path, "node_modules") or
+            pathHasSegment(path, "third_party");
+        facts.is_doc = isDocLanguage(detectLanguage(path));
+        return facts;
     }
 
     /// Append one JSON line per searchContent invocation. v0 logger for the

From 38325f0f755697c6cfc8af0aaed8567c39008f68 Mon Sep 17 00:00:00 2001
From: justrach <54503978+justrach@users.noreply.github.com>
Date: Fri, 12 Jun 2026 16:17:45 +0800
Subject: [PATCH 2/4] perf(search): direct-address doc slots, symbol-length
 masks, init-time path classification

Tier 0 grouping (searchContent + renderPlainSearch) dedupes doc_ids through
a direct-address slot array when the doc table is small (or the query heavy
enough to amortize the memset), falling back to the previous map/rescan path
otherwise; candidates now append straight into the array, dropping the
map-to-array copy pass. The candidate order sorts plain u64 keys - entries
land in first-seen order, so the array index doubles as the unique tiebreak
and the post-sort address.

FileOutline gains name_len_mask (bitmask of symbol-name lengths, a
conservative superset maintained at all three append sites including
snapshot load) so fileDefinesSymbol / pathRerankFacts skip whole symbol
scans when no name can match the query length, and path_class
(query-independent #598/#580 tooling/test/example/vendor priors) computed
once at init instead of ~10 path tokenizations per unique path per rerank.

codedb repo, 300-500 iters, c_allocator: middleware 12.4us, database 13us,
error 27us. 814/814 tests, e2e MCP 20/20.

Generated with [Devin](https://cli.devin.ai/docs)

Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com>
---
 src/explore.zig  | 287 +++++++++++++++++++++++++++++++----------------
 src/snapshot.zig |   1 +
 2 files changed, 190 insertions(+), 98 deletions(-)

diff --git a/src/explore.zig b/src/explore.zig
index 9046bbdc..f74948cd 100644
--- a/src/explore.zig
+++ b/src/explore.zig
@@ -55,6 +55,41 @@ pub const Symbol = struct {
     detail: ?[]const u8 = null,
 };
 
+/// Query-independent path-prior flags shared by the rerankers — a pure
+/// function of the path, see classifyPath.
+pub const PathClass = struct {
+    is_tooling: bool = false,
+    is_test: bool = false,
+    is_example: bool = false,
+    is_vendor: bool = false,
+};
+
+/// Classify `path` for the rerank path priors. Pure — the result is cached
+/// on FileOutline at init; callers without an outline compute it directly.
+pub fn classifyPath(path: []const u8) PathClass {
+    var c: PathClass = .{};
+    // One tokenize pass over the path instead of one per segment keyword.
+    var iter = std.mem.tokenizeAny(u8, path, "/\\");
+    while (iter.next()) |seg| {
+        // #598: mention-dense tooling files (a bench script repeating the
+        // term six times per line) saturate the per-line count and shrug off
+        // the ×0.5 path prior — the occurrence BASE is capped for tooling.
+        if (std.mem.eql(u8, seg, "bench") or std.mem.eql(u8, seg, "benchmarks") or
+            std.mem.eql(u8, seg, "scripts") or std.mem.eql(u8, seg, "website") or
+            std.mem.eql(u8, seg, "install")) c.is_tooling = true;
+        if (std.mem.eql(u8, seg, "tests") or std.mem.eql(u8, seg, "test")) c.is_test = true;
+        if (std.mem.eql(u8, seg, "examples") or std.mem.eql(u8, seg, "example")) c.is_example = true;
+        if (std.mem.eql(u8, seg, "vendor") or std.mem.eql(u8, seg, "node_modules") or
+            std.mem.eql(u8, seg, "third_party")) c.is_vendor = true;
+    }
+    // #580: match BM25's pathRelevanceMultiplier — test files identified by
+    // BASENAME (tests.zig, test_*.zig, *_tests.zig) are tests even without
+    // a test/ directory segment.
+    const basename = std.fs.path.basename(path);
+    if (std.mem.startsWith(u8, basename, "test") or std.mem.indexOf(u8, basename, "_test") != null) c.is_test = true;
+    return c;
+}
+
 pub const FileOutline = struct {
     path: []const u8,
     language: Language,
@@ -69,6 +104,20 @@ pub const FileOutline = struct {
     /// section, retained by the Explorer) rather than individual allocations,
     /// so deinit must not free them. The ArrayLists themselves are still owned.
     borrows_strings: bool = false,
+    /// Bitmask of symbol-name lengths present (bit min(len, 63)). A
+    /// conservative superset — never cleared on symbol removal — that lets
+    /// per-query definition scans (fileDefinesSymbol, pathRerankFacts) skip
+    /// the whole symbol list when no name could match the query's length.
+    /// Every site that appends to `symbols` must OR in the new name's bit.
+    name_len_mask: u64 = 0,
+    /// Query-independent path-prior classification (issue #429 signals),
+    /// computed once at init — pure function of `path`, so reranks can read
+    /// it instead of re-tokenizing the path on every query.
+    path_class: PathClass = .{},
+
+    pub fn nameLenBit(len: usize) u64 {
+        return @as(u64, 1) << @as(u6, @intCast(@min(len, 63)));
+    }
 
     pub fn init(allocator: std.mem.Allocator, path: []const u8) FileOutline {
         return .{
@@ -77,6 +126,7 @@ pub const FileOutline = struct {
             .line_count = 0,
             .byte_size = 0,
             .allocator = allocator,
+            .path_class = classifyPath(path),
         };
     }
     pub fn deinit(self: *FileOutline) void {
@@ -2028,6 +2078,7 @@ pub const Explorer = struct {
                 .line_end = sym.line_end,
                 .detail = copied_detail,
             });
+            dst.name_len_mask |= FileOutline.nameLenBit(copied_name.len);
         }
         for (src.imports.items) |imp| {
             const copied_import = try allocator.dupe(u8, imp);
@@ -2671,72 +2722,97 @@ pub const Explorer = struct {
                 defines: bool,
             };
 
-            // Keyed by doc_id, not path: with the contiguity fast path below
-            // the map sees one getOrPut per UNIQUE file, and a u32 hash is
-            // several times cheaper than re-hashing a ~40-byte path string.
-            // hitPath only runs on first sight of a doc_id. Invalid postings
-            // (hitPath == "") stay in the map as empty-path tombstones and
-            // are skipped when the candidate list is built.
-            var tier0_files_by_doc = std.AutoHashMap(u32, Tier0File).init(allocator);
-            defer tier0_files_by_doc.deinit();
-            // Pre-size for the unique-file count so high-frequency words
-            // (hundreds of files) don't pay a rehash cascade while inserting.
-            tier0_files_by_doc.ensureTotalCapacity(@intCast(@min(word_hits.len, 1024))) catch {};
+            // Candidates append straight into this array; deduplication maps
+            // doc_id → array index. Small doc tables get a direct-address
+            // slot array (one indexed load per lookup, no hashing at all);
+            // big tables — or queries too small to amortize the memset —
+            // fall back to a u32-keyed hash map. SLOT_NONE = unseen,
+            // SLOT_INVALID = doc checked and skipped (freed doc_id slot).
+            var tier0_files: std.ArrayList(Tier0File) = .empty;
+            defer tier0_files.deinit(allocator);
+            try tier0_files.ensureTotalCapacity(allocator, @min(word_hits.len, 1024));
+
+            const SLOT_NONE = std.math.maxInt(u32);
+            const SLOT_INVALID = SLOT_NONE - 1;
+            const ndocs = self.word_index.id_to_path.items.len;
+            const use_slots = ndocs > 0 and (ndocs <= 4096 or (ndocs <= 65536 and word_hits.len >= 512));
+            var slots: []u32 = &.{};
+            defer if (slots.len > 0) allocator.free(slots);
+            var idx_by_doc = std.AutoHashMap(u32, u32).init(allocator);
+            defer idx_by_doc.deinit();
+            if (use_slots) {
+                slots = try allocator.alloc(u32, ndocs);
+                @memset(slots, SLOT_NONE);
+            } else {
+                idx_by_doc.ensureTotalCapacity(@intCast(@min(word_hits.len, 1024))) catch {};
+            }
 
             // Postings for one file are appended contiguously (indexFile
             // processes whole files), so consecutive hits almost always share
-            // a doc_id — the cached-entry fast path turns one hash getOrPut
-            // PER HIT (~30µs on 1800-hit words) into one per unique file.
-            // last_entry is only dereferenced immediately after it was
-            // (re)set with no intervening map mutation, so it cannot dangle
-            // across a rehash.
+            // a doc_id — the cached-index fast path resolves them without
+            // touching the slot table at all. Indices (not pointers) into
+            // tier0_files stay valid across array growth.
             var last_doc_id: u32 = 0;
-            var last_entry: ?*Tier0File = null;
+            var last_cur: u32 = SLOT_NONE;
             for (word_hits, 0..) |hit, ordinal| {
-                if (last_entry) |entry| {
-                    if (hit.doc_id == last_doc_id) {
-                        entry.count +|= 1;
-                        entry.hits_end = ordinal + 1;
-                        continue;
+                if (last_cur != SLOT_NONE and hit.doc_id == last_doc_id) {
+                    if (last_cur != SLOT_INVALID) {
+                        const e = &tier0_files.items[last_cur];
+                        e.count +|= 1;
+                        e.hits_end = ordinal + 1;
                     }
+                    continue;
                 }
-                const gop = tier0_files_by_doc.getOrPut(hit.doc_id) catch continue;
-                if (!gop.found_existing) {
+                var cur: u32 = blk: {
+                    if (use_slots) {
+                        if (hit.doc_id >= ndocs) break :blk SLOT_INVALID;
+                        break :blk slots[hit.doc_id];
+                    }
+                    break :blk idx_by_doc.get(hit.doc_id) orelse SLOT_NONE;
+                };
+                if (cur == SLOT_NONE) {
                     const hit_path = self.word_index.hitPath(hit);
-                    const is_doc = hit_path.len > 0 and isDocLanguage(detectLanguage(hit_path));
-                    const defines = hit_path.len > 0 and !is_doc and self.fileDefinesSymbol(hit_path, query);
-                    gop.value_ptr.* = .{
-                        .path = hit_path,
-                        .doc_id = hit.doc_id,
-                        .count = 0,
-                        .first_seen = ordinal,
-                        .hits_end = ordinal + 1,
-                        .is_doc = is_doc,
-                        .defines = defines,
-                    };
+                    if (hit_path.len == 0) {
+                        cur = SLOT_INVALID;
+                    } else {
+                        const is_doc = isDocLanguage(detectLanguage(hit_path));
+                        const defines = !is_doc and self.fileDefinesSymbol(hit_path, query);
+                        cur = @intCast(tier0_files.items.len);
+                        tier0_files.append(allocator, .{
+                            .path = hit_path,
+                            .doc_id = hit.doc_id,
+                            .count = 0,
+                            .first_seen = ordinal,
+                            .hits_end = ordinal + 1,
+                            .is_doc = is_doc,
+                            .defines = defines,
+                        }) catch {
+                            cur = SLOT_INVALID;
+                        };
+                    }
+                    if (use_slots) {
+                        if (hit.doc_id < ndocs) slots[hit.doc_id] = cur;
+                    } else {
+                        idx_by_doc.put(hit.doc_id, cur) catch {};
+                    }
+                }
+                if (cur != SLOT_INVALID) {
+                    const e = &tier0_files.items[cur];
+                    e.count +|= 1;
+                    e.hits_end = ordinal + 1;
                 }
-                gop.value_ptr.count +|= 1;
-                gop.value_ptr.hits_end = ordinal + 1;
                 last_doc_id = hit.doc_id;
-                last_entry = gop.value_ptr;
+                last_cur = cur;
             }
-            var tier0_files: std.ArrayList(Tier0File) = .empty;
-            defer tier0_files.deinit(allocator);
-            try tier0_files.ensureTotalCapacity(allocator, tier0_files_by_doc.count());
-            var tier0_iter = tier0_files_by_doc.valueIterator();
-            while (tier0_iter.next()) |stats| {
-                if (stats.path.len == 0) continue;
-                tier0_files.appendAssumeCapacity(stats.*);
-            }
-
-            // Sort 12-byte (key, index) pairs instead of the 48-byte structs.
-            // The old comparator (is_doc asc, defines desc, count desc,
-            // first_seen asc, path asc) packs losslessly into one u64:
-            // first_seen is the ordinal of a file's first hit, unique per
-            // file, so the path tiebreak was unreachable. count saturates at
+
+            // Sort plain u64 keys instead of the 48-byte structs. The old
+            // comparator (is_doc asc, defines desc, count desc, first_seen
+            // asc, path asc) packs losslessly: entries were appended in
+            // first-seen order, so the array index doubles as the first_seen
+            // tiebreak (unique per file — the path tiebreak was unreachable)
+            // AND addresses the entry after sorting. count saturates at
             // 2^30−1; beyond that ties fall to first_seen, same as before.
-            const Tier0Order = struct { key: u64, idx: u32 };
-            var tier0_order: std.ArrayList(Tier0Order) = .empty;
+            var tier0_order: std.ArrayList(u64) = .empty;
             defer tier0_order.deinit(allocator);
             try tier0_order.ensureTotalCapacity(allocator, tier0_files.items.len);
             for (tier0_files.items, 0..) |stats, i| {
@@ -2744,25 +2820,21 @@ pub const Explorer = struct {
                 const key = (@as(u64, @intFromBool(stats.is_doc)) << 63) |
                     (@as(u64, @intFromBool(!stats.defines)) << 62) |
                     ((((1 << 30) - 1) - cnt) << 32) |
-                    @as(u64, @as(u32, @truncate(stats.first_seen)));
-                tier0_order.appendAssumeCapacity(.{ .key = key, .idx = @intCast(i) });
+                    @as(u64, @as(u32, @intCast(i)));
+                tier0_order.appendAssumeCapacity(key);
             }
             if (tier0_order.items.len > 1) {
-                std.sort.pdq(Tier0Order, tier0_order.items, {}, struct {
-                    pub fn lessThan(_: void, a: Tier0Order, b: Tier0Order) bool {
-                        return a.key < b.key;
-                    }
-                }.lessThan);
+                std.sort.pdq(u64, tier0_order.items, {}, std.sort.asc(u64));
             }
             const tier0_per_file_cap: usize = if (tier0_files.items.len <= 1) max_results else @max(1, max_results / 5);
             var tier0_exact_capacity: usize = 0;
-            for (tier0_order.items) |ord| {
-                tier0_exact_capacity += @min(@as(usize, tier0_files.items[ord.idx].count), tier0_per_file_cap);
+            for (tier0_order.items) |key| {
+                tier0_exact_capacity += @min(@as(usize, tier0_files.items[@as(u32, @truncate(key))].count), tier0_per_file_cap);
                 if (tier0_exact_capacity >= max_results) break;
             }
             const use_line_hits = tier0_exact_capacity >= max_results and tier0_per_file_cap <= 256;
-            for (tier0_order.items) |ord| {
-                const stats = tier0_files.items[ord.idx];
+            for (tier0_order.items) |key| {
+                const stats = tier0_files.items[@as(u32, @truncate(key))];
                 if (result_list.items.len >= max_results) break;
                 const ref = self.readContentForSearch(stats.path, allocator) orelse continue;
                 defer ref.deinit();
@@ -3016,11 +3088,24 @@ pub const Explorer = struct {
 
         var tier0_files_buf: [512]Tier0File = undefined;
         var tier0_files_len: usize = 0;
+        // Direct-address doc_id → entry-index slots replace the linear rescan
+        // (O(unique files²) on high-frequency words). Same gating as
+        // searchContent's grouping; when the table is too big to amortize the
+        // memset, the rescan path below still handles dedup.
+        const SLOT_NONE = std.math.maxInt(u32);
+        const ndocs = self.word_index.id_to_path.items.len;
+        const use_slots = ndocs > 0 and (ndocs <= 4096 or (ndocs <= 65536 and word_hits.len >= 512));
+        var slots: []u32 = &.{};
+        defer if (slots.len > 0) allocator.free(slots);
+        if (use_slots) {
+            slots = allocator.alloc(u32, ndocs) catch &.{};
+            if (slots.len > 0) @memset(slots, SLOT_NONE);
+        }
         for (word_hits, 0..) |hit, ordinal| {
             // Postings for one file are appended contiguously (indexFile
             // processes whole files), so consecutive hits almost always share
-            // a doc_id — checking the newest entry first turns the O(hits ×
-            // files) linear rescan below into one pass per unique file.
+            // a doc_id — checking the newest entry first resolves them
+            // without touching the slots or the rescan at all.
             if (tier0_files_len > 0 and tier0_files_buf[tier0_files_len - 1].doc_id == hit.doc_id) {
                 tier0_files_buf[tier0_files_len - 1].count +|= 1;
                 tier0_files_buf[tier0_files_len - 1].hits_end = ordinal + 1;
@@ -3030,10 +3115,14 @@ pub const Explorer = struct {
             if (hit_path.len == 0) continue;
 
             var found_i: ?usize = null;
-            for (tier0_files_buf[0..tier0_files_len], 0..) |stats, i| {
-                if (stats.doc_id == hit.doc_id) {
-                    found_i = i;
-                    break;
+            if (slots.len > 0) {
+                if (hit.doc_id < slots.len and slots[hit.doc_id] != SLOT_NONE) found_i = slots[hit.doc_id];
+            } else {
+                for (tier0_files_buf[0..tier0_files_len], 0..) |stats, i| {
+                    if (stats.doc_id == hit.doc_id) {
+                        found_i = i;
+                        break;
+                    }
                 }
             }
             if (found_i) |i| {
@@ -3041,6 +3130,7 @@ pub const Explorer = struct {
                 tier0_files_buf[i].hits_end = ordinal + 1;
             } else {
                 if (tier0_files_len >= tier0_files_buf.len) return false;
+                if (slots.len > 0 and hit.doc_id < slots.len) slots[hit.doc_id] = @intCast(tier0_files_len);
                 tier0_files_buf[tier0_files_len] = .{
                     .doc_id = hit.doc_id,
                     .path = hit_path,
@@ -3389,28 +3479,36 @@ pub const Explorer = struct {
         // Symbol-definition facts from the outline: which lines start a
         // symbol named exactly like the query (case-insensitive). Feeds the
         // +5 definition-line boost and the co-change seed set (#550).
+        // Path-prior classification (#598 tooling cap, #580 test detection,
+        // example/vendor demotion) is query-independent — read it from the
+        // outline, where it was computed once at init; only paths with no
+        // outline (not indexed) classify on the fly.
+        var class: PathClass = .{};
+        var is_doc_lang: bool = undefined;
         if (self.outlines.get(path)) |outline| {
-            for (outline.symbols.items) |sym| {
-                if (!asciiEqlIgnoreCase(sym.name, query)) continue;
-                facts.defines = true;
-                if (facts.def_count < facts.def_lines.len) {
-                    facts.def_lines[facts.def_count] = sym.line_start;
-                    facts.def_count += 1;
-                } else {
-                    facts.def_overflow = true;
+            class = outline.path_class;
+            is_doc_lang = isDocLanguage(outline.language);
+            if (outline.name_len_mask & FileOutline.nameLenBit(query.len) != 0) {
+                for (outline.symbols.items) |sym| {
+                    if (!asciiEqlIgnoreCase(sym.name, query)) continue;
+                    facts.defines = true;
+                    if (facts.def_count < facts.def_lines.len) {
+                        facts.def_lines[facts.def_count] = sym.line_start;
+                        facts.def_count += 1;
+                    } else {
+                        facts.def_overflow = true;
+                    }
                 }
             }
+        } else {
+            class = classifyPath(path);
+            is_doc_lang = isDocLanguage(detectLanguage(path));
         }
-
-        // #598: mention-dense tooling files (a bench script repeating the term
-        // six times per line) saturate the per-line count and shrug off the
-        // ×0.5 path prior below. The occurrence BASE is capped for tooling
-        // paths before the stem/symbol boosts so density cannot dominate,
-        // while an eponymous lookup (query 'install' → install/install.sh)
-        // still wins through its +15 stem boost.
-        facts.is_tooling = pathHasSegment(path, "bench") or pathHasSegment(path, "benchmarks") or
-            pathHasSegment(path, "scripts") or pathHasSegment(path, "website") or
-            pathHasSegment(path, "install");
+        facts.is_tooling = class.is_tooling;
+        facts.is_test = class.is_test;
+        facts.is_example = class.is_example;
+        facts.is_vendor = class.is_vendor;
+        facts.is_doc = is_doc_lang;
 
         const basename = std.fs.path.basename(path);
         const stem_end = std.mem.indexOfScalar(u8, basename, '.') orelse basename.len;
@@ -3432,15 +3530,6 @@ pub const Explorer = struct {
             facts.add_boost += 6.0;
         }
 
-        // #580: match BM25's pathRelevanceMultiplier — test files identified by
-        // BASENAME (tests.zig, test_*.zig, *_tests.zig) are tests even without
-        // a test/ directory segment.
-        facts.is_test = pathHasSegment(path, "tests") or pathHasSegment(path, "test") or
-            std.mem.startsWith(u8, basename, "test") or std.mem.indexOf(u8, basename, "_test") != null;
-        facts.is_example = pathHasSegment(path, "examples") or pathHasSegment(path, "example");
-        facts.is_vendor = pathHasSegment(path, "vendor") or pathHasSegment(path, "node_modules") or
-            pathHasSegment(path, "third_party");
-        facts.is_doc = isDocLanguage(detectLanguage(path));
         return facts;
     }
 
@@ -3661,6 +3750,7 @@ pub const Explorer = struct {
     /// where symbol_index is deferred (#564).
     fn fileDefinesSymbol(self: *const Explorer, path: []const u8, name: []const u8) bool {
         const outline = self.outlines.get(path) orelse return false;
+        if (outline.name_len_mask & FileOutline.nameLenBit(name.len) == 0) return false;
         for (outline.symbols.items) |sym| {
             if (asciiEqlIgnoreCase(sym.name, name)) return true;
         }
@@ -6567,6 +6657,7 @@ fn appendOutlineSymbol(
         .line_end = line_num,
         .detail = detail_copy,
     });
+    outline.name_len_mask |= FileOutline.nameLenBit(name_copy.len);
 }
 
 inline fn resIsIdentStart(c: u8) bool {
diff --git a/src/snapshot.zig b/src/snapshot.zig
index 40d93338..eb45782f 100644
--- a/src/snapshot.zig
+++ b/src/snapshot.zig
@@ -754,6 +754,7 @@ fn loadOutlineStateMap(io: std.Io, snapshot_path: []const u8, allocator: std.mem
                 .line_end = line_end,
                 .detail = detail,
             });
+            outline.name_len_mask |= explore_mod.FileOutline.nameLenBit(name.len);
         }
 
         try result.put(path, outline);

From 3332a8103eb9f91726e3d68244c793910c35007f Mon Sep 17 00:00:00 2001
From: justrach <54503978+justrach@users.noreply.github.com>
Date: Fri, 12 Jun 2026 16:46:15 +0800
Subject: [PATCH 3/4] perf(search): rare-byte scan anchors, keyed final sort,
 pointer facts memoization

Sampling profile (10s @ 200k iters) showed the rerank final sort, the
per-result ~120-byte facts copy, and case-insensitive content scanning as
the remaining hot spots.

- indexOfCaseInsensitive and searchInContent's SIMD loop now anchor on the
  needle's RAREST byte (static code-frequency table) instead of byte 0, so
  common-first-letter words (authentication, error) stop verifying at every
  'a'/'e'; searchInContent also widens to 32-byte vectors. Anchor choice
  never affects which matches are found, only the candidate rate.
- The rerank final sort compares one precomputed u64 key per result (score
  as order-isomorphic descending bits, path as its lexicographic rank among
  unique result paths, line_num as in-comparator tiebreak) over u32 indices,
  then applies the permutation in one scratch pass - no string compares or
  40-byte struct moves inside the sort. NaN and -0.0 collapse exactly like
  the float comparator it replaces.
- The score loop holds a pointer to the memoized facts instead of copying
  the struct per result; the lfp file-hit tally gets the same
  consecutive-path fast path as the other per-result maps.

codedb repo, c_allocator, min of 6 runs under load: authentication 31us
(was 50), database 8.4us (was 13), middleware 11.8us, error 27us.
814/814 tests, e2e MCP 20/20.

Generated with [Devin](https://cli.devin.ai/docs)

Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com>
---
 src/explore.zig | 273 ++++++++++++++++++++++++++++++++++--------------
 1 file changed, 194 insertions(+), 79 deletions(-)

diff --git a/src/explore.zig b/src/explore.zig
index f74948cd..30b4b3fa 100644
--- a/src/explore.zig
+++ b/src/explore.zig
@@ -3321,10 +3321,25 @@ pub const Explorer = struct {
         defer file_hit_counts.deinit();
         var max_file_hits: u32 = 0;
         if (lfp.enabled) {
+            // Results arrive grouped by file, so the consecutive-path fast
+            // path turns one string getOrPut per result into one per unique
+            // file. The cached pointer is only dereferenced immediately after
+            // being (re)set, so it cannot dangle across a rehash.
+            var last_path: []const u8 = "";
+            var last_count: ?*u32 = null;
             for (result_list.items) |r| {
+                if (last_count) |cnt| {
+                    if (std.mem.eql(u8, r.path, last_path)) {
+                        cnt.* += 1;
+                        if (cnt.* > max_file_hits) max_file_hits = cnt.*;
+                        continue;
+                    }
+                }
                 const gop = try file_hit_counts.getOrPut(r.path);
                 gop.value_ptr.* = if (gop.found_existing) gop.value_ptr.* + 1 else 1;
                 if (gop.value_ptr.* > max_file_hits) max_file_hits = gop.value_ptr.*;
+                last_path = r.path;
+                last_count = gop.value_ptr;
             }
         }
         // #550: a single-token query that exactly names a known symbol gets the
@@ -3388,14 +3403,15 @@ pub const Explorer = struct {
             if (sp.enabled) entry.value_ptr.sp_mult = sp.multiplier(self, path);
         }
 
-        // Same consecutive-path memoization as the facts pass: the facts are
-        // copied by VALUE, so later map lookups can never be invalidated (the
-        // map is no longer mutated here anyway).
+        // Same consecutive-path memoization as the facts pass. Holding a
+        // pointer (instead of copying the ~120-byte facts struct per result)
+        // is safe: the map is not mutated anywhere in this loop.
+        const no_facts = PathRerankFacts{};
         var score_last_path: []const u8 = "";
-        var score_last_facts: PathRerankFacts = .{};
+        var score_last_facts: *const PathRerankFacts = &no_facts;
         for (result_list.items) |*r| {
             if (score_last_path.len == 0 or !std.mem.eql(u8, r.path, score_last_path)) {
-                score_last_facts = facts_by_path.get(r.path) orelse PathRerankFacts{};
+                score_last_facts = facts_by_path.getPtr(r.path) orelse &no_facts;
                 score_last_path = r.path;
             }
             const facts = score_last_facts;
@@ -3434,19 +3450,55 @@ pub const Explorer = struct {
             r.score = score;
         }
         if (result_list.items.len > 1) {
-            // pdq, not block: (score, path, line_num) is a total order, so an
-            // unstable sort yields the identical permutation while moving the
-            // fat SearchResult structs far less.
-            std.sort.pdq(SearchResult, result_list.items, {}, struct {
-                pub fn lessThan(_: void, a: SearchResult, b: SearchResult) bool {
-                    const sa = if (a.score == a.score) a.score else 0;
-                    const sb = if (b.score == b.score) b.score else 0;
-                    if (sa != sb) return sa > sb;
-                    const ord = std.mem.order(u8, a.path, b.path);
-                    if (ord != .eq) return ord == .lt;
-                    return a.line_num < b.line_num;
+            // The (score desc, path asc, line asc) order sorts via one
+            // precomputed u64 key per result — score as order-isomorphic
+            // descending bits, path as its lexicographic rank among the
+            // unique result paths — with line_num as the in-comparator
+            // tiebreak. No string compares or 40-byte struct moves inside
+            // the sort loop; the permutation applies in one scratch pass.
+            var unique_paths: std.ArrayList([]const u8) = .empty;
+            defer unique_paths.deinit(allocator);
+            try unique_paths.ensureTotalCapacity(allocator, facts_by_path.count());
+            var path_iter = facts_by_path.keyIterator();
+            while (path_iter.next()) |k| unique_paths.appendAssumeCapacity(k.*);
+            std.sort.pdq([]const u8, unique_paths.items, {}, struct {
+                pub fn lessThan(_: void, a: []const u8, b: []const u8) bool {
+                    return std.mem.lessThan(u8, a, b);
                 }
             }.lessThan);
+            for (unique_paths.items, 0..) |p, rank| {
+                if (facts_by_path.getPtr(p)) |f| f.path_rank = @intCast(rank);
+            }
+
+            const keys = try allocator.alloc(u64, result_list.items.len);
+            defer allocator.free(keys);
+            const order = try allocator.alloc(u32, result_list.items.len);
+            defer allocator.free(order);
+            {
+                var lp: []const u8 = "";
+                var lrank: u32 = std.math.maxInt(u32);
+                for (result_list.items, 0..) |r, ri| {
+                    if (lp.len == 0 or !std.mem.eql(u8, r.path, lp)) {
+                        lrank = if (facts_by_path.getPtr(r.path)) |f| f.path_rank else std.math.maxInt(u32);
+                        lp = r.path;
+                    }
+                    keys[ri] = (@as(u64, scoreDescBits(r.score)) << 32) | lrank;
+                    order[ri] = @intCast(ri);
+                }
+            }
+            const SortCtx = struct {
+                keys: []const u64,
+                items: []const SearchResult,
+                pub fn lessThan(ctx: @This(), a: u32, b: u32) bool {
+                    if (ctx.keys[a] != ctx.keys[b]) return ctx.keys[a] < ctx.keys[b];
+                    return ctx.items[a].line_num < ctx.items[b].line_num;
+                }
+            };
+            std.sort.pdq(u32, order, SortCtx{ .keys = keys, .items = result_list.items }, SortCtx.lessThan);
+            const scratch = try allocator.alloc(SearchResult, result_list.items.len);
+            defer allocator.free(scratch);
+            @memcpy(scratch, result_list.items);
+            for (order, 0..) |src, dst| result_list.items[dst] = scratch[src];
         }
         self.appendRerankTrace(query, result_list.items);
         return result_list.toOwnedSlice(allocator);
@@ -3471,8 +3523,24 @@ pub const Explorer = struct {
         cc: f32 = 1,
         lfp_mult: f32 = 1,
         sp_mult: f32 = 1,
+        /// Lexicographic rank of this path among the result set's unique
+        /// paths — assigned by rerankAndFinalize just before the final sort
+        /// so the sort key replaces per-comparison string compares.
+        path_rank: u32 = 0,
     };
 
+    /// Map a score to bits whose UNSIGNED ascending order equals descending
+    /// float order — the standard sign-flip trick, with NaN collapsed to 0
+    /// and -0.0 to +0.0 so it ties exactly like the float comparator it
+    /// replaces (`if (sa != sb) return sa > sb` with NaN already mapped).
+    fn scoreDescBits(score: f32) u32 {
+        var v: f32 = if (score == score) score else 0;
+        if (v == 0) v = 0;
+        const b: u32 = @bitCast(v);
+        const asc: u32 = if (b & 0x8000_0000 != 0) ~b else b | 0x8000_0000;
+        return ~asc;
+    }
+
     fn pathRerankFacts(self: *const Explorer, path: []const u8, query: []const u8) PathRerankFacts {
         var facts: PathRerankFacts = .{};
 
@@ -6334,53 +6402,68 @@ fn searchInContent(path: []const u8, content: []const u8, query: []const u8, all
         query_lower_buf[i] = if (c >= 'A' and c <= 'Z') c + 32 else c;
     }
     const query_lower = query_lower_buf[0..query.len];
-    const first_lower: u8 = query_lower[0];
-    const first_upper: u8 = if (first_lower >= 'a' and first_lower <= 'z') first_lower - 32 else first_lower;
+    // Anchor the scan on the needle's RAREST byte (see code_char_freq) — for
+    // common-first-letter words this cuts verify calls by an order of
+    // magnitude versus always anchoring on byte 0. A match starting at s has
+    // its anchor at s + anchor, so candidate positions live in
+    // [anchor, end + anchor).
+    var anchor: usize = 0;
+    var anchor_rarity: u8 = std.math.maxInt(u8);
+    for (query_lower, 0..) |c, j| {
+        if (code_char_freq[c] < anchor_rarity) {
+            anchor_rarity = code_char_freq[c];
+            anchor = j;
+        }
+    }
+    const anchor_lower: u8 = query_lower[anchor];
+    const anchor_upper: u8 = if (anchor_lower >= 'a' and anchor_lower <= 'z') anchor_lower - 32 else anchor_lower;
     var file_hits: usize = 0;
-    var pos: usize = 0;
     const end = content.len - query.len + 1;
+    const scan_end = end + anchor;
+    var pos: usize = anchor;
 
     // Track line number incrementally.
     var current_line: u32 = 1;
     var current_line_start: usize = 0;
 
-    // SIMD constants — 16-byte NEON/SSE vectors.
-    const VW = 16;
+    // SIMD constants — 32-byte vectors (2x NEON / 1x AVX2 per compare).
+    const VW = 32;
     const Vec = @Vector(VW, u8);
-    const splat_lo: Vec = @splat(first_lower);
-    const splat_hi: Vec = @splat(first_upper);
+    const splat_lo: Vec = @splat(anchor_lower);
+    const splat_hi: Vec = @splat(anchor_upper);
 
-    scan: while (pos < end) {
-        // ── SIMD path: process full 16-byte chunks ──
-        if (pos + VW <= end) {
+    scan: while (pos < scan_end) {
+        // ── SIMD path: process full chunks ──
+        if (pos + VW <= scan_end) {
             const chunk: Vec = content[pos..][0..VW].*;
             const eq_lo: @Vector(VW, u1) = @bitCast(chunk == splat_lo);
             const eq_hi: @Vector(VW, u1) = @bitCast(chunk == splat_hi);
-            var mask: u16 = @bitCast(eq_lo | eq_hi);
+            var mask: u32 = @bitCast(eq_lo | eq_hi);
 
             if (mask == 0) {
                 pos += VW;
                 continue;
             }
 
-            // Process ALL first-byte candidates in this chunk without reloading.
+            // Process ALL anchor candidates in this chunk without reloading.
             while (mask != 0) {
                 const offset: usize = @ctz(mask);
                 const cand = pos + offset;
-                if (cand >= end) break;
+                if (cand >= scan_end) break;
+                const start = cand - anchor;
 
-                if (matchAtCaseInsensitive(content, cand, query_lower)) {
+                if (matchAtCaseInsensitive(content, start, query_lower)) {
                     // ── Match found ──
-                    while (current_line_start < cand) {
+                    while (current_line_start < start) {
                         if (simdIndexOfNewline(content, current_line_start)) |nl| {
-                            if (nl < cand) {
+                            if (nl < start) {
                                 current_line += 1;
                                 current_line_start = nl + 1;
                             } else break;
                         } else break;
                     }
                     const line_start = current_line_start;
-                    const line_end = simdIndexOfNewline(content, cand) orelse content.len;
+                    const line_end = simdIndexOfNewline(content, start) orelse content.len;
 
                     const line_text = try allocator.dupe(u8, content[line_start..line_end]);
                     errdefer allocator.free(line_text);
@@ -6392,8 +6475,10 @@ fn searchInContent(path: []const u8, content: []const u8, query: []const u8, all
 
                     current_line += 1;
                     current_line_start = line_end + 1;
-                    pos = line_end + 1;
-                    if (pos >= end) return;
+                    // One result per line: the next match must START after
+                    // the line, so its anchor sits at least `anchor` later.
+                    pos = line_end + 1 + anchor;
+                    if (pos >= scan_end) return;
                     continue :scan;
                 }
                 mask &= mask - 1; // clear lowest bit, try next candidate in chunk
@@ -6402,32 +6487,35 @@ fn searchInContent(path: []const u8, content: []const u8, query: []const u8, all
             continue;
         }
 
-        // ── Scalar tail for last <16 bytes ──
+        // ── Scalar tail for the last <VW bytes ──
         const c = content[pos];
-        if ((c == first_lower or c == first_upper) and matchAtCaseInsensitive(content, pos, query_lower)) {
-            while (current_line_start < pos) {
-                if (simdIndexOfNewline(content, current_line_start)) |nl| {
-                    if (nl < pos) {
-                        current_line += 1;
-                        current_line_start = nl + 1;
+        if (c == anchor_lower or c == anchor_upper) {
+            const start = pos - anchor;
+            if (matchAtCaseInsensitive(content, start, query_lower)) {
+                while (current_line_start < start) {
+                    if (simdIndexOfNewline(content, current_line_start)) |nl| {
+                        if (nl < start) {
+                            current_line += 1;
+                            current_line_start = nl + 1;
+                        } else break;
                     } else break;
-                } else break;
-            }
-            const line_start = current_line_start;
-            const line_end = simdIndexOfNewline(content, pos) orelse content.len;
+                }
+                const line_start = current_line_start;
+                const line_end = simdIndexOfNewline(content, start) orelse content.len;
 
-            const line_text = try allocator.dupe(u8, content[line_start..line_end]);
-            errdefer allocator.free(line_text);
-            const path_copy = try allocator.dupe(u8, path);
-            errdefer allocator.free(path_copy);
-            try result_list.append(allocator, .{ .path = path_copy, .line_num = current_line, .line_text = line_text });
-            file_hits += 1;
-            if (file_hits >= max_per_file or result_list.items.len >= max_results) return;
+                const line_text = try allocator.dupe(u8, content[line_start..line_end]);
+                errdefer allocator.free(line_text);
+                const path_copy = try allocator.dupe(u8, path);
+                errdefer allocator.free(path_copy);
+                try result_list.append(allocator, .{ .path = path_copy, .line_num = current_line, .line_text = line_text });
+                file_hits += 1;
+                if (file_hits >= max_per_file or result_list.items.len >= max_results) return;
 
-            current_line += 1;
-            current_line_start = line_end + 1;
-            pos = line_end + 1;
-            continue;
+                current_line += 1;
+                current_line_start = line_end + 1;
+                pos = line_end + 1 + anchor;
+                continue;
+            }
         }
         pos += 1;
     }
@@ -6512,43 +6600,70 @@ pub fn regexMatch(haystack: []const u8, pattern: []const u8) bool {
     return false;
 }
 
+/// Rough frequency of each lowercase byte in source code, used only to pick
+/// the SIMD anchor inside indexOfCaseInsensitive — lower is rarer. Anchor
+/// choice never affects correctness, only how often candidates verify.
+const code_char_freq: [256]u8 = blk: {
+    var t = [_]u8{3} ** 256;
+    const ranks = "zqjxkvbywgpfmucdlhrsnioate";
+    for (ranks, 0..) |c, i| t[c] = @intCast(i + 4);
+    for ('0'..'9' + 1) |c| t[c] = 5;
+    t['_'] = 20;
+    t['.'] = 14;
+    break :blk t;
+};
+
 fn indexOfCaseInsensitive(haystack: []const u8, needle: []const u8) ?usize {
     if (needle.len == 0) return 0;
     if (needle.len > haystack.len) return null;
 
-    // Pre-compute lowered first byte + second byte for fast skip.
-    const first_lower: u8 = if (needle[0] >= 'A' and needle[0] <= 'Z') needle[0] + 32 else needle[0];
-    const first_upper: u8 = if (needle[0] >= 'a' and needle[0] <= 'z') needle[0] - 32 else needle[0];
     const end = haystack.len - needle.len + 1;
 
     if (needle.len == 1) {
-        // Single-char: use std.mem.indexOfAny for speed.
-        const chars = [2]u8{ first_lower, first_upper };
+        const c = needle[0];
+        const lower: u8 = if (c >= 'A' and c <= 'Z') c + 32 else c;
+        const upper: u8 = if (lower >= 'a' and lower <= 'z') lower - 32 else lower;
+        const chars = [2]u8{ lower, upper };
         return std.mem.indexOfAny(u8, haystack, &chars);
     }
 
-    const second_lower: u8 = if (needle[1] >= 'A' and needle[1] <= 'Z') needle[1] + 32 else needle[1];
-
-    var i: usize = 0;
-    while (i < end) : (i += 1) {
-        // Fast reject: check first byte, then second byte before full compare.
-        const c0 = haystack[i];
-        if (c0 != first_lower and c0 != first_upper) continue;
-        const c1 = haystack[i + 1];
-        const c1_lower = if (c1 >= 'A' and c1 <= 'Z') c1 + 32 else c1;
-        if (c1_lower != second_lower) continue;
-
-        // First two bytes match — verify the rest.
+    // Jump between candidates of the needle's RAREST byte with the
+    // vectorized indexOfAnyPos instead of walking byte-by-byte — content
+    // scans (searchInContent, Tier 1 candidate verification) spend most of
+    // their time here, and anchoring on a rare letter (a 'k' or 'x') rather
+    // than position 0 keeps the verify rate low for common-first-letter words.
+    var anchor: usize = 0;
+    var anchor_freq: u8 = std.math.maxInt(u8);
+    for (needle, 0..) |c, j| {
+        const cl: u8 = if (c >= 'A' and c <= 'Z') c + 32 else c;
+        if (code_char_freq[cl] < anchor_freq) {
+            anchor_freq = code_char_freq[cl];
+            anchor = j;
+        }
+    }
+    const ac = needle[anchor];
+    const anchor_lower: u8 = if (ac >= 'A' and ac <= 'Z') ac + 32 else ac;
+    const anchor_upper: u8 = if (anchor_lower >= 'a' and anchor_lower <= 'z') anchor_lower - 32 else anchor_lower;
+    const anchor_chars = [2]u8{ anchor_lower, anchor_upper };
+
+    // A match starting at s puts the anchor at s + anchor, so anchor
+    // candidates live in [anchor, end + anchor).
+    const scan = haystack[0 .. end - 1 + anchor + 1];
+    var i: usize = anchor;
+    while (std.mem.indexOfAnyPos(u8, scan, i, &anchor_chars)) |pos| {
+        i = pos + 1;
+        const start = pos - anchor;
         var match = true;
-        for (2..needle.len) |j| {
-            const hc = if (haystack[i + j] >= 'A' and haystack[i + j] <= 'Z') haystack[i + j] + 32 else haystack[i + j];
-            const nc = if (needle[j] >= 'A' and needle[j] <= 'Z') needle[j] + 32 else needle[j];
-            if (hc != nc) {
+        for (needle, 0..) |nc0, j| {
+            const hc = haystack[start + j];
+            const hl: u8 = if (hc >= 'A' and hc <= 'Z') hc + 32 else hc;
+            const nl: u8 = if (nc0 >= 'A' and nc0 <= 'Z') nc0 + 32 else nc0;
+            if (hl != nl) {
                 match = false;
                 break;
             }
         }
-        if (match) return i;
+        if (match) return start;
     }
     return null;
 }

From 6b17e4ee5edc6f482a9a2932fcb92fc17836831f Mon Sep 17 00:00:00 2001
From: justrach <54503978+justrach@users.noreply.github.com>
Date: Fri, 12 Jun 2026 16:55:38 +0800
Subject: [PATCH 4/4] perf(search): run-at-a-time posting grouping, single
 outline fetch per candidate

The Tier 0 hit list decomposes into contiguous runs of one doc_id each, so
all three grouping passes (searchContent, renderPlainSearch, Tier 1's
hits_per_file tally) now scan to each run boundary first and touch the slot
table / hash map / entry ONCE per run instead of once per hit - the per-hit
work drops to a doc_id compare.

searchContent's candidate metadata also collapses to a single outlines.get
per unique file: language comes from the outline (computed at init via the
same detectLanguage) and the defines scan runs inline behind the
name_len_mask gate, where the old shape hashed the path twice
(detectLanguage + fileDefinesSymbol's own lookup).

codedb repo, c_allocator, min of 5 runs: error 19.6us (was 27),
middleware 10.2us, database 7.4us, authentication 28.7us, webhook 16.6us.
814/814 tests, e2e MCP 20/20.

Generated with [Devin](https://cli.devin.ai/docs)

Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com>
---
 src/explore.zig | 145 ++++++++++++++++++++++++++----------------------
 1 file changed, 78 insertions(+), 67 deletions(-)

diff --git a/src/explore.zig b/src/explore.zig
index 30b4b3fa..ee1226b5 100644
--- a/src/explore.zig
+++ b/src/explore.zig
@@ -2748,42 +2748,59 @@ pub const Explorer = struct {
             }
 
             // Postings for one file are appended contiguously (indexFile
-            // processes whole files), so consecutive hits almost always share
-            // a doc_id — the cached-index fast path resolves them without
-            // touching the slot table at all. Indices (not pointers) into
+            // processes whole files), so the hit list decomposes into runs of
+            // one doc_id each — group run-at-a-time: scan to the run boundary
+            // first, then touch the slot table and the entry ONCE per run
+            // instead of once per hit. Indices (not pointers) into
             // tier0_files stay valid across array growth.
-            var last_doc_id: u32 = 0;
-            var last_cur: u32 = SLOT_NONE;
-            for (word_hits, 0..) |hit, ordinal| {
-                if (last_cur != SLOT_NONE and hit.doc_id == last_doc_id) {
-                    if (last_cur != SLOT_INVALID) {
-                        const e = &tier0_files.items[last_cur];
-                        e.count +|= 1;
-                        e.hits_end = ordinal + 1;
-                    }
-                    continue;
-                }
+            var run_start: usize = 0;
+            while (run_start < word_hits.len) {
+                const doc_id = word_hits[run_start].doc_id;
+                var run_end = run_start + 1;
+                while (run_end < word_hits.len and word_hits[run_end].doc_id == doc_id) run_end += 1;
+                defer run_start = run_end;
+
                 var cur: u32 = blk: {
                     if (use_slots) {
-                        if (hit.doc_id >= ndocs) break :blk SLOT_INVALID;
-                        break :blk slots[hit.doc_id];
+                        if (doc_id >= ndocs) break :blk SLOT_INVALID;
+                        break :blk slots[doc_id];
                     }
-                    break :blk idx_by_doc.get(hit.doc_id) orelse SLOT_NONE;
+                    break :blk idx_by_doc.get(doc_id) orelse SLOT_NONE;
                 };
                 if (cur == SLOT_NONE) {
-                    const hit_path = self.word_index.hitPath(hit);
+                    const hit_path = self.word_index.hitPath(word_hits[run_start]);
                     if (hit_path.len == 0) {
                         cur = SLOT_INVALID;
                     } else {
-                        const is_doc = isDocLanguage(detectLanguage(hit_path));
-                        const defines = !is_doc and self.fileDefinesSymbol(hit_path, query);
+                        // One outline fetch serves both signals: language is
+                        // detectLanguage(path) computed at outline init, and
+                        // the defines scan is gated by the symbol-name-length
+                        // mask — the old shape hashed the path twice
+                        // (detectLanguage + fileDefinesSymbol's own get).
+                        // Files with no outline never define (same as
+                        // fileDefinesSymbol's `orelse return false`).
+                        var is_doc = false;
+                        var defines = false;
+                        if (self.outlines.get(hit_path)) |o| {
+                            is_doc = isDocLanguage(o.language);
+                            if (!is_doc and o.name_len_mask & FileOutline.nameLenBit(query.len) != 0) {
+                                for (o.symbols.items) |sym| {
+                                    if (asciiEqlIgnoreCase(sym.name, query)) {
+                                        defines = true;
+                                        break;
+                                    }
+                                }
+                            }
+                        } else {
+                            is_doc = isDocLanguage(detectLanguage(hit_path));
+                        }
                         cur = @intCast(tier0_files.items.len);
                         tier0_files.append(allocator, .{
                             .path = hit_path,
-                            .doc_id = hit.doc_id,
+                            .doc_id = doc_id,
                             .count = 0,
-                            .first_seen = ordinal,
-                            .hits_end = ordinal + 1,
+                            .first_seen = run_start,
+                            .hits_end = run_end,
                             .is_doc = is_doc,
                             .defines = defines,
                         }) catch {
@@ -2791,18 +2808,16 @@ pub const Explorer = struct {
                         };
                     }
                     if (use_slots) {
-                        if (hit.doc_id < ndocs) slots[hit.doc_id] = cur;
+                        if (doc_id < ndocs) slots[doc_id] = cur;
                     } else {
-                        idx_by_doc.put(hit.doc_id, cur) catch {};
+                        idx_by_doc.put(doc_id, cur) catch {};
                     }
                 }
                 if (cur != SLOT_INVALID) {
                     const e = &tier0_files.items[cur];
-                    e.count +|= 1;
-                    e.hits_end = ordinal + 1;
+                    e.count +|= @intCast(@min(run_end - run_start, std.math.maxInt(u32)));
+                    e.hits_end = run_end;
                 }
-                last_doc_id = hit.doc_id;
-                last_cur = cur;
             }
 
             // Sort plain u64 keys instead of the 48-byte structs. The old
@@ -2932,27 +2947,21 @@ pub const Explorer = struct {
                 var hits_per_file = std.StringHashMap(u32).init(allocator);
                 defer hits_per_file.deinit();
                 hits_per_file.ensureTotalCapacity(@intCast(@min(word_hits.len, 1024))) catch {};
-                // Same contiguous-posting fast path as Tier 0's grouping:
-                // consecutive hits share a doc_id, so the per-hit hitPath +
-                // string getOrPut collapses to once per unique file. The
-                // cached pointer is only dereferenced immediately after being
-                // (re)set, so it cannot dangle across a rehash.
-                var hpf_last_doc: u32 = 0;
-                var hpf_last: ?*u32 = null;
-                for (word_hits) |hit| {
-                    if (hpf_last) |cnt| {
-                        if (hit.doc_id == hpf_last_doc) {
-                            cnt.* += 1;
-                            continue;
-                        }
-                    }
-                    const hp = self.word_index.hitPath(hit);
+                // Same contiguous-posting decomposition as Tier 0's grouping:
+                // the hit list is runs of one doc_id each, so hitPath + the
+                // string getOrPut run once per file run, not per hit.
+                var hpf_run_start: usize = 0;
+                while (hpf_run_start < word_hits.len) {
+                    const hpf_doc = word_hits[hpf_run_start].doc_id;
+                    var hpf_run_end = hpf_run_start + 1;
+                    while (hpf_run_end < word_hits.len and word_hits[hpf_run_end].doc_id == hpf_doc) hpf_run_end += 1;
+                    defer hpf_run_start = hpf_run_end;
+
+                    const hp = self.word_index.hitPath(word_hits[hpf_run_start]);
                     if (hp.len == 0) continue;
                     const gop_h = try hits_per_file.getOrPut(hp);
                     if (!gop_h.found_existing) gop_h.value_ptr.* = 0;
-                    gop_h.value_ptr.* += 1;
-                    hpf_last_doc = hit.doc_id;
-                    hpf_last = gop_h.value_ptr;
+                    gop_h.value_ptr.* += @intCast(@min(hpf_run_end - hpf_run_start, std.math.maxInt(u32)));
                 }
                 const SortCtx = struct {
                     contents: *ContentCache,
@@ -3101,42 +3110,44 @@ pub const Explorer = struct {
             slots = allocator.alloc(u32, ndocs) catch &.{};
             if (slots.len > 0) @memset(slots, SLOT_NONE);
         }
-        for (word_hits, 0..) |hit, ordinal| {
-            // Postings for one file are appended contiguously (indexFile
-            // processes whole files), so consecutive hits almost always share
-            // a doc_id — checking the newest entry first resolves them
-            // without touching the slots or the rescan at all.
-            if (tier0_files_len > 0 and tier0_files_buf[tier0_files_len - 1].doc_id == hit.doc_id) {
-                tier0_files_buf[tier0_files_len - 1].count +|= 1;
-                tier0_files_buf[tier0_files_len - 1].hits_end = ordinal + 1;
-                continue;
-            }
-            const hit_path = self.word_index.hitPath(hit);
+        // Postings for one file are appended contiguously (indexFile
+        // processes whole files), so the hit list decomposes into runs of one
+        // doc_id each — group run-at-a-time: scan to the run boundary first,
+        // then touch the slots/rescan and the entry ONCE per run.
+        var run_start: usize = 0;
+        while (run_start < word_hits.len) {
+            const doc_id = word_hits[run_start].doc_id;
+            var run_end = run_start + 1;
+            while (run_end < word_hits.len and word_hits[run_end].doc_id == doc_id) run_end += 1;
+            defer run_start = run_end;
+
+            const hit_path = self.word_index.hitPath(word_hits[run_start]);
             if (hit_path.len == 0) continue;
 
             var found_i: ?usize = null;
             if (slots.len > 0) {
-                if (hit.doc_id < slots.len and slots[hit.doc_id] != SLOT_NONE) found_i = slots[hit.doc_id];
+                if (doc_id < slots.len and slots[doc_id] != SLOT_NONE) found_i = slots[doc_id];
             } else {
                 for (tier0_files_buf[0..tier0_files_len], 0..) |stats, i| {
-                    if (stats.doc_id == hit.doc_id) {
+                    if (stats.doc_id == doc_id) {
                         found_i = i;
                         break;
                     }
                 }
             }
+            const run_count: u32 = @intCast(@min(run_end - run_start, std.math.maxInt(u32)));
             if (found_i) |i| {
-                tier0_files_buf[i].count +|= 1;
-                tier0_files_buf[i].hits_end = ordinal + 1;
+                tier0_files_buf[i].count +|= run_count;
+                tier0_files_buf[i].hits_end = run_end;
             } else {
                 if (tier0_files_len >= tier0_files_buf.len) return false;
-                if (slots.len > 0 and hit.doc_id < slots.len) slots[hit.doc_id] = @intCast(tier0_files_len);
+                if (slots.len > 0 and doc_id < slots.len) slots[doc_id] = @intCast(tier0_files_len);
                 tier0_files_buf[tier0_files_len] = .{
-                    .doc_id = hit.doc_id,
+                    .doc_id = doc_id,
                     .path = hit_path,
-                    .count = 1,
-                    .first_seen = ordinal,
-                    .hits_end = ordinal + 1,
+                    .count = run_count,
+                    .first_seen = run_start,
+                    .hits_end = run_end,
                     .is_doc = isDocLanguage(detectLanguage(hit_path)),
                 };
                 tier0_files_len += 1;